首页 > 脚本专栏 > python > Python提取C++代码依赖

Python代码实现C++代码依赖提取工具

2026-03-12 08:40:38 作者：weixin_30777913

这篇文章主要为大家详细介绍了如何使用Python代码实现一个C++代码依赖提取工具,文中的示例代码讲解详细,感兴趣的小伙伴可以跟随小编一起学习一下

编写一个Python程序代码，输入C/C++源文件名和函数名字符串，解析一个目录下所有C/C++的源代码，找到相关的宏名、变量名和函数名对应的具体实现代码，以及和它们相关的具体实现代码，按依赖性顺序写入一个standalone.cpp的文件里。

下面是一个使用 libclang 提取 C/C++ 函数及其依赖并生成 standalone.cpp 的 Python 程序框架。由于完整实现非常复杂，这里给出了核心思路和关键代码，实际使用时需要根据项目情况调整。

import os
import sys
import argparse
from collections import defaultdict, deque
from clang.cindex import Index, CursorKind, TokenKind, Config

# 如果 libclang 不在默认路径，需要设置 Config.set_library_path()
# Config.set_library_path('/usr/lib/llvm-10/lib')

class DependencyExtractor:
    def __init__(self, root_dir, target_func):
        self.root_dir = os.path.abspath(root_dir)
        self.target_func = target_func
        self.index = Index.create()
        self.symbols = {}          # 符号名 -> 符号信息（类型、定义位置、代码片段等）
        self.dependencies = defaultdict(set)  # 符号名 -> 依赖的符号名集合
        self.referenced_by = defaultdict(set) # 反向依赖，用于拓扑排序
        self.file_content_cache = {} # 文件内容缓存，用于提取代码片段

    def parse_files(self):
        """遍历目录下所有 .c/.cpp/.h/.hpp 文件，解析并收集符号定义和引用关系"""
        for root, _, files in os.walk(self.root_dir):
            for file in files:
                if file.endswith(('.c', '.cpp', '.h', '.hpp')):
                    full_path = os.path.join(root, file)
                    self.parse_file(full_path)

    def parse_file(self, filepath):
        """解析单个文件，更新符号表和依赖关系"""
        tu = self.index.parse(filepath, args=['-x', 'c++', '-std=c++11'])  # 根据实际情况调整编译参数
        if not tu:
            return

        # 遍历所有顶层声明，收集函数、变量、类型定义、宏
        self._visit_children(tu.cursor, filepath)

        # 额外遍历预处理令牌获取宏定义（因为宏定义在 AST 中可能不直接作为子节点出现）
        self._collect_macros(tu, filepath)

    def _visit_children(self, cursor, filepath):
        """递归遍历 AST，收集定义和引用关系"""
        # 处理当前节点
        kind = cursor.kind
        location = cursor.location
        if location.file and location.file.name != filepath:
            # 忽略来自头文件的节点，因为该节点会在解析包含它的源文件时被捕获
            # 但为了确保不遗漏，我们允许跨文件收集，但需要避免重复
            pass

        # 收集定义
        if cursor.is_definition():
            if kind == CursorKind.FUNCTION_DECL:
                self._add_symbol(cursor, 'function', filepath)
            elif kind == CursorKind.VAR_DECL and cursor.is_definition():
                # 检查是否是全局变量（没有父函数）
                if self._is_global(cursor):
                    self._add_symbol(cursor, 'variable', filepath)
            elif kind in (CursorKind.STRUCT_DECL, CursorKind.ENUM_DECL,
                          CursorKind.TYPEDEF_DECL, CursorKind.CLASS_DECL):
                self._add_symbol(cursor, 'type', filepath)
            # 注意：宏定义通过 _collect_macros 收集

        # 收集函数体内的引用（仅当当前节点是函数定义时）
        if kind == CursorKind.FUNCTION_DECL and cursor.is_definition():
            self._collect_references(cursor)

        # 继续遍历子节点
        for child in cursor.get_children():
            self._visit_children(child, filepath)

    def _is_global(self, cursor):
        """判断变量是否为全局（没有父函数）"""
        parent = cursor.semantic_parent
        while parent:
            if parent.kind in (CursorKind.FUNCTION_DECL, CursorKind.CXX_METHOD):
                return False
            parent = parent.semantic_parent
        return True

    def _add_symbol(self, cursor, sym_type, filepath):
        """添加符号到符号表，记录其代码范围"""
        name = cursor.spelling
        if not name:
            return
        # 如果已存在，保留最早的定义（简单去重）
        if name in self.symbols:
            return
        start = cursor.extent.start
        end = cursor.extent.end
        # 提取代码片段
        code = self._extract_code(filepath, start.line, end.line)
        self.symbols[name] = {
            'type': sym_type,
            'file': filepath,
            'start_line': start.line,
            'end_line': end.line,
            'code': code,
            'cursor': cursor
        }

    def _extract_code(self, filepath, start_line, end_line):
        """从文件中提取指定行范围的代码"""
        if filepath not in self.file_content_cache:
            try:
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                    self.file_content_cache[filepath] = f.readlines()
            except Exception:
                return ""
        lines = self.file_content_cache[filepath]
        # 行号从1开始，转换为索引
        start_idx = start_line - 1
        end_idx = end_line - 1
        return ''.join(lines[start_idx:end_idx+1])

    def _collect_references(self, func_cursor):
        """在函数体内收集调用的其他函数、使用的全局变量、类型等"""
        func_name = func_cursor.spelling
        # 遍历函数体内的所有节点
        def visit_child(cursor):
            # 函数调用
            if cursor.kind == CursorKind.CALL_EXPR:
                ref = cursor.referenced
                if ref and ref.kind == CursorKind.FUNCTION_DECL:
                    callee = ref.spelling
                    if callee and callee != func_name:
                        self.dependencies[func_name].add(callee)
                        self.referenced_by[callee].add(func_name)
            # 变量引用
            elif cursor.kind == CursorKind.DECL_REF_EXPR:
                ref = cursor.referenced
                if ref and ref.kind == CursorKind.VAR_DECL and self._is_global(ref):
                    var_name = ref.spelling
                    if var_name:
                        self.dependencies[func_name].add(var_name)
                        self.referenced_by[var_name].add(func_name)
            # 类型引用（例如在变量声明、参数中）
            # 这里简化处理：通过获取引用的类型定义来添加依赖
            elif cursor.kind in (CursorKind.VAR_DECL, CursorKind.PARM_DECL):
                type_cursor = cursor.type.get_declaration()
                if type_cursor and type_cursor.kind in (CursorKind.STRUCT_DECL,
                                                         CursorKind.ENUM_DECL,
                                                         CursorKind.TYPEDEF_DECL,
                                                         CursorKind.CLASS_DECL):
                    type_name = type_cursor.spelling
                    if type_name:
                        self.dependencies[func_name].add(type_name)
                        self.referenced_by[type_name].add(func_name)
            # 继续遍历子节点
            for child in cursor.get_children():
                visit_child(child)

        for child in func_cursor.get_children():
            visit_child(child)

    def _collect_macros(self, tu, filepath):
        """通过预处理令牌收集宏定义"""
        # 获取所有预处理令牌
        tokens = tu.get_tokens(extent=tu.cursor.extent)
        current_macro = None
        macro_lines = []
        macro_start = None
        for token in tokens:
            if token.kind == TokenKind.PUNCTUATION and token.spelling == '#':
                # 遇到 #，可能是宏定义开始
                # 简单处理：下一令牌如果是 'define'，则开始收集宏
                pass  # 这里需要复杂的状态机，省略实现细节
        # 实际上 libclang 提供了 CursorKind.MACRO_DEFINITION，但需要遍历预处理实体？
        # 简单方法：遍历所有宏定义光标
        for cursor in tu.cursor.get_children():
            if cursor.kind == CursorKind.MACRO_DEFINITION:
                name = cursor.spelling
                # 获取宏定义的范围
                extent = cursor.extent
                start = extent.start
                end = extent.end
                code = self._extract_code(filepath, start.line, end.line)
                self.symbols[name] = {
                    'type': 'macro',
                    'file': filepath,
                    'start_line': start.line,
                    'end_line': end.line,
                    'code': code,
                    'cursor': cursor
                }

    def resolve_dependencies(self):
        """从目标函数出发，收集所有依赖的符号（广度优先）"""
        if self.target_func not in self.symbols:
            print(f"错误：未找到函数 {self.target_func} 的定义")
            sys.exit(1)

        visited = set()
        queue = deque([self.target_func])
        while queue:
            sym = queue.popleft()
            if sym in visited:
                continue
            visited.add(sym)
            # 获取该符号的依赖
            deps = self.dependencies.get(sym, set())
            for dep in deps:
                if dep not in visited:
                    queue.append(dep)
        return visited

    def topological_sort(self, symbols):
        """对符号集合进行拓扑排序（被依赖的先输出）"""
        # 构建子图
        graph = {s: set() for s in symbols}
        for s in symbols:
            for dep in self.dependencies.get(s, set()):
                if dep in symbols:
                    graph[s].add(dep)

        # 计算入度
        in_degree = {s: 0 for s in symbols}
        for s in symbols:
            for dep in graph[s]:
                in_degree[dep] += 1

        # Kahn 算法
        zero_in = deque([s for s in symbols if in_degree[s] == 0])
        topo = []
        while zero_in:
            node = zero_in.popleft()
            topo.append(node)
            for dep in graph[node]:
                in_degree[dep] -= 1
                if in_degree[dep] == 0:
                    zero_in.append(dep)

        if len(topo) != len(symbols):
            print("警告：存在循环依赖，将按任意顺序输出")
            # 简单返回原集合
            return list(symbols)
        return topo

    def write_standalone(self, output_file='standalone.cpp'):
        """生成 standalone.cpp 文件"""
        # 获取依赖的所有符号
        symbols_to_include = self.resolve_dependencies()
        # 按类型分组，以便将类型定义放在前面
        type_symbols = [s for s in symbols_to_include if self.symbols[s]['type'] == 'type']
        var_symbols = [s for s in symbols_to_include if self.symbols[s]['type'] == 'variable']
        func_symbols = [s for s in symbols_to_include if self.symbols[s]['type'] == 'function']
        macro_symbols = [s for s in symbols_to_include if self.symbols[s]['type'] == 'macro']

        # 拓扑排序：类型和变量之间可能有依赖，但简单起见，我们先输出类型，再输出变量，再输出函数
        # 更精确的做法是对所有符号一起拓扑排序
        all_symbols = list(symbols_to_include)
        topo_order = self.topological_sort(all_symbols)

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("// Standalone generated file\n")
            f.write("// Include necessary standard headers (you may need to add more)\n")
            f.write("#include <stdio.h>\n")
            f.write("#include <stdlib.h>\n")
            f.write("#include <string.h>\n\n")

            # 按照拓扑顺序输出代码片段
            for sym in topo_order:
                info = self.symbols[sym]
                f.write(f"// {info['type']}: {sym} (from {info['file']}:{info['start_line']})\n")
                f.write(info['code'])
                f.write("\n\n")

        print(f"成功生成 {output_file}，包含 {len(topo_order)} 个符号。")

def main():
    parser = argparse.ArgumentParser(description='提取 C/C++ 函数及其依赖，生成 standalone.cpp')
    parser.add_argument('directory', help='源代码根目录')
    parser.add_argument('function', help='目标函数名')
    args = parser.parse_args()

    extractor = DependencyExtractor(args.directory, args.function)
    extractor.parse_files()
    extractor.write_standalone()

if __name__ == '__main__':
    main()

使用说明

1.安装依赖：

安装 libclang（例如通过 apt install libclang-10-dev）和 Python 绑定 clang（pip install clang）。
如果 libclang 不在默认路径，需要在代码开头设置 Config.set_library_path()。

2.运行：

python extract.py /path/to/project my_function

3.输出：生成 standalone.cpp，包含目标函数及其依赖的所有函数、全局变量、类型定义和宏定义，按拓扑顺序排列（被依赖的在前）。

注意事项

此实现为概念验证，实际使用需处理更多边界情况（如函数重载、命名空间、类成员函数、模板等）。
宏定义的收集依赖预处理令牌，当前代码未完整实现，可能需要改用其他方法（如遍历预处理实体）。
依赖分析仅包含直接的函数调用、全局变量引用和类型引用，未考虑通过指针、回调等间接调用。
代码提取基于行范围，可能不准确（例如宏定义跨多行，或函数体包含大括号嵌套）。
对于标准库函数（如 printf），不会提取其实现，但保留了头文件包含语句。

如果需要更完善的解决方案，可以考虑使用成熟的工具如 gcc -E 预处理后分析，或基于 clang 的 ASTMatcher。

到此这篇关于Python代码实现C++代码依赖提取工具的文章就介绍到这了,更多相关Python提取C++代码依赖内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！

Python代码实现C++代码依赖提取工具

您可能感兴趣的文章: