[Concept,6/9] codman: Provide an dwarf analyser

Message ID 20251124134932.1991031-7-sjg@u-boot.org
State New
Headers
Series codman: Add a new source-code analysis tool |

Commit Message

Simon Glass Nov. 24, 2025, 1:49 p.m. UTC
  From: Simon Glass <simon.glass@canonical.com>

Add a way to do static preprocessor analysis using debug information
from compiled code. This reads the DWARF tables to determin which lines
produced code.

Co-developed-by: Claude <noreply@anthropic.com>
Signed-off-by: Simon Glass <simon.glass@canonical.com>
---

 tools/codman/dwarf.py | 200 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 tools/codman/dwarf.py
  

Patch

diff --git a/tools/codman/dwarf.py b/tools/codman/dwarf.py
new file mode 100644
index 00000000000..adceac9d20a
--- /dev/null
+++ b/tools/codman/dwarf.py
@@ -0,0 +1,200 @@ 
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2025 Canonical Ltd
+#
+"""DWARF debug info-based line-level analysis for source code.
+
+This module provides functionality to analyse which lines in source files
+were compiled by extracting line information from DWARF debug data in
+object files.
+"""
+
+import multiprocessing
+import os
+import subprocess
+from collections import defaultdict
+
+from u_boot_pylib import tout
+from analyser import Analyser, FileResult
+
+
+def worker(args):
+    """Extract line numbers from DWARF debug info in an object file.
+
+    Uses readelf --debug-dump=decodedline to get the line table, then parses
+    section headers and line entries to determine which source lines were
+    compiled into the object.
+
+    Args:
+        args (tuple): Tuple of (obj_path, build_dir, srcdir)
+
+    Returns:
+        tuple: (source_lines_dict, error_msg) where source_lines_dict is a
+            mapping of source file paths to sets of line numbers, and
+            error_msg is None on success or an error string on failure
+    """
+    obj_path, build_dir, srcdir = args
+    source_lines = defaultdict(set)
+
+    # Get the directory of the .o file relative to build_dir
+    rel_to_build = os.path.relpath(obj_path, build_dir)
+    obj_dir = os.path.dirname(rel_to_build)
+
+    # Use readelf to extract decoded line information
+    try:
+        result = subprocess.run(
+            ['readelf', '--debug-dump=decodedline', obj_path],
+            capture_output=True, text=True, check=False,
+            encoding='utf-8', errors='ignore')
+        if result.returncode != 0:
+            error_msg = (f'readelf failed on {obj_path} with return code '
+                        f'{result.returncode}\nstderr: {result.stderr}')
+            return (source_lines, error_msg)
+
+        # Parse the output
+        # Format is: Section header with full path, then data lines
+        current_file = None
+        for line in result.stdout.splitlines():
+            # Skip header lines and empty lines
+            if not line or line.startswith('Contents of') or \
+               line.startswith('File name') or line.strip() == '' or \
+               line.startswith(' '):
+                continue
+
+            # Look for section headers with full path (e.g., '/path/to/file.c:')
+            if line.endswith(':'):
+                header_path = line.rstrip(':')
+                # Try to resolve the path
+                if os.path.isabs(header_path):
+                    # Absolute path in DWARF
+                    abs_path = os.path.realpath(header_path)
+                else:
+                    # Relative path - try relative to srcdir and obj_dir
+                    abs_path = os.path.realpath(
+                        os.path.join(srcdir, obj_dir, header_path))
+                    if not os.path.exists(abs_path):
+                        abs_path = os.path.realpath(
+                            os.path.join(srcdir, header_path))
+
+                if os.path.exists(abs_path):
+                    current_file = abs_path
+                continue
+
+            # Parse data lines - use current_file from section header
+            if current_file:
+                parts = line.split()
+                if len(parts) >= 2:
+                    try:
+                        line_num = int(parts[1])
+                        # Skip special line numbers (like '-')
+                        if line_num > 0:
+                            source_lines[current_file].add(line_num)
+                    except (ValueError, IndexError):
+                        continue
+    except (OSError, subprocess.SubprocessError) as e:
+        error_msg = f'Failed to execute readelf on {obj_path}: {e}'
+        return (source_lines, error_msg)
+
+    return (source_lines, None)
+
+
+# pylint: disable=too-few-public-methods
+class DwarfAnalyser(Analyser):
+    """Analyser that uses DWARF debug info to determine active lines.
+
+    This analyser extracts line number information from DWARF debug data in
+    compiled object files to determine which source lines generated code.
+    """
+    def __init__(self, build_dir, srcdir, used_sources, keep_temps=False):
+        """Initialise the DWARF analyser.
+
+        Args:
+            build_dir (str): Build directory containing .o files
+            srcdir (str): Path to source root directory
+            used_sources (set): Set of source files that are compiled
+            keep_temps (bool): If True, keep temporary files for debugging
+        """
+        super().__init__(srcdir, keep_temps)
+        self.build_dir = build_dir
+        self.used_sources = used_sources
+
+    def extract_lines(self, jobs=None):
+        """Extract used line numbers from DWARF debug info in object files.
+
+        Args:
+            jobs (int): Number of parallel jobs (None = use all CPUs)
+
+        Returns:
+            dict: Mapping of source file paths to sets of line numbers that
+                generated code
+        """
+        # Find all .o files
+        obj_files = self.find_object_files(self.build_dir)
+
+        if not obj_files:
+            return defaultdict(set)
+
+        # Prepare arguments for parallel processing
+        args_list = [(obj_path, self.build_dir, self.srcdir)
+                     for obj_path in obj_files]
+
+        # Process in parallel
+        num_jobs = jobs if jobs else multiprocessing.cpu_count()
+        with multiprocessing.Pool(num_jobs) as pool:
+            results = pool.map(worker, args_list)
+
+        # Merge results from all workers and check for errors
+        source_lines = defaultdict(set)
+        errors = []
+        for result_dict, error_msg in results:
+            if error_msg:
+                errors.append(error_msg)
+            else:
+                for source_file, lines in result_dict.items():
+                    source_lines[source_file].update(lines)
+
+        # Report any errors
+        if errors:
+            for error in errors:
+                tout.error(error)
+            tout.fatal(f'readelf failed on {len(errors)} object file(s)')
+
+        return source_lines
+
+    def process(self, jobs=None):
+        """Perform line-level analysis using DWARF debug info.
+
+        Args:
+            jobs (int): Number of parallel jobs (None = use all CPUs)
+
+        Returns:
+            dict: Mapping of source file paths to FileResult named tuples
+        """
+        tout.progress('Extracting DWARF line information...')
+        dwarf_line_map = self.extract_lines(jobs)
+
+        file_results = {}
+        for source_file in self.used_sources:
+            abs_path = os.path.realpath(source_file)
+            used_lines = dwarf_line_map.get(abs_path, set())
+
+            # Count total lines in the file
+            total_lines = self.count_lines(abs_path)
+
+            active_lines = len(used_lines)
+            inactive_lines = total_lines - active_lines
+
+            # Create line status dict
+            line_status = {}
+            for i in range(1, total_lines + 1):
+                line_status[i] = 'active' if i in used_lines else 'inactive'
+
+            file_results[abs_path] = FileResult(
+                total_lines=total_lines,
+                active_lines=active_lines,
+                inactive_lines=inactive_lines,
+                line_status=line_status
+            )
+
+        tout.info(f'Analysed {len(file_results)} files using DWARF debug info')
+        return file_results