[Concept,19/20] buildman: Wire up distributed builds with WorkerPool

Message ID 20260316154733.1587261-20-sjg@u-boot.org
State New
Headers
Series buildman: Add distributed builds |

Commit Message

Simon Glass March 16, 2026, 3:47 p.m. UTC
  From: Simon Glass <sjg@chromium.org>

Add the plumbing to connect the boss and worker modules to the build
flow via control.py and cmdline.py.

Add --distribute (--dist) flag to activate distributed builds and
--use-machines to select a subset of configured machines (implies
--dist). Add --no-local to skip local building entirely, sending all
boards to remote workers. Add --kill-workers to clean up stale worker
processes on remote machines.

In control.py, _setup_remote_builds() probes machines, collects
toolchains, resolves toolchain aliases, checks gcc versions across
machines, splits boards between local and remote, and starts the
worker pool. Build settings (verbose, no_lto, allow_missing, etc.)
are forwarded to remote workers via the configure command.

Show the machine name in the build-progress line for remote results
and on verbose output when there is an error or warning.

Signed-off-by: Simon Glass <sjg@chromium.org>
---

 tools/buildman/builder.py       |  12 +-
 tools/buildman/builderthread.py |   2 +
 tools/buildman/cmdline.py       |  16 ++
 tools/buildman/control.py       | 297 ++++++++++++++++++++++++++++++--
 4 files changed, 315 insertions(+), 12 deletions(-)
  

Patch

diff --git a/tools/buildman/builder.py b/tools/buildman/builder.py
index 3264978a616..b3cc136d036 100644
--- a/tools/buildman/builder.py
+++ b/tools/buildman/builder.py
@@ -374,6 +374,7 @@  class Builder:
         self.count = 0
         self.timestamps = collections.deque()
         self.verbose = False
+        self.progress = ''
 
         # Note: baseline state for result summaries is now in ResultHandler
 
@@ -591,6 +592,9 @@  class Builder:
                     sys.stderr.write(result.stderr)
             elif self.verbose:
                 terminal.print_clear()
+                machine = result.remote
+                if machine and (result.return_code or result.stderr):
+                    tprint(f'[{machine}]')
                 boards_selected = {target : result.brd}
                 self._result_handler.reset_result_summary(boards_selected)
                 self._result_handler.produce_result_summary(
@@ -616,7 +620,13 @@  class Builder:
         if self._complete_delay:
             line += f'{self._complete_delay}  : '
 
-        line += target
+        machine = result.remote if result else None
+        if machine:
+            line += f'{target} [{machine}]'
+        elif self.progress:
+            line += f'{target} [{self.progress}]'
+        else:
+            line += f'{target} [local]'
         if not self._opts.ide:
             terminal.print_clear()
             tprint(line, newline=False, limit_to_line=True)
diff --git a/tools/buildman/builderthread.py b/tools/buildman/builderthread.py
index 6f4f257dedb..3f487ba57c7 100644
--- a/tools/buildman/builderthread.py
+++ b/tools/buildman/builderthread.py
@@ -434,6 +434,7 @@  class BuilderThread(threading.Thread):
                     - result.stderr set to 'bad' if stderr output was recorded
         """
         result = command.CommandResult()
+        result.remote = None
         done_file = self.builder.get_done_file(commit_upto, brd.target)
         result.already_done = os.path.exists(done_file)
         result.kconfig_reconfig = False
@@ -728,6 +729,7 @@  class BuilderThread(threading.Thread):
                 req, commit_upto, do_config, mrproper, config_only,
                 out_dir, out_rel_dir, result)
 
+        result.remote = None
         result.toolchain = self.toolchain
         result.brd = req.brd
         result.commit_upto = commit_upto
diff --git a/tools/buildman/cmdline.py b/tools/buildman/cmdline.py
index 5f3c47bf7fe..5396ee640fa 100644
--- a/tools/buildman/cmdline.py
+++ b/tools/buildman/cmdline.py
@@ -105,6 +105,18 @@  def add_upto_m(parser):
     parser.add_argument(
           '-M', '--allow-missing', action='store_true', default=False,
           help='Tell binman to allow missing blobs and generate fake ones as needed')
+    parser.add_argument('--dist', '--distribute', action='store_true',
+          dest='distribute',
+          default=False,
+          help='Distribute builds to remote machines from [machines] config')
+    parser.add_argument('--use-machines', type=str, default=None,
+          dest='use_machines',
+          help='Comma-separated list of machine names to use for '
+               'distributed builds (default: all from [machines] config)')
+    parser.add_argument('--no-local', action='store_true', default=False,
+          dest='no_local',
+          help='Do not build on the local machine; send all boards to '
+               'remote workers (requires --dist)')
     parser.add_argument('--mach', '--machines', action='store_true',
           default=False, dest='machines',
           help='Probe all remote machines from [machines] config and show '
@@ -119,6 +131,10 @@  def add_upto_m(parser):
     parser.add_argument(
           '--maintainer-check', action='store_true',
           help='Check that maintainer entries exist for each board')
+    parser.add_argument('--kill-workers', action='store_true', default=False,
+          dest='kill_workers',
+          help='Kill stale worker processes and remove lock files on all '
+               'remote machines, then exit')
     parser.add_argument('--worker', action='store_true', default=False,
           help='Run in worker mode, accepting build commands on stdin '
                '(used internally for distributed builds)')
diff --git a/tools/buildman/control.py b/tools/buildman/control.py
index 082db377293..bb866910491 100644
--- a/tools/buildman/control.py
+++ b/tools/buildman/control.py
@@ -10,9 +10,11 @@  This holds the main control logic for buildman, when not running tests.
 import getpass
 import multiprocessing
 import os
+import signal
 import shutil
 import sys
 import tempfile
+import threading
 import time
 
 from buildman import boards
@@ -67,7 +69,8 @@  def count_build_commits(commits, step):
     return 0
 
 
-def get_action_summary(is_summary, commit_count, selected, threads, jobs):
+def get_action_summary(is_summary, commit_count, selected, threads, jobs,
+                       no_local=False):
     """Return a string summarising the intended action.
 
     Args:
@@ -76,6 +79,7 @@  def get_action_summary(is_summary, commit_count, selected, threads, jobs):
         selected (list of Board): List of Board objects that are marked
         threads (int): Number of processor threads being used
         jobs (int): Number of jobs to build at once
+        no_local (bool): True if all builds are remote (no local threads)
 
     Returns:
         str: Summary string
@@ -86,8 +90,9 @@  def get_action_summary(is_summary, commit_count, selected, threads, jobs):
         commit_str = 'current source'
     msg = (f"{'Summary of' if is_summary else 'Building'} "
            f'{commit_str} for {len(selected)} boards')
-    msg += (f' ({threads} thread{get_plural(threads)}, '
-            f'{jobs} job{get_plural(jobs)} per thread)')
+    if not no_local:
+        msg += (f' ({threads} thread{get_plural(threads)}, '
+                f'{jobs} job{get_plural(jobs)} per thread)')
     return msg
 
 # pylint: disable=R0913,R0917
@@ -377,7 +382,8 @@  def get_toolchains(toolchains, col, override_toolchain, fetch_arch,
 
     if no_toolchains:
         toolchains.get_settings()
-        toolchains.scan(list_tool_chains and verbose)
+        toolchains.scan(list_tool_chains and verbose,
+                        raise_on_error=not list_tool_chains)
     if list_tool_chains:
         toolchains.list()
         print()
@@ -538,12 +544,240 @@  def setup_output_dir(output_dir, work_in_output, branch, no_subdirs, col,
     return output_dir
 
 
+def _filter_mismatched_toolchains(machines, local_toolchains):
+    """Remove remote toolchains whose gcc version differs from local
+
+    Compares the gcc version directory (e.g. gcc-13.1.0-nolibc) in
+    each toolchain path. If a remote machine has a different version
+    for an architecture, that architecture is removed from the
+    machine's toolchain list so no boards are sent to it for that arch.
+
+    Args:
+        machines (list of Machine): Remote machines with toolchains
+        local_toolchains (dict): arch -> gcc path on the local machine
+    """
+    local_versions = {}
+    for arch, gcc in local_toolchains.items():
+        ver = machine.gcc_version(gcc)
+        if ver:
+            local_versions[arch] = ver
+
+    for mach in machines:
+        mismatched = []
+        for arch, gcc in mach.toolchains.items():
+            local_ver = local_versions.get(arch)
+            if not local_ver:
+                continue
+            remote_ver = machine.gcc_version(gcc)
+            if remote_ver and remote_ver != local_ver:
+                mismatched.append(arch)
+        for arch in mismatched:
+            del mach.toolchains[arch]
+
+
+def _collect_worker_settings(args):
+    """Collect build settings to send to remote workers
+
+    Gathers the command-line flags that affect how make is invoked and
+    returns them as a dict for the worker's 'configure' command.
+
+    Args:
+        args (Namespace): Command-line arguments
+
+    Returns:
+        dict: Settings dict (only includes flags that are set)
+    """
+    settings = {}
+    flag_names = [
+        'verbose_build', 'allow_missing', 'no_lto',
+        'reproducible_builds', 'warnings_as_errors',
+        'mrproper', 'fallback_mrproper', 'config_only',
+        'force_build', 'kconfig_check',
+    ]
+    for name in flag_names:
+        val = getattr(args, name, None)
+        if val is not None:
+            settings[name] = val
+    return settings
+
+
+def _setup_remote_builds(board_selected, args, git_dir):
+    """Set up remote workers if machines are configured
+
+    Probes machines, checks toolchains and splits boards into local
+    and remote sets. Returns a WorkerPool for the remote boards.
+
+    Args:
+        board_selected (dict): All selected boards
+        args (Namespace): Command-line arguments
+        git_dir (str): Path to local .git directory
+
+    Returns:
+        tuple:
+            dict: Boards to build locally
+            dict: Boards to build remotely
+            WorkerPool or None: Pool of remote workers, or None
+    """
+    from buildman import boss  # pylint: disable=C0415
+
+    # Parse machine name filter from --use-machines
+    machine_names = None
+    if args.use_machines:
+        machine_names = [n.strip() for n in args.use_machines.split(',')]
+
+    no_local = args.no_local
+
+    def _fail(msg):
+        """Handle a failure to set up remote builds
+
+        With --no-local, prints the error and returns empty dicts so
+        nothing is built. Otherwise falls back to building everything
+        locally.
+        """
+        if no_local:
+            tprint(msg)
+            return {}, {}, None
+        return board_selected, {}, None
+
+    machines_config = machine.get_machines_config()
+    if not machines_config:
+        return _fail('No machines configured')
+
+    # Probe machines and their toolchains
+    pool = machine.MachinePool(names=machine_names)
+    available = pool.probe_all()
+    if not available:
+        return _fail('No machines available')
+
+    # Check which of the boss's toolchains exist on each remote
+    # machine. This makes workers use the boss's toolchain choices
+    # rather than their own .buildman config.
+    local_tc = toolchain.Toolchains()
+    local_tc.get_settings(show_warning=False)
+    local_tc.scan(verbose=False)
+    local_gcc = {arch: tc.gcc for arch, tc in local_tc.toolchains.items()}
+
+    # Resolve toolchain aliases (e.g. x86->i386) so that board
+    # architectures using alias names are recognised by split_boards()
+    machine.resolve_toolchain_aliases(local_gcc)
+
+    pool.check_toolchains(
+        set(), buildman_path=args.machines_buildman_path,
+        local_gcc=local_gcc)
+    remote_toolchains = {}
+    for mach in available:
+        remote_toolchains.update(mach.toolchains)
+
+    if not remote_toolchains:
+        return _fail('No remote toolchains available')
+
+    if no_local:
+        local = {}
+        remote = board_selected
+    else:
+        local, remote = boss.split_boards(
+            board_selected, remote_toolchains)
+
+    if not remote:
+        return board_selected, {}, None
+
+    # Collect build settings to send to workers. Resolve allow_missing
+    # using the .buildman config, since workers don't have it.
+    settings = _collect_worker_settings(args)
+    settings['allow_missing'] = get_allow_missing(
+        args.allow_missing, args.no_allow_missing,
+        len(board_selected), args.branch)
+
+    # Start workers: init git, push source, start from tree
+    worker_pool = boss.WorkerPool(available)
+    workers = worker_pool.start_all(git_dir, 'HEAD:refs/heads/work',
+                                     debug=args.debug,
+                                     settings=settings)
+    if not workers:
+        return _fail('No remote workers available')
+
+    return local, remote, worker_pool
+
+
+def _start_remote_builds(builder, commits, board_selected, args):
+    """Start remote builds in a background thread
+
+    Splits boards between local and remote machines, launches remote
+    builds in a background thread, and installs a SIGINT handler for
+    clean shutdown.
+
+    Args:
+        builder (Builder): Builder to use
+        commits (list of Commit): Commits to build, or None
+        board_selected (dict): target -> Board for all selected boards
+        args (Namespace): Command-line arguments
+
+    Returns:
+        tuple: (local_boards, remote_thread, worker_pool, extra_count,
+            old_sigint)
+    """
+    local_boards, remote_boards, worker_pool = (
+        _setup_remote_builds(board_selected, args, builder.git_dir))
+
+    extra_count = 0
+    if worker_pool and remote_boards:
+        commit_count = len(commits) if commits else 1
+        extra_count = len(remote_boards) * commit_count
+
+    remote_thread = None
+    if worker_pool and remote_boards:
+        remote_thread = threading.Thread(
+            target=worker_pool.build_boards,
+            args=(remote_boards, commits, builder,
+                  len(local_boards)))
+        remote_thread.daemon = True
+        remote_thread.start()
+
+    # Install a SIGINT handler that cleanly shuts down workers.
+    # This is more reliable than try/except KeyboardInterrupt since
+    # SIGINT may terminate the process before the exception handler
+    # runs.
+    old_sigint = None
+    if worker_pool:
+        def _sigint_handler(_signum, _frame):
+            worker_pool.close_all()
+            signal.signal(signal.SIGINT, old_sigint or signal.SIG_DFL)
+            os.kill(os.getpid(), signal.SIGINT)
+        old_sigint = signal.signal(signal.SIGINT, _sigint_handler)
+
+    return local_boards, remote_thread, worker_pool, extra_count, old_sigint
+
+
+def _finish_remote_builds(remote_thread, worker_pool, old_sigint, builder):
+    """Wait for remote builds to finish and clean up
+
+    Args:
+        remote_thread (Thread or None): Background remote build thread
+        worker_pool (WorkerPool or None): Worker pool to shut down
+        old_sigint: Previous SIGINT handler to restore
+        builder (Builder): Builder for printing the summary
+    """
+    if remote_thread:
+        try:
+            while remote_thread.is_alive():
+                remote_thread.join(timeout=0.5)
+        except KeyboardInterrupt:
+            worker_pool.close_all()
+            raise
+        worker_pool.quit_all()
+        builder.print_summary()
+
+    if worker_pool and old_sigint is not None:
+        signal.signal(signal.SIGINT, old_sigint)
+
+
 def run_builder(builder, commits, board_selected, display_options, args):
     """Run the builder or show the summary
 
     Args:
         builder (Builder): Builder to use
-        commits (list of Commit): List of commits being built, None if no branch
+        commits (list of Commit): List of commits being built, None if
+            no branch
         board_selected (dict): Dict of selected boards:
             key: target name
             value: Board object
@@ -562,8 +796,9 @@  def run_builder(builder, commits, board_selected, display_options, args):
 
     if not args.ide:
         commit_count = count_build_commits(commits, args.step)
-        tprint(get_action_summary(args.summary, commit_count, board_selected,
-                                  args.threads, args.jobs))
+        tprint(get_action_summary(args.summary, commit_count,
+                                  board_selected, args.threads,
+                                  args.jobs, no_local=args.no_local))
 
     builder.set_display_options(
         display_options, args.filter_dtb_warnings,
@@ -573,9 +808,31 @@  def run_builder(builder, commits, board_selected, display_options, args):
         builder.result_handler.show_summary(
             commits, board_selected, args.step)
     else:
-        fail, warned, excs = builder.build_boards(
-            commits, board_selected, args.keep_outputs, args.verbose,
-            args.fragments)
+        local_boards = board_selected
+        remote_thread = None
+        worker_pool = None
+        extra_count = 0
+        old_sigint = None
+
+        if args.distribute:
+            (local_boards, remote_thread, worker_pool,
+             extra_count, old_sigint) = _start_remote_builds(
+                builder, commits, board_selected, args)
+
+        try:
+            fail, warned, excs = builder.build_boards(
+                commits, local_boards, args.keep_outputs,
+                args.verbose, args.fragments,
+                extra_count=extra_count,
+                delay_summary=bool(remote_thread))
+        except KeyboardInterrupt:
+            if worker_pool:
+                worker_pool.close_all()
+            raise
+
+        _finish_remote_builds(remote_thread, worker_pool,
+                              old_sigint, builder)
+
         if args.build_summary:
             builder.commits = commits
             builder.result_handler.show_summary(
@@ -762,7 +1019,17 @@  def do_buildman(args, toolchains=None, make_func=None, brds=None,
     # Handle --worker: run in worker mode for distributed builds
     if args.worker:
         from buildman import worker  # pylint: disable=C0415
-        return worker.do_worker()
+        return worker.do_worker(args.debug)
+
+    # Handle --kill-workers: kill stale workers and exit
+    if args.kill_workers:
+        from buildman import boss  # pylint: disable=C0415
+
+        machines_config = machine.get_machines_config()
+        if not machines_config:
+            print('No machines configured')
+            return 1
+        return boss.kill_workers(machines_config)
 
     # Handle --machines: probe remote machines and show status
     if args.machines or args.machines_fetch_arch:
@@ -770,6 +1037,14 @@  def do_buildman(args, toolchains=None, make_func=None, brds=None,
             col, fetch=args.machines_fetch_arch,
             buildman_path=args.machines_buildman_path)
 
+    # --use-machines implies --dist
+    if args.use_machines:
+        args.distribute = True
+
+    if args.no_local and not args.distribute:
+        print('--no-local requires --dist')
+        return 1
+
     git_dir = os.path.join(args.git, '.git')
 
     toolchains = get_toolchains(toolchains, col, args.override_toolchain,