diff mbox series

[RFC] gitlab: add a new build_unit job to track build size

Message ID 20250228175441.674384-1-alex.bennee@linaro.org
State New
Headers show
Series [RFC] gitlab: add a new build_unit job to track build size | expand

Commit Message

Alex Bennée Feb. 28, 2025, 5:54 p.m. UTC
We want to reduce the total number of build units in the system to get
on our way to a single binary. It will help to have some numbers so
lets add a job to gitlab to track our progress.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
---
 .gitlab-ci.d/check-units.py    | 95 ++++++++++++++++++++++++++++++++++
 .gitlab-ci.d/static_checks.yml | 22 ++++++++
 2 files changed, 117 insertions(+)
 create mode 100755 .gitlab-ci.d/check-units.py

Comments

Pierrick Bouvier Feb. 28, 2025, 7:05 p.m. UTC | #1
Hi Alex,

On 2/28/25 09:54, Alex Bennée wrote:
> We want to reduce the total number of build units in the system to get
> on our way to a single binary. It will help to have some numbers so
> lets add a job to gitlab to track our progress.
> 

That's a good idea!

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> Cc: Philippe Mathieu-Daudé <philmd@linaro.org>
> Cc: Richard Henderson <richard.henderson@linaro.org>
> ---
>   .gitlab-ci.d/check-units.py    | 95 ++++++++++++++++++++++++++++++++++
>   .gitlab-ci.d/static_checks.yml | 22 ++++++++
>   2 files changed, 117 insertions(+)
>   create mode 100755 .gitlab-ci.d/check-units.py
> 
> diff --git a/.gitlab-ci.d/check-units.py b/.gitlab-ci.d/check-units.py
> new file mode 100755
> index 0000000000..aca63bd481
> --- /dev/null
> +++ b/.gitlab-ci.d/check-units.py
> @@ -0,0 +1,95 @@
> +#!/usr/bin/env python3
> +#
> +# check-units.py: check the number of compilation units and identify
> +#                 those that are rebuilt multiple times
> +#
> +# Copyright (C) 2025 Linaro Ltd.
> +#
> +# SPDX-License-Identifier: GPL-2.0-or-later
> +
> +from os import access, R_OK, path
> +from subprocess import check_output, CalledProcessError
> +from sys import argv, exit
> +import re
> +
> +
> +def extract_build_units(cc_path):
> +    """
> +    Extract the build units and their counds from compile_commands.json file.
> +
> +    Returns:
> +        Hash table of ["unit"] = count
> +    """
> +
> +    # Make jq/shell do the heavy lifting
> +    cmd = f"jq < {cc_path} '.[] | .file' | sort | uniq -c | sort -rn"
> +

If we choose to have a dedicated python script, maybe we can simply:

import json
from collections import Counter
j = json.load(open('build/compile_commands.json', 'r'))
files = [f['file'] for f in j]
occurences = Counter(files)

It's just a suggestion, and the script is fine as it is as well.

> +    try:
> +        # Execute the shell command and capture the output
> +        result = check_output(cmd, shell=True)
> +    except CalledProcessError as exp:
> +        print(f"Error executing {cmd}: {exp}")
> +        exit(1)
> +
> +    lines = result.decode().strip().split('\n')
> +
> +    # Create a dictionary to store the build unit frequencies
> +    build_units = {}
> +
> +    # extract from string of form: ' 65 "../../fpu/softfloat.c"'
> +    ext_pat = re.compile(r'^\s*(\d+)\s+"([^"]+)"')
> +
> +    # strip leading ../
> +    norm_pat = re.compile(r'^((\.\./)+|/+)')
> +
> +    # Process each line of the output
> +    for line in lines:
> +        match = re.match(ext_pat, line)
> +        if match:
> +            count = int(match.group(1))
> +            unit_path = re.sub(norm_pat, '', match.group(2))
> +
> +            # Store the count in the dictionary
> +            build_units[unit_path] = count
> +        else:
> +            print(f"couldn't process {line}")
> +
> +    return build_units
> +
> +
> +def analyse_units(build_units):
> +    """
> +    Analyse the build units and report stats and the top 10 rebuilds
> +    """
> +
> +    print(f"Total source files: {len(build_units.keys())}")
> +    print(f"Total build units: {sum(units.values())}")
> +
> +    # Create a sorted list by number of rebuilds
> +    sorted_build_units = sorted(build_units.items(),
> +                                key=lambda item: item[1],
> +                                reverse=True)
> +
> +    print("Most rebuilt units:")
> +    for unit, count in sorted_build_units[:10]:
> +        print(f"  {unit} built {count} times")
> +
> +    print("Least rebuilt units:")
> +    for unit, count in sorted_build_units[-10:]:
> +        print(f"  {unit} built {count} times")
> +
> +
> +if __name__ == "__main__":
> +    if len(argv) != 2:
> +        script_name = path.basename(argv[0])
> +        print(f"Usage: {script_name} <path_to_compile_commands.json>")
> +        exit(1)
> +
> +    cc_path = argv[1]
> +    if path.isfile(cc_path) and access(cc_path, R_OK):
> +        units = extract_build_units(cc_path)
> +        analyse_units(units)
> +        exit(0)
> +    else:
> +        print(f"{cc_path} doesn't exist or isn't readable")
> +        exit(1)
> diff --git a/.gitlab-ci.d/static_checks.yml b/.gitlab-ci.d/static_checks.yml
> index c0ba453382..c3ed6de453 100644
> --- a/.gitlab-ci.d/static_checks.yml
> +++ b/.gitlab-ci.d/static_checks.yml
> @@ -70,3 +70,25 @@ check-rust-tools-nightly:
>       expire_in: 2 days
>       paths:
>         - rust/target/doc
> +
> +check-build-units:
> +  extends: .base_job_template
> +  stage: build
> +  image: $CI_REGISTRY_IMAGE/qemu/debian:$QEMU_CI_CONTAINER_TAG
> +  needs:
> +    job: amd64-debian-container
> +  before_script:
> +    - source scripts/ci/gitlab-ci-section
> +    - section_start setup "Install Tools"
> +    - apt install --assume-yes --no-install-recommends jq
> +    - section_end setup
> +  script:
> +    - mkdir build
> +    - cd build
> +    - section_start configure "Running configure"
> +    - ../configure
> +    - cd ..
> +    - section_end configure
> +    - section_start analyse "Analyse"
> +    - .gitlab-ci.d/check-units.py build/compile_commands.json
> +    - section_end analyse

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
diff mbox series

Patch

diff --git a/.gitlab-ci.d/check-units.py b/.gitlab-ci.d/check-units.py
new file mode 100755
index 0000000000..aca63bd481
--- /dev/null
+++ b/.gitlab-ci.d/check-units.py
@@ -0,0 +1,95 @@ 
+#!/usr/bin/env python3
+#
+# check-units.py: check the number of compilation units and identify
+#                 those that are rebuilt multiple times
+#
+# Copyright (C) 2025 Linaro Ltd.
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+from os import access, R_OK, path
+from subprocess import check_output, CalledProcessError
+from sys import argv, exit
+import re
+
+
+def extract_build_units(cc_path):
+    """
+    Extract the build units and their counds from compile_commands.json file.
+
+    Returns:
+        Hash table of ["unit"] = count
+    """
+
+    # Make jq/shell do the heavy lifting
+    cmd = f"jq < {cc_path} '.[] | .file' | sort | uniq -c | sort -rn"
+
+    try:
+        # Execute the shell command and capture the output
+        result = check_output(cmd, shell=True)
+    except CalledProcessError as exp:
+        print(f"Error executing {cmd}: {exp}")
+        exit(1)
+
+    lines = result.decode().strip().split('\n')
+
+    # Create a dictionary to store the build unit frequencies
+    build_units = {}
+
+    # extract from string of form: ' 65 "../../fpu/softfloat.c"'
+    ext_pat = re.compile(r'^\s*(\d+)\s+"([^"]+)"')
+
+    # strip leading ../
+    norm_pat = re.compile(r'^((\.\./)+|/+)')
+
+    # Process each line of the output
+    for line in lines:
+        match = re.match(ext_pat, line)
+        if match:
+            count = int(match.group(1))
+            unit_path = re.sub(norm_pat, '', match.group(2))
+
+            # Store the count in the dictionary
+            build_units[unit_path] = count
+        else:
+            print(f"couldn't process {line}")
+
+    return build_units
+
+
+def analyse_units(build_units):
+    """
+    Analyse the build units and report stats and the top 10 rebuilds
+    """
+
+    print(f"Total source files: {len(build_units.keys())}")
+    print(f"Total build units: {sum(units.values())}")
+
+    # Create a sorted list by number of rebuilds
+    sorted_build_units = sorted(build_units.items(),
+                                key=lambda item: item[1],
+                                reverse=True)
+
+    print("Most rebuilt units:")
+    for unit, count in sorted_build_units[:10]:
+        print(f"  {unit} built {count} times")
+
+    print("Least rebuilt units:")
+    for unit, count in sorted_build_units[-10:]:
+        print(f"  {unit} built {count} times")
+
+
+if __name__ == "__main__":
+    if len(argv) != 2:
+        script_name = path.basename(argv[0])
+        print(f"Usage: {script_name} <path_to_compile_commands.json>")
+        exit(1)
+
+    cc_path = argv[1]
+    if path.isfile(cc_path) and access(cc_path, R_OK):
+        units = extract_build_units(cc_path)
+        analyse_units(units)
+        exit(0)
+    else:
+        print(f"{cc_path} doesn't exist or isn't readable")
+        exit(1)
diff --git a/.gitlab-ci.d/static_checks.yml b/.gitlab-ci.d/static_checks.yml
index c0ba453382..c3ed6de453 100644
--- a/.gitlab-ci.d/static_checks.yml
+++ b/.gitlab-ci.d/static_checks.yml
@@ -70,3 +70,25 @@  check-rust-tools-nightly:
     expire_in: 2 days
     paths:
       - rust/target/doc
+
+check-build-units:
+  extends: .base_job_template
+  stage: build
+  image: $CI_REGISTRY_IMAGE/qemu/debian:$QEMU_CI_CONTAINER_TAG
+  needs:
+    job: amd64-debian-container
+  before_script:
+    - source scripts/ci/gitlab-ci-section
+    - section_start setup "Install Tools"
+    - apt install --assume-yes --no-install-recommends jq
+    - section_end setup
+  script:
+    - mkdir build
+    - cd build
+    - section_start configure "Running configure"
+    - ../configure
+    - cd ..
+    - section_end configure
+    - section_start analyse "Analyse"
+    - .gitlab-ci.d/check-units.py build/compile_commands.json
+    - section_end analyse