ci/compare: nix stats comparison

Displays stats table in the step-summary if there are no added/removed packages
2025-06-10 19:55:41 +03:00 · 2025-04-01 10:27:06 +02:00 · 2025-04-01 10:27:06 +02:00 · 0d584f7c8f
commit 0d584f7c8f
parent 0d6b61cf5f
3 changed files with 201 additions and 4 deletions
--- a/ci/eval/compare/cmp-stats.py
+++ b/ci/eval/compare/cmp-stats.py
@ -0,0 +1,153 @@
+import json
+import os
+from scipy.stats import ttest_rel
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# Define metrics of interest (can be expanded as needed)
+METRIC_PREFIXES = ("nr", "gc")
+
+def flatten_data(json_data: dict) -> dict:
+    """
+    Extracts and flattens metrics from JSON data.
+    This is needed because the JSON data can be nested.
+    For example, the JSON data entry might look like this:
+
+    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}
+
+    Flattened:
+
+    "gc.cycles": 13
+    "gc.heapSize": 5404549120
+    ...
+
+    Args:
+        json_data (dict): JSON data containing metrics.
+    Returns:
+        dict: Flattened metrics with keys as metric names.
+    """
+    flat_metrics = {}
+    for k, v in json_data.items():
+        if isinstance(v, (int, float)):
+            flat_metrics[k] = v
+        elif isinstance(v, dict):
+            for sub_k, sub_v in v.items():
+                flat_metrics[f"{k}.{sub_k}"] = sub_v
+    return flat_metrics
+
+
+
+
+def load_all_metrics(directory: Path) -> dict:
+    """
+    Loads all stats JSON files in the specified directory and extracts metrics.
+
+    Args:
+        directory (Path): Directory containing JSON files.
+    Returns:
+        dict: Dictionary with filenames as keys and extracted metrics as values.
+    """
+    metrics = {}
+    for system_dir in directory.iterdir():
+        assert system_dir.is_dir()
+
+        for chunk_output in system_dir.iterdir():
+                with chunk_output.open() as f:
+                    data = json.load(f)
+                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
+
+    return metrics
+
+def dataframe_to_markdown(df: pd.DataFrame) -> str:
+    markdown_lines = []
+
+    # Header (get column names and format them)
+    header = '\n| ' + ' | '.join(df.columns) + ' |'
+    markdown_lines.append(header)
+    markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line
+
+    # Iterate over rows to build Markdown rows
+    for _, row in df.iterrows():
+        # TODO: define threshold for highlighting
+        highlight = False
+
+        fmt = lambda x: f"**{x}**" if highlight else f"{x}"
+
+        # Check for no change and NaN in p_value/t_stat
+        row_values = []
+        for val in row:
+            if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat
+                row_values.append("-")  # Custom symbol for NaN
+            elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)
+                row_values.append("-")  # Custom symbol for no change
+            else:
+                row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))
+
+        markdown_lines.append('| ' + ' | '.join(row_values) + ' |')
+
+    return '\n'.join(markdown_lines)
+
+
+def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
+    common_files = sorted(set(before_metrics) & set(after_metrics))
+    all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })
+
+    results = []
+
+    for key in all_keys:
+        before_vals, after_vals = [], []
+
+        for fname in common_files:
+            if key in before_metrics[fname] and key in after_metrics[fname]:
+                before_vals.append(before_metrics[fname][key])
+                after_vals.append(after_metrics[fname][key])
+
+        if len(before_vals) >= 2:
+            before_arr = np.array(before_vals)
+            after_arr = np.array(after_vals)
+
+            diff = after_arr - before_arr
+            pct_change = 100 * diff / before_arr
+            t_stat, p_val = ttest_rel(after_arr, before_arr)
+
+            results.append({
+                "metric": key,
+                "mean_before": np.mean(before_arr),
+                "mean_after": np.mean(after_arr),
+                "mean_diff": np.mean(diff),
+                "mean_%_change": np.mean(pct_change),
+                "p_value": p_val,
+                "t_stat": t_stat
+            })
+
+    df = pd.DataFrame(results).sort_values("p_value")
+    return df
+
+
+if __name__ == "__main__":
+    before_dir = os.environ.get("BEFORE_DIR")
+    after_dir = os.environ.get("AFTER_DIR")
+
+    if not before_dir or not after_dir:
+        print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
+        exit(1)
+
+    before_stats = Path(before_dir) / "stats"
+    after_stats = Path(after_dir) / "stats"
+
+    # This may happen if the pull request target does not include PR#399720 yet.
+    if not before_stats.exists():
+        print("⚠️  Skipping comparison: stats directory is missing in the target commit.")
+        exit(0)
+
+    # This should never happen, but we're exiting gracefully anyways
+    if not after_stats.exists():
+        print("⚠️  Skipping comparison: stats directory missing in current PR evaluation.")
+        exit(0)
+
+    before_metrics = load_all_metrics(before_stats)
+    after_metrics = load_all_metrics(after_stats)
+    df1 = perform_pairwise_tests(before_metrics, after_metrics)
+    markdown_table = dataframe_to_markdown(df1)
+    print(markdown_table)