nixpkgs/ci/eval/compare/cmp-stats.py

import json
import os
from scipy.stats import ttest_rel
import pandas as pd
import numpy as np
from pathlib import Path

# Define metrics of interest (can be expanded as needed)
METRIC_PREFIXES = ("nr", "gc")

def flatten_data(json_data: dict) -> dict:
    """
    Extracts and flattens metrics from JSON data.
    This is needed because the JSON data can be nested.
    For example, the JSON data entry might look like this:

    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}

    Flattened:

    "gc.cycles": 13
    "gc.heapSize": 5404549120
    ...

    Args:
        json_data (dict): JSON data containing metrics.
    Returns:
        dict: Flattened metrics with keys as metric names.
    """
    flat_metrics = {}
    for k, v in json_data.items():
        if isinstance(v, (int, float)):
            flat_metrics[k] = v
        elif isinstance(v, dict):
            for sub_k, sub_v in v.items():
                flat_metrics[f"{k}.{sub_k}"] = sub_v
    return flat_metrics


def load_all_metrics(directory: Path) -> dict:
    """
    Loads all stats JSON files in the specified directory and extracts metrics.

    Args:
        directory (Path): Directory containing JSON files.
    Returns:
        dict: Dictionary with filenames as keys and extracted metrics as values.
    """
    metrics = {}
    for system_dir in directory.iterdir():
        assert system_dir.is_dir()

        for chunk_output in system_dir.iterdir():
                with chunk_output.open() as f:
                    data = json.load(f)
                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)

    return metrics

def dataframe_to_markdown(df: pd.DataFrame) -> str:
    markdown_lines = []

    # Header (get column names and format them)
    header = '\n| ' + ' | '.join(df.columns) + ' |'
    markdown_lines.append(header)
    markdown_lines.append("| - " * (len(df.columns)) + "|")  # Separator line

    # Iterate over rows to build Markdown rows
    for _, row in df.iterrows():
        # TODO: define threshold for highlighting
        highlight = False

        fmt = lambda x: f"**{x}**" if highlight else f"{x}"

        # Check for no change and NaN in p_value/t_stat
        row_values = []
        for val in row:
            if isinstance(val, float) and np.isnan(val):  # For NaN values in p-value or t-stat
                row_values.append("-")  # Custom symbol for NaN
            elif isinstance(val, float) and val == 0:  # For no change (mean_diff == 0)
                row_values.append("-")  # Custom symbol for no change
            else:
                row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))

        markdown_lines.append('| ' + ' | '.join(row_values) + ' |')

    return '\n'.join(markdown_lines)


def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:
    common_files = sorted(set(before_metrics) & set(after_metrics))
    all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })

    results = []

    for key in all_keys:
        before_vals, after_vals = [], []

        for fname in common_files:
            if key in before_metrics[fname] and key in after_metrics[fname]:
                before_vals.append(before_metrics[fname][key])
                after_vals.append(after_metrics[fname][key])

        if len(before_vals) >= 2:
            before_arr = np.array(before_vals)
            after_arr = np.array(after_vals)

            diff = after_arr - before_arr
            pct_change = 100 * diff / before_arr
            t_stat, p_val = ttest_rel(after_arr, before_arr)

            results.append({
                "metric": key,
                "mean_before": np.mean(before_arr),
                "mean_after": np.mean(after_arr),
                "mean_diff": np.mean(diff),
                "mean_%_change": np.mean(pct_change),
                "p_value": p_val,
                "t_stat": t_stat
            })

    df = pd.DataFrame(results).sort_values("p_value")
    return df


if __name__ == "__main__":
    before_dir = os.environ.get("BEFORE_DIR")
    after_dir = os.environ.get("AFTER_DIR")

    if not before_dir or not after_dir:
        print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")
        exit(1)

    before_metrics = load_all_metrics(Path(before_dir) / "stats")
    after_metrics = load_all_metrics(Path(after_dir) / "stats")

    df1 = perform_pairwise_tests(before_metrics, after_metrics)
    markdown_table = dataframe_to_markdown(df1)
    print(markdown_table)
ci/compare: nix stats comparison Displays stats table in the step-summary if there are no added/removed packages 2025-04-01 10:27:06 +02:00			`import json`
			`import os`
			`from scipy.stats import ttest_rel`
			`import pandas as pd`
			`import numpy as np`
			`from pathlib import Path`

			`# Define metrics of interest (can be expanded as needed)`
			`METRIC_PREFIXES = ("nr", "gc")`

			`def flatten_data(json_data: dict) -> dict:`
			`"""`
			`Extracts and flattens metrics from JSON data.`
			`This is needed because the JSON data can be nested.`
			`For example, the JSON data entry might look like this:`

			`"gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}`

			`Flattened:`

			`"gc.cycles": 13`
			`"gc.heapSize": 5404549120`
			`...`

			`Args:`
			`json_data (dict): JSON data containing metrics.`
			`Returns:`
			`dict: Flattened metrics with keys as metric names.`
			`"""`
			`flat_metrics = {}`
			`for k, v in json_data.items():`
			`if isinstance(v, (int, float)):`
			`flat_metrics[k] = v`
			`elif isinstance(v, dict):`
			`for sub_k, sub_v in v.items():`
			`flat_metrics[f"{k}.{sub_k}"] = sub_v`
			`return flat_metrics`




			`def load_all_metrics(directory: Path) -> dict:`
			`"""`
			`Loads all stats JSON files in the specified directory and extracts metrics.`

			`Args:`
			`directory (Path): Directory containing JSON files.`
			`Returns:`
			`dict: Dictionary with filenames as keys and extracted metrics as values.`
			`"""`
			`metrics = {}`
			`for system_dir in directory.iterdir():`
			`assert system_dir.is_dir()`

			`for chunk_output in system_dir.iterdir():`
			`with chunk_output.open() as f:`
			`data = json.load(f)`
			`metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)`

			`return metrics`

			`def dataframe_to_markdown(df: pd.DataFrame) -> str:`
			`markdown_lines = []`

			`# Header (get column names and format them)`
			`header = '\n\| ' + ' \| '.join(df.columns) + ' \|'`
			`markdown_lines.append(header)`
			`markdown_lines.append("\| - " * (len(df.columns)) + "\|") # Separator line`

			`# Iterate over rows to build Markdown rows`
			`for _, row in df.iterrows():`
			`# TODO: define threshold for highlighting`
			`highlight = False`

			`fmt = lambda x: f"{x}" if highlight else f"{x}"`

			`# Check for no change and NaN in p_value/t_stat`
			`row_values = []`
			`for val in row:`
			`if isinstance(val, float) and np.isnan(val): # For NaN values in p-value or t-stat`
			`row_values.append("-") # Custom symbol for NaN`
			`elif isinstance(val, float) and val == 0: # For no change (mean_diff == 0)`
			`row_values.append("-") # Custom symbol for no change`
			`else:`
			`row_values.append(fmt(f"{val:.4f}" if isinstance(val, float) else str(val)))`

			`markdown_lines.append('\| ' + ' \| '.join(row_values) + ' \|')`

			`return '\n'.join(markdown_lines)`


			`def perform_pairwise_tests(before_metrics: dict, after_metrics: dict) -> pd.DataFrame:`
			`common_files = sorted(set(before_metrics) & set(after_metrics))`
			`all_keys = sorted({ metric_keys for file_metrics in before_metrics.values() for metric_keys in file_metrics.keys() })`

			`results = []`

			`for key in all_keys:`
			`before_vals, after_vals = [], []`

			`for fname in common_files:`
			`if key in before_metrics[fname] and key in after_metrics[fname]:`
			`before_vals.append(before_metrics[fname][key])`
			`after_vals.append(after_metrics[fname][key])`

			`if len(before_vals) >= 2:`
			`before_arr = np.array(before_vals)`
			`after_arr = np.array(after_vals)`

			`diff = after_arr - before_arr`
			`pct_change = 100 * diff / before_arr`
			`t_stat, p_val = ttest_rel(after_arr, before_arr)`

			`results.append({`
			`"metric": key,`
			`"mean_before": np.mean(before_arr),`
			`"mean_after": np.mean(after_arr),`
			`"mean_diff": np.mean(diff),`
			`"mean_%_change": np.mean(pct_change),`
			`"p_value": p_val,`
			`"t_stat": t_stat`
			`})`

			`df = pd.DataFrame(results).sort_values("p_value")`
			`return df`


			`if __name__ == "__main__":`
			`before_dir = os.environ.get("BEFORE_DIR")`
			`after_dir = os.environ.get("AFTER_DIR")`

			`if not before_dir or not after_dir:`
			`print("Error: Environment variables 'BEFORE_DIR' and 'AFTER_DIR' must be set.")`
			`exit(1)`

			`before_metrics = load_all_metrics(Path(before_dir) / "stats")`
			`after_metrics = load_all_metrics(Path(after_dir) / "stats")`

			`df1 = perform_pairwise_tests(before_metrics, after_metrics)`
			`markdown_table = dataframe_to_markdown(df1)`
			`print(markdown_table)`