1082 callgrind_path: Path,
1083) -> tuple[list[str], dict[str, int], dict[str, dict[str, int]]]:
1084 event_names: list[str] = []
1085 totals: dict[str, int] = {}
1086 functions: dict[str, dict[str, int]] = {}
1088 ob_table: dict[str, str] = {}
1089 fl_table: dict[str, str] = {}
1090 fn_table: dict[str, str] = {}
1092 lines = callgrind_path.read_text(errors=
"ignore").splitlines()
1093 for raw_line
in lines:
1094 stripped = raw_line.strip()
1095 if stripped.startswith(
"ob="):
1097 elif stripped.startswith(
"fl="):
1099 elif stripped.startswith(
"fn="):
1103 current_file =
"???"
1104 current_function: str |
None =
None
1106 for raw_line
in lines:
1107 line = raw_line.strip()
1111 if line.startswith(
"events:"):
1112 event_names = line.split(
":", 1)[1].strip().split()
1115 if line.startswith(
"summary:"):
1118 int(value)
for value
in line.split(
":", 1)[1].strip().split()
1120 padded_summary = summary_values + [0] * max(
1121 0, len(event_names) - len(summary_values)
1124 event_name: padded_summary[idx]
1125 for idx, event_name
in enumerate(event_names)
1131 if line.startswith(
"ob="):
1135 if line.startswith(
"fl="):
1139 if line.startswith(
"fn="):
1143 if current_function
is None:
1150 if event_costs
is None:
1153 key = current_function
1154 function_events = functions.setdefault(
1155 key, {event_name: 0
for event_name
in event_names}
1157 for event_name, value
in event_costs.items():
1158 function_events[event_name] = function_events.get(event_name, 0) + value
1160 return event_names, totals, functions
1546 perf_a: dict |
None =
None
1547 perf_b: dict |
None =
None
1550 print(f
"Results directory: {result_dir}", flush=
True)
1552 effective_configure_args = list(args.configure_arg)
1553 imported_seed_cache_vars: dict[str, dict[str, str]] = {}
1554 if args.repo
and args.seed_cache:
1556 args.seed_cache, args.seed_cache_var
1558 effective_configure_args = [*seeded_args, *effective_configure_args]
1559 if imported_seed_cache_vars:
1561 "Seeded configure cache vars: "
1562 +
", ".join(imported_seed_cache_vars.keys()),
1567 print(
"Phase 0: prepare source builds from refs", flush=
True)
1573 result_dir=result_dir,
1574 exe_relpath=args.exe_relpath,
1575 workdir_relpath=args.workdir_relpath,
1576 configure_args=effective_configure_args,
1577 build_args=args.build_arg,
1578 build_target=args.build_target,
1579 build_jobs=args.build_jobs,
1586 result_dir=result_dir,
1587 exe_relpath=args.exe_relpath,
1588 workdir_relpath=args.workdir_relpath,
1589 configure_args=effective_configure_args,
1590 build_args=args.build_arg,
1591 build_target=args.build_target,
1592 build_jobs=args.build_jobs,
1598 build_dir=args.build_dir_a,
1600 exe_relpath=args.exe_relpath,
1601 commit=args.commit_a,
1602 source_dir=args.source_dir_a,
1607 build_dir=args.build_dir_b,
1609 exe_relpath=args.exe_relpath,
1610 commit=args.commit_b,
1611 source_dir=args.source_dir_b,
1615 "created_utc": datetime.now(timezone.utc).isoformat(),
1616 "workdir": args.workdir,
1617 "workdir_relpath": args.workdir_relpath,
1618 "cmd_args": args.cmd_args,
1619 "skip_timing": args.skip_timing,
1620 "skip_callgrind": args.skip_callgrind,
1621 "petsc_events": args.petsc_event,
1622 "callgrind_args": DEFAULT_CALLGRIND_ARGS + args.callgrind_arg,
1623 "perf_enabled": args.perf_stat,
1624 "perf_bin": args.perf_bin,
1625 "perf_runs": args.perf_runs,
1626 "perf_events":
ordered_unique([*DEFAULT_PERF_EVENTS, *args.perf_event]),
1628 "base_cpi": args.sim_base_cpi,
1629 "l1_miss_penalty": args.sim_l1_miss_penalty,
1630 "ll_miss_penalty": args.sim_ll_miss_penalty,
1631 "branch_miss_penalty": args.sim_branch_miss_penalty,
1634 "ref_a": args.ref_a,
1635 "ref_b": args.ref_b,
1636 "configure_args": effective_configure_args,
1637 "build_args": args.build_arg,
1638 "build_target": args.build_target,
1639 "build_jobs": args.build_jobs,
1640 "seed_cache": args.seed_cache,
1641 "seed_cache_vars": imported_seed_cache_vars,
1642 "cases": [asdict(case_a), asdict(case_b)],
1644 (result_dir /
"metadata.json").write_text(json.dumps(metadata, indent=2))
1645 (result_dir /
"command.txt").write_text(
1646 " ".join(shlex.quote(part)
for part
in args.cmd_args) +
"\n"
1649 summary: dict[str, object] = {
1650 "result_dir": str(result_dir),
1651 "cases": [asdict(case_a), asdict(case_b)],
1652 "skip_timing": args.skip_timing,
1653 "skip_callgrind": args.skip_callgrind,
1659 "base_cpi": args.sim_base_cpi,
1660 "l1_miss_penalty": args.sim_l1_miss_penalty,
1661 "ll_miss_penalty": args.sim_ll_miss_penalty,
1662 "branch_miss_penalty": args.sim_branch_miss_penalty,
1666 if not args.skip_timing:
1667 print(
"Phase 1: wall timing (no profiling, no -log_view)", flush=
True)
1670 app_args=args.cmd_args,
1671 workdir=args.workdir,
1672 results_dir=result_dir,
1673 timing_runs=args.timing_runs,
1677 app_args=args.cmd_args,
1678 workdir=args.workdir,
1679 results_dir=result_dir,
1680 timing_runs=args.timing_runs,
1682 summary[
"wall_timing"] = {case_a.label: timing_a, case_b.label: timing_b}
1684 write_timing_csv(result_dir / f
"{case_a.label}_wall.csv", case_a.label, timing_a, [])
1685 write_timing_csv(result_dir / f
"{case_b.label}_wall.csv", case_b.label, timing_b, [])
1688 f
"{case_a.label}: mean wall {timing_a['mean_wall_seconds']:.6f} s",
1692 f
"{case_b.label}: mean wall {timing_b['mean_wall_seconds']:.6f} s",
1695 delta_wall = timing_b[
"mean_wall_seconds"] - timing_a[
"mean_wall_seconds"]
1697 100.0 * delta_wall / timing_a[
"mean_wall_seconds"]
1698 if timing_a[
"mean_wall_seconds"]
1702 f
"Wall-time delta ({case_b.label} - {case_a.label}): "
1703 f
"{delta_wall:+.6f} s ({format_delta_percent(delta_wall_pct)}%)",
1706 if args.petsc_event:
1707 print(
"Phase 2: PETSc event timing (-log_view)", flush=
True)
1710 app_args=args.cmd_args,
1711 workdir=args.workdir,
1712 results_dir=result_dir,
1713 timing_runs=args.timing_runs,
1714 petsc_events=args.petsc_event,
1718 app_args=args.cmd_args,
1719 workdir=args.workdir,
1720 results_dir=result_dir,
1721 timing_runs=args.timing_runs,
1722 petsc_events=args.petsc_event,
1724 summary[
"petsc_timing"] = {case_a.label: petsc_a, case_b.label: petsc_b}
1727 result_dir / f
"{case_a.label}_petsc.csv",
1733 result_dir / f
"{case_b.label}_petsc.csv",
1740 f
"{case_a.label}: mean PETSc event-sum wall {petsc_a['mean_wall_seconds']:.6f} s",
1744 f
"{case_b.label}: mean PETSc event-sum wall {petsc_b['mean_wall_seconds']:.6f} s",
1748 f
"{case_a.label}: mean PETSc event sum {petsc_a['mean_petsc_event_sum']:.6f} s",
1752 f
"{case_b.label}: mean PETSc event sum {petsc_b['mean_petsc_event_sum']:.6f} s",
1755 delta_petsc = petsc_b[
"mean_petsc_event_sum"] - petsc_a[
"mean_petsc_event_sum"]
1757 100.0 * delta_petsc / petsc_a[
"mean_petsc_event_sum"]
1758 if petsc_a[
"mean_petsc_event_sum"]
1762 f
"PETSc event-sum delta ({case_b.label} - {case_a.label}): "
1763 f
"{delta_petsc:+.6f} s ({format_delta_percent(delta_petsc_pct)}%)",
1766 for event_name
in args.petsc_event:
1768 petsc_b[
"mean_petsc_events"][event_name]
1769 - petsc_a[
"mean_petsc_events"][event_name]
1772 100.0 * delta_event / petsc_a[
"mean_petsc_events"][event_name]
1773 if petsc_a[
"mean_petsc_events"][event_name]
1777 f
"{event_name} delta ({case_b.label} - {case_a.label}): "
1778 f
"{delta_event:+.6f} s ({format_delta_percent(delta_event_pct)}%)",
1784 print(
"Phase 3: perf stat", flush=
True)
1787 app_args=args.cmd_args,
1788 workdir=args.workdir,
1789 results_dir=result_dir,
1790 perf_runs=args.perf_runs,
1791 perf_bin=args.perf_bin,
1792 perf_events=args.perf_event,
1796 app_args=args.cmd_args,
1797 workdir=args.workdir,
1798 results_dir=result_dir,
1799 perf_runs=args.perf_runs,
1800 perf_bin=args.perf_bin,
1801 perf_events=args.perf_event,
1803 summary[
"perf"] = {case_a.label: perf_a, case_b.label: perf_b}
1804 write_perf_csv(result_dir / f
"{case_a.label}_perf.csv", case_a.label, perf_a)
1805 write_perf_csv(result_dir / f
"{case_b.label}_perf.csv", case_b.label, perf_b)
1807 f
"{case_a.label}: mean perf instructions {perf_a['mean_events']['instructions']:.0f}",
1811 f
"{case_a.label}: mean perf cycles {perf_a['mean_events']['cycles']:.0f}",
1815 f
"{case_a.label}: mean IPC {perf_a.get('mean_ipc', math.nan):.6f}",
1819 f
"{case_b.label}: mean perf instructions {perf_b['mean_events']['instructions']:.0f}",
1823 f
"{case_b.label}: mean perf cycles {perf_b['mean_events']['cycles']:.0f}",
1827 f
"{case_b.label}: mean IPC {perf_b.get('mean_ipc', math.nan):.6f}",
1831 perf_b[
"mean_events"][
"instructions"] - perf_a[
"mean_events"][
"instructions"]
1834 100.0 * delta_instr / perf_a[
"mean_events"][
"instructions"]
1835 if perf_a[
"mean_events"][
"instructions"]
1839 f
"Instructions delta ({case_b.label} - {case_a.label}): "
1840 f
"{delta_instr:+.0f} ({format_delta_percent(delta_instr_pct)}%)",
1844 perf_b[
"mean_events"][
"cycles"] - perf_a[
"mean_events"][
"cycles"]
1846 delta_cycles_pct = (
1847 100.0 * delta_cycles / perf_a[
"mean_events"][
"cycles"]
1848 if perf_a[
"mean_events"][
"cycles"]
1852 f
"Cycles delta ({case_b.label} - {case_a.label}): "
1853 f
"{delta_cycles:+.0f} ({format_delta_percent(delta_cycles_pct)}%)",
1856 if "mean_ipc" in perf_a
and "mean_ipc" in perf_b:
1857 delta_ipc = perf_b[
"mean_ipc"] - perf_a[
"mean_ipc"]
1859 100.0 * delta_ipc / perf_a[
"mean_ipc"]
1860 if perf_a[
"mean_ipc"]
1864 f
"IPC delta ({case_b.label} - {case_a.label}): "
1865 f
"{delta_ipc:+.6f} ({format_delta_percent(delta_ipc_pct)}%)",
1868 perf_plot = result_dir /
"perf_ipc_comparison.svg"
1870 summary[
"perf_plot"] = str(perf_plot)
1871 print(f
"perf plot: {perf_plot}", flush=
True)
1872 except RuntimeError
as exc:
1873 summary[
"perf_error"] = str(exc)
1874 print(f
"perf stat unavailable: {exc}", flush=
True)
1875 if args.require_perf:
1878 if not args.skip_callgrind:
1879 print(
"Phase 4: Callgrind", flush=
True)
1880 callgrind_args = DEFAULT_CALLGRIND_ARGS + args.callgrind_arg
1883 app_args=args.cmd_args,
1884 workdir=args.workdir,
1885 results_dir=result_dir,
1886 callgrind_args=callgrind_args,
1890 app_args=args.cmd_args,
1891 workdir=args.workdir,
1892 results_dir=result_dir,
1893 callgrind_args=callgrind_args,
1895 summary[
"callgrind"] = {case_a.label: cg_a, case_b.label: cg_b}
1899 callgrind_event_names =
ordered_unique([*event_names_a, *event_names_b])
1900 ir_total_a = totals_a.get(
"Ir")
1901 ir_total_b = totals_b.get(
"Ir")
1902 summary[
"callgrind_event_names"] = callgrind_event_names
1903 summary[
"callgrind_totals"] = {case_a.label: totals_a, case_b.label: totals_b}
1904 summary[
"callgrind_totals_ir"] = {
1905 case_a.label: ir_total_a,
1906 case_b.label: ir_total_b,
1908 summary[
"callgrind_simulation_totals"] = {
1915 base_cpi=args.sim_base_cpi,
1916 l1_miss_penalty=args.sim_l1_miss_penalty,
1917 ll_miss_penalty=args.sim_ll_miss_penalty,
1918 branch_miss_penalty=args.sim_branch_miss_penalty,
1927 base_cpi=args.sim_base_cpi,
1928 l1_miss_penalty=args.sim_l1_miss_penalty,
1929 ll_miss_penalty=args.sim_ll_miss_penalty,
1930 branch_miss_penalty=args.sim_branch_miss_penalty,
1934 for label
in (case_a.label, case_b.label):
1935 totals_map = summary[
"callgrind_simulation_totals"][label]
1936 ir_total = summary[
"callgrind_totals_ir"][label]
1937 est_cycles = totals_map[
"estimated_cycles"]
1938 totals_map[
"ir_per_estimated_cycle"] = (
1939 ir_total / est_cycles
if ir_total
and est_cycles
else math.nan
1942 exclude_patterns = [re.compile(p)
for p
in (DEFAULT_EXCLUDE_PATTERNS + args.exclude_pattern)]
1943 include_patterns = [re.compile(p)
for p
in args.include_pattern]
1950 diff_metric_label =
"Ir"
1951 diff_title =
"Top {count} Callgrind differences"
1955 base_cpi=args.sim_base_cpi,
1956 l1_miss_penalty=args.sim_l1_miss_penalty,
1957 ll_miss_penalty=args.sim_ll_miss_penalty,
1958 branch_miss_penalty=args.sim_branch_miss_penalty,
1962 base_cpi=args.sim_base_cpi,
1963 l1_miss_penalty=args.sim_l1_miss_penalty,
1964 ll_miss_penalty=args.sim_ll_miss_penalty,
1965 branch_miss_penalty=args.sim_branch_miss_penalty,
1970 have_simulated_events = all(
1971 name
in callgrind_event_names
1972 for name
in [
"Ir",
"I1mr",
"D1mr",
"D1mw",
"ILmr",
"DLmr",
"DLmw",
"Bcm",
"Bim"]
1974 if have_simulated_events:
1975 diff_rows = diff_rows_est_cycles
1976 diff_metric_label =
"est. cycles (Valgrind sim)"
1977 diff_title =
"Top {count} estimated cycle differences"
1978 summary[
"callgrind_diff_metric"] = {
1979 "kind":
"estimated_cycles_from_valgrind_sim",
1980 "metric_label": diff_metric_label,
1981 "base_cpi": args.sim_base_cpi,
1982 "l1_miss_penalty": args.sim_l1_miss_penalty,
1983 "ll_miss_penalty": args.sim_ll_miss_penalty,
1984 "branch_miss_penalty": args.sim_branch_miss_penalty,
1987 "Per-function Callgrind differences shown as estimated cycles "
1988 "from Valgrind cache/branch simulation.",
1994 and "mean_ipc" in perf_a
1995 and "mean_ipc" in perf_b
1996 and perf_a[
"mean_ipc"]
1997 and perf_b[
"mean_ipc"]
2000 diff_rows, perf_a[
"mean_ipc"], perf_b[
"mean_ipc"]
2002 diff_metric_label =
"est. cycles (Ir / IPC)"
2003 diff_title =
"Top {count} estimated cycle differences"
2004 summary[
"callgrind_diff_metric"] = {
2005 "kind":
"estimated_cycles_from_perf_ipc",
2006 "metric_label": diff_metric_label,
2007 "ipc_a": perf_a[
"mean_ipc"],
2008 "ipc_b": perf_b[
"mean_ipc"],
2011 "Per-function Callgrind differences shown as estimated cycles "
2012 "(function Ir divided by whole-run IPC from perf stat).",
2018 "perf stat did not produce usable IPC, so per-function differences "
2019 "remain in raw Ir.",
2024 "Valgrind simulated cache/branch events were unavailable, so "
2025 "per-function differences remain in raw Ir.",
2028 summary[
"callgrind_diff_metric"] = {
2030 "metric_label": diff_metric_label,
2033 diff_rows.sort(key=
lambda row: row[
"abs_delta"], reverse=
True)
2037 cg_diff_path = result_dir /
"callgrind.cg_diff.out"
2039 cg_diff_proc = subprocess.run(
2040 [
"cg_diff", cg_a[
"callgrind_out"], cg_b[
"callgrind_out"]],
2042 stdout=subprocess.PIPE,
2043 stderr=subprocess.STDOUT,
2046 cg_diff_path.write_text(cg_diff_proc.stdout)
2047 except subprocess.CalledProcessError
as exc:
2048 (result_dir /
"callgrind.cg_diff.log").write_text(exc.stdout
or "")
2051 f
"{case_a.label}: callgrind total Ir {ir_total_a:,}" if ir_total_a
is not None else f
"{case_a.label}: callgrind total Ir unavailable",
2055 f
"{case_b.label}: callgrind total Ir {ir_total_b:,}" if ir_total_b
is not None else f
"{case_b.label}: callgrind total Ir unavailable",
2059 f
"{case_a.label}: simulated L1 misses {total_l1_misses(totals_a):,}",
2063 f
"{case_b.label}: simulated L1 misses {total_l1_misses(totals_b):,}",
2067 f
"{case_a.label}: simulated LL misses {total_ll_misses(totals_a):,}",
2071 f
"{case_b.label}: simulated LL misses {total_ll_misses(totals_b):,}",
2075 f
"{case_a.label}: simulated branch misses {total_branch_misses(totals_a):,}",
2079 f
"{case_b.label}: simulated branch misses {total_branch_misses(totals_b):,}",
2082 est_cycles_a = summary[
"callgrind_simulation_totals"][case_a.label][
"estimated_cycles"]
2083 est_cycles_b = summary[
"callgrind_simulation_totals"][case_b.label][
"estimated_cycles"]
2085 f
"{case_a.label}: estimated cycles {est_cycles_a:.0f}",
2089 f
"{case_b.label}: estimated cycles {est_cycles_b:.0f}",
2092 sim_ipc_a = summary[
"callgrind_simulation_totals"][case_a.label][
"ir_per_estimated_cycle"]
2093 sim_ipc_b = summary[
"callgrind_simulation_totals"][case_b.label][
"ir_per_estimated_cycle"]
2095 f
"{case_a.label}: Ir / est_cycles {sim_ipc_a:.6f}",
2099 f
"{case_b.label}: Ir / est_cycles {sim_ipc_b:.6f}",
2102 if ir_total_a
is not None and ir_total_b
is not None:
2103 delta_total = ir_total_b - ir_total_a
2104 delta_pct = 100.0 * delta_total / ir_total_a
if ir_total_a
else float(
"inf")
2106 f
"Callgrind total delta ({case_b.label} - {case_a.label}): "
2107 f
"{delta_total:+,} Ir ({format_delta_percent(delta_pct)}%)",
2110 delta_est_cycles = est_cycles_b - est_cycles_a
2111 delta_est_cycles_pct = (
2112 100.0 * delta_est_cycles / est_cycles_a
if est_cycles_a
else float(
"inf")
2115 f
"Estimated cycles delta ({case_b.label} - {case_a.label}): "
2116 f
"{delta_est_cycles:+.0f} ({format_delta_percent(delta_est_cycles_pct)}%)",
2119 if not math.isnan(sim_ipc_a)
and not math.isnan(sim_ipc_b):
2120 delta_sim_ipc = sim_ipc_b - sim_ipc_a
2121 delta_sim_ipc_pct = 100.0 * delta_sim_ipc / sim_ipc_a
if sim_ipc_a
else float(
"inf")
2123 f
"Ir / est_cycles delta ({case_b.label} - {case_a.label}): "
2124 f
"{delta_sim_ipc:+.6f} ({format_delta_percent(delta_sim_ipc_pct)}%)",
2130 f
"Top branch differences ({case_b.label} - {case_a.label}) present in both branches:",
2135 metric_label=diff_metric_label,
2137 diff_plot = result_dir /
"callgrind_top_differences.svg"
2143 metric_label=diff_metric_label,
2144 title=diff_title.format(count=len(top_diff_rows)),
2146 summary[
"callgrind_diff_plot"] = str(diff_plot)
2147 print(f
"callgrind diff plot: {diff_plot}", flush=
True)
2149 print(
"Phase 4: Callgrind skipped (--skip-callgrind)", flush=
True)
2151 summary_path = result_dir /
"summary.json"
2152 summary_path.write_text(json.dumps(summary, indent=2))
2153 print(f
"summary: {summary_path}", flush=
True)