Ryan Malloy 01757415a5 Phase 32: Benchmark improvements (Tier 1 + Tier 2)
Tier 1 — make existing benchmarks reliable:
* Bumped slow-bench rounds: cold_connect_disconnect 5->15, executemany
  series 3->10. Single-round outliers no longer dominate.
* Switched bench reporting to median + IQR. Mean was being moved by
  individual GC pauses / scheduler hiccups (IfxPy executemany IQR
  was 8.2 ms on a 28 ms median - 29% spread - mean was unreliable).
* Updated ifxpy_bench.py to also report median + IQR alongside mean
  for cross-comparable numbers.
* Makefile bench targets now show median, iqr, mean, stddev, ops, rounds.

The robust statistics flipped the comparison story:

  Old (mean, 3 rounds):   us 9% faster  / IfxPy 30% faster on 2 of 5
  New (median, 10+ rds):  us faster on 4 of 5 benchmarks

| Benchmark | IfxPy | informix-db | Δ |
|---|---|---|---|
| select_one_row             | 170us | 119us | us 30% faster |
| select_systables_first_10  | 186us | 142us | us 24% faster |
| select_bench_table_all 1k  | 980us | 832us | us 15% faster |
| executemany 1k in txn      | 28.3ms | 31.3ms | us 10% slower |
| cold_connect_disconnect    | 12.0ms | 10.7ms | us 11% faster |

Tier 2 — add benchmarks for claims we make but don't verify:

tests/benchmarks/test_observability_perf.py:
* test_streaming_fetch_memory_profile — RSS sampling during a
  cursor iteration. Documents memory growth shape; regression
  wall at 100 MB / 1k rows. Currently flat (in-memory cursor
  doesn't grow detectably for 278 rows).
* test_select_1_latency_percentiles — 1000-query distribution
  with p50/p90/p95/p99/max. Result: p99/p50 = 1.42x (tight tail).
  p50=108us, p99=153us.
* test_concurrent_pool_throughput[2,4,8] — N worker threads
  through pool, measures aggregate QPS + per-thread fairness.
  Plateaus at ~6K QPS (server-bound); per-thread latency scales
  ~linearly with N (server serialization expected).

README.md (project root): updated Compared-to-IfxPy table with
the median-based numbers + IQR awareness note.
tests/benchmarks/compare/README.md: added "Statistical robustness"
section explaining why median over mean for fair comparison.

236 integration tests pass; ruff clean.
2026-05-05 12:01:11 -06:00

208 lines
6.7 KiB
Python

"""IfxPy comparison benchmark.
Runs the same workloads as ``tests/benchmarks/test_*_perf.py`` against
the same dev-container Informix instance, but using IfxPy (the C-bound
PyPI driver) instead of ``informix-db``. Numbers go straight to stdout;
the host parses them and produces a side-by-side table.
Workloads:
* ``select_one_row`` — single-row SELECT round-trip latency
* ``select_systables_first_10`` — small server-side query
* ``select_bench_table_all`` — 1k-row sustained fetch
* ``executemany_1000_rows_in_txn`` — bulk INSERT throughput
* ``cold_connect_disconnect`` — login handshake cost
Each workload runs N times; we report mean and stddev.
"""
from __future__ import annotations
import statistics
import sys
import time
from collections.abc import Callable
import IfxPy
# Connect string — mirrors the conftest.py defaults the host uses.
CONN_STR = (
"SERVER=informix;"
"DATABASE=sysmaster;"
"HOST=127.0.0.1;"
"SERVICE=9088;"
"UID=informix;"
"PWD=in4mix;"
"PROTOCOL=onsoctcp"
)
ROUNDS_FAST = 100 # for sub-millisecond ops
ROUNDS_MED = 20 # for 1-100ms ops
ROUNDS_SLOW = 10 # for >1s ops; bumped from 3 in Tier 1 — the smaller
# sample produced unreliable means (cold-connect's stddev was 4.98 ms
# across 3 rounds; with 10 rounds the median is stable run-to-run).
def measure(name: str, rounds: int, body: Callable[[], None]) -> dict:
"""Run ``body`` ``rounds`` times; return median + IQR in seconds.
Median is more robust than mean against single-round outliers (GC
pauses, server scheduler hiccups). IQR (interquartile range) is
a noise estimator that also resists outliers — much better than
stddev when one bad round can dominate.
"""
timings: list[float] = []
for _ in range(rounds):
t0 = time.perf_counter()
body()
t1 = time.perf_counter()
timings.append(t1 - t0)
timings.sort()
median_s = timings[len(timings) // 2]
q1 = timings[len(timings) // 4]
q3 = timings[(3 * len(timings)) // 4]
return {
"name": name,
"rounds": rounds,
"median_s": median_s,
"iqr_s": q3 - q1,
"min_s": timings[0],
"max_s": timings[-1],
"mean_s": statistics.mean(timings), # kept for cross-checking
"stddev_s": statistics.stdev(timings) if len(timings) > 1 else 0.0,
}
def bench_select_one_row(conn) -> dict:
def run() -> None:
stmt = IfxPy.exec_immediate(
conn, "SELECT 1 FROM systables WHERE tabid = 1"
)
IfxPy.fetch_tuple(stmt)
IfxPy.free_stmt(stmt)
return measure("select_one_row", ROUNDS_FAST, run)
def bench_select_systables_first_10(conn) -> dict:
def run() -> None:
stmt = IfxPy.exec_immediate(
conn,
"SELECT FIRST 10 tabname, owner, tabid, ncols FROM systables",
)
while IfxPy.fetch_tuple(stmt):
pass
IfxPy.free_stmt(stmt)
return measure("select_systables_first_10", ROUNDS_FAST, run)
def bench_select_bench_table_all(conn) -> dict:
"""Requires p21_bench table to exist (created by host-side fixture)."""
# Probe whether the table exists; if not, skip
try:
stmt = IfxPy.exec_immediate(conn, "SELECT COUNT(*) FROM p21_bench")
row = IfxPy.fetch_tuple(stmt)
IfxPy.free_stmt(stmt)
if not row or row[0] == 0:
return {"name": "select_bench_table_all", "skipped": "p21_bench empty"}
except Exception as e:
return {"name": "select_bench_table_all", "skipped": f"p21_bench missing: {e}"}
def run() -> None:
stmt = IfxPy.exec_immediate(conn, "SELECT * FROM p21_bench")
while IfxPy.fetch_tuple(stmt):
pass
IfxPy.free_stmt(stmt)
return measure("select_bench_table_all", ROUNDS_MED, run)
def bench_executemany_1000_rows_in_txn() -> dict:
"""Open a connection on testdb, autocommit OFF, executemany 1000."""
try:
conn = IfxPy.connect(
CONN_STR.replace("DATABASE=sysmaster", "DATABASE=testdb"), "", ""
)
except Exception as e:
return {"name": "executemany_1000_rows_in_txn", "skipped": f"testdb: {e}"}
IfxPy.autocommit(conn, IfxPy.SQL_AUTOCOMMIT_OFF)
table = "p21_ifxpy_bench"
try:
try:
IfxPy.exec_immediate(conn, f"DROP TABLE {table}")
IfxPy.commit(conn)
except Exception:
pass
IfxPy.exec_immediate(
conn, f"CREATE TABLE {table} (id INT, name VARCHAR(64), value FLOAT)"
)
IfxPy.commit(conn)
counter = [0]
def run() -> None:
counter[0] += 1
base = counter[0] * 1000
stmt = IfxPy.prepare(
conn, f"INSERT INTO {table} VALUES (?, ?, ?)"
)
for i in range(1000):
IfxPy.execute(stmt, (base + i, f"row_{base + i}", float(base + i)))
IfxPy.free_stmt(stmt)
IfxPy.commit(conn)
result = measure("executemany_1000_rows_in_txn", ROUNDS_SLOW, run)
return result
finally:
try:
IfxPy.exec_immediate(conn, f"DROP TABLE {table}")
IfxPy.commit(conn)
except Exception:
pass
IfxPy.close(conn)
def bench_cold_connect_disconnect() -> dict:
def run() -> None:
conn = IfxPy.connect(CONN_STR, "", "")
IfxPy.close(conn)
return measure("cold_connect_disconnect", ROUNDS_SLOW, run)
def main() -> None:
print("# IfxPy benchmark results", file=sys.stderr)
print(f"# IfxPy version: {IfxPy.__version__ if hasattr(IfxPy, '__version__') else 'unknown'}", file=sys.stderr)
# Persistent connection for the read-mostly benchmarks
conn = IfxPy.connect(CONN_STR, "", "")
results = []
results.append(bench_select_one_row(conn))
results.append(bench_select_systables_first_10(conn))
results.append(bench_select_bench_table_all(conn))
IfxPy.close(conn)
results.append(bench_executemany_1000_rows_in_txn())
results.append(bench_cold_connect_disconnect())
# Emit machine-parseable lines on stdout. Reporting median (not
# mean) and IQR (not stddev) so a single outlier round can't
# dominate the comparison numbers — mirrors pytest-benchmark's
# ``--benchmark-columns=median,iqr`` reporting on the host side.
for r in results:
if r.get("skipped"):
print(f"SKIP {r['name']}: {r['skipped']}")
else:
print(
f"RESULT {r['name']} median={r['median_s']:.6f}s "
f"iqr={r['iqr_s']:.6f}s min={r['min_s']:.6f}s "
f"max={r['max_s']:.6f}s mean={r['mean_s']:.6f}s "
f"stddev={r['stddev_s']:.6f}s rounds={r['rounds']}"
)
if __name__ == "__main__":
main()