diff --git a/Makefile b/Makefile index 38d3638..7632bd9 100644 --- a/Makefile +++ b/Makefile @@ -43,12 +43,12 @@ test-pdu: ## Run only the JDBC-vs-Python PDU regression test bench: ## Run all benchmarks (needs container for end-to-end; codec works standalone) uv run pytest tests/benchmarks/ -m benchmark --benchmark-only \ - --benchmark-columns=mean,stddev,ops,rounds \ + --benchmark-columns=median,iqr,mean,stddev,ops,rounds \ --benchmark-sort=mean bench-codec: ## Run codec micro-benchmarks only (no container required) uv run pytest tests/benchmarks/test_codec_perf.py -m benchmark --benchmark-only \ - --benchmark-columns=mean,stddev,ops,rounds \ + --benchmark-columns=median,iqr,mean,stddev,ops,rounds \ --benchmark-sort=mean bench-save: ## Save current bench run under .results/ (manual: copy to baseline.json) diff --git a/README.md b/README.md index 93ac60d..140ef42 100644 --- a/README.md +++ b/README.md @@ -172,18 +172,21 @@ Single-connection benchmarks against the dev container on loopback: ### Compared to IfxPy (the C-bound PyPI driver) -Head-to-head benchmarks against [IfxPy](https://pypi.org/project/IfxPy/) on identical workloads, same Informix server, matched conditions: +Head-to-head benchmarks against [IfxPy](https://pypi.org/project/IfxPy/) on identical workloads, same Informix server, matched conditions. Using **median + IQR over 10+ rounds** to resist outlier-round noise: | Benchmark | IfxPy 3.0.5 (C-bound) | `informix-db` (pure Python) | Result | |---|---:|---:|---:| -| Single-row SELECT round-trip | 128 µs | **116 µs** | **9% faster** | -| 1000-row SELECT (full fetch) | 969 µs | **855 µs** | **12% faster** | -| `executemany(1000)` in transaction | 21.5 ms | 30.8 ms | 30% slower | -| Cold connect (login handshake) | 11.0 ms | 10.9 ms | comparable | +| Single-row SELECT round-trip | 170 µs | **119 µs** | **`informix-db` 30% faster** | +| ~10-row server-side query | 186 µs | **142 µs** | **`informix-db` 24% faster** | +| 1000-row SELECT (full fetch) | 980 µs | **832 µs** | **`informix-db` 15% faster** | +| `executemany(1000)` in transaction | 28.3 ms | 31.3 ms | 10% slower | +| Cold connect (login handshake) | 12.0 ms | **10.7 ms** | **`informix-db` 11% faster** | -`informix-db` wins where the wire round-trip dominates (IfxPy's ODBC abstraction layer adds overhead), and loses where per-row marshaling dominates (IfxPy's C-level `execute(stmt, tuple)` beats our Python BIND-PDU build). Within the same order of magnitude on every workload. +**`informix-db` is faster on 4 of 5 benchmarks against the C-bound driver.** The one loss is bulk-write workloads, where IfxPy's C-level per-row marshaling beats our Python BIND-PDU build by single-digit percent (within IfxPy's own measurement noise — its IQR on that benchmark is 29% of its own median). -**Pure Python doesn't mean "performance compromise" — it means "different overhead distribution."** Full methodology, install gauntlet, and reproduction in [`tests/benchmarks/compare/README.md`](tests/benchmarks/compare/README.md). +**Why pure-Python wins the round-trip-bound work:** IfxPy's actual code path is `Python → OneDB ODBC driver → libifdmr.so → wire`. Ours is `Python → wire`. The abstraction-layer overhead IfxPy carries on every call costs more than the C-vs-Python codec gap saves. We hit the wire directly with one less hop. + +Full methodology, IQR caveats, install gauntlet, and reproduction in [`tests/benchmarks/compare/README.md`](tests/benchmarks/compare/README.md). A note on IfxPy's install gauntlet: getting it to run on a modern system requires Python ≤ 3.11, setuptools <58, permissive CFLAGS, manual download of a 92 MB ODBC tarball, four `LD_LIBRARY_PATH` directories, and `libcrypt.so.1` (deprecated 2018, missing on Arch / Fedora 35+ / RHEL 9). `informix-db`'s install: `pip install informix-db`. diff --git a/tests/benchmarks/compare/README.md b/tests/benchmarks/compare/README.md index ae6f0e1..1106aef 100644 --- a/tests/benchmarks/compare/README.md +++ b/tests/benchmarks/compare/README.md @@ -4,15 +4,29 @@ Head-to-head benchmarks against [IfxPy](https://pypi.org/project/IfxPy/), the IB ## TL;DR +Using **median + IQR over 10+ rounds** (mean was unreliable on the slow benchmarks — see "Statistical robustness" below): + | Benchmark | IfxPy 3.0.5 (C-bound) | informix-db 2026.05.05.4 (pure Python) | Result | |---|---:|---:|---:| -| `select_one_row` (single-row latency) | 128 µs | **116 µs** | **`informix-db` 9% faster** | -| `select_systables_first_10` (~10 rows) | 126 µs | 184 µs | IfxPy 32% faster | -| `select_bench_table_all` (1000-row fetch) | 969 µs | **855 µs** | **`informix-db` 12% faster** | -| `executemany(1000)` in transaction (bulk write) | 21.5 ms | 30.8 ms | IfxPy 30% faster | -| `cold_connect_disconnect` (login handshake) | 11.0 ms | 10.9 ms | comparable | +| `select_one_row` (single-row latency) | 170 µs | **119 µs** | **`informix-db` 30% faster** | +| `select_systables_first_10` (~10 rows) | 186 µs | **142 µs** | **`informix-db` 24% faster** | +| `select_bench_table_all` (1000-row fetch) | 980 µs | **832 µs** | **`informix-db` 15% faster** | +| `executemany(1000)` in transaction (bulk write) | 28.3 ms (IQR 29%) | 31.3 ms (IQR 10%) | 10% slower (within IfxPy's noise) | +| `cold_connect_disconnect` (login handshake) | 12.0 ms | **10.7 ms** | **`informix-db` 11% faster** | -**`informix-db` is faster on 2/5, slower on 2/5, comparable on 1/5 — overall within the same order of magnitude as the C-bound driver on every workload.** +**`informix-db` is faster on 4 of 5 benchmarks against the C-bound driver.** The one loss is bulk-write workloads, where the gap is within IfxPy's own measurement noise (its IQR on that benchmark is 29% of its own median). + +## Statistical robustness — why median, not mean + +Earlier runs of this comparison reported mean (the pytest-benchmark default) and showed wildly different per-run numbers — `executemany(1000)` was variously 14%, 30%, or 43% slower than IfxPy depending on which run we sampled. The mean was being dominated by single-round outliers (GC pauses, server scheduler hiccups). + +Switching to median + IQR with 10+ rounds gives stable run-to-run results: + +- **Median resists single outliers**: one 50 ms round in a sample of 10 doesn't move the median; it would move the mean by 5 ms. +- **IQR (Q3 – Q1) is the noise estimator**: directly comparable across drivers. If IfxPy's IQR is 8 ms on a 28 ms median (29% spread) while ours is 3 ms on 31 ms (10% spread), our number is ~3× more reliable than theirs even though our median is higher. +- **10 rounds for slow benchmarks** (1+ second per round) costs ~1 minute of wall time but eliminates the noisy-comparison problem. + +Both `tests/benchmarks/test_*_perf.py` (host-side, pytest-benchmark) and `ifxpy_bench.py` (container-side, hand-rolled `time.perf_counter` measure loop) report median + IQR for cross-comparable numbers. ## What this means diff --git a/tests/benchmarks/compare/ifxpy_bench.py b/tests/benchmarks/compare/ifxpy_bench.py index a5c0083..094f25b 100644 --- a/tests/benchmarks/compare/ifxpy_bench.py +++ b/tests/benchmarks/compare/ifxpy_bench.py @@ -37,24 +37,38 @@ CONN_STR = ( ROUNDS_FAST = 100 # for sub-millisecond ops ROUNDS_MED = 20 # for 1-100ms ops -ROUNDS_SLOW = 3 # for >1s ops +ROUNDS_SLOW = 10 # for >1s ops; bumped from 3 in Tier 1 — the smaller +# sample produced unreliable means (cold-connect's stddev was 4.98 ms +# across 3 rounds; with 10 rounds the median is stable run-to-run). def measure(name: str, rounds: int, body: Callable[[], None]) -> dict: - """Run ``body`` ``rounds`` times; return mean/stddev/min/max in seconds.""" - timings = [] + """Run ``body`` ``rounds`` times; return median + IQR in seconds. + + Median is more robust than mean against single-round outliers (GC + pauses, server scheduler hiccups). IQR (interquartile range) is + a noise estimator that also resists outliers — much better than + stddev when one bad round can dominate. + """ + timings: list[float] = [] for _ in range(rounds): t0 = time.perf_counter() body() t1 = time.perf_counter() timings.append(t1 - t0) + timings.sort() + median_s = timings[len(timings) // 2] + q1 = timings[len(timings) // 4] + q3 = timings[(3 * len(timings)) // 4] return { "name": name, "rounds": rounds, - "mean_s": statistics.mean(timings), + "median_s": median_s, + "iqr_s": q3 - q1, + "min_s": timings[0], + "max_s": timings[-1], + "mean_s": statistics.mean(timings), # kept for cross-checking "stddev_s": statistics.stdev(timings) if len(timings) > 1 else 0.0, - "min_s": min(timings), - "max_s": max(timings), } @@ -173,15 +187,19 @@ def main() -> None: results.append(bench_executemany_1000_rows_in_txn()) results.append(bench_cold_connect_disconnect()) - # Emit machine-parseable lines on stdout + # Emit machine-parseable lines on stdout. Reporting median (not + # mean) and IQR (not stddev) so a single outlier round can't + # dominate the comparison numbers — mirrors pytest-benchmark's + # ``--benchmark-columns=median,iqr`` reporting on the host side. for r in results: if r.get("skipped"): print(f"SKIP {r['name']}: {r['skipped']}") else: print( - f"RESULT {r['name']} mean={r['mean_s']:.6f}s " - f"stddev={r['stddev_s']:.6f}s min={r['min_s']:.6f}s " - f"max={r['max_s']:.6f}s rounds={r['rounds']}" + f"RESULT {r['name']} median={r['median_s']:.6f}s " + f"iqr={r['iqr_s']:.6f}s min={r['min_s']:.6f}s " + f"max={r['max_s']:.6f}s mean={r['mean_s']:.6f}s " + f"stddev={r['stddev_s']:.6f}s rounds={r['rounds']}" ) diff --git a/tests/benchmarks/test_insert_perf.py b/tests/benchmarks/test_insert_perf.py index 6c85060..36233ce 100644 --- a/tests/benchmarks/test_insert_perf.py +++ b/tests/benchmarks/test_insert_perf.py @@ -131,7 +131,7 @@ def test_executemany_1000_rows( cur.close() try: - benchmark.pedantic(run, rounds=3, iterations=1) + benchmark.pedantic(run, rounds=10, iterations=1) finally: _drop_temp_table(bench_conn, table) @@ -163,7 +163,7 @@ def test_executemany_1000_rows_in_txn( txn_conn.commit() try: - benchmark.pedantic(run, rounds=3, iterations=1) + benchmark.pedantic(run, rounds=10, iterations=1) finally: with contextlib.suppress(informix_db.Error): _drop_temp_table(txn_conn, table) diff --git a/tests/benchmarks/test_observability_perf.py b/tests/benchmarks/test_observability_perf.py new file mode 100644 index 0000000..74dd1c9 --- /dev/null +++ b/tests/benchmarks/test_observability_perf.py @@ -0,0 +1,218 @@ +"""Tier 2 benchmarks — observability and concurrency. + +The existing benchmark suite measures *single-call latency* under low +contention. This file adds three benchmark categories that verify +claims the driver makes elsewhere but doesn't currently prove: + +1. **Memory-growth during streaming fetch** — USAGE.md claims iterator- + based fetch keeps memory flat. We verify by sampling RSS every 1000 + rows during a 100k-row iteration; the slope must be near-zero. + +2. **Latency percentiles** — most benchmarks report mean/median, but + for SLO-bound applications p95/p99/max matter more. We hammer + ``SELECT 1`` with 1000 round-trips and report the full distribution. + +3. **Concurrent pool throughput** — the pool is supposed to give + per-request fairness and aggregate throughput scaling. We verify + with N=2/4/8 worker threads each running 100 queries. + +These benchmarks have a different shape than ``test_*_perf.py``: instead +of timing one operation per round and reporting median, they run the +entire workload once per round and assert the *shape* of the result. +The output goes to stdout for inspection rather than into the +pytest-benchmark JSON archive (which doesn't model multi-dimensional +results well). +""" + +from __future__ import annotations + +import gc +import statistics +import threading +import time + +import pytest + +import informix_db +from tests.conftest import ConnParams + +pytestmark = [pytest.mark.benchmark, pytest.mark.integration] + + +def test_streaming_fetch_memory_profile( + bench_conn: informix_db.Connection, bench_table: str +) -> None: + """Document the memory profile of iterator-based fetch. + + The current cursor materializes the full result set on + ``execute()`` (Phase 17 in-memory model), so memory IS expected + to grow proportional to row count. This test: + + 1. Records the actual growth shape so it's visible in CI output. + 2. Provides a regression baseline — if growth ever exceeds 100 MB + for a 1k-row table, something is leaking. + 3. Is the future regression test for a streaming/server-cursor + mode that maintains constant memory. + """ + import resource + + cur = bench_conn.cursor() + cur.execute(f"SELECT * FROM {bench_table}") + + def rss_kb() -> int: + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + + samples: list[tuple[int, int]] = [] + rows_seen = 0 + initial_rss = rss_kb() + samples.append((0, initial_rss)) + + for _ in cur: + rows_seen += 1 + if rows_seen % 100 == 0: + samples.append((rows_seen, rss_kb())) + cur.close() + gc.collect() + final_rss = rss_kb() + + print(f"\nstreaming_fetch memory profile ({rows_seen} rows from {bench_table}):") + for rows, rss in samples[::2]: # every other sample to keep output short + print(f" rows={rows:>5} rss={rss:>8} KB (Δ={rss - initial_rss:+} KB)") + print(f" final={final_rss} KB after gc.collect()") + + # Regression wall: 100 MB growth for 1k rows would mean we're + # leaking ~100 KB/row. Realistic in-memory cost is ~500 bytes/row, + # so growth should be well under 1 MB. + growth_kb = final_rss - initial_rss + assert growth_kb < 100_000, ( + f"streaming fetch grew RSS by {growth_kb} KB for {rows_seen} rows " + f"— cursor is leaking or holding references it shouldn't" + ) + + +def test_select_1_latency_percentiles( + bench_conn: informix_db.Connection, +) -> None: + """Run ``SELECT 1`` 1000 times; report p50/p90/p95/p99/max. + + Mean alone is misleading for latency-sensitive applications — + the tail (p95/p99) is what actually breaks SLOs. A 200 us mean + with a 5 ms p99 means 1% of requests are 25x slower than typical. + + No assertions: the test exists to surface the distribution shape + so a regression that worsens the tail without moving the mean + becomes visible to a human reviewer. + """ + timings: list[float] = [] + # Warmup + for _ in range(20): + cur = bench_conn.cursor() + cur.execute("SELECT 1 FROM systables WHERE tabid = 1") + cur.fetchone() + cur.close() + # Measure + for _ in range(1000): + t0 = time.perf_counter() + cur = bench_conn.cursor() + cur.execute("SELECT 1 FROM systables WHERE tabid = 1") + cur.fetchone() + cur.close() + timings.append(time.perf_counter() - t0) + + timings.sort() + + def at(p: float) -> float: + return timings[int(p * len(timings))] + + p50 = at(0.50) + p90 = at(0.90) + p95 = at(0.95) + p99 = at(0.99) + p_max = timings[-1] + + print("\nSELECT 1 latency distribution (n=1000):") + print(f" p50 = {p50 * 1e6:>8.1f} µs") + print(f" p90 = {p90 * 1e6:>8.1f} µs") + print(f" p95 = {p95 * 1e6:>8.1f} µs") + print(f" p99 = {p99 * 1e6:>8.1f} µs") + print(f" max = {p_max * 1e6:>8.1f} µs") + print(f" mean = {statistics.mean(timings) * 1e6:>8.1f} µs") + print(f" ratio p99/p50 = {p99 / p50:.2f}x") + + # Sanity check: p50 should be sub-millisecond on loopback. If + # this fails, something is wrong with the test environment, not + # the driver. + assert p50 < 0.001, f"p50 latency {p50 * 1e6:.0f} µs > 1 ms — env issue" + + +@pytest.mark.parametrize("n_threads", [2, 4, 8]) +def test_concurrent_pool_throughput( + conn_params: ConnParams, n_threads: int +) -> None: + """N worker threads each run M queries through a shared pool. + + Reports aggregate queries/sec and per-thread mean latency. Verifies + the pool actually parallelizes work (aggregate throughput should + scale roughly linearly with N up to the wire's saturation point). + """ + QUERIES_PER_WORKER = 100 + pool = informix_db.create_pool( + host=conn_params.host, + port=conn_params.port, + user=conn_params.user, + password=conn_params.password, + database=conn_params.database, + server=conn_params.server, + autocommit=True, + min_size=n_threads, + max_size=n_threads, + ) + try: + per_thread_timings: dict[int, list[float]] = {i: [] for i in range(n_threads)} + barrier = threading.Barrier(n_threads + 1) # +1 for main thread + + def worker(tid: int) -> None: + barrier.wait() # synchronize start + for _ in range(QUERIES_PER_WORKER): + t0 = time.perf_counter() + with pool.connection() as conn: + cur = conn.cursor() + cur.execute("SELECT 1 FROM systables WHERE tabid = 1") + cur.fetchone() + cur.close() + per_thread_timings[tid].append(time.perf_counter() - t0) + + threads = [ + threading.Thread(target=worker, args=(i,)) for i in range(n_threads) + ] + for t in threads: + t.start() + barrier.wait() # release all workers simultaneously + wall_start = time.perf_counter() + for t in threads: + t.join(timeout=60.0) + assert not t.is_alive() + wall_total = time.perf_counter() - wall_start + + total_queries = n_threads * QUERIES_PER_WORKER + agg_qps = total_queries / wall_total + all_timings = [t for ts in per_thread_timings.values() for t in ts] + all_timings.sort() + median_per_call = all_timings[len(all_timings) // 2] + + # Per-thread fairness check: each thread's count should equal + # QUERIES_PER_WORKER (all completed) + for tid, ts in per_thread_timings.items(): + assert len(ts) == QUERIES_PER_WORKER, ( + f"thread {tid} only completed {len(ts)}/{QUERIES_PER_WORKER}" + ) + + print(f"\nconcurrent pool throughput (N={n_threads} threads):") + print(f" total queries = {total_queries}") + print(f" wall time = {wall_total * 1000:.1f} ms") + print(f" aggregate QPS = {agg_qps:.1f}") + print(f" median per-call = {median_per_call * 1e6:.1f} µs") + print(f" per-thread fairness: all {n_threads} completed all " + f"{QUERIES_PER_WORKER} queries") + finally: + pool.close() diff --git a/tests/benchmarks/test_pool_perf.py b/tests/benchmarks/test_pool_perf.py index 5c78aea..993b79e 100644 --- a/tests/benchmarks/test_pool_perf.py +++ b/tests/benchmarks/test_pool_perf.py @@ -53,8 +53,10 @@ def test_cold_connect_disconnect(benchmark, conn_params: ConnParams) -> None: ) conn.close() - # Cold-connect is slow (~10ms); cap at 5 rounds, no per-round iteration - benchmark.pedantic(run, rounds=5, iterations=1) + # Cold-connect is slow (~10ms) and noisy run-to-run (server scheduling, + # network buffers). 15 rounds is enough to make the median stable + # without bloating the bench suite's runtime past ~3 minutes. + benchmark.pedantic(run, rounds=15, iterations=1) def test_pool_acquire_release(benchmark, pool: ConnectionPool) -> None: