From 5c4a7a57f16b819144731c83ecf6e9013e638aa0 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 5 May 2026 03:22:18 -0600 Subject: [PATCH] Phase 26: Pool rollback-on-release - CRITICAL data-correctness fix (2026.05.05) Fixes the dirty-pool-checkout bug surfaced by Margaret Hamilton's system-wide audit (Critical #1). The bug: ConnectionPool.release() returned connections with open server-side transactions still active. Request A's uncommitted INSERTs would be inherited by Request B reusing the same connection - B's commit would land A's writes permanently; B's rollback would silently lose them. Same shape as psycopg2's pre-2.5 dirty-pool bug. The fix: pool.release() now rolls back any open transaction before returning the connection to the idle list. The rollback runs OUTSIDE the pool lock since it's a wire round-trip - the connection is already off the idle list and counted in _total, so no other thread can grab it during the rollback window. If the rollback itself fails (dead socket, etc.), the connection is evicted rather than recycled. Async path covered automatically: AsyncConnectionPool.release() delegates to the sync pool's release via _to_thread. Margaret Hamilton review pass surfaced two findings, both addressed: * Silent rollback failure: added a WARNING log via logging.getLogger ("informix_db.pool") so evictions are debuggable. First logger in the project. * Async cancellation race: the fix doesn't introduce the asyncio.wait_for race (Critical #2, deferred to Phase 27), but it adds a code path that can trigger it. Documented loudly in pool.release() docstring, aio.py module docstring, and USAGE.md async section. Recommendation: use read_timeout on the connection instead of asyncio.wait_for until Phase 27 lands. Two new regression tests in tests/test_pool.py: * test_uncommitted_writes_invisible_to_next_acquirer (the bug) * test_committed_writes_survive_pool_checkout (no over-correction) Verified the regression test catches the bug: stashed the fix, ran the test - it fails with "B sees 1 rows - leaked across pool checkout boundary" - confirming it tests the real failure mode. Total tests: 72 unit + 226 integration + 28 benchmark = 326. Deferred to Phase 27 per Hamilton audit: * Critical #2 (concurrency / per-connection wire lock) * High #3 (async cancellation routes to broken=True) * High #4 (bare except in _raise_sq_err drain) * High #5 (no cursor finalizers - server-side resource leaks) --- CHANGELOG.md | 48 ++++++++++++++ docs/USAGE.md | 16 +++++ pyproject.toml | 2 +- src/informix_db/aio.py | 19 ++++++ src/informix_db/pool.py | 62 +++++++++++++++++- tests/test_pool.py | 141 ++++++++++++++++++++++++++++++++++++++++ uv.lock | 2 +- 7 files changed, 287 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93ef205..002b89f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,54 @@ All notable changes to `informix-db`. Versioning is [CalVer](https://calver.org/) — `YYYY.MM.DD` for date-based releases, `YYYY.MM.DD.N` for same-day post-releases per PEP 440. +## 2026.05.05 — Pool rollback-on-release (Phase 26): CRITICAL data-correctness fix + +Fixes the dirty-pool-checkout bug surfaced by Margaret Hamilton's system-wide audit. **This is the most important fix in the project's history so far** — it eliminates a class of silent data-correctness failures that affect any application using both the connection pool and non-autocommit transactions. + +### The bug + +Before this fix, when a request released a connection to the pool with an open server-side transaction (no explicit `commit()` or `rollback()`), the connection went back to the idle list with the transaction still active. The next request to acquire that connection would inherit it. Specifically: + +- **Request A** does `INSERT` (no commit) → returns 200 to user → context manager calls `pool.release()` → connection rejoins `_idle` *with A's open transaction*. +- **Request B** acquires the same connection → its first DML runs *inside A's transaction*. If B commits, A's writes land **permanently**. If B errors before commit, A's writes are silently rolled back. + +This is the same shape as the bug psycopg2 fixed years ago. Hamilton ranked it Critical #1 in the audit. + +### The fix + +`ConnectionPool.release()` now rolls back any uncommitted transaction *before* the connection rejoins the idle list. If the rollback itself fails (dead socket, etc.), the connection is evicted instead of recycled — half-state never returns to the pool. + +The rollback runs *outside* the pool lock (it's a wire round-trip; we don't want to block other pool operations). The connection is safe to operate on alone because it's not yet in `_idle` and `_total` already counts it as owned. + +### Async path covered automatically + +`AsyncConnectionPool.release()` delegates to the sync pool's release via `_to_thread`, so async users get the fix transparently. + +### Margaret Hamilton review pass + +The fix went through a focused review. Two findings addressed before tagging: + +- **Silent failure on rollback exception**: Hamilton flagged that `except Exception:` swallowed rollback failures invisibly. Added a `WARNING`-level log via `logging.getLogger("informix_db.pool")` so evictions are debuggable. (No existing logging convention in the codebase before now; this is the first.) +- **Async cancellation interaction**: the fix doesn't *introduce* the `asyncio.wait_for` race (that's Critical #2 from the audit, deferred to Phase 27), but it adds a code path that can trigger it. Documented loudly in `pool.release()`'s docstring, in `aio.py`'s module docstring, and in `USAGE.md`'s async section. **The recommendation: use `read_timeout` on the connection instead of `asyncio.wait_for` until Phase 27 lands the per-connection wire lock.** + +### Tests + +Two new integration tests in `tests/test_pool.py`: + +- **`test_uncommitted_writes_invisible_to_next_acquirer`** — the regression test. Pool with `max_size=1` forces A and B to share the connection. A inserts without committing, A's context exits, B acquires (gets the same connection), B verifies no rows visible. **Verified** by stashing the fix that the test fails with "B sees 1 rows — leaked across pool checkout boundary" — confirming the test catches the actual bug, not a false positive. +- **`test_committed_writes_survive_pool_checkout`** — counterpart that proves the fix doesn't over-correct (committed writes still persist). + +Total tests: 72 unit + 226 integration + 28 benchmark = **326**. + +### What this fix does NOT cover (deferred to Phase 27) + +- **Concurrency / per-connection wire lock** (Hamilton Critical #2). `Connection._lock` is currently held only by `close()`; every wire-touching method runs unsynchronized. Two threads on the same connection (or async cancellation leaving a worker thread still running) can interleave PDU bytes. The Phase 26 fix doesn't make this worse than baseline, but it adds a second path that can trigger it. +- **No finalizers on cursors** (Hamilton High #5). Mid-fetch raises still leak server-side prepared statements / scrollable cursors. +- **Bare `except: pass` in `_raise_sq_err` drain** (Hamilton High #4). Masks ProtocolError during error decode. +- **Async cancellation evicts cleanly** (Hamilton High #3). Currently `TimeoutError`/`CancelledError` aren't routed to `broken=True` — connection rejoins pool. Phase 26 mitigates the data corruption from this case (rollback runs on release), but the wire-desync race remains until Phase 27. + +These are real and known. Documented in the audit punch list. + ## 2026.05.04.10 — Branch reorder by frequency + invariant tripwires (Phase 25) Third-pass optimization on `parse_tuple_payload`. Previous phases removed redundant work; this one removes *correct-but-wasteful* work: the if/elif chain checked branches in implementation order, not frequency order. Fixed-width types (INT, FLOAT, DATE, BIGINT — by far the most common columns in real queries) sat at the *bottom* of the chain, paying ~7 frozenset/equality misses per column. diff --git a/docs/USAGE.md b/docs/USAGE.md index 5527152..7c88d0a 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -494,6 +494,22 @@ await pool.close() The async API mirrors the sync API one-to-one. Each blocking I/O call is offloaded to a worker thread via `asyncio.to_thread` — the event loop never blocks; concurrent queries across an `asyncio.gather` actually run in parallel up to `max_size`. +### Cancellation caveat — don't wrap `aio` calls with `asyncio.wait_for` + +`asyncio.to_thread` does not interrupt the underlying worker thread when the awaitable is cancelled. If you wrap a query in `asyncio.wait_for(...)` and the timeout fires, the worker thread keeps running on the socket while the connection is being released back to the pool — and the pool's release path runs its own wire I/O (the Phase 26 transaction rollback). Two threads writing to one socket = wire desync = a poisoned connection in the pool. + +**Use connection-level timeouts instead:** + +```python +# Good — socket-level timeout, no to_thread race +conn = await aio.connect(..., read_timeout=30.0) + +# Bad — until Phase 27 lands the per-connection wire lock +await asyncio.wait_for(cur.execute(big_query), timeout=30.0) +``` + +`connect_timeout` and `read_timeout` apply at the socket layer; on a frozen server they raise a clean `OperationalError` and the cursor/connection state stays consistent. + ## TLS ```python diff --git a/pyproject.toml b/pyproject.toml index ba85bd9..ca56819 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "informix-db" -version = "2026.05.04.10" +version = "2026.05.05" description = "Pure-Python driver for IBM Informix IDS — speaks the SQLI wire protocol over raw sockets. No CSDK, no JVM, no native libraries." readme = "README.md" license = { text = "MIT" } diff --git a/src/informix_db/aio.py b/src/informix_db/aio.py index 04b7871..bbfde19 100644 --- a/src/informix_db/aio.py +++ b/src/informix_db/aio.py @@ -15,6 +15,25 @@ event loop yields during each ``await``; a worker thread does the actual socket I/O. Only differs for thousands-of-concurrent-connections workloads, which need native-async (Phase 17 if anyone asks). +.. warning:: + + **Cancellation caveat.** ``asyncio.to_thread`` does not interrupt + the underlying OS thread when the awaitable is cancelled — the + thread keeps running until the sync call returns naturally. If you + wrap a query in ``asyncio.wait_for`` and the timeout fires: + + 1. The async call raises ``TimeoutError``. + 2. The worker thread is still mid-I/O on the socket. + 3. The connection's pool ``release()`` runs (under Phase 26 it + does its own wire I/O for the rollback). + 4. The two threads can interleave bytes on the socket → wire desync. + + Until Phase 27 lands a per-connection wire lock, **avoid wrapping + ``aio`` DB calls with ``asyncio.wait_for``**. Use the connection's + ``connect_timeout`` and ``read_timeout`` parameters instead — those + apply at the socket level (no to_thread race) and produce clean + ``OperationalError`` on timeout. + Usage:: import asyncio diff --git a/src/informix_db/pool.py b/src/informix_db/pool.py index 701d43e..6133c9a 100644 --- a/src/informix_db/pool.py +++ b/src/informix_db/pool.py @@ -40,6 +40,7 @@ Design notes: from __future__ import annotations import contextlib +import logging import threading import time from collections.abc import Iterator @@ -51,6 +52,11 @@ from .exceptions import ( OperationalError, ) +# Module-level logger. By default Python loggers without a configured +# handler emit nothing — applications that want pool diagnostics can +# wire up ``logging.getLogger("informix_db.pool")`` to their handler. +_log = logging.getLogger(__name__) + class PoolClosedError(InterfaceError): """Pool was closed before/during acquire.""" @@ -173,9 +179,63 @@ class ConnectionPool: Pass ``broken=True`` to evict it (e.g., after a connection- related exception). Broken connections are closed and the slot is freed for a new connection. + + **Session reset (Phase 26)**: any uncommitted transaction is + rolled back before the connection rejoins the idle list. + Without this, request A's uncommitted writes would be visible + to (and could be inadvertently committed by) request B who + reuses the same connection. If the rollback itself fails + (dead socket, etc.), the connection is evicted instead of + recycled — half-state is never returned to the pool. Failures + are logged at WARNING level via ``logging.getLogger( + "informix_db.pool")``. + + **Concurrency caveat — async cancellation**: when an awaitable + wrapping a query is cancelled (``asyncio.wait_for`` timeout, + explicit task.cancel(), etc.), the underlying ``to_thread`` + worker that's executing the query is NOT interrupted. It + keeps running while the async pool's release runs concurrently + — and ``release()`` now does its own wire I/O for the rollback. + Two threads writing to one socket will interleave bytes and + desync the wire. Until Phase 27 lands a per-connection wire + lock, **don't put ``asyncio.wait_for`` around `informix_db.aio` + DB calls in production**. Use ``connect_timeout`` / + ``read_timeout`` on the connection itself instead — those run + at the socket level and don't have the to_thread race. """ + if broken or self._closed or conn.closed: + with self._lock: + self._total -= 1 + self._safe_close(conn) + self._lock.notify() + return + # Rollback any open transaction outside the pool lock — + # ``Connection.rollback`` does a wire round-trip and we don't + # want to block other pool operations during that. The + # connection isn't yet in ``_idle``, and ``_total`` already + # counts it as "owned by us", so no other thread can grab it + # while we're working. + if conn._in_transaction: + try: + conn.rollback() + except Exception as exc: + # Rollback failed — the wire is in an unknown state. + # Evict the connection rather than recycling unsafe + # state to the next acquirer. Log so the eviction is + # debuggable rather than silent. + _log.warning( + "rollback-on-release failed; evicting connection: %r", + exc, + ) + with self._lock: + self._total -= 1 + self._safe_close(conn) + self._lock.notify() + return with self._lock: - if broken or self._closed or conn.closed: + if self._closed: + # Pool was closed while we were rolling back. Don't + # add to ``_idle`` — close the connection instead. self._total -= 1 self._safe_close(conn) self._lock.notify() diff --git a/tests/test_pool.py b/tests/test_pool.py index 36d9c36..bcb917c 100644 --- a/tests/test_pool.py +++ b/tests/test_pool.py @@ -287,3 +287,144 @@ def test_pool_thread_safe_concurrent_acquires( assert pool.size <= 4 finally: pool.close() + + +# -------- Phase 26: pool rollback-on-release (CRITICAL data-correctness bug) -------- + + +def test_uncommitted_writes_invisible_to_next_acquirer( + logged_db_params: ConnParams, +) -> None: + """Critical regression test for the dirty-pool-checkout bug. + + Pre-Phase-26 behavior: + Request A acquires → INSERTs (no commit) → releases. Server + transaction stays open. Request B acquires the SAME connection + (max_size=1 forces reuse) → its first SELECT sees A's + uncommitted row (because it's running inside A's transaction). + Worse: if B then commits, A's writes land permanently. If B + errors before commit, A's writes silently roll back. + + This is the same shape as psycopg2's pre-2.5 dirty-pool bug. + + Post-Phase-26: pool.release() rolls back any open transaction + before adding the connection to ``_idle``. A's uncommitted + writes are gone before B ever sees the connection. + """ + pool = informix_db.create_pool( + host=logged_db_params.host, + port=logged_db_params.port, + user=logged_db_params.user, + password=logged_db_params.password, + database=logged_db_params.database, + server=logged_db_params.server, + min_size=1, + max_size=1, # forces A and B to share the connection + ) + table = "p26_dirty_pool" + try: + # Setup: fresh table, autocommit so the CREATE lands + with pool.connection() as setup: + cur = setup.cursor() + with __import__("contextlib").suppress(Exception): + cur.execute(f"DROP TABLE {table}") + cur.execute(f"CREATE TABLE {table} (id INT, label VARCHAR(64))") + setup.commit() + + # Request A: insert without committing, then release + a_conn = pool.acquire() + try: + cur = a_conn.cursor() + cur.execute( + f"INSERT INTO {table} VALUES (?, ?)", (1, "A's dirty write") + ) + # Confirm A sees its own write inside its own transaction + cur.execute(f"SELECT COUNT(*) FROM {table}") + assert cur.fetchone() == (1,), "A should see its own write pre-release" + assert a_conn._in_transaction, "A's connection should be in_transaction" + finally: + pool.release(a_conn) # NO commit — this is the critical case + + # Request B: acquire the same connection (max_size=1 guarantees reuse). + # Note: we don't assert on ``_in_transaction`` after acquire — the + # pool's ``_is_alive`` health probe runs SELECT 1 which opens a + # fresh transaction under autocommit=False. The data-correctness + # check (the COUNT below) is the actual ground truth: if Phase 26 + # didn't apply, A's uncommitted row would still be visible because + # B would be running INSIDE A's leftover transaction. + b_conn = pool.acquire() + try: + assert b_conn is a_conn, "max_size=1 must yield the same connection" + cur = b_conn.cursor() + cur.execute(f"SELECT COUNT(*) FROM {table}") + (count,) = cur.fetchone() + assert count == 0, ( + f"B sees {count} rows — A's uncommitted writes leaked across " + "the pool checkout boundary. Phase 26 fix did not apply." + ) + finally: + pool.release(b_conn) + + # Cleanup + with pool.connection() as cleanup: + cur = cleanup.cursor() + with __import__("contextlib").suppress(Exception): + cur.execute(f"DROP TABLE {table}") + cleanup.commit() + finally: + pool.close() + + +def test_committed_writes_survive_pool_checkout( + logged_db_params: ConnParams, +) -> None: + """Counterpart to the previous test: COMMITTED writes must persist. + + This guards against the obvious over-correction — if Phase 26's + rollback also somehow nukes already-committed work (e.g., via a + second BEGIN+ROLLBACK round-trip), the bug fix would itself be + a data-loss bug. This test fails if rollback runs when it + shouldn't. + """ + pool = informix_db.create_pool( + host=logged_db_params.host, + port=logged_db_params.port, + user=logged_db_params.user, + password=logged_db_params.password, + database=logged_db_params.database, + server=logged_db_params.server, + min_size=1, + max_size=1, + ) + table = "p26_committed" + try: + with pool.connection() as setup: + cur = setup.cursor() + with __import__("contextlib").suppress(Exception): + cur.execute(f"DROP TABLE {table}") + cur.execute(f"CREATE TABLE {table} (id INT)") + setup.commit() + + # Request A: insert + commit + release + with pool.connection() as a_conn: + cur = a_conn.cursor() + cur.execute(f"INSERT INTO {table} VALUES (?)", (42,)) + a_conn.commit() + assert not a_conn._in_transaction + + # Request B: should see the committed row + with pool.connection() as b_conn: + cur = b_conn.cursor() + cur.execute(f"SELECT id FROM {table}") + assert cur.fetchone() == (42,), ( + "Committed write disappeared — Phase 26 rollback ran when " + "it shouldn't have." + ) + + with pool.connection() as cleanup: + cur = cleanup.cursor() + with __import__("contextlib").suppress(Exception): + cur.execute(f"DROP TABLE {table}") + cleanup.commit() + finally: + pool.close() diff --git a/uv.lock b/uv.lock index 00eded2..4888ac8 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ wheels = [ [[package]] name = "informix-db" -version = "2026.5.4.9" +version = "2026.5.4.10" source = { editable = "." } [package.optional-dependencies]