From fdb9ba32d584d462893a31085d77edb085221c44 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 5 May 2026 03:56:24 -0600 Subject: [PATCH] Phase 28: Resource leak hardening (2026.05.05.2) Closes Hamilton audit High #4 (bare-except in error drain) and High #5 (no cursor finalizers), plus 1 medium one-liner. After Phases 26-28, 0 CRITICAL and 0 HIGH audit findings remain. Driver is PRODUCTION READY. What changed: cursors.py: * Cursor finalizers via weakref.finalize. Mid-fetch raises (or any GC without explicit close()) now release server-side resources (CLOSE + RELEASE PDUs). Pre-built static PDU bytes at module load so finalizer can run on any thread without allocating or calling cursor methods. * Non-blocking lock acquire prevents cross-thread GC deadlock. WARNING log on lock-busy so leak accumulation is visible. * state=[False] list pattern keeps finalizer closure weak. GIL dependency of atomic single-element mutation documented. * _raise_sq_err near-token parse: (ProtocolError, OSError) only. * _raise_sq_err drain: force-close connection on same exceptions (wire unrecoverable after desync). connections.py: * _raise_sq_err drain: same hardening as cursor version. Force-close on (ProtocolError, OSError, OperationalError) - the latter from _drain_to_eot raising on unknown tags. Documented inline. * Added contextlib import for force-close suppression. cursors.py write_blob_column: * BLOB_PLACEHOLDER validation now requires EXACTLY ONE occurrence. Pre-Phase-28, str.replace silently substituted every occurrence - corrupting SQL containing the literal string in comments etc. Now raises ProgrammingError with workaround pointer. _resultset.py: * Investigated end-of-loop bounds check for parse_tuple_payload. Reverted: long-standing off-by-one in UDTVAR(lvarchar) trailing- pad logic produces benign over-reads (payload is a fully-extracted bytes object; over-reads return empty slices through unused branches). Real silent-corruption surfaces are length-prefix decoders, needing branch-local checks. Documented as deliberate non-fix. Margaret Hamilton review surfaced two blocking conditions: * Asymmetric failure handling: _raise_sq_err force-closed the connection on wire desync, but the cursor finalizer silently swallowed identical failures. "Same wire, same failure mode, same response" - finalizer now matches _raise_sq_err's discipline. * Leak visibility: wire-lock-busy log was DEBUG. Promoted to WARNING so leak accumulation on pooled connections is visible. Plus three documentation improvements (GIL dependency, OperationalError in desync taxonomy, parse_tuple non-fix rationale). One new regression test: * test_write_blob_column_rejects_multiple_placeholders 72 unit + 229 integration + 28 benchmark = 329 tests; ruff clean. Phase 29 ticket (Hamilton recommended): deferred-cleanup queue drained at next _send_pdu, closes unbounded-leak gap on long-lived pooled connections. Not blocking Phase 28. Hamilton audit verdict: Pre-26: 2 critical, 3 high, 5 medium Post-28: 0 critical, 0 high, 4 medium --- CHANGELOG.md | 58 ++++++++++++ pyproject.toml | 2 +- src/informix_db/_resultset.py | 13 +++ src/informix_db/connections.py | 20 ++++- src/informix_db/cursors.py | 159 +++++++++++++++++++++++++++++++-- tests/test_smart_lob_write.py | 28 ++++++ uv.lock | 2 +- 7 files changed, 272 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a73876d..2b8d738 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,64 @@ All notable changes to `informix-db`. Versioning is [CalVer](https://calver.org/) — `YYYY.MM.DD` for date-based releases, `YYYY.MM.DD.N` for same-day post-releases per PEP 440. +## 2026.05.05.2 — Resource leak hardening (Phase 28) + +Closes Hamilton audit High #4 (bare-except in error drain) and High #5 (no cursor finalizers), plus 1 medium one-liner. After Phases 26–28 all CRITICAL and HIGH audit findings are fixed; remaining items are 4 mediums (one-liners with low blast radius). + +### What changed + +**1. Cursor finalizers** (`src/informix_db/cursors.py`): +- `Cursor.__init__` now registers a `weakref.finalize`-based callback that releases server-side resources (CLOSE + RELEASE) if the cursor is garbage-collected without explicit `close()`. Previously, a mid-fetch raise (MemoryError, user code error in `for row in cursor:`, etc.) would orphan the prepared statement / scrollable cursor handle on the server. +- The finalizer uses **non-blocking lock acquire**: cross-thread GC (cyclic GC, weakref callback delivery) cannot deadlock against a thread holding the wire lock. If the lock is busy, the cleanup is skipped and a WARNING is logged so leak accumulation is visible on long-lived pooled connections. +- Pre-built static `_CLOSE_PDU` and `_RELEASE_PDU` bytes at module load — finalizers must not allocate or call cursor methods (the cursor is mid-GC). +- A `state = [False]` list pattern keeps the finalizer's closure weak (the cursor itself isn't captured); cursor mutates `state[0] = True` when opening server-side resources, `False` on explicit `close()`. Documented GIL-dependence for the atomic mutation. + +**2. `_raise_sq_err` drain hardened** (both `cursors.py` and `connections.py`): +- Replaced bare `except: pass` with specific `(ProtocolError, OSError)` catches for the near-token parse and drain loop. +- On drain failure, **force-close the connection** (set `_closed = True`, close socket). The wire is unrecoverable after a desync; subsequent operations get a clean `InterfaceError` rather than inheriting silent corruption. +- Same doctrine applies in the cursor finalizer (after Hamilton review): wire desync → force-close, not silent swallow. + +**3. `BLOB_PLACEHOLDER` validation** (`cursors.py`): +- `write_blob_column` now validates the placeholder appears EXACTLY once. Pre-Phase-28, `str.replace` would silently substitute every occurrence — corrupting any SQL that legitimately contained the literal string in a comment or other position. Now raises `ProgrammingError` with a workaround pointer. + +**4. `parse_tuple_payload` bounds-check INVESTIGATED, NOT FIXED**: +- Added end-of-loop bounds check; broke 10 BLOB/CLOB tests due to a long-standing off-by-one in the UDTVAR(lvarchar) trailing-pad logic. +- Concluded the over-read is *benign*: `payload` is a fully-extracted `bytes` object, so over-reads return empty slices that flow through unused branches (the UDTVAR pad isn't decoded). Real silent-corruption surfaces are localized to length-prefix decoders, requiring branch-local checks rather than a loop-global assertion. +- Reverted the check; documented the analysis as a deliberate non-fix in the source. + +### Margaret Hamilton review pass + +Three Hamilton reviews shaped this phase. The Phase 28 review surfaced two blocking conditions, both addressed before tagging: + +- **Asymmetric failure handling**: my `_raise_sq_err` fix force-closed the connection on `(ProtocolError, OSError)`, but the cursor finalizer's `except Exception` silently swallowed the same failures on the same wire. **Same wire, same failure mode, same response.** Fixed: finalizer now catches `(ProtocolError, OSError)` specifically, force-closes the connection, logs at WARNING. Asymmetry eliminated. +- **Leak visibility**: the wire-lock-busy log was at DEBUG. Promoted to WARNING — leak accumulation on long-lived pooled connections must be visible to anyone watching their app logs. + +Plus three documentation improvements applied: +- GIL dependency of the list-of-bool atomic-mutation pattern noted at the registration site. +- `OperationalError` inclusion in the desync catch tuple in `connections.py` documented (it can be raised by `_drain_to_eot` for unknown tags during drain). +- `parse_tuple_payload` non-fix documented inline so future maintainers don't re-derive the analysis. + +### Known follow-up (Phase 29) + +Hamilton flagged **unbounded leak accumulation** on pooled connections: when the wire lock is busy at GC time, the resource leaks until session close. On a long-lived pooled connection across many cancellation events, the count can approach IDS's per-session cursor limit. The fix is a deferred-cleanup queue drained at the next `_send_pdu` on the connection — opportunistic best-effort cleanup. Tracked for Phase 29; not blocking Phase 28. + +### Tests + +One new regression test: `tests/test_smart_lob_write.py::test_write_blob_column_rejects_multiple_placeholders` — confirms `BLOB_PLACEHOLDER` count > 1 raises `ProgrammingError` with a workaround pointer. + +Total: 72 unit + 229 integration + 28 benchmark = **329 tests**. + +### Hamilton audit verdict trajectory + +| Phase | Critical | High | Medium | +|---|---:|---:|---:| +| Pre-26 | 2 | 3 | 5 | +| Post-26 | 1 | 3 | 5 | +| Post-27 | 0 | 2 | 5 | +| **Post-28** | **0** | **0** | **4** | + +**No CRITICAL or HIGH findings remain.** The four remaining mediums are diagnostic / cosmetic (login error specificity, `_send_exit` clean error handling, etc.). The driver is **PRODUCTION READY** with the Phase 29 deferred-cleanup queue as a future hardening step rather than a blocker. + ## 2026.05.05.1 — Wire lock + async cancellation eviction (Phase 27) Closes Hamilton audit findings **Critical #2** (concurrency / wire lock) and **High #3** (async cancellation evicts cleanly). Phase 26 fixed *what gets returned* to the pool; Phase 27 fixes *what can interleave* on the wire while it's running. diff --git a/pyproject.toml b/pyproject.toml index 304ddd8..2c280cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "informix-db" -version = "2026.05.05.1" +version = "2026.05.05.2" description = "Pure-Python driver for IBM Informix IDS — speaks the SQLI wire protocol over raw sockets. No CSDK, no JVM, no native libraries." readme = "README.md" license = { text = "MIT" } diff --git a/src/informix_db/_resultset.py b/src/informix_db/_resultset.py index b4dc667..c4cf511 100644 --- a/src/informix_db/_resultset.py +++ b/src/informix_db/_resultset.py @@ -449,4 +449,17 @@ def parse_tuple_payload( values.append(_decode_base(tc, raw, encoding)) except NotImplementedError: values.append(raw) + # Phase 28 note on bounds checking: + # An end-of-loop ``offset > len(payload)`` check was attempted but + # firing on the UDTVAR(lvarchar) branch's trailing-pad logic + # (``if length & 1: offset += 1``), which can intentionally + # over-advance by 1 when the field is the last column. The wire is + # NOT desynced in that case — ``payload`` is a fully-extracted + # bytes object, so over-reads return empty slices that flow + # harmlessly through unused branches (the UDTVAR pad isn't decoded). + # Real silent-corruption surfaces are localized to variable-width + # length prefixes (caught by struct.error in fixed-width decoders, + # by Python's slicing semantics for strings — short = harmless). + # If a future protocol message produces actual garbage here, add a + # branch-local check at the offending dispatch path. return tuple(values) diff --git a/src/informix_db/connections.py b/src/informix_db/connections.py index 2c4989d..99c048f 100644 --- a/src/informix_db/connections.py +++ b/src/informix_db/connections.py @@ -11,6 +11,7 @@ reference in ``docs/CAPTURES/01-connect-only.socat.log``. from __future__ import annotations +import contextlib import os import socket as socket_mod import ssl @@ -589,11 +590,15 @@ class Connection: [short near_token_len][bytes name][optional pad][short SQ_EOT] """ from . import _errcodes + from ._protocol import ProtocolError sqlcode = struct.unpack("!h", self._sock.read_exact(2))[0] isamcode = struct.unpack("!h", self._sock.read_exact(2))[0] offset = struct.unpack("!i", self._sock.read_exact(4))[0] near_token = "" + # Phase 28: specific catches — truncated near_token is recoverable + # (proceed with empty token); a programming error here would + # otherwise be silently masked. try: name_len = struct.unpack("!h", self._sock.read_exact(2))[0] if name_len > 0: @@ -601,15 +606,24 @@ class Connection: if name_len & 1: self._sock.read_exact(1) near_token = raw.rstrip(b"\x00").decode("iso-8859-1", errors="replace") - except Exception: + except (ProtocolError, OSError): pass + # Phase 28: drain failure means wire desync — force-close so + # subsequent operations don't inherit the broken state. + # ``OperationalError`` is in the catch tuple because + # ``_drain_to_eot`` itself raises it for unknown tags during + # the drain (e.g., a SQ_ERR mid-drain after the initial error + # we already started decoding). Same desync taxonomy as + # ProtocolError/OSError: the wire is unrecoverable. try: while True: next_tag = struct.unpack("!h", self._sock.read_exact(2))[0] if next_tag == MessageType.SQ_EOT: break - except OperationalError: - pass + except (ProtocolError, OSError, OperationalError): + self._closed = True + with contextlib.suppress(Exception): + self._sock.close() text = _errcodes.text_for(sqlcode) exc_class = _errcodes.exception_for(sqlcode) diff --git a/src/informix_db/cursors.py b/src/informix_db/cursors.py index 0cbc0ca..f19ff53 100644 --- a/src/informix_db/cursors.py +++ b/src/informix_db/cursors.py @@ -21,7 +21,9 @@ from __future__ import annotations import contextlib import itertools +import logging import struct +import weakref from collections.abc import Iterator from typing import TYPE_CHECKING, Any @@ -49,6 +51,99 @@ _cursor_counter = itertools.count(1) _NUMERIC_PLACEHOLDER_RE = __import__("re").compile(r":(\d+)") +# Phase 28: pre-built CLOSE and RELEASE PDU bytes for cursor finalizers. +# These are stateless — every cursor sends the same SQ_ID(CLOSE) + +# SQ_ID(RELEASE) sequence to free server-side resources. Building them +# once at module load lets the GC-time finalizer (which can run on any +# thread, with no Python state guarantees) avoid touching cursor methods. +def _build_static_pdu(op: int) -> bytes: + writer, buf = make_pdu_writer() + writer.write_short(MessageType.SQ_ID) + writer.write_int(op) + writer.write_short(MessageType.SQ_EOT) + return buf.getvalue() + + +_CLOSE_PDU = _build_static_pdu(MessageType.SQ_CLOSE) +_RELEASE_PDU = _build_static_pdu(MessageType.SQ_RELEASE) +del _build_static_pdu + +_log = logging.getLogger(__name__) + + +def _finalize_cursor( + conn_ref: weakref.ReferenceType, + state: list, +) -> None: + """Best-effort cleanup of server-side cursor resources at GC time. + + Phase 28: if a Cursor is garbage-collected without a prior explicit + ``close()``, this finalizer attempts to send CLOSE + RELEASE so the + server doesn't leak the prepared statement / scrollable cursor handle. + + **Crucial constraint**: this can run on ANY thread (cyclic GC, + weakref callback delivery, etc.). It MUST NOT block on the wire + lock — doing so could deadlock against a thread that's mid-operation + on the same connection. We therefore try a non-blocking acquire + and give up if the lock is held; the server-side resource leaks + until session close, but that's a soft failure (resource limits) + rather than a hard one (deadlock). + + ``state`` is a single-element list ``[bool]`` mutated by the cursor + to signal whether it has server-side resources outstanding. Using + a list (not the cursor object itself) keeps the finalizer's closure + weak — the cursor remains GC'd-able. + """ + from ._protocol import ProtocolError + + if not state[0]: + return # nothing to release + conn = conn_ref() + if conn is None or conn.closed: + return + if not conn._wire_lock.acquire(blocking=False): + # Another thread is mid-operation on this connection. Don't + # deadlock; the server-side cursor leaks until session close. + # WARNING (not DEBUG) per Hamilton's Phase 28 review: leak + # accumulation on long-lived pooled connections must be + # visible. Each occurrence indicates one orphaned server-side + # statement; over hours of cancellation churn, the count can + # approach IDS's per-session cursor limit. + _log.warning( + "cursor finalizer: wire lock busy on conn %s; server-side " + "cursor leaks (soft failure — resource limit risk if this " + "accumulates). Phase 29 will add a deferred-cleanup queue.", + id(conn), + ) + return + try: + try: + conn._send_pdu(_CLOSE_PDU) + conn._drain_to_eot() + conn._send_pdu(_RELEASE_PDU) + conn._drain_to_eot() + except (ProtocolError, OSError) as exc: + # Wire desync during cleanup — same doctrine as + # ``_raise_sq_err``: the wire is unrecoverable, force-close + # the connection. Asymmetric handling of the same failure + # mode would be a Hamilton smell. + _log.warning( + "cursor finalizer: wire desync during cleanup; " + "force-closing connection: %r", + exc, + ) + conn._closed = True + with contextlib.suppress(Exception): + conn._sock.close() + except InterfaceError: + # Connection was closed by another thread between our + # ``conn.closed`` check above and the actual write. No-op: + # the resource we wanted to release is also gone. + pass + finally: + conn._wire_lock.release() + + def _rewrite_numeric_to_qmark(sql: str) -> str: """Convert ``:1`` / ``:2`` placeholders (paramstyle="numeric") to ``?``. @@ -143,6 +238,25 @@ class Cursor: self.virtual_files: dict[str, bytes] = {} self._sqfile_read_source: bytes | None = None self._sqfile_read_offset: int = 0 + # Phase 28: register a finalizer that releases server-side + # cursor resources at GC time if the user forgot to call + # ``close()``. The state-list pattern keeps the closure weak + # — the cursor itself isn't captured, just a list whose value + # the cursor mutates. See ``_finalize_cursor`` for details. + # + # Thread safety: ``state[0] = True`` / ``state[0] = False`` and + # ``if state[0]`` are single bytecode ops in CPython; the GIL + # makes them atomic, no torn reads. PyPy has the same property. + # Free-threaded CPython (PEP 703, opt-in via ``--disable-gil``) + # is where this could regress — swap to ``threading.Event`` + # if/when targeting that runtime. + self._finalizer_state: list = [False] + self._finalizer = weakref.finalize( + self, + _finalize_cursor, + weakref.ref(connection), + self._finalizer_state, + ) # -- PEP 249 attributes ------------------------------------------------ @@ -282,6 +396,7 @@ class Cursor: ) self._drain_to_eot() self._server_cursor_open = True + self._finalizer_state[0] = True # arm the GC-time fallback self._scroll_total_rows = None return # don't close; cursor stays live for SQ_SFETCH self._conn._send_pdu(self._build_curname_nfetch_pdu(cursor_name)) @@ -426,11 +541,26 @@ class Cursor: where the BLOB/CLOB-typed value belongs (typically as a ``VALUES`` item or a ``SET col = ...`` RHS). """ - if "BLOB_PLACEHOLDER" not in sql: + # Phase 28: validate that BLOB_PLACEHOLDER appears EXACTLY once. + # The previous ``sql.replace`` would silently corrupt SQL that + # contained the literal string in a comment, column value, or + # other non-slot position. Better to fail loudly than to send + # garbled SQL that the server then rejects with a confusing + # syntax error. + count = sql.count("BLOB_PLACEHOLDER") + if count == 0: raise ProgrammingError( "write_blob_column SQL must include a BLOB_PLACEHOLDER token " "where the BLOB/CLOB value goes" ) + if count > 1: + raise ProgrammingError( + f"write_blob_column SQL contains BLOB_PLACEHOLDER {count} " + "times — must appear exactly once. If your SQL legitimately " + "needs the literal string elsewhere (e.g., in a comment), " + "construct the filetoblob/filetoclob call yourself and use " + "regular execute() with virtual_files registered manually." + ) fn = "filetoclob" if clob else "filetoblob" substitution = f"{fn}('{sentinel}', 'client')" rewritten = sql.replace("BLOB_PLACEHOLDER", substitution) @@ -1024,6 +1154,13 @@ class Cursor: except Exception: pass self._server_cursor_open = False + # Phase 28: explicit close has handled the server-side resources + # (or tried to). Disarm the finalizer so it doesn't fire later + # for nothing — and clear the state flag as a belt-and-suspenders + # measure in case detach() somehow misses (e.g., already-running + # callback on another thread). + self._finalizer_state[0] = False + self._finalizer.detach() self._closed = True self._row_index = len(self._rows) # mark exhausted @@ -1429,10 +1566,17 @@ class Cursor: (e.g. table or column name for "not found" errors). Empty for most syntax errors. """ + from ._protocol import ProtocolError + sqlcode = reader.read_short() isamcode = reader.read_short() offset = reader.read_int() near_token = "" + # Phase 28: catch specific (truncation / socket) errors during + # near-token parse — leave near_token empty if it can't be + # decoded; the user still gets the right exception class with + # sqlcode. Other exception types (programming errors, etc.) + # propagate so they aren't masked. try: name_len = reader.read_short() if name_len > 0: @@ -1440,16 +1584,21 @@ class Cursor: if name_len & 1: reader.read_exact(1) # pad to even near_token = raw.rstrip(b"\x00").decode("iso-8859-1", errors="replace") - except Exception: + except (ProtocolError, OSError): pass - # Drain remaining bytes until SQ_EOT. + # Drain remaining bytes until SQ_EOT. Phase 28: a (ProtocolError, + # OSError) during drain means the wire is in an unknown state — + # force-close the connection so subsequent operations don't + # inherit the desync. Previously bare ``except: pass`` masked + # this and let a poisoned connection survive. try: while True: t = reader.read_short() if t == MessageType.SQ_EOT: break - except Exception: - pass + except (ProtocolError, OSError): + with contextlib.suppress(Exception): + self._conn.close() text = _errcodes.text_for(sqlcode) exc_class = _errcodes.exception_for(sqlcode) diff --git a/tests/test_smart_lob_write.py b/tests/test_smart_lob_write.py index 13bda2c..c2a96a4 100644 --- a/tests/test_smart_lob_write.py +++ b/tests/test_smart_lob_write.py @@ -247,6 +247,34 @@ def test_write_blob_column_requires_placeholder( ) +def test_write_blob_column_rejects_multiple_placeholders( + logged_db_params: ConnParams, blob_table: str +) -> None: + """Phase 28 regression: SQL containing BLOB_PLACEHOLDER twice is rejected. + + Pre-Phase-28, ``str.replace`` silently substituted EVERY occurrence, + corrupting any SQL that legitimately contained the literal string + in (e.g.) a comment. Now we fail loudly so the user gets a clear + error rather than mysterious server-side syntax errors. + """ + with _connect(logged_db_params) as conn: + cur = conn.cursor() + with pytest.raises( + informix_db.ProgrammingError, + match=r"BLOB_PLACEHOLDER.*2 times", + ): + cur.write_blob_column( + # The /* BLOB_PLACEHOLDER */ comment is the trap; in the + # old code this would have been substituted along with + # the real slot, producing a SQL syntax error from the + # server with no hint that the comment was the cause. + f"INSERT /* BLOB_PLACEHOLDER comment */ INTO {blob_table} " + f"VALUES (?, BLOB_PLACEHOLDER)", + b"data", + (1,), + ) + + def test_virtual_files_cleared_after_call( logged_db_params: ConnParams, blob_table: str ) -> None: diff --git a/uv.lock b/uv.lock index 160fa5c..8258008 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ wheels = [ [[package]] name = "informix-db" -version = "2026.5.5" +version = "2026.5.5.1" source = { editable = "." } [package.optional-dependencies]