From 90483354622281f0ef860cf14277c9d414874025 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 4 May 2026 14:30:44 -0600 Subject: [PATCH] Phase 12: ROW / COLLECTION type recognition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Composite UDTs (ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21) now decode into typed wrapper objects (informix_db.RowValue, informix_db.CollectionValue) that expose schema + raw payload bytes. The wire format is the now-familiar [byte ind][int length][bytes] pattern (same as UDTVAR(lvarchar) from Phase 10). The bytes are a TEXTUAL representation of the value when selected without the extended-binary opt-in JDBC uses: ROW value: b"ROW('Alice',30 )" SET value: b"SET{'red','green','blue'}" LIST value: b"LIST{10 ,20 ,30 }" JDBC's binary-with-schema format runs ~30x larger (1420 bytes for a 2-field ROW vs. our 24). We don't request it — the textual form is what the server returns by default and is sufficient for type recognition. Phase 12 ships type recognition only. Full recursive parsing into Python tuples/lists/sets is deferred to Phase 13 (would require a SQL-literal lexer + recursive type-driven decoding). Production workloads that need typed field access today can project via SQL: cur.execute("SELECT id, r.name, r.age FROM tbl") Tests: 8 integration tests in test_composite_types.py covering ROW recognition, NULL, sub-field projection workaround, long values (>255 bytes — verifies 4-byte length prefix), SET/MULTISET/LIST recognition, and null collections. Total: 64 unit + 134 integration = 198 tests. Lesson reinforced: once one UDT-shaped type is implemented (UDTVAR in Phase 10, smart-LOB in Phase 9), every subsequent UDT-shaped type is mostly a copy of the existing decoder branch. The hard part is payload semantics, not framing. --- docs/DECISION_LOG.md | 67 ++++++++++ src/informix_db/__init__.py | 10 +- src/informix_db/_resultset.py | 48 +++++++ src/informix_db/converters.py | 49 ++++++++ tests/test_composite_types.py | 231 ++++++++++++++++++++++++++++++++++ 5 files changed, 404 insertions(+), 1 deletion(-) create mode 100644 tests/test_composite_types.py diff --git a/docs/DECISION_LOG.md b/docs/DECISION_LOG.md index 8aeb5c7..96ad0e2 100644 --- a/docs/DECISION_LOG.md +++ b/docs/DECISION_LOG.md @@ -883,6 +883,73 @@ Smart-LOBs went from "research-only" (Phase 9) to "fully working in pure Python" --- +## 2026-05-04 — Phase 12: ROW / COLLECTION type recognition (text representation) + +**Status**: active (recognition-only — full recursive parsing deferred) +**Decision**: Composite UDTs (ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21) decode into typed wrapper objects (``informix_db.RowValue`` and ``informix_db.CollectionValue``) that expose the schema string and the raw payload bytes. Full recursive parsing into Python tuples / sets / lists is deferred — Phase 13 territory. + +### The wire format surprise + +The first probe via JDBC's `RefClient` showed a 1420-byte SQ_TUPLE payload for a 2-field ROW value (just "Alice, 30"). Reading JDBC's `IfxComplex` and `IfxComplexInput` sources, this is the binary-with-full-schema-metadata format — JDBC opts into it because it wants to recursively parse fields by type. + +When SELECT runs without that opt-in (e.g., from our driver), the server uses a **textual representation** instead: + +``` +ROW value: b"ROW('Alice',30 )" # 24 bytes +SET value: b"SET{'red','green','blue'}" # 25 bytes +LIST value: b"LIST{10 ,20 ,30 }" # 41 bytes +``` + +That's ~30x lighter than JDBC's binary form. Per-element padding follows the declared column widths (note the trailing spaces — VARCHAR/INT padding). + +The wire framing is identical to ``UDTVAR(lvarchar)`` from Phase 10: + +``` +[byte indicator][int length][bytes] +``` + +Indicator: ``0`` = not null, ``1`` = null. Length is a 4-byte big-endian int. Bytes are the textual representation. + +### Why we ship recognition only + +Implementing recursive parsing into Python tuples/lists/sets requires: +1. Parsing the textual representation — needs a small SQL-literal lexer (handle quoted strings, escapes, nested ROWs, NULL elements) +2. Per-element type decoding driven by the column's `extended_name` schema (e.g., ``ROW(name varchar(50), age integer)``) +3. Recursive handling for nested ROWs and collections of ROWs + +That's a substantial parser. The user-facing benefit is "a tuple instead of a string of the tuple" — useful, but most production use cases sidestep this by **projecting sub-fields via SQL** (`SELECT row_col.fieldname FROM tbl`) which is already supported and returns properly-typed columns. + +So Phase 12's deliverable is the **type recognition + wire-format envelope** part. The recursive parser can layer on top later without changing the API. + +### Phase 13+ scope (if anyone needs it) + +If a future user has a real workload that needs structured parsing: +1. Implement a SQL-literal tokenizer for the textual format +2. Add `RowValue.fields() -> tuple` driven by parsing `extended_name` +3. Add `CollectionValue.elements() -> set | list` similarly + +Or alternatively, request the binary-with-schema format like JDBC does (which would require additional protocol negotiation) and parse that. + +### Test coverage + +8 integration tests in `tests/test_composite_types.py`: +- ROW value recognition (returns `RowValue`) +- ROW NULL +- ROW sub-field projection workaround (`SELECT r.field FROM tbl`) +- ROW with long value (>255 bytes — confirms 4-byte length prefix) +- SET / MULTISET / LIST recognition +- NULL collection + +Total: **64 unit + 134 integration = 198 tests**. + +### Lesson + +**The same wire-protocol pattern keeps showing up.** Phase 10's UDTVAR(lvarchar) decoder, Phase 9's smart-LOB locator, and now Phase 12's composite UDTs all use ``[byte indicator][int length][bytes]``. Once you've implemented one UDT-shaped type, the next is mostly a copy of the decoder branch. The hard part of UDTs is the **payload semantics**, not the framing. + +This phase took less than an hour to ship after the protocol research from Phase 10/11 had already established the indicator+length convention. + +--- + ## (template — copy below this line for new entries) ``` diff --git a/src/informix_db/__init__.py b/src/informix_db/__init__.py index 6d8e91f..b23c5f1 100644 --- a/src/informix_db/__init__.py +++ b/src/informix_db/__init__.py @@ -23,7 +23,13 @@ from __future__ import annotations from importlib.metadata import PackageNotFoundError, version from .connections import Connection -from .converters import BlobLocator, ClobLocator, IntervalYM +from .converters import ( + BlobLocator, + ClobLocator, + CollectionValue, + IntervalYM, + RowValue, +) from .exceptions import ( DatabaseError, DataError, @@ -51,6 +57,7 @@ except PackageNotFoundError: __all__ = [ "BlobLocator", "ClobLocator", + "CollectionValue", "Connection", "DataError", "DatabaseError", @@ -62,6 +69,7 @@ __all__ = [ "NotSupportedError", "OperationalError", "ProgrammingError", + "RowValue", "Warning", "__version__", "apilevel", diff --git a/src/informix_db/_resultset.py b/src/informix_db/_resultset.py index 49b985d..15e3103 100644 --- a/src/informix_db/_resultset.py +++ b/src/informix_db/_resultset.py @@ -299,6 +299,54 @@ def parse_tuple_payload( values.append(cls(raw=bytes(raw))) continue + # ROW / COLLECTION (Phase 12): composite UDTs. Wire format is + # ``[byte ind][int length][bytes]`` — same shape as + # UDTVAR(lvarchar) above, but the payload semantics are a + # textual representation of the composite (e.g., + # ``ROW('Alice',30 )`` or ``LIST{10,20,30}``) when + # selected with default options. JDBC requests a richer + # binary-with-schema format that's ~30x larger; we don't. + # + # We surface the bytes wrapped in a typed object and let the + # user parse the textual form themselves. Type codes: + # ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21. + if base in ( + int(IfxType.ROW), + int(IfxType.COLLECTION), + int(IfxType.SET), + int(IfxType.MULTISET), + int(IfxType.LIST), + ): + from .converters import CollectionValue, RowValue + indicator = payload[offset] + offset += 1 + if indicator == 1: # null + values.append(None) + continue + length = int.from_bytes( + payload[offset:offset + 4], "big", signed=True + ) + offset += 4 + raw = bytes(payload[offset:offset + length]) + offset += length + if base == int(IfxType.ROW): + values.append(RowValue(raw=raw, schema=col.extended_name)) + else: + kind_map = { + int(IfxType.SET): "set", + int(IfxType.MULTISET): "multiset", + int(IfxType.LIST): "list", + int(IfxType.COLLECTION): "collection", + } + values.append( + CollectionValue( + raw=raw, + kind=kind_map[base], + element_schema=col.extended_name, + ) + ) + continue + # UDTVAR (type 40) with extended_name="lvarchar": this is what # functions like ``lotofile`` return — a length-prefixed string # wrapped as a UDT. The wire format adds a 1-byte indicator diff --git a/src/informix_db/converters.py b/src/informix_db/converters.py index e7c8498..4cfa873 100644 --- a/src/informix_db/converters.py +++ b/src/informix_db/converters.py @@ -24,6 +24,55 @@ from collections.abc import Callable from ._types import IfxType, base_type +@dataclasses.dataclass(frozen=True, slots=True) +class RowValue: + """Composite-UDT ROW column value (Phase 12 minimal surface). + + A ROW column carries a heavy on-the-wire payload — the full type + schema (field names, types, nullability) is repeated *per row*, + plus the actual values. A 2-field ROW with "Alice, 30" came in as + 1420 bytes against the IBM dev server. + + Phase 12 decodes the outer ``[int length][bytes]`` envelope and + surfaces the inner payload as ``raw`` plus the schema string from + the column descriptor. Full recursive parsing into a Python tuple + of typed values is deferred — it requires implementing JDBC's + ``IfxComplexInput`` (700+ lines) on our side. + + Users who need to extract specific fields from a ROW today can: + 1. Use SQL projections: ``SELECT row_col.fieldname FROM ...`` + 2. Use the schema string for diagnostics + 3. Wait for Phase 13 (or contribute the parser) + """ + + raw: bytes + schema: str + + def __repr__(self) -> str: + return f"RowValue(<{len(self.raw)} bytes>, schema={self.schema!r})" + + +@dataclasses.dataclass(frozen=True, slots=True) +class CollectionValue: + """Composite-UDT collection column value (SET / MULTISET / LIST). + + Same shape as :class:`RowValue` — outer ``[int length][bytes]`` + envelope, inner payload as ``raw``. ``kind`` is one of + ``"set"`` / ``"multiset"`` / ``"list"``. The element type comes + from the column descriptor. + """ + + raw: bytes + kind: str # "set" / "multiset" / "list" + element_schema: str + + def __repr__(self) -> str: + return ( + f"CollectionValue(<{len(self.raw)} bytes>, kind={self.kind!r}, " + f"element_schema={self.element_schema!r})" + ) + + @dataclasses.dataclass(frozen=True, slots=True) class BlobLocator: """Reference to a smart-LOB BLOB stored in an sbspace. diff --git a/tests/test_composite_types.py b/tests/test_composite_types.py new file mode 100644 index 0000000..9c016c0 --- /dev/null +++ b/tests/test_composite_types.py @@ -0,0 +1,231 @@ +"""Phase 12 integration tests — ROW / COLLECTION (SET / MULTISET / LIST). + +These types are composite UDTs. The wire format is +``[byte indicator][int length][bytes]`` — same shape as the +``UDTVAR(lvarchar)`` decoder from Phase 10. The ``bytes`` payload is a +textual representation of the value (e.g., ``ROW('Alice',30 )`` or +``LIST{10,20,30}``) when selected with default options. + +JDBC requests a richer binary-with-schema format that runs ~30x larger +per row (1KB+ for a 2-field ROW). We don't — we stay with the +text representation and wrap the bytes in a typed object so users can +recognize the column without crashes. + +Phase 12 ships **type recognition only**. Full recursive parsing into +Python tuples / sets / lists is deferred — implementing it requires +porting JDBC's ``IfxComplexInput`` (hundreds of lines) and a textual +representation parser per element type. + +Workaround for users who need typed access today: project sub-fields +via SQL. For ROWs: ``SELECT row_col.fieldname FROM tbl``. +""" + +from __future__ import annotations + +import contextlib +from collections.abc import Iterator + +import pytest + +import informix_db +from tests.conftest import ConnParams + +pytestmark = pytest.mark.integration + + +def _connect(params: ConnParams) -> informix_db.Connection: + return informix_db.connect( + host=params.host, + port=params.port, + user=params.user, + password=params.password, + database=params.database, + server=params.server, + connect_timeout=10.0, + read_timeout=10.0, + autocommit=True, + ) + + +@pytest.fixture +def composite_table_factory( + logged_db_params: ConnParams, +) -> Iterator[callable]: + """Per-test factory for ad-hoc composite-type tables.""" + created: list[str] = [] + + def make(name: str, ddl: str) -> None: + with _connect(logged_db_params) as conn: + cur = conn.cursor() + with contextlib.suppress(Exception): + cur.execute(f"DROP TABLE {name}") + cur.execute(ddl) + created.append(name) + + try: + yield make + finally: + with _connect(logged_db_params) as conn: + cur = conn.cursor() + for name in created: + with contextlib.suppress(Exception): + cur.execute(f"DROP TABLE {name}") + + +# -------- ROW -------- + + +def test_row_value_recognized( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + """A ROW column returns an ``informix_db.RowValue``.""" + composite_table_factory( + "p12_row1", + "CREATE TABLE p12_row1 (id INT, r ROW(name VARCHAR(50), age INT))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute( + "INSERT INTO p12_row1 VALUES (1, " + "ROW('Alice', 30)::ROW(name VARCHAR(50), age INT))" + ) + cur.execute("SELECT id, r FROM p12_row1") + rid, val = cur.fetchone() + assert rid == 1 + assert isinstance(val, informix_db.RowValue) + assert val.schema == "ROW(name varchar(50), age integer)" + # Default representation is textual: + assert b"Alice" in val.raw + assert b"30" in val.raw + + +def test_row_null( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + """NULL ROW column → Python None (indicator byte = 1).""" + composite_table_factory( + "p12_row2", + "CREATE TABLE p12_row2 (id INT, r ROW(name VARCHAR(50), age INT))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute("INSERT INTO p12_row2 VALUES (1, NULL)") + cur.execute("SELECT id, r FROM p12_row2") + assert cur.fetchone() == (1, None) + + +def test_row_subfield_projection( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + """Users who need typed field access today can project via SQL.""" + composite_table_factory( + "p12_row3", + "CREATE TABLE p12_row3 (id INT, r ROW(name VARCHAR(50), age INT))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute( + "INSERT INTO p12_row3 VALUES (1, " + "ROW('Bob', 25)::ROW(name VARCHAR(50), age INT))" + ) + # Project sub-fields directly — bypass composite-type machinery + cur.execute("SELECT id, r.name, r.age FROM p12_row3") + assert cur.fetchall() == [(1, "Bob", 25)] + + +def test_row_long_value( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + """ROW value exceeding 255 bytes — verifies 4-byte length prefix.""" + composite_table_factory( + "p12_row4", + "CREATE TABLE p12_row4 (id INT, r ROW(s VARCHAR(255), n INT))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + long_string = "X" * 240 + cur.execute( + f"INSERT INTO p12_row4 VALUES (1, " + f"ROW('{long_string}', 42)::ROW(s VARCHAR(255), n INT))" + ) + cur.execute("SELECT id, r FROM p12_row4") + rid, val = cur.fetchone() + assert rid == 1 + assert isinstance(val, informix_db.RowValue) + assert b"X" * 240 in val.raw # the long string survived + + +# -------- Collections (SET / MULTISET / LIST) -------- + + +def test_set_recognized( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + composite_table_factory( + "p12_set", + "CREATE TABLE p12_set (id INT, s SET(VARCHAR(20) NOT NULL))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute("INSERT INTO p12_set VALUES (1, SET{'red','green','blue'})") + cur.execute("SELECT id, s FROM p12_set") + rid, val = cur.fetchone() + assert rid == 1 + assert isinstance(val, informix_db.CollectionValue) + assert val.kind == "set" + # Element values land in the textual representation: + assert b"red" in val.raw + assert b"green" in val.raw + assert b"blue" in val.raw + + +def test_multiset_recognized( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + composite_table_factory( + "p12_ms", + "CREATE TABLE p12_ms (id INT, ms MULTISET(INT NOT NULL))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute("INSERT INTO p12_ms VALUES (1, MULTISET{1,2,2,3})") + cur.execute("SELECT id, ms FROM p12_ms") + rid, val = cur.fetchone() + assert rid == 1 + assert val.kind == "multiset" + # Multiset preserves duplicates + assert val.raw.count(b"2") >= 2 + + +def test_list_recognized( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + composite_table_factory( + "p12_list", + "CREATE TABLE p12_list (id INT, l LIST(INT NOT NULL))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute("INSERT INTO p12_list VALUES (1, LIST{10,20,30})") + cur.execute("SELECT id, l FROM p12_list") + rid, val = cur.fetchone() + assert rid == 1 + assert val.kind == "list" + # LIST preserves order + idx_10 = val.raw.find(b"10") + idx_30 = val.raw.find(b"30") + assert 0 <= idx_10 < idx_30 + + +def test_collection_null( + logged_db_params: ConnParams, composite_table_factory: callable +) -> None: + composite_table_factory( + "p12_null", + "CREATE TABLE p12_null (id INT, l LIST(INT NOT NULL))", + ) + with _connect(logged_db_params) as conn: + cur = conn.cursor() + cur.execute("INSERT INTO p12_null VALUES (1, NULL)") + cur.execute("SELECT id, l FROM p12_null") + assert cur.fetchone() == (1, None)