Per-row decode is hit on every row of every SELECT. The original code had three forms of waste in the inner loop: 1. Redundant base_type() call. ColumnInfo.type_code is already base-typed by parse_describe at construction; calling base_type() again per column per row was pure waste. Single largest savings. 2. IntFlag->int conversions inline (~10x per iteration). Lifted to module-level _TC_X constants. 3. Lazy imports inside the loop body (_decode_datetime, _decode_interval, BlobLocator, ClobLocator, RowValue, CollectionValue). Moved to top. Plus three precomputed frozensets (_LENGTH_PREFIXED_SHORT_TYPES, _COMPOSITE_UDT_TYPES, _NUMERIC_TYPES) replace inline tuple-membership checks. _COLLECTION_KIND_MAP is now MappingProxyType (actually frozen). Performance: * parse_tuple_5cols: 2796 ns -> 2030 ns (-27%) * select_bench_table_all (1k rows): 1477 us -> 1198 us (-19%) * Codec micro-bench, cold connect, executemany: unchanged Real-world fetch ceiling on a single connection: 350K rows/sec -> 490K rows/sec. Margaret Hamilton review surfaced four cleanup items, all addressed before tagging: * H1: cursor._dereference_blob_columns had the same redundant base_type() call - stripped for consistency. * M1: documented the load-bearing invariant at parse_describe (the single producer site) so future contributors have a grep target. * M2: _COLLECTION_KIND_MAP wrapped in MappingProxyType. * L1: stale line-number comment fixed to point at the INVARIANT comment instead. baseline.json refreshed; all 224 integration tests pass; ruff clean.
430 lines
16 KiB
Python
430 lines
16 KiB
Python
"""SQ_DESCRIBE column descriptor parser and SQ_TUPLE row decoder.
|
|
|
|
Per IfxSqli.receiveDescribe (line 2175+) for ``isUSVER`` modern servers.
|
|
The per-field block layout is:
|
|
|
|
fieldIndex (int 4)
|
|
columnStartPos (int 4 — USVER)
|
|
columnType (short 2 — base IDS type code with high-bit flags)
|
|
columnExtendedId (int 4 — USVER, for UDT/extended types)
|
|
ownerName (readChar = [short len][bytes][pad if odd])
|
|
extendedName (readChar)
|
|
reference (short 2)
|
|
alignment (short 2)
|
|
sourceType (int 4)
|
|
encodedLength (int 4)
|
|
|
|
After all fields: the string table (a length-prefixed block of nul-separated
|
|
column names), read via readPadded.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from types import MappingProxyType
|
|
|
|
from ._protocol import IfxStreamReader
|
|
from ._types import IfxType, base_type, is_nullable
|
|
from .converters import (
|
|
FIXED_WIDTHS,
|
|
BlobLocator,
|
|
ClobLocator,
|
|
CollectionValue,
|
|
RowValue,
|
|
_decode_datetime,
|
|
_decode_interval,
|
|
decode,
|
|
)
|
|
|
|
# Module-level type-code constants — lifted out of the hot loop in
|
|
# parse_tuple_payload so we don't pay the IntFlag→int conversion per
|
|
# column per row.
|
|
_TC_CHAR = int(IfxType.CHAR)
|
|
_TC_VARCHAR = int(IfxType.VARCHAR)
|
|
_TC_NCHAR = int(IfxType.NCHAR)
|
|
_TC_NVCHAR = int(IfxType.NVCHAR)
|
|
_TC_LVARCHAR = int(IfxType.LVARCHAR)
|
|
_TC_DECIMAL = int(IfxType.DECIMAL)
|
|
_TC_MONEY = int(IfxType.MONEY)
|
|
_TC_DATETIME = int(IfxType.DATETIME)
|
|
_TC_INTERVAL = int(IfxType.INTERVAL)
|
|
_TC_UDTFIXED = int(IfxType.UDTFIXED)
|
|
_TC_UDTVAR = int(IfxType.UDTVAR)
|
|
_TC_ROW = int(IfxType.ROW)
|
|
_TC_COLLECTION = int(IfxType.COLLECTION)
|
|
_TC_SET = int(IfxType.SET)
|
|
_TC_MULTISET = int(IfxType.MULTISET)
|
|
_TC_LIST = int(IfxType.LIST)
|
|
|
|
_COLLECTION_KIND_MAP = MappingProxyType({
|
|
_TC_SET: "set",
|
|
_TC_MULTISET: "multiset",
|
|
_TC_LIST: "list",
|
|
_TC_COLLECTION: "collection",
|
|
})
|
|
|
|
|
|
@dataclass
|
|
class ColumnInfo:
|
|
"""One column in a SQ_DESCRIBE response."""
|
|
|
|
name: str
|
|
type_code: int # base IDS type code (high-bit flags stripped)
|
|
raw_type_code: int # raw type-code short with flags intact
|
|
encoded_length: int
|
|
column_start_pos: int = 0
|
|
extended_id: int = 0
|
|
owner_name: str = ""
|
|
extended_name: str = ""
|
|
|
|
@property
|
|
def null_ok(self) -> bool:
|
|
return is_nullable(self.raw_type_code)
|
|
|
|
def to_description_tuple(self) -> tuple:
|
|
"""The PEP 249 cursor.description 7-tuple."""
|
|
return (
|
|
self.name,
|
|
self.type_code,
|
|
self.encoded_length, # display_size
|
|
self.encoded_length, # internal_size
|
|
0, # precision (Phase 6+ derives from type)
|
|
0, # scale
|
|
self.null_ok,
|
|
)
|
|
|
|
|
|
def _read_char(reader: IfxStreamReader, encoding: str = "iso-8859-1") -> str:
|
|
"""Read JDBC's ``readChar`` format: [short len][bytes][pad if odd-len]."""
|
|
length = reader.read_short()
|
|
if length < 0:
|
|
return ""
|
|
if length == 0:
|
|
return ""
|
|
data = reader.read_exact(length)
|
|
if length & 1:
|
|
reader.read_exact(1) # pad byte
|
|
return data.decode(encoding)
|
|
|
|
|
|
def parse_describe(reader: IfxStreamReader) -> tuple[list[ColumnInfo], dict]:
|
|
"""Parse a SQ_DESCRIBE response (the SQ_DESCRIBE tag is already consumed).
|
|
|
|
Returns ``(columns, metadata)``.
|
|
"""
|
|
statement_type = reader.read_short()
|
|
statement_id = reader.read_short()
|
|
estimated_cost = reader.read_int()
|
|
tuple_size = reader.read_short()
|
|
nfields = reader.read_short()
|
|
string_table_size = reader.read_int() # 4-byte on modern servers
|
|
|
|
metadata = {
|
|
"statement_type": statement_type,
|
|
"statement_id": statement_id,
|
|
"estimated_cost": estimated_cost,
|
|
"tuple_size": tuple_size,
|
|
"nfields": nfields,
|
|
"string_table_size": string_table_size,
|
|
}
|
|
|
|
if nfields <= 0:
|
|
return [], metadata
|
|
|
|
# Pass 1: per-field descriptor block (no name yet — names come from
|
|
# the string table).
|
|
raw_fields: list[dict] = []
|
|
for _ in range(nfields):
|
|
field_index = reader.read_int()
|
|
column_start_pos = reader.read_int()
|
|
column_type = reader.read_short()
|
|
column_extended_id = reader.read_int()
|
|
owner_name = _read_char(reader)
|
|
extended_name = _read_char(reader)
|
|
reference = reader.read_short() # noqa: F841 (Phase 6+)
|
|
alignment = reader.read_short() # noqa: F841
|
|
source_type = reader.read_int() # noqa: F841
|
|
encoded_length = reader.read_int()
|
|
raw_fields.append(
|
|
{
|
|
"field_index": field_index,
|
|
"column_start_pos": column_start_pos,
|
|
"type_code": column_type,
|
|
"extended_id": column_extended_id,
|
|
"owner_name": owner_name,
|
|
"extended_name": extended_name,
|
|
"encoded_length": encoded_length,
|
|
}
|
|
)
|
|
|
|
# Pass 2: string table — nul-separated column names. readPadded.
|
|
string_table = b""
|
|
if string_table_size > 0:
|
|
string_table = reader.read_exact(string_table_size)
|
|
if string_table_size & 1:
|
|
reader.read_exact(1) # pad
|
|
|
|
# Split string table on nul to get the column-name list. The fieldIndex
|
|
# values point into this table for each column's name.
|
|
raw_names = string_table.split(b"\x00")
|
|
name_lookup = {0: ""}
|
|
cursor = 0
|
|
for piece in raw_names:
|
|
if piece:
|
|
name_lookup[cursor] = piece.decode("iso-8859-1")
|
|
cursor += len(piece) + 1 # +1 for the nul we split on
|
|
|
|
columns: list[ColumnInfo] = []
|
|
for fd in raw_fields:
|
|
# fieldIndex is the byte offset where the column's name starts.
|
|
name = name_lookup.get(fd["field_index"])
|
|
if name is None:
|
|
# Walk the string table to find the name at this offset.
|
|
tail = string_table[fd["field_index"] :].split(b"\x00", 1)[0]
|
|
name = tail.decode("iso-8859-1") if tail else f"col{len(columns)}"
|
|
# INVARIANT: ColumnInfo.type_code is always base-typed (high-bit
|
|
# flags stripped). This is the single producer site — every reader
|
|
# (parse_tuple_payload, cursor._dereference_blob_columns, etc.)
|
|
# depends on this and skips redundant base_type() calls. If you
|
|
# ever construct ColumnInfo elsewhere, base_type() the input.
|
|
columns.append(
|
|
ColumnInfo(
|
|
name=name or f"col{len(columns)}",
|
|
type_code=base_type(fd["type_code"]),
|
|
raw_type_code=fd["type_code"],
|
|
encoded_length=fd["encoded_length"],
|
|
column_start_pos=fd["column_start_pos"],
|
|
extended_id=fd["extended_id"],
|
|
owner_name=fd["owner_name"],
|
|
extended_name=fd["extended_name"],
|
|
)
|
|
)
|
|
return columns, metadata
|
|
|
|
|
|
# IDS type codes that are length-prefixed in the tuple payload.
|
|
# Per ``IfxSqli`` row-data extraction (see receiveFastPath case 13/15/16):
|
|
# CHAR, VARCHAR, NCHAR, NVCHAR all use ``[short length][bytes][pad if odd]``
|
|
# inside the tuple blob. LVARCHAR uses a 4-byte length prefix instead.
|
|
_LENGTH_PREFIXED_SHORT_TYPES = frozenset({
|
|
_TC_CHAR,
|
|
_TC_VARCHAR,
|
|
_TC_NCHAR,
|
|
_TC_NVCHAR,
|
|
})
|
|
|
|
_COMPOSITE_UDT_TYPES = frozenset({
|
|
_TC_ROW,
|
|
_TC_COLLECTION,
|
|
_TC_SET,
|
|
_TC_MULTISET,
|
|
_TC_LIST,
|
|
})
|
|
|
|
_NUMERIC_TYPES = frozenset({_TC_DECIMAL, _TC_MONEY})
|
|
|
|
|
|
def parse_tuple_payload(
|
|
reader: IfxStreamReader,
|
|
columns: list[ColumnInfo],
|
|
encoding: str = "iso-8859-1",
|
|
) -> tuple:
|
|
"""Parse a SQ_TUPLE payload (the SQ_TUPLE tag is already consumed).
|
|
|
|
Per ``IfxSqli.receiveTuple``:
|
|
``[short warn][int size][bytes payload]``
|
|
|
|
The payload contains column values back-to-back. For each column, the
|
|
on-wire encoding depends on the type:
|
|
|
|
* Fixed-width types (INT, FLOAT, DATE, BIGINT, etc.): exact byte count
|
|
from ``FIXED_WIDTHS``.
|
|
* Length-prefixed strings (CHAR, VARCHAR, NCHAR, NVCHAR): ``[short len]
|
|
[bytes][pad if odd]``.
|
|
* LVARCHAR: 4-byte length prefix instead of 2.
|
|
* Other variable-width types (DECIMAL, DATETIME, INTERVAL, BLOBs):
|
|
Phase 6+ — currently surfaces raw bytes from ``encoded_length``.
|
|
|
|
``encoding`` is forwarded to ``decode()`` for string columns. Caller
|
|
(typically the cursor) should pass the connection's
|
|
``encoding`` so user-data text honors CLIENT_LOCALE.
|
|
"""
|
|
reader.read_short() # warn (Phase 5 surfaces)
|
|
size = reader.read_int()
|
|
payload = reader.read_exact(size)
|
|
# SQ_TUPLE payload is padded to even-byte alignment on the wire.
|
|
# Discovered empirically: a 11-byte "syscolumns" VARCHAR payload had
|
|
# a trailing 0x00 between it and the next SQ_TUPLE tag. Consuming
|
|
# this pad keeps the next read aligned.
|
|
# (See docs/CAPTURES/15-py-varchar-fixed.socat.log analysis.)
|
|
if size & 1:
|
|
reader.read_exact(1)
|
|
|
|
values: list[object] = []
|
|
offset = 0
|
|
# Note: ``col.type_code`` is *already* base-typed by ``parse_describe``
|
|
# (see INVARIANT comment there), so we don't re-strip high-bit flags
|
|
# here. The original code called ``base_type(col.type_code)`` per
|
|
# column per row — pure waste. Skipping it is the single largest
|
|
# savings in this loop.
|
|
for col in columns:
|
|
tc = col.type_code
|
|
|
|
if tc in _LENGTH_PREFIXED_SHORT_TYPES:
|
|
# In tuple data, VARCHAR/NCHAR/NVCHAR use a SINGLE-BYTE
|
|
# length prefix (max 255 — IDS VARCHAR's hard limit), not
|
|
# a short. Empirically verified against the SQ_TUPLE bytes
|
|
# for ``SELECT tabname FROM systables`` in
|
|
# docs/CAPTURES/13-py-varchar.socat.log:
|
|
# payload = 09 73 79 73 74 61 62 6c 65 73
|
|
# = [byte 9]["systables"]
|
|
# CHAR is fixed-width per encoded_length — handled below.
|
|
if tc == _TC_CHAR:
|
|
width = col.encoded_length
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
else:
|
|
length = payload[offset]
|
|
offset += 1
|
|
raw = payload[offset:offset + length]
|
|
offset += length
|
|
values.append(decode(tc, raw, encoding))
|
|
continue
|
|
|
|
if tc == _TC_LVARCHAR:
|
|
# [int length][bytes][pad if odd]
|
|
length = int.from_bytes(payload[offset:offset + 4], "big", signed=True)
|
|
offset += 4
|
|
raw = payload[offset:offset + length]
|
|
offset += length
|
|
if length & 1:
|
|
offset += 1
|
|
values.append(decode(tc, raw, encoding))
|
|
continue
|
|
|
|
# DECIMAL/MONEY: width = ceil(precision/2) + 1, where precision is
|
|
# the high byte of encoded_length (packed as (precision << 8) | scale).
|
|
# Per IfxRowColumn.loadColumnData and IfxToJavaDecimal byte sizing.
|
|
if tc in _NUMERIC_TYPES:
|
|
precision = (col.encoded_length >> 8) & 0xFF
|
|
width = (precision + 1) // 2 + 1
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
try:
|
|
values.append(decode(tc, raw))
|
|
except NotImplementedError:
|
|
values.append(raw)
|
|
continue
|
|
|
|
# DATETIME: width = ceil(digit_count/2) + 1, where digit_count is the
|
|
# high byte of encoded_length (packed as (digit_count << 8) |
|
|
# (start_TU << 4) | end_TU). The decoder needs the qualifier too,
|
|
# so we call it directly here rather than via the dispatch.
|
|
if tc == _TC_DATETIME:
|
|
digit_count = (col.encoded_length >> 8) & 0xFF
|
|
width = (digit_count + 1) // 2 + 1
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
values.append(_decode_datetime(raw, col.encoded_length))
|
|
continue
|
|
|
|
# INTERVAL: same width formula as DATETIME — high byte of
|
|
# encoded_length holds the total digit count across all fields,
|
|
# and the wire bytes are ``[head][digit pairs]`` (one head byte
|
|
# plus ceil(digit_count/2) digit pairs). Like DATETIME, the
|
|
# qualifier is needed at decode time, so we bypass the generic
|
|
# dispatch.
|
|
if tc == _TC_INTERVAL:
|
|
digit_count = (col.encoded_length >> 8) & 0xFF
|
|
width = (digit_count + 1) // 2 + 1
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
values.append(_decode_interval(raw, col.encoded_length))
|
|
continue
|
|
|
|
# BLOB / CLOB (smart-LOBs): the SQ_DESCRIBE response presents
|
|
# these as UDTFIXED (type 41) with extended_id 10 (BLOB) or 11
|
|
# (CLOB) and encoded_length = 72 (locator size). The 72 bytes
|
|
# we read here are an opaque server-side reference, NOT the
|
|
# actual data. Phase 10 lets users fetch via lotofile + SQ_FILE.
|
|
if tc == _TC_UDTFIXED and col.extended_id in (10, 11):
|
|
width = col.encoded_length
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
cls = BlobLocator if col.extended_id == 10 else ClobLocator
|
|
values.append(cls(raw=bytes(raw)))
|
|
continue
|
|
|
|
# ROW / COLLECTION (Phase 12): composite UDTs. Wire format is
|
|
# ``[byte ind][int length][bytes]`` — same shape as
|
|
# UDTVAR(lvarchar) above, but the payload semantics are a
|
|
# textual representation of the composite (e.g.,
|
|
# ``ROW('Alice',30 )`` or ``LIST{10,20,30}``) when
|
|
# selected with default options. JDBC requests a richer
|
|
# binary-with-schema format that's ~30x larger; we don't.
|
|
#
|
|
# We surface the bytes wrapped in a typed object and let the
|
|
# user parse the textual form themselves. Type codes:
|
|
# ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21.
|
|
if tc in _COMPOSITE_UDT_TYPES:
|
|
indicator = payload[offset]
|
|
offset += 1
|
|
if indicator == 1: # null
|
|
values.append(None)
|
|
continue
|
|
length = int.from_bytes(
|
|
payload[offset:offset + 4], "big", signed=True
|
|
)
|
|
offset += 4
|
|
raw = bytes(payload[offset:offset + length])
|
|
offset += length
|
|
if tc == _TC_ROW:
|
|
values.append(RowValue(raw=raw, schema=col.extended_name))
|
|
else:
|
|
values.append(
|
|
CollectionValue(
|
|
raw=raw,
|
|
kind=_COLLECTION_KIND_MAP[tc],
|
|
element_schema=col.extended_name,
|
|
)
|
|
)
|
|
continue
|
|
|
|
# UDTVAR (type 40) with extended_name="lvarchar": this is what
|
|
# functions like ``lotofile`` return — a length-prefixed string
|
|
# wrapped as a UDT. The wire format adds a 1-byte indicator
|
|
# prefix BEFORE the LVARCHAR ``[int len][bytes]``. Empirically
|
|
# verified against ``SELECT lotofile(...)`` row data — the
|
|
# leading ``00`` is null indicator (0=not null, 1=null per UDT
|
|
# convention).
|
|
if tc == _TC_UDTVAR and col.extended_name == "lvarchar":
|
|
indicator = payload[offset]
|
|
offset += 1
|
|
if indicator == 1:
|
|
values.append(None)
|
|
continue
|
|
length = int.from_bytes(
|
|
payload[offset:offset + 4], "big", signed=True
|
|
)
|
|
offset += 4
|
|
raw = payload[offset:offset + length]
|
|
offset += length
|
|
if length & 1:
|
|
offset += 1
|
|
values.append(raw.decode(encoding))
|
|
continue
|
|
|
|
# Fixed-width types
|
|
width = FIXED_WIDTHS.get(tc)
|
|
if width is None:
|
|
# Phase 6+ types (DATETIME, INTERVAL, BLOBs) — fall back
|
|
# to encoded_length and surface raw bytes.
|
|
width = col.encoded_length
|
|
raw = payload[offset:offset + width]
|
|
offset += width
|
|
try:
|
|
values.append(decode(tc, raw, encoding))
|
|
except NotImplementedError:
|
|
values.append(raw)
|
|
return tuple(values)
|