informix-db/src/informix_db/_resultset.py
Ryan Malloy bea1a1cd0c Phase 20: UTF-8/multibyte locale support (2026.05.04.4)
Thread CLIENT_LOCALE through to user-data string codecs. Driver previously
hardcoded iso-8859-1 for all string conversions, which broke any locale
outside Western European code points.

* Connection.encoding property derived from client_locale via
  _python_encoding_from_locale (en_US.utf8 -> utf-8, en_US.8859-1 ->
  iso-8859-1, etc.)
* encode_param / decode / parse_tuple_payload accept an encoding
  parameter; cursor and fast-path call sites forward conn.encoding
* Smart-LOB CLOB encode/decode and TEXT decode honor connection encoding
* DataError raised for non-representable chars; cursor releases the
  prepared statement before propagating so connection state stays clean

Boundary discipline: protocol-level strings (cursor names, function
signatures, SQ_FILE fnames, error near-tokens, SQL text) stay
iso-8859-1 (always ASCII, never user-controlled).

9 new integration tests in tests/test_unicode.py covering ASCII
round-trip, Latin-1 high-bit, full byte range, locale-mapping,
encoding property, UTF-8 negotiation, multibyte (skipped without
IFX_UTF8_DATABASE), DataError on non-representable, CLOB round-trip.

Total: 69 unit + 212 integration = 281 tests.
2026-05-04 17:13:19 -06:00

392 lines
15 KiB
Python

"""SQ_DESCRIBE column descriptor parser and SQ_TUPLE row decoder.
Per IfxSqli.receiveDescribe (line 2175+) for ``isUSVER`` modern servers.
The per-field block layout is:
fieldIndex (int 4)
columnStartPos (int 4 — USVER)
columnType (short 2 — base IDS type code with high-bit flags)
columnExtendedId (int 4 — USVER, for UDT/extended types)
ownerName (readChar = [short len][bytes][pad if odd])
extendedName (readChar)
reference (short 2)
alignment (short 2)
sourceType (int 4)
encodedLength (int 4)
After all fields: the string table (a length-prefixed block of nul-separated
column names), read via readPadded.
"""
from __future__ import annotations
from dataclasses import dataclass
from ._protocol import IfxStreamReader
from ._types import base_type, is_nullable
from .converters import FIXED_WIDTHS, decode
@dataclass
class ColumnInfo:
"""One column in a SQ_DESCRIBE response."""
name: str
type_code: int # base IDS type code (high-bit flags stripped)
raw_type_code: int # raw type-code short with flags intact
encoded_length: int
column_start_pos: int = 0
extended_id: int = 0
owner_name: str = ""
extended_name: str = ""
@property
def null_ok(self) -> bool:
return is_nullable(self.raw_type_code)
def to_description_tuple(self) -> tuple:
"""The PEP 249 cursor.description 7-tuple."""
return (
self.name,
self.type_code,
self.encoded_length, # display_size
self.encoded_length, # internal_size
0, # precision (Phase 6+ derives from type)
0, # scale
self.null_ok,
)
def _read_char(reader: IfxStreamReader, encoding: str = "iso-8859-1") -> str:
"""Read JDBC's ``readChar`` format: [short len][bytes][pad if odd-len]."""
length = reader.read_short()
if length < 0:
return ""
if length == 0:
return ""
data = reader.read_exact(length)
if length & 1:
reader.read_exact(1) # pad byte
return data.decode(encoding)
def parse_describe(reader: IfxStreamReader) -> tuple[list[ColumnInfo], dict]:
"""Parse a SQ_DESCRIBE response (the SQ_DESCRIBE tag is already consumed).
Returns ``(columns, metadata)``.
"""
statement_type = reader.read_short()
statement_id = reader.read_short()
estimated_cost = reader.read_int()
tuple_size = reader.read_short()
nfields = reader.read_short()
string_table_size = reader.read_int() # 4-byte on modern servers
metadata = {
"statement_type": statement_type,
"statement_id": statement_id,
"estimated_cost": estimated_cost,
"tuple_size": tuple_size,
"nfields": nfields,
"string_table_size": string_table_size,
}
if nfields <= 0:
return [], metadata
# Pass 1: per-field descriptor block (no name yet — names come from
# the string table).
raw_fields: list[dict] = []
for _ in range(nfields):
field_index = reader.read_int()
column_start_pos = reader.read_int()
column_type = reader.read_short()
column_extended_id = reader.read_int()
owner_name = _read_char(reader)
extended_name = _read_char(reader)
reference = reader.read_short() # noqa: F841 (Phase 6+)
alignment = reader.read_short() # noqa: F841
source_type = reader.read_int() # noqa: F841
encoded_length = reader.read_int()
raw_fields.append(
{
"field_index": field_index,
"column_start_pos": column_start_pos,
"type_code": column_type,
"extended_id": column_extended_id,
"owner_name": owner_name,
"extended_name": extended_name,
"encoded_length": encoded_length,
}
)
# Pass 2: string table — nul-separated column names. readPadded.
string_table = b""
if string_table_size > 0:
string_table = reader.read_exact(string_table_size)
if string_table_size & 1:
reader.read_exact(1) # pad
# Split string table on nul to get the column-name list. The fieldIndex
# values point into this table for each column's name.
raw_names = string_table.split(b"\x00")
name_lookup = {0: ""}
cursor = 0
for piece in raw_names:
if piece:
name_lookup[cursor] = piece.decode("iso-8859-1")
cursor += len(piece) + 1 # +1 for the nul we split on
columns: list[ColumnInfo] = []
for fd in raw_fields:
# fieldIndex is the byte offset where the column's name starts.
name = name_lookup.get(fd["field_index"])
if name is None:
# Walk the string table to find the name at this offset.
tail = string_table[fd["field_index"] :].split(b"\x00", 1)[0]
name = tail.decode("iso-8859-1") if tail else f"col{len(columns)}"
columns.append(
ColumnInfo(
name=name or f"col{len(columns)}",
type_code=base_type(fd["type_code"]),
raw_type_code=fd["type_code"],
encoded_length=fd["encoded_length"],
column_start_pos=fd["column_start_pos"],
extended_id=fd["extended_id"],
owner_name=fd["owner_name"],
extended_name=fd["extended_name"],
)
)
return columns, metadata
# IDS type codes that are length-prefixed in the tuple payload.
# Per ``IfxSqli`` row-data extraction (see receiveFastPath case 13/15/16):
# CHAR, VARCHAR, NCHAR, NVCHAR all use ``[short length][bytes][pad if odd]``
# inside the tuple blob. LVARCHAR uses a 4-byte length prefix instead.
from ._types import IfxType # noqa: E402
_LENGTH_PREFIXED_SHORT_TYPES = frozenset({
int(IfxType.CHAR),
int(IfxType.VARCHAR),
int(IfxType.NCHAR),
int(IfxType.NVCHAR),
})
def parse_tuple_payload(
reader: IfxStreamReader,
columns: list[ColumnInfo],
encoding: str = "iso-8859-1",
) -> tuple:
"""Parse a SQ_TUPLE payload (the SQ_TUPLE tag is already consumed).
Per ``IfxSqli.receiveTuple``:
``[short warn][int size][bytes payload]``
The payload contains column values back-to-back. For each column, the
on-wire encoding depends on the type:
* Fixed-width types (INT, FLOAT, DATE, BIGINT, etc.): exact byte count
from ``FIXED_WIDTHS``.
* Length-prefixed strings (CHAR, VARCHAR, NCHAR, NVCHAR): ``[short len]
[bytes][pad if odd]``.
* LVARCHAR: 4-byte length prefix instead of 2.
* Other variable-width types (DECIMAL, DATETIME, INTERVAL, BLOBs):
Phase 6+ — currently surfaces raw bytes from ``encoded_length``.
``encoding`` is forwarded to ``decode()`` for string columns. Caller
(typically the cursor) should pass the connection's
``encoding`` so user-data text honors CLIENT_LOCALE.
"""
reader.read_short() # warn (Phase 5 surfaces)
size = reader.read_int()
payload = reader.read_exact(size)
# SQ_TUPLE payload is padded to even-byte alignment on the wire.
# Discovered empirically: a 11-byte "syscolumns" VARCHAR payload had
# a trailing 0x00 between it and the next SQ_TUPLE tag. Consuming
# this pad keeps the next read aligned.
# (See docs/CAPTURES/15-py-varchar-fixed.socat.log analysis.)
if size & 1:
reader.read_exact(1)
values: list[object] = []
offset = 0
for col in columns:
base = base_type(col.type_code)
if base in _LENGTH_PREFIXED_SHORT_TYPES:
# In tuple data, VARCHAR/NCHAR/NVCHAR use a SINGLE-BYTE
# length prefix (max 255 — IDS VARCHAR's hard limit), not
# a short. Empirically verified against the SQ_TUPLE bytes
# for ``SELECT tabname FROM systables`` in
# docs/CAPTURES/13-py-varchar.socat.log:
# payload = 09 73 79 73 74 61 62 6c 65 73
# = [byte 9]["systables"]
# CHAR is fixed-width per encoded_length — handled below.
if base == int(IfxType.CHAR):
# CHAR(N) is fixed-width; uses encoded_length straight
width = col.encoded_length
raw = payload[offset:offset + width]
offset += width
else:
length = payload[offset]
offset += 1
raw = payload[offset:offset + length]
offset += length
values.append(decode(col.type_code, raw, encoding))
continue
if base == int(IfxType.LVARCHAR):
# [int length][bytes][pad if odd]
length = int.from_bytes(payload[offset:offset + 4], "big", signed=True)
offset += 4
raw = payload[offset:offset + length]
offset += length
if length & 1:
offset += 1
values.append(decode(col.type_code, raw, encoding))
continue
# DECIMAL/MONEY: width = ceil(precision/2) + 1, where precision is
# the high byte of encoded_length (packed as (precision << 8) | scale).
# Per IfxRowColumn.loadColumnData and IfxToJavaDecimal byte sizing.
if base in (int(IfxType.DECIMAL), int(IfxType.MONEY)):
precision = (col.encoded_length >> 8) & 0xFF
width = (precision + 1) // 2 + 1
raw = payload[offset:offset + width]
offset += width
try:
values.append(decode(col.type_code, raw))
except NotImplementedError:
values.append(raw)
continue
# DATETIME: width = ceil(digit_count/2) + 1, where digit_count is the
# high byte of encoded_length (packed as (digit_count << 8) |
# (start_TU << 4) | end_TU). The decoder needs the qualifier too,
# so we call it directly here rather than via the dispatch.
if base == int(IfxType.DATETIME):
digit_count = (col.encoded_length >> 8) & 0xFF
width = (digit_count + 1) // 2 + 1
raw = payload[offset:offset + width]
offset += width
from .converters import _decode_datetime
values.append(_decode_datetime(raw, col.encoded_length))
continue
# INTERVAL: same width formula as DATETIME — high byte of
# encoded_length holds the total digit count across all fields,
# and the wire bytes are ``[head][digit pairs]`` (one head byte
# plus ceil(digit_count/2) digit pairs). Like DATETIME, the
# qualifier is needed at decode time, so we bypass the generic
# dispatch.
if base == int(IfxType.INTERVAL):
digit_count = (col.encoded_length >> 8) & 0xFF
width = (digit_count + 1) // 2 + 1
raw = payload[offset:offset + width]
offset += width
from .converters import _decode_interval
values.append(_decode_interval(raw, col.encoded_length))
continue
# BLOB / CLOB (smart-LOBs): the SQ_DESCRIBE response presents
# these as UDTFIXED (type 41) with extended_id 10 (BLOB) or 11
# (CLOB) and encoded_length = 72 (locator size). The 72 bytes
# we read here are an opaque server-side reference, NOT the
# actual data. Phase 10 lets users fetch via lotofile + SQ_FILE.
if base == int(IfxType.UDTFIXED) and col.extended_id in (10, 11):
from .converters import BlobLocator, ClobLocator
width = col.encoded_length
raw = payload[offset:offset + width]
offset += width
cls = BlobLocator if col.extended_id == 10 else ClobLocator
values.append(cls(raw=bytes(raw)))
continue
# ROW / COLLECTION (Phase 12): composite UDTs. Wire format is
# ``[byte ind][int length][bytes]`` — same shape as
# UDTVAR(lvarchar) above, but the payload semantics are a
# textual representation of the composite (e.g.,
# ``ROW('Alice',30 )`` or ``LIST{10,20,30}``) when
# selected with default options. JDBC requests a richer
# binary-with-schema format that's ~30x larger; we don't.
#
# We surface the bytes wrapped in a typed object and let the
# user parse the textual form themselves. Type codes:
# ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21.
if base in (
int(IfxType.ROW),
int(IfxType.COLLECTION),
int(IfxType.SET),
int(IfxType.MULTISET),
int(IfxType.LIST),
):
from .converters import CollectionValue, RowValue
indicator = payload[offset]
offset += 1
if indicator == 1: # null
values.append(None)
continue
length = int.from_bytes(
payload[offset:offset + 4], "big", signed=True
)
offset += 4
raw = bytes(payload[offset:offset + length])
offset += length
if base == int(IfxType.ROW):
values.append(RowValue(raw=raw, schema=col.extended_name))
else:
kind_map = {
int(IfxType.SET): "set",
int(IfxType.MULTISET): "multiset",
int(IfxType.LIST): "list",
int(IfxType.COLLECTION): "collection",
}
values.append(
CollectionValue(
raw=raw,
kind=kind_map[base],
element_schema=col.extended_name,
)
)
continue
# UDTVAR (type 40) with extended_name="lvarchar": this is what
# functions like ``lotofile`` return — a length-prefixed string
# wrapped as a UDT. The wire format adds a 1-byte indicator
# prefix BEFORE the LVARCHAR ``[int len][bytes]``. Empirically
# verified against ``SELECT lotofile(...)`` row data — the
# leading ``00`` is null indicator (0=not null, 1=null per UDT
# convention).
if base == int(IfxType.UDTVAR) and col.extended_name == "lvarchar":
indicator = payload[offset]
offset += 1
if indicator == 1:
values.append(None)
continue
length = int.from_bytes(
payload[offset:offset + 4], "big", signed=True
)
offset += 4
raw = payload[offset:offset + length]
offset += length
if length & 1:
offset += 1
values.append(raw.decode(encoding))
continue
# Fixed-width types
width = FIXED_WIDTHS.get(base)
if width is None:
# Phase 6+ types (DATETIME, INTERVAL, BLOBs) — fall back
# to encoded_length and surface raw bytes.
width = col.encoded_length
raw = payload[offset:offset + width]
offset += width
try:
values.append(decode(col.type_code, raw, encoding))
except NotImplementedError:
values.append(raw)
return tuple(values)