informix-db/src/informix_db/_resultset.py

"""SQ_DESCRIBE column descriptor parser and SQ_TUPLE row decoder.

Per IfxSqli.receiveDescribe (line 2175+) for ``isUSVER`` modern servers.
The per-field block layout is:

  fieldIndex          (int 4)
  columnStartPos      (int 4 — USVER)
  columnType          (short 2 — base IDS type code with high-bit flags)
  columnExtendedId    (int 4 — USVER, for UDT/extended types)
  ownerName           (readChar = [short len][bytes][pad if odd])
  extendedName        (readChar)
  reference           (short 2)
  alignment           (short 2)
  sourceType          (int 4)
  encodedLength       (int 4)

After all fields: the string table (a length-prefixed block of nul-separated
column names), read via readPadded.
"""

from __future__ import annotations

from dataclasses import dataclass
from types import MappingProxyType

from ._protocol import IfxStreamReader
from ._types import IfxType, base_type, is_nullable
from .converters import (
    FIXED_WIDTHS,
    BlobLocator,
    ClobLocator,
    CollectionValue,
    RowValue,
    _decode_datetime,
    _decode_interval,
    decode,
)

# Module-level type-code constants — lifted out of the hot loop in
# parse_tuple_payload so we don't pay the IntFlag→int conversion per
# column per row.
_TC_CHAR = int(IfxType.CHAR)
_TC_VARCHAR = int(IfxType.VARCHAR)
_TC_NCHAR = int(IfxType.NCHAR)
_TC_NVCHAR = int(IfxType.NVCHAR)
_TC_LVARCHAR = int(IfxType.LVARCHAR)
_TC_DECIMAL = int(IfxType.DECIMAL)
_TC_MONEY = int(IfxType.MONEY)
_TC_DATETIME = int(IfxType.DATETIME)
_TC_INTERVAL = int(IfxType.INTERVAL)
_TC_UDTFIXED = int(IfxType.UDTFIXED)
_TC_UDTVAR = int(IfxType.UDTVAR)
_TC_ROW = int(IfxType.ROW)
_TC_COLLECTION = int(IfxType.COLLECTION)
_TC_SET = int(IfxType.SET)
_TC_MULTISET = int(IfxType.MULTISET)
_TC_LIST = int(IfxType.LIST)

_COLLECTION_KIND_MAP = MappingProxyType({
    _TC_SET: "set",
    _TC_MULTISET: "multiset",
    _TC_LIST: "list",
    _TC_COLLECTION: "collection",
})


@dataclass
class ColumnInfo:
    """One column in a SQ_DESCRIBE response."""

    name: str
    type_code: int  # base IDS type code (high-bit flags stripped)
    raw_type_code: int  # raw type-code short with flags intact
    encoded_length: int
    column_start_pos: int = 0
    extended_id: int = 0
    owner_name: str = ""
    extended_name: str = ""

    @property
    def null_ok(self) -> bool:
        return is_nullable(self.raw_type_code)

    def to_description_tuple(self) -> tuple:
        """The PEP 249 cursor.description 7-tuple."""
        return (
            self.name,
            self.type_code,
            self.encoded_length,  # display_size
            self.encoded_length,  # internal_size
            0,  # precision (Phase 6+ derives from type)
            0,  # scale
            self.null_ok,
        )


def _read_char(reader: IfxStreamReader, encoding: str = "iso-8859-1") -> str:
    """Read JDBC's ``readChar`` format: [short len][bytes][pad if odd-len]."""
    length = reader.read_short()
    if length < 0:
        return ""
    if length == 0:
        return ""
    data = reader.read_exact(length)
    if length & 1:
        reader.read_exact(1)  # pad byte
    return data.decode(encoding)


def parse_describe(reader: IfxStreamReader) -> tuple[list[ColumnInfo], dict]:
    """Parse a SQ_DESCRIBE response (the SQ_DESCRIBE tag is already consumed).

    Returns ``(columns, metadata)``.
    """
    statement_type = reader.read_short()
    statement_id = reader.read_short()
    estimated_cost = reader.read_int()
    tuple_size = reader.read_short()
    nfields = reader.read_short()
    string_table_size = reader.read_int()  # 4-byte on modern servers

    metadata = {
        "statement_type": statement_type,
        "statement_id": statement_id,
        "estimated_cost": estimated_cost,
        "tuple_size": tuple_size,
        "nfields": nfields,
        "string_table_size": string_table_size,
    }

    if nfields <= 0:
        return [], metadata

    # Pass 1: per-field descriptor block (no name yet — names come from
    # the string table).
    raw_fields: list[dict] = []
    for _ in range(nfields):
        field_index = reader.read_int()
        column_start_pos = reader.read_int()
        column_type = reader.read_short()
        column_extended_id = reader.read_int()
        owner_name = _read_char(reader)
        extended_name = _read_char(reader)
        reference = reader.read_short()  # noqa: F841 (Phase 6+)
        alignment = reader.read_short()  # noqa: F841
        source_type = reader.read_int()  # noqa: F841
        encoded_length = reader.read_int()
        raw_fields.append(
            {
                "field_index": field_index,
                "column_start_pos": column_start_pos,
                "type_code": column_type,
                "extended_id": column_extended_id,
                "owner_name": owner_name,
                "extended_name": extended_name,
                "encoded_length": encoded_length,
            }
        )

    # Pass 2: string table — nul-separated column names. readPadded.
    string_table = b""
    if string_table_size > 0:
        string_table = reader.read_exact(string_table_size)
        if string_table_size & 1:
            reader.read_exact(1)  # pad

    # Split string table on nul to get the column-name list. The fieldIndex
    # values point into this table for each column's name.
    raw_names = string_table.split(b"\x00")
    name_lookup = {0: ""}
    cursor = 0
    for piece in raw_names:
        if piece:
            name_lookup[cursor] = piece.decode("iso-8859-1")
        cursor += len(piece) + 1  # +1 for the nul we split on

    columns: list[ColumnInfo] = []
    for fd in raw_fields:
        # fieldIndex is the byte offset where the column's name starts.
        name = name_lookup.get(fd["field_index"])
        if name is None:
            # Walk the string table to find the name at this offset.
            tail = string_table[fd["field_index"] :].split(b"\x00", 1)[0]
            name = tail.decode("iso-8859-1") if tail else f"col{len(columns)}"
        # INVARIANT: ColumnInfo.type_code is always base-typed (high-bit
        # flags stripped). This is the single producer site — every reader
        # (parse_tuple_payload, cursor._dereference_blob_columns, etc.)
        # depends on this and skips redundant base_type() calls. If you
        # ever construct ColumnInfo elsewhere, base_type() the input.
        columns.append(
            ColumnInfo(
                name=name or f"col{len(columns)}",
                type_code=base_type(fd["type_code"]),
                raw_type_code=fd["type_code"],
                encoded_length=fd["encoded_length"],
                column_start_pos=fd["column_start_pos"],
                extended_id=fd["extended_id"],
                owner_name=fd["owner_name"],
                extended_name=fd["extended_name"],
            )
        )
    return columns, metadata


# IDS type codes that are length-prefixed in the tuple payload.
# Per ``IfxSqli`` row-data extraction (see receiveFastPath case 13/15/16):
# CHAR, VARCHAR, NCHAR, NVCHAR all use ``[short length][bytes][pad if odd]``
# inside the tuple blob. LVARCHAR uses a 4-byte length prefix instead.
_LENGTH_PREFIXED_SHORT_TYPES = frozenset({
    _TC_CHAR,
    _TC_VARCHAR,
    _TC_NCHAR,
    _TC_NVCHAR,
})

_COMPOSITE_UDT_TYPES = frozenset({
    _TC_ROW,
    _TC_COLLECTION,
    _TC_SET,
    _TC_MULTISET,
    _TC_LIST,
})

_NUMERIC_TYPES = frozenset({_TC_DECIMAL, _TC_MONEY})


def parse_tuple_payload(
    reader: IfxStreamReader,
    columns: list[ColumnInfo],
    encoding: str = "iso-8859-1",
) -> tuple:
    """Parse a SQ_TUPLE payload (the SQ_TUPLE tag is already consumed).

    Per ``IfxSqli.receiveTuple``:
      ``[short warn][int size][bytes payload]``

    The payload contains column values back-to-back. For each column, the
    on-wire encoding depends on the type:

    * Fixed-width types (INT, FLOAT, DATE, BIGINT, etc.): exact byte count
      from ``FIXED_WIDTHS``.
    * Length-prefixed strings (CHAR, VARCHAR, NCHAR, NVCHAR): ``[short len]
      [bytes][pad if odd]``.
    * LVARCHAR: 4-byte length prefix instead of 2.
    * Other variable-width types (DECIMAL, DATETIME, INTERVAL, BLOBs):
      Phase 6+ — currently surfaces raw bytes from ``encoded_length``.

    ``encoding`` is forwarded to ``decode()`` for string columns. Caller
    (typically the cursor) should pass the connection's
    ``encoding`` so user-data text honors CLIENT_LOCALE.
    """
    reader.read_short()  # warn (Phase 5 surfaces)
    size = reader.read_int()
    payload = reader.read_exact(size)
    # SQ_TUPLE payload is padded to even-byte alignment on the wire.
    # Discovered empirically: a 11-byte "syscolumns" VARCHAR payload had
    # a trailing 0x00 between it and the next SQ_TUPLE tag. Consuming
    # this pad keeps the next read aligned.
    # (See docs/CAPTURES/15-py-varchar-fixed.socat.log analysis.)
    if size & 1:
        reader.read_exact(1)

    values: list[object] = []
    offset = 0
    # Note: ``col.type_code`` is *already* base-typed by ``parse_describe``
    # (see INVARIANT comment there), so we don't re-strip high-bit flags
    # here. The original code called ``base_type(col.type_code)`` per
    # column per row — pure waste. Skipping it is the single largest
    # savings in this loop.
    for col in columns:
        tc = col.type_code

        if tc in _LENGTH_PREFIXED_SHORT_TYPES:
            # In tuple data, VARCHAR/NCHAR/NVCHAR use a SINGLE-BYTE
            # length prefix (max 255 — IDS VARCHAR's hard limit), not
            # a short. Empirically verified against the SQ_TUPLE bytes
            # for ``SELECT tabname FROM systables`` in
            # docs/CAPTURES/13-py-varchar.socat.log:
            #     payload = 09 73 79 73 74 61 62 6c 65 73
            #             = [byte 9]["systables"]
            # CHAR is fixed-width per encoded_length — handled below.
            if tc == _TC_CHAR:
                width = col.encoded_length
                raw = payload[offset:offset + width]
                offset += width
            else:
                length = payload[offset]
                offset += 1
                raw = payload[offset:offset + length]
                offset += length
            values.append(decode(tc, raw, encoding))
            continue

        if tc == _TC_LVARCHAR:
            # [int length][bytes][pad if odd]
            length = int.from_bytes(payload[offset:offset + 4], "big", signed=True)
            offset += 4
            raw = payload[offset:offset + length]
            offset += length
            if length & 1:
                offset += 1
            values.append(decode(tc, raw, encoding))
            continue

        # DECIMAL/MONEY: width = ceil(precision/2) + 1, where precision is
        # the high byte of encoded_length (packed as (precision << 8) | scale).
        # Per IfxRowColumn.loadColumnData and IfxToJavaDecimal byte sizing.
        if tc in _NUMERIC_TYPES:
            precision = (col.encoded_length >> 8) & 0xFF
            width = (precision + 1) // 2 + 1
            raw = payload[offset:offset + width]
            offset += width
            try:
                values.append(decode(tc, raw))
            except NotImplementedError:
                values.append(raw)
            continue

        # DATETIME: width = ceil(digit_count/2) + 1, where digit_count is the
        # high byte of encoded_length (packed as (digit_count << 8) |
        # (start_TU << 4) | end_TU). The decoder needs the qualifier too,
        # so we call it directly here rather than via the dispatch.
        if tc == _TC_DATETIME:
            digit_count = (col.encoded_length >> 8) & 0xFF
            width = (digit_count + 1) // 2 + 1
            raw = payload[offset:offset + width]
            offset += width
            values.append(_decode_datetime(raw, col.encoded_length))
            continue

        # INTERVAL: same width formula as DATETIME — high byte of
        # encoded_length holds the total digit count across all fields,
        # and the wire bytes are ``[head][digit pairs]`` (one head byte
        # plus ceil(digit_count/2) digit pairs). Like DATETIME, the
        # qualifier is needed at decode time, so we bypass the generic
        # dispatch.
        if tc == _TC_INTERVAL:
            digit_count = (col.encoded_length >> 8) & 0xFF
            width = (digit_count + 1) // 2 + 1
            raw = payload[offset:offset + width]
            offset += width
            values.append(_decode_interval(raw, col.encoded_length))
            continue

        # BLOB / CLOB (smart-LOBs): the SQ_DESCRIBE response presents
        # these as UDTFIXED (type 41) with extended_id 10 (BLOB) or 11
        # (CLOB) and encoded_length = 72 (locator size). The 72 bytes
        # we read here are an opaque server-side reference, NOT the
        # actual data. Phase 10 lets users fetch via lotofile + SQ_FILE.
        if tc == _TC_UDTFIXED and col.extended_id in (10, 11):
            width = col.encoded_length
            raw = payload[offset:offset + width]
            offset += width
            cls = BlobLocator if col.extended_id == 10 else ClobLocator
            values.append(cls(raw=bytes(raw)))
            continue

        # ROW / COLLECTION (Phase 12): composite UDTs. Wire format is
        # ``[byte ind][int length][bytes]`` — same shape as
        # UDTVAR(lvarchar) above, but the payload semantics are a
        # textual representation of the composite (e.g.,
        # ``ROW('Alice',30         )`` or ``LIST{10,20,30}``) when
        # selected with default options. JDBC requests a richer
        # binary-with-schema format that's ~30x larger; we don't.
        #
        # We surface the bytes wrapped in a typed object and let the
        # user parse the textual form themselves. Type codes:
        # ROW=22, COLLECTION=23, SET=19, MULTISET=20, LIST=21.
        if tc in _COMPOSITE_UDT_TYPES:
            indicator = payload[offset]
            offset += 1
            if indicator == 1:  # null
                values.append(None)
                continue
            length = int.from_bytes(
                payload[offset:offset + 4], "big", signed=True
            )
            offset += 4
            raw = bytes(payload[offset:offset + length])
            offset += length
            if tc == _TC_ROW:
                values.append(RowValue(raw=raw, schema=col.extended_name))
            else:
                values.append(
                    CollectionValue(
                        raw=raw,
                        kind=_COLLECTION_KIND_MAP[tc],
                        element_schema=col.extended_name,
                    )
                )
            continue

        # UDTVAR (type 40) with extended_name="lvarchar": this is what
        # functions like ``lotofile`` return — a length-prefixed string
        # wrapped as a UDT. The wire format adds a 1-byte indicator
        # prefix BEFORE the LVARCHAR ``[int len][bytes]``. Empirically
        # verified against ``SELECT lotofile(...)`` row data — the
        # leading ``00`` is null indicator (0=not null, 1=null per UDT
        # convention).
        if tc == _TC_UDTVAR and col.extended_name == "lvarchar":
            indicator = payload[offset]
            offset += 1
            if indicator == 1:
                values.append(None)
                continue
            length = int.from_bytes(
                payload[offset:offset + 4], "big", signed=True
            )
            offset += 4
            raw = payload[offset:offset + length]
            offset += length
            if length & 1:
                offset += 1
            values.append(raw.decode(encoding))
            continue

        # Fixed-width types
        width = FIXED_WIDTHS.get(tc)
        if width is None:
            # Phase 6+ types (DATETIME, INTERVAL, BLOBs) — fall back
            # to encoded_length and surface raw bytes.
            width = col.encoded_length
        raw = payload[offset:offset + width]
        offset += width
        try:
            values.append(decode(tc, raw, encoding))
        except NotImplementedError:
            values.append(raw)
    return tuple(values)