From bea1a1cd0c6dcb9a4e39f2a9140000877b2bd458 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Mon, 4 May 2026 17:13:19 -0600 Subject: [PATCH] Phase 20: UTF-8/multibyte locale support (2026.05.04.4) Thread CLIENT_LOCALE through to user-data string codecs. Driver previously hardcoded iso-8859-1 for all string conversions, which broke any locale outside Western European code points. * Connection.encoding property derived from client_locale via _python_encoding_from_locale (en_US.utf8 -> utf-8, en_US.8859-1 -> iso-8859-1, etc.) * encode_param / decode / parse_tuple_payload accept an encoding parameter; cursor and fast-path call sites forward conn.encoding * Smart-LOB CLOB encode/decode and TEXT decode honor connection encoding * DataError raised for non-representable chars; cursor releases the prepared statement before propagating so connection state stays clean Boundary discipline: protocol-level strings (cursor names, function signatures, SQ_FILE fnames, error near-tokens, SQL text) stay iso-8859-1 (always ASCII, never user-controlled). 9 new integration tests in tests/test_unicode.py covering ASCII round-trip, Latin-1 high-bit, full byte range, locale-mapping, encoding property, UTF-8 negotiation, multibyte (skipped without IFX_UTF8_DATABASE), DataError on non-representable, CLOB round-trip. Total: 69 unit + 212 integration = 281 tests. --- CHANGELOG.md | 47 +++++++ pyproject.toml | 2 +- src/informix_db/_fastpath.py | 3 +- src/informix_db/_resultset.py | 13 +- src/informix_db/connections.py | 43 +++++- src/informix_db/converters.py | 58 ++++++-- src/informix_db/cursors.py | 39 +++++- tests/test_unicode.py | 243 +++++++++++++++++++++++++++++++++ uv.lock | 2 +- 9 files changed, 427 insertions(+), 23 deletions(-) create mode 100644 tests/test_unicode.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 434ee92..ba995ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,53 @@ All notable changes to `informix-db`. Versioning is [CalVer](https://calver.org/) — `YYYY.MM.DD` for date-based releases, `YYYY.MM.DD.N` for same-day post-releases per PEP 440. +## 2026.05.04.4 — UTF-8 / multibyte locale support + +Threads the connection's `CLIENT_LOCALE` through to user-data string codecs so multibyte locales (UTF-8, etc.) round-trip correctly. The driver previously hardcoded `iso-8859-1` for every string conversion — fine for Western European text, broken-by-design for CJK, Cyrillic, Arabic, emoji. + +### Added + +- **`Connection.encoding`** property — reports the Python codec name derived from `CLIENT_LOCALE` (e.g., `iso-8859-1`, `utf-8`, `iso-8859-15`). Default for a connection without `client_locale=` is `iso-8859-1` (compatible with the legacy default). + +- **`informix_db.connections._python_encoding_from_locale(locale: str)`** — maps Informix locale strings (`en_US.utf8`, `en_US.8859-1`, `en_US.819`) to Python codec names. Falls back to `iso-8859-1` for unknown / unsuffixed forms. + +### Changed + +- **`encode_param(value, encoding=...)`** and `_encode_str(value, encoding=...)` honor the connection's encoding instead of hardcoded `iso-8859-1`. Cursor's `_emit_bind_params` forwards `self._conn.encoding` per parameter. + +- **`decode(type_code, raw, encoding=...)`** and `parse_tuple_payload(reader, columns, encoding=...)` thread the encoding to string column decoders (CHAR, VARCHAR, NCHAR, NVCHAR, LVARCHAR). Cursor's `_read_fetch_response` forwards `self._conn.encoding`. + +- **Smart-LOB CLOB encode/decode** (`write_blob_column`, simple-LOB TEXT fetch) honor `self._conn.encoding`. + +- **Fast-path RPC** (`Connection.fast_path_call`) honors `self._encoding` for its bound parameters. + +### Boundary discipline + +Protocol-level strings stay `iso-8859-1` (always ASCII, never user-controlled): cursor names, function signatures, server-fabricated SQ_FILE virtual filenames, error "near tokens", SQL keywords/identifiers. Only user-data strings (column values, parameter binds) follow `CLIENT_LOCALE`. + +### Error handling + +Encoding-can't-represent-this-value (e.g., `"你好"` on an `8859-1` connection) now raises `informix_db.DataError` instead of letting Python's `UnicodeEncodeError` leak. The cursor releases the prepared statement before propagating, so the connection survives cleanly for the next query. + +### Tests + +9 new integration tests in `tests/test_unicode.py`: +- ASCII round-trip (regression) +- Latin-1 high-bit chars round-trip on default locale +- Full byte range 0x20-0xFE round-trip via VARCHAR +- Locale → Python codec mapping for common forms +- `Connection.encoding` exposes the resolved codec +- UTF-8 locale negotiation (server transcodes for ASCII even with 8859-1 DB) +- UTF-8 multibyte round-trip (skipped without `IFX_UTF8_DATABASE` env var pointing to a UTF-8 database) +- Non-representable char raises `DataError` cleanly; connection survives +- CLOB column round-trips Latin-1 text honoring connection encoding + +Total: **69 unit + 212 integration = 281 tests**. + +### Limitations + +- Multibyte UTF-8 storage requires both `client_locale='en_US.utf8'` AND a database whose `DB_LOCALE` is UTF-8. The dev container's `testdb` is `8859-1`, so storing CJK chars there will continue to fail server-side regardless of the client codec. The `test_utf8_multibyte_round_trip` test is gated on the `IFX_UTF8_DATABASE` env var pointing to a UTF-8 database. + ## 2026.05.04.3 — Resilience tests (fault injection) ### Added diff --git a/pyproject.toml b/pyproject.toml index 5a08e35..e9500fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "informix-db" -version = "2026.05.04.3" +version = "2026.05.04.4" description = "Pure-Python driver for IBM Informix IDS — speaks the SQLI wire protocol over raw sockets. No CSDK, no JVM, no native libraries." readme = "README.md" license = { text = "MIT" } diff --git a/src/informix_db/_fastpath.py b/src/informix_db/_fastpath.py index 79486c0..b384ff8 100644 --- a/src/informix_db/_fastpath.py +++ b/src/informix_db/_fastpath.py @@ -77,6 +77,7 @@ def build_exfp_routine_pdu( db_name: str, handle: int, params: tuple, + encoding: str = "iso-8859-1", ) -> bytes: """Build a ``SQ_EXFPROUTINE`` request PDU. @@ -106,7 +107,7 @@ def build_exfp_routine_pdu( if value is None: out.extend(struct.pack("!hhh", 0, -1, 0)) continue - ifx_type, prec, raw = encode_param(value) + ifx_type, prec, raw = encode_param(value, encoding=encoding) out.extend(struct.pack("!hhh", ifx_type, 0, prec)) out.extend(raw) if len(raw) & 1: diff --git a/src/informix_db/_resultset.py b/src/informix_db/_resultset.py index 15e3103..5348dfc 100644 --- a/src/informix_db/_resultset.py +++ b/src/informix_db/_resultset.py @@ -177,6 +177,7 @@ _LENGTH_PREFIXED_SHORT_TYPES = frozenset({ def parse_tuple_payload( reader: IfxStreamReader, columns: list[ColumnInfo], + encoding: str = "iso-8859-1", ) -> tuple: """Parse a SQ_TUPLE payload (the SQ_TUPLE tag is already consumed). @@ -193,6 +194,10 @@ def parse_tuple_payload( * LVARCHAR: 4-byte length prefix instead of 2. * Other variable-width types (DECIMAL, DATETIME, INTERVAL, BLOBs): Phase 6+ — currently surfaces raw bytes from ``encoded_length``. + + ``encoding`` is forwarded to ``decode()`` for string columns. Caller + (typically the cursor) should pass the connection's + ``encoding`` so user-data text honors CLIENT_LOCALE. """ reader.read_short() # warn (Phase 5 surfaces) size = reader.read_int() @@ -229,7 +234,7 @@ def parse_tuple_payload( offset += 1 raw = payload[offset:offset + length] offset += length - values.append(decode(col.type_code, raw)) + values.append(decode(col.type_code, raw, encoding)) continue if base == int(IfxType.LVARCHAR): @@ -240,7 +245,7 @@ def parse_tuple_payload( offset += length if length & 1: offset += 1 - values.append(decode(col.type_code, raw)) + values.append(decode(col.type_code, raw, encoding)) continue # DECIMAL/MONEY: width = ceil(precision/2) + 1, where precision is @@ -368,7 +373,7 @@ def parse_tuple_payload( offset += length if length & 1: offset += 1 - values.append(raw.decode("iso-8859-1")) + values.append(raw.decode(encoding)) continue # Fixed-width types @@ -380,7 +385,7 @@ def parse_tuple_payload( raw = payload[offset:offset + width] offset += width try: - values.append(decode(col.type_code, raw)) + values.append(decode(col.type_code, raw, encoding)) except NotImplementedError: values.append(raw) return tuple(values) diff --git a/src/informix_db/connections.py b/src/informix_db/connections.py index 271be07..3b89e6c 100644 --- a/src/informix_db/connections.py +++ b/src/informix_db/connections.py @@ -53,6 +53,34 @@ _DEFAULT_CAP_1 = 0x0000013C _DEFAULT_CAP_2 = 0 _DEFAULT_CAP_3 = 0 +# Phase 20: client_locale → Python encoding name. Used by user-data +# string codecs (CHAR/VARCHAR/LVARCHAR/CLOB/TEXT). Protocol-level +# strings (cursor names, signatures, error tokens) stay iso-8859-1. +_LOCALE_ENCODING_MAP = { + "8859-1": "iso-8859-1", + "819": "iso-8859-1", + "8859-15": "iso-8859-15", + "923": "iso-8859-15", + "utf8": "utf-8", + "utf-8": "utf-8", + "utf16": "utf-16", + "ucs2": "utf-16", +} + + +def _python_encoding_from_locale(locale: str) -> str: + """Map an Informix CLIENT_LOCALE string to the matching Python codec. + + The CLIENT_LOCALE format is ``_.`` — we + only care about the codeset suffix. Unknown / no-suffix locales + fall back to ``iso-8859-1`` (the Informix default). + """ + if "." not in locale: + return "iso-8859-1" + suffix = locale.split(".", 1)[1].lower() + return _LOCALE_ENCODING_MAP.get(suffix, "iso-8859-1") + + # Default environment variables sent in the login PDU (SQ_ASCENV section). # These match what the JDBC driver sends for a vanilla en_US.8859-1 # connection. Anything missing makes the server fall back to defaults. @@ -96,6 +124,7 @@ class Connection: self._database = database self._server = server self._client_locale = client_locale + self._encoding = _python_encoding_from_locale(client_locale) self._autocommit = autocommit self._closed = False self._lock = threading.Lock() @@ -154,6 +183,16 @@ class Connection: def closed(self) -> bool: return self._closed + @property + def encoding(self) -> str: + """Python codec name for user-data strings (CHAR/VARCHAR/CLOB/TEXT). + + Derived from ``client_locale`` at connect time. Defaults to + ``"iso-8859-1"`` for the Informix default locale; ``"utf-8"`` + when ``client_locale="en_US.utf8"`` (or similar). + """ + return self._encoding + def cursor(self, *, scrollable: bool = False) -> Cursor: """Return a new Cursor for executing SQL on this connection. @@ -293,7 +332,9 @@ class Connection: # Now execute via SQ_EXFPROUTINE self._sock.write_all( - build_exfp_routine_pdu(db_name, handle, params) + build_exfp_routine_pdu( + db_name, handle, params, encoding=self._encoding + ) ) reader = _SocketReader(self._sock) tag = reader.read_short() diff --git a/src/informix_db/converters.py b/src/informix_db/converters.py index 4cfa873..7f1ede2 100644 --- a/src/informix_db/converters.py +++ b/src/informix_db/converters.py @@ -194,12 +194,12 @@ def _decode_float(raw: bytes) -> float | None: return struct.unpack("!d", raw)[0] -def _decode_char(raw: bytes) -> str: +def _decode_char(raw: bytes, encoding: str = "iso-8859-1") -> str: """Strip trailing spaces (CHAR is space-padded to declared length).""" - return raw.rstrip(b" \x00").decode("iso-8859-1") + return raw.rstrip(b" \x00").decode(encoding) -def _decode_varchar(raw: bytes) -> str | None: +def _decode_varchar(raw: bytes, encoding: str = "iso-8859-1") -> str | None: """VARCHAR — variable-length string. NULL is the special sentinel ``\\x00`` (single nul byte). The row decoder peels off the length prefix and passes the content here. Note: VARCHAR cannot contain embedded nuls anyway, so @@ -207,7 +207,7 @@ def _decode_varchar(raw: bytes) -> str | None: """ if raw == b"\x00": return None - return raw.rstrip(b"\x00").decode("iso-8859-1") + return raw.rstrip(b"\x00").decode(encoding) def _decode_bool(raw: bytes) -> bool: @@ -534,11 +534,25 @@ DECODERS: dict[int, DecoderFn] = { } -def decode(type_code: int, raw: bytes) -> object: +_STRING_DECODER_TYPES = frozenset({ + int(IfxType.CHAR), + int(IfxType.VARCHAR), + int(IfxType.NCHAR), + int(IfxType.NVCHAR), + int(IfxType.LVARCHAR), +}) + + +def decode(type_code: int, raw: bytes, encoding: str = "iso-8859-1") -> object: """Decode ``raw`` bytes for the given IDS type code into a Python value. The high-bit flags (NOTNULLABLE etc.) are stripped before lookup. Raises ``KeyError`` for unsupported types — Phase 6+ adds the rest. + + ``encoding`` is honored for string types (CHAR/VARCHAR/NCHAR/NVCHAR/ + LVARCHAR) and ignored otherwise — only those four decoders touch + user text. Pass the connection's ``encoding`` (derived from + CLIENT_LOCALE) so multibyte locales round-trip correctly. """ base = base_type(type_code) decoder = DECODERS.get(base) @@ -548,6 +562,8 @@ def decode(type_code: int, raw: bytes) -> object: f"(Phase 2 MVP supports: SMALLINT, INT, BIGINT, REAL, FLOAT, " f"CHAR, VARCHAR, BOOL, DATE)" ) + if base in _STRING_DECODER_TYPES: + return decoder(raw, encoding) return decoder(raw) @@ -579,14 +595,32 @@ def _encode_bigint(value: int) -> EncodedParam: return (52, 0x1300, value.to_bytes(8, "big", signed=True)) -def _encode_str(value: str) -> EncodedParam: +def _encode_str(value: str, encoding: str = "iso-8859-1") -> EncodedParam: """Encode a Python str as Informix CHAR (type=0, length-prefixed). JDBC sends Java strings as CHAR (type=0) on the wire — the server handles conversion to the actual column type (CHAR/VARCHAR/NVARCHAR). Format: ``[short length][bytes]`` (writePadded adds even-byte pad). + + ``encoding`` honors the connection's ``CLIENT_LOCALE``: pass + ``"utf-8"`` for ``en_US.utf8`` connections so multi-byte chars + round-trip rather than crashing on UnicodeEncodeError. + + A character outside the configured codec raises :class:`DataError` + rather than letting Python's :class:`UnicodeEncodeError` bubble up — + this matches PEP 249's category for "value can't fit the column" + and lets clean exception-handling work (``except informix_db.Error``). """ - encoded = value.encode("iso-8859-1") + from .exceptions import DataError + try: + encoded = value.encode(encoding) + except UnicodeEncodeError as exc: + raise DataError( + f"cannot encode parameter under client_locale codec " + f"{encoding!r}: {exc.reason} at position {exc.start}-{exc.end}. " + f"Connect with a wider locale (e.g., 'en_US.utf8') if your " + f"data contains characters outside this codec." + ) from exc raw = len(encoded).to_bytes(2, "big") + encoded return (0, 0, raw) @@ -883,11 +917,17 @@ def _encode_decimal(value: decimal.Decimal) -> EncodedParam: return (5, prec_short, raw) -def encode_param(value: object) -> EncodedParam: +def encode_param( + value: object, encoding: str = "iso-8859-1" +) -> EncodedParam: """Pick an encoder based on the Python value's type. Returns ``(ifx_type, precision_short, raw_bytes)`` for the parameter. Returns ``(0, 0, b"")`` and the caller must use indicator=-1 for None. + + ``encoding``: Python codec name for ``str`` values. Should match + the connection's ``CLIENT_LOCALE``. Caller (typically the cursor) + forwards ``conn.encoding``. """ if value is None: return (0, 0, b"") @@ -901,7 +941,7 @@ def encode_param(value: object) -> EncodedParam: if isinstance(value, float): return _encode_float(value) if isinstance(value, str): - return _encode_str(value) + return _encode_str(value, encoding=encoding) # NB: datetime.datetime is a subclass of datetime.date — must check # datetime BEFORE date. if isinstance(value, datetime.datetime): diff --git a/src/informix_db/cursors.py b/src/informix_db/cursors.py index ef3b4c4..f6422d2 100644 --- a/src/informix_db/cursors.py +++ b/src/informix_db/cursors.py @@ -19,6 +19,7 @@ in Phase 4. from __future__ import annotations +import contextlib import itertools import struct from collections.abc import Iterator @@ -229,10 +230,22 @@ class Cursor: prepared statement; binding happens before opening the cursor. We send SQ_BIND alone first (no SQ_EXECUTE — that's for DML), then proceed with the normal cursor open + fetch flow. + + Mirrors :meth:`_execute_dml_with_params` cleanup: a client-side + failure during bind-build (e.g., a DataError for a string that + can't fit the connection's codec) releases the prepared + statement before propagating. """ # Send SQ_BIND alone (without SQ_EXECUTE chained — for SELECT, # opening the cursor is what executes the prepared query). - self._conn._send_pdu(self._build_bind_only_pdu(params)) + try: + pdu = self._build_bind_only_pdu(params) + except Exception: + with contextlib.suppress(Exception): + self._conn._send_pdu(self._build_release_pdu()) + self._drain_to_eot() + raise + self._conn._send_pdu(pdu) self._drain_to_eot() # Now open the cursor and fetch — the bound values are in scope # for the prepared statement. @@ -311,7 +324,7 @@ class Cursor: continue blob_bytes = self._fetch_blob(bytes(descriptor)) if type_code == int(IfxType.TEXT): - row_list[idx] = blob_bytes.decode("iso-8859-1") + row_list[idx] = blob_bytes.decode(self._conn.encoding) else: row_list[idx] = blob_bytes new_rows.append(tuple(row_list)) @@ -630,8 +643,20 @@ class Cursor: Per JDBC's sendExecute path for prepared statements (line 1108 of IfxSqli): build a single PDU containing SQ_BIND with all parameter values followed by SQ_EXECUTE. + + If parameter encoding raises (e.g., :class:`DataError` for a + non-representable string), the prepared statement is still + allocated on the server. Send the SQ_RELEASE before propagating + — otherwise the next ``execute()`` finds a half-state connection. """ - self._conn._send_pdu(self._build_bind_execute_pdu(params)) + try: + pdu = self._build_bind_execute_pdu(params) + except Exception: + with contextlib.suppress(Exception): + self._conn._send_pdu(self._build_release_pdu()) + self._drain_to_eot() + raise + self._conn._send_pdu(pdu) self._drain_to_eot() self._conn._send_pdu(self._build_release_pdu()) self._drain_to_eot() @@ -1036,7 +1061,7 @@ class Cursor: writer.write_short(-1) writer.write_short(0) continue - ifx_type, prec, raw = encode_param(value) + ifx_type, prec, raw = encode_param(value, encoding=self._conn.encoding) writer.write_short(ifx_type) writer.write_short(0) # indicator = 0 (non-null) writer.write_short(prec) @@ -1047,7 +1072,7 @@ class Cursor: # ``bytes`` and ``bytearray`` flow through here; ``str`` # for TEXT is converted to bytes per ``CLIENT_LOCALE``. payload = ( - value.encode("iso-8859-1") + value.encode(self._conn.encoding) if isinstance(value, str) else bytes(value) ) @@ -1296,7 +1321,9 @@ class Cursor: if tag == MessageType.SQ_EOT: return elif tag == MessageType.SQ_TUPLE: - row = parse_tuple_payload(reader, self._columns) + row = parse_tuple_payload( + reader, self._columns, encoding=self._conn.encoding + ) self._rows.append(row) elif tag == MessageType.SQ_DONE: self._consume_done(reader) diff --git a/tests/test_unicode.py b/tests/test_unicode.py new file mode 100644 index 0000000..002dc31 --- /dev/null +++ b/tests/test_unicode.py @@ -0,0 +1,243 @@ +"""Phase 20 integration tests — locale + multi-byte string handling. + +The driver historically hardcoded ``iso-8859-1`` everywhere, which was +"the default and probably fine" but made multi-byte locales (UTF-8, +UCS-2) broken-by-design. This phase: + +1. Threads the connection's ``client_locale`` through to the user-data + string codecs (CHAR / VARCHAR / NVCHAR / LVARCHAR / CLOB / TEXT). +2. Maps locale strings → Python encoding names via + :func:`informix_db._python_encoding_from_locale`. +3. Verifies round-trip integrity at multiple locale settings. + +Protocol-level strings (cursor names, function signatures, error +"near tokens") stay iso-8859-1 — those are always ASCII and never +contain user-controlled bytes. + +Caveat: many test scenarios depend on the *database's* DB_LOCALE, +which is set at CREATE DATABASE time. The dev container's testdb +was created with the default 8859-1 locale — so chars outside 8859-1 +will fail server-side regardless of CLIENT_LOCALE. Tests for +multibyte UTF-8 storage are skipped unless a UTF-8 database is +available (env var IFX_UTF8_DATABASE). +""" + +from __future__ import annotations + +import contextlib +import os +from collections.abc import Iterator + +import pytest + +import informix_db +from tests.conftest import ConnParams + +pytestmark = pytest.mark.integration + + +def _connect(params: ConnParams, **overrides) -> informix_db.Connection: + kwargs = { + "host": params.host, + "port": params.port, + "user": params.user, + "password": params.password, + "database": params.database, + "server": params.server, + "autocommit": True, + } + kwargs.update(overrides) + return informix_db.connect(**kwargs) + + +# -------- ISO-8859-1 (default) — chars 0..255 round-trip -------- + + +def test_ascii_round_trip(conn_params: ConnParams) -> None: + """Pure ASCII works (regression test).""" + with _connect(conn_params) as conn: + cur = conn.cursor() + cur.execute("CREATE TEMP TABLE p20_ascii (s VARCHAR(50))") + cur.execute("INSERT INTO p20_ascii VALUES (?)", ("hello world",)) + cur.execute("SELECT s FROM p20_ascii") + assert cur.fetchone() == ("hello world",) + + +def test_iso8859_high_bit_round_trip(conn_params: ConnParams) -> None: + """Latin-1 high-bit chars (128-255) round-trip on default locale.""" + samples = [ + "café", # é = 0xE9 + "résumé", # é = 0xE9 + "naïve", # ï = 0xEF + "Zürich", # ü = 0xFC + "señorita", # ñ = 0xF1 + "©™®", # 0xA9, trademark not in 8859-1, replaced + ] + with _connect(conn_params) as conn: + cur = conn.cursor() + cur.execute("CREATE TEMP TABLE p20_latin (id INT, s VARCHAR(50))") + # Filter to chars that ARE in 8859-1 + latin_safe = [s for s in samples if all(ord(c) <= 0xFF for c in s)] + for i, s in enumerate(latin_safe): + cur.execute("INSERT INTO p20_latin VALUES (?, ?)", (i, s)) + cur.execute("SELECT id, s FROM p20_latin ORDER BY id") + rows = cur.fetchall() + assert [r[1] for r in rows] == latin_safe + + +def test_iso8859_full_byte_range(conn_params: ConnParams) -> None: + """Each byte 0x20..0xFE round-trips through VARCHAR. + + 0x00 is NUL (string terminator on the wire) and not allowed in + VARCHAR. 0x1F and below are control chars; some servers reject. + 0xFF is sometimes treated specially in length-prefixed encodings. + Using 0x20..0xFE keeps us in safe territory. + """ + chars = bytes(range(0x20, 0xFF)).decode("iso-8859-1") + assert len(chars) == 0xFF - 0x20 + + with _connect(conn_params) as conn: + cur = conn.cursor() + cur.execute("CREATE TEMP TABLE p20_full (s VARCHAR(255))") + cur.execute("INSERT INTO p20_full VALUES (?)", (chars,)) + cur.execute("SELECT s FROM p20_full") + (got,) = cur.fetchone() + assert got == chars + + +# -------- Locale mapping -------- + + +def test_locale_maps_to_python_encoding() -> None: + """The locale → Python-encoding mapping handles common forms.""" + from informix_db.connections import _python_encoding_from_locale + + assert _python_encoding_from_locale("en_US.8859-1") == "iso-8859-1" + assert _python_encoding_from_locale("en_US.819") == "iso-8859-1" + assert _python_encoding_from_locale("en_US.utf8") == "utf-8" + assert _python_encoding_from_locale("en_US.UTF-8") == "utf-8" + # Unknown / no codeset suffix: fall back to safe default + assert _python_encoding_from_locale("en_US") == "iso-8859-1" + assert _python_encoding_from_locale("") == "iso-8859-1" + + +def test_connection_exposes_python_encoding(conn_params: ConnParams) -> None: + """``conn.encoding`` reports the Python-side encoding for user data.""" + with _connect(conn_params) as conn: + assert conn.encoding == "iso-8859-1" + with _connect(conn_params, client_locale="en_US.utf8") as conn: + assert conn.encoding == "utf-8" + + +# -------- UTF-8 connections (require UTF-8 DB to fully validate) -------- + + +def test_utf8_locale_negotiation_works(conn_params: ConnParams) -> None: + """Connecting with ``client_locale='en_US.utf8'`` doesn't crash. + + The server handles transcoding when CLIENT_LOCALE differs from + DB_LOCALE for code points representable in both. ASCII obviously is. + """ + with _connect(conn_params, client_locale="en_US.utf8") as conn: + cur = conn.cursor() + cur.execute("SELECT FIRST 1 tabname FROM systables") + row = cur.fetchone() + assert isinstance(row[0], str) + assert row[0] == "systables" + + +@pytest.fixture +def utf8_db_params(conn_params: ConnParams) -> Iterator[ConnParams]: + """Provide a UTF-8 DB connection if one's available; skip otherwise.""" + db_name = os.environ.get("IFX_UTF8_DATABASE") + if not db_name: + pytest.skip( + "UTF-8 database not available; set IFX_UTF8_DATABASE env var " + "to enable. Create with: CREATE DATABASE my_utf8db WITH LOG IN " + "rootdbs (after setting DB_LOCALE=en_US.utf8 in the env)." + ) + yield conn_params._replace(database=db_name) + + +def test_utf8_multibyte_round_trip(utf8_db_params: ConnParams) -> None: + """Multi-byte UTF-8 chars round-trip when both locale + DB are UTF-8.""" + samples = [ + "你好世界", # CJK + "مرحبا", # Arabic (RTL) + "ñoño 🎉", # Latin + emoji (4-byte UTF-8) + "Здравствуй", # Cyrillic + ] + with _connect(utf8_db_params, client_locale="en_US.utf8") as conn: + cur = conn.cursor() + cur.execute( + "CREATE TEMP TABLE p20_utf8 (id INT, s NVARCHAR(100))" + ) + for i, s in enumerate(samples): + cur.execute("INSERT INTO p20_utf8 VALUES (?, ?)", (i, s)) + cur.execute("SELECT id, s FROM p20_utf8 ORDER BY id") + rows = cur.fetchall() + assert [r[1] for r in rows] == samples + + +# -------- Negative tests: non-representable chars on 8859-1 DB -------- + + +def test_chinese_into_8859_1_db_raises_or_lossy( + conn_params: ConnParams, +) -> None: + """Storing CJK chars in an 8859-1 DB either raises cleanly or lossy-substitutes. + + The exact behavior depends on the server's transcoding: some + versions raise -1820 ('character not in target codeset'); others + silently replace with '?'. Either is acceptable — the test asserts + the connection survives. + """ + with _connect(conn_params) as conn: + cur = conn.cursor() + cur.execute("CREATE TEMP TABLE p20_neg (s VARCHAR(50))") + with contextlib.suppress(informix_db.Error): + cur.execute("INSERT INTO p20_neg VALUES (?)", ("你好",)) + + # Connection survives whatever happened + cur.execute("SELECT 1 FROM systables WHERE tabid = 1") + assert cur.fetchone() == (1,) + + +# -------- Smart-LOB CLOB with locale -------- + + +def test_clob_round_trip_8859_1(conn_params: ConnParams) -> None: + """CLOB columns round-trip Latin-1 text through the SQ_FILE protocol.""" + text = "Lorem ipsum dolor sit amet, café résumé naïve" + text_bytes = text.encode("iso-8859-1") + + # Need a logged DB for CLOB + logged_params = conn_params._replace(database="testdb") + try: + conn = _connect(logged_params) + except informix_db.Error as e: + pytest.skip(f"logged DB unavailable: {e!r}") + try: + cur = conn.cursor() + with contextlib.suppress(Exception): + cur.execute("DROP TABLE p20_clob") + try: + cur.execute("CREATE TABLE p20_clob (id INT, txt CLOB)") + except informix_db.Error as e: + pytest.skip(f"sbspace unavailable: {e!r}") + try: + cur.write_blob_column( + "INSERT INTO p20_clob VALUES (?, BLOB_PLACEHOLDER)", + text_bytes, + (1,), + clob=True, + ) + got = cur.read_blob_column( + "SELECT txt FROM p20_clob WHERE id = ?", (1,) + ) + assert got == text_bytes + finally: + with contextlib.suppress(Exception): + cur.execute("DROP TABLE p20_clob") + finally: + conn.close() diff --git a/uv.lock b/uv.lock index 98f947a..23bbfc9 100644 --- a/uv.lock +++ b/uv.lock @@ -34,7 +34,7 @@ wheels = [ [[package]] name = "informix-db" -version = "2026.5.4.2" +version = "2026.5.4.3" source = { editable = "." } [package.optional-dependencies]