informix-db/tests/test_unicode.py
Ryan Malloy bea1a1cd0c Phase 20: UTF-8/multibyte locale support (2026.05.04.4)
Thread CLIENT_LOCALE through to user-data string codecs. Driver previously
hardcoded iso-8859-1 for all string conversions, which broke any locale
outside Western European code points.

* Connection.encoding property derived from client_locale via
  _python_encoding_from_locale (en_US.utf8 -> utf-8, en_US.8859-1 ->
  iso-8859-1, etc.)
* encode_param / decode / parse_tuple_payload accept an encoding
  parameter; cursor and fast-path call sites forward conn.encoding
* Smart-LOB CLOB encode/decode and TEXT decode honor connection encoding
* DataError raised for non-representable chars; cursor releases the
  prepared statement before propagating so connection state stays clean

Boundary discipline: protocol-level strings (cursor names, function
signatures, SQ_FILE fnames, error near-tokens, SQL text) stay
iso-8859-1 (always ASCII, never user-controlled).

9 new integration tests in tests/test_unicode.py covering ASCII
round-trip, Latin-1 high-bit, full byte range, locale-mapping,
encoding property, UTF-8 negotiation, multibyte (skipped without
IFX_UTF8_DATABASE), DataError on non-representable, CLOB round-trip.

Total: 69 unit + 212 integration = 281 tests.
2026-05-04 17:13:19 -06:00

244 lines
9.0 KiB
Python

"""Phase 20 integration tests — locale + multi-byte string handling.
The driver historically hardcoded ``iso-8859-1`` everywhere, which was
"the default and probably fine" but made multi-byte locales (UTF-8,
UCS-2) broken-by-design. This phase:
1. Threads the connection's ``client_locale`` through to the user-data
string codecs (CHAR / VARCHAR / NVCHAR / LVARCHAR / CLOB / TEXT).
2. Maps locale strings → Python encoding names via
:func:`informix_db._python_encoding_from_locale`.
3. Verifies round-trip integrity at multiple locale settings.
Protocol-level strings (cursor names, function signatures, error
"near tokens") stay iso-8859-1 — those are always ASCII and never
contain user-controlled bytes.
Caveat: many test scenarios depend on the *database's* DB_LOCALE,
which is set at CREATE DATABASE time. The dev container's testdb
was created with the default 8859-1 locale — so chars outside 8859-1
will fail server-side regardless of CLIENT_LOCALE. Tests for
multibyte UTF-8 storage are skipped unless a UTF-8 database is
available (env var IFX_UTF8_DATABASE).
"""
from __future__ import annotations
import contextlib
import os
from collections.abc import Iterator
import pytest
import informix_db
from tests.conftest import ConnParams
pytestmark = pytest.mark.integration
def _connect(params: ConnParams, **overrides) -> informix_db.Connection:
kwargs = {
"host": params.host,
"port": params.port,
"user": params.user,
"password": params.password,
"database": params.database,
"server": params.server,
"autocommit": True,
}
kwargs.update(overrides)
return informix_db.connect(**kwargs)
# -------- ISO-8859-1 (default) — chars 0..255 round-trip --------
def test_ascii_round_trip(conn_params: ConnParams) -> None:
"""Pure ASCII works (regression test)."""
with _connect(conn_params) as conn:
cur = conn.cursor()
cur.execute("CREATE TEMP TABLE p20_ascii (s VARCHAR(50))")
cur.execute("INSERT INTO p20_ascii VALUES (?)", ("hello world",))
cur.execute("SELECT s FROM p20_ascii")
assert cur.fetchone() == ("hello world",)
def test_iso8859_high_bit_round_trip(conn_params: ConnParams) -> None:
"""Latin-1 high-bit chars (128-255) round-trip on default locale."""
samples = [
"café", # é = 0xE9
"résumé", # é = 0xE9
"naïve", # ï = 0xEF
"Zürich", # ü = 0xFC
"señorita", # ñ = 0xF1
"©™®", # 0xA9, trademark not in 8859-1, replaced
]
with _connect(conn_params) as conn:
cur = conn.cursor()
cur.execute("CREATE TEMP TABLE p20_latin (id INT, s VARCHAR(50))")
# Filter to chars that ARE in 8859-1
latin_safe = [s for s in samples if all(ord(c) <= 0xFF for c in s)]
for i, s in enumerate(latin_safe):
cur.execute("INSERT INTO p20_latin VALUES (?, ?)", (i, s))
cur.execute("SELECT id, s FROM p20_latin ORDER BY id")
rows = cur.fetchall()
assert [r[1] for r in rows] == latin_safe
def test_iso8859_full_byte_range(conn_params: ConnParams) -> None:
"""Each byte 0x20..0xFE round-trips through VARCHAR.
0x00 is NUL (string terminator on the wire) and not allowed in
VARCHAR. 0x1F and below are control chars; some servers reject.
0xFF is sometimes treated specially in length-prefixed encodings.
Using 0x20..0xFE keeps us in safe territory.
"""
chars = bytes(range(0x20, 0xFF)).decode("iso-8859-1")
assert len(chars) == 0xFF - 0x20
with _connect(conn_params) as conn:
cur = conn.cursor()
cur.execute("CREATE TEMP TABLE p20_full (s VARCHAR(255))")
cur.execute("INSERT INTO p20_full VALUES (?)", (chars,))
cur.execute("SELECT s FROM p20_full")
(got,) = cur.fetchone()
assert got == chars
# -------- Locale mapping --------
def test_locale_maps_to_python_encoding() -> None:
"""The locale → Python-encoding mapping handles common forms."""
from informix_db.connections import _python_encoding_from_locale
assert _python_encoding_from_locale("en_US.8859-1") == "iso-8859-1"
assert _python_encoding_from_locale("en_US.819") == "iso-8859-1"
assert _python_encoding_from_locale("en_US.utf8") == "utf-8"
assert _python_encoding_from_locale("en_US.UTF-8") == "utf-8"
# Unknown / no codeset suffix: fall back to safe default
assert _python_encoding_from_locale("en_US") == "iso-8859-1"
assert _python_encoding_from_locale("") == "iso-8859-1"
def test_connection_exposes_python_encoding(conn_params: ConnParams) -> None:
"""``conn.encoding`` reports the Python-side encoding for user data."""
with _connect(conn_params) as conn:
assert conn.encoding == "iso-8859-1"
with _connect(conn_params, client_locale="en_US.utf8") as conn:
assert conn.encoding == "utf-8"
# -------- UTF-8 connections (require UTF-8 DB to fully validate) --------
def test_utf8_locale_negotiation_works(conn_params: ConnParams) -> None:
"""Connecting with ``client_locale='en_US.utf8'`` doesn't crash.
The server handles transcoding when CLIENT_LOCALE differs from
DB_LOCALE for code points representable in both. ASCII obviously is.
"""
with _connect(conn_params, client_locale="en_US.utf8") as conn:
cur = conn.cursor()
cur.execute("SELECT FIRST 1 tabname FROM systables")
row = cur.fetchone()
assert isinstance(row[0], str)
assert row[0] == "systables"
@pytest.fixture
def utf8_db_params(conn_params: ConnParams) -> Iterator[ConnParams]:
"""Provide a UTF-8 DB connection if one's available; skip otherwise."""
db_name = os.environ.get("IFX_UTF8_DATABASE")
if not db_name:
pytest.skip(
"UTF-8 database not available; set IFX_UTF8_DATABASE env var "
"to enable. Create with: CREATE DATABASE my_utf8db WITH LOG IN "
"rootdbs (after setting DB_LOCALE=en_US.utf8 in the env)."
)
yield conn_params._replace(database=db_name)
def test_utf8_multibyte_round_trip(utf8_db_params: ConnParams) -> None:
"""Multi-byte UTF-8 chars round-trip when both locale + DB are UTF-8."""
samples = [
"你好世界", # CJK
"مرحبا", # Arabic (RTL)
"ñoño 🎉", # Latin + emoji (4-byte UTF-8)
"Здравствуй", # Cyrillic
]
with _connect(utf8_db_params, client_locale="en_US.utf8") as conn:
cur = conn.cursor()
cur.execute(
"CREATE TEMP TABLE p20_utf8 (id INT, s NVARCHAR(100))"
)
for i, s in enumerate(samples):
cur.execute("INSERT INTO p20_utf8 VALUES (?, ?)", (i, s))
cur.execute("SELECT id, s FROM p20_utf8 ORDER BY id")
rows = cur.fetchall()
assert [r[1] for r in rows] == samples
# -------- Negative tests: non-representable chars on 8859-1 DB --------
def test_chinese_into_8859_1_db_raises_or_lossy(
conn_params: ConnParams,
) -> None:
"""Storing CJK chars in an 8859-1 DB either raises cleanly or lossy-substitutes.
The exact behavior depends on the server's transcoding: some
versions raise -1820 ('character not in target codeset'); others
silently replace with '?'. Either is acceptable — the test asserts
the connection survives.
"""
with _connect(conn_params) as conn:
cur = conn.cursor()
cur.execute("CREATE TEMP TABLE p20_neg (s VARCHAR(50))")
with contextlib.suppress(informix_db.Error):
cur.execute("INSERT INTO p20_neg VALUES (?)", ("你好",))
# Connection survives whatever happened
cur.execute("SELECT 1 FROM systables WHERE tabid = 1")
assert cur.fetchone() == (1,)
# -------- Smart-LOB CLOB with locale --------
def test_clob_round_trip_8859_1(conn_params: ConnParams) -> None:
"""CLOB columns round-trip Latin-1 text through the SQ_FILE protocol."""
text = "Lorem ipsum dolor sit amet, café résumé naïve"
text_bytes = text.encode("iso-8859-1")
# Need a logged DB for CLOB
logged_params = conn_params._replace(database="testdb")
try:
conn = _connect(logged_params)
except informix_db.Error as e:
pytest.skip(f"logged DB unavailable: {e!r}")
try:
cur = conn.cursor()
with contextlib.suppress(Exception):
cur.execute("DROP TABLE p20_clob")
try:
cur.execute("CREATE TABLE p20_clob (id INT, txt CLOB)")
except informix_db.Error as e:
pytest.skip(f"sbspace unavailable: {e!r}")
try:
cur.write_blob_column(
"INSERT INTO p20_clob VALUES (?, BLOB_PLACEHOLDER)",
text_bytes,
(1,),
clob=True,
)
got = cur.read_blob_column(
"SELECT txt FROM p20_clob WHERE id = ?", (1,)
)
assert got == text_bytes
finally:
with contextlib.suppress(Exception):
cur.execute("DROP TABLE p20_clob")
finally:
conn.close()