"""Phase 20 integration tests — locale + multi-byte string handling. The driver historically hardcoded ``iso-8859-1`` everywhere, which was "the default and probably fine" but made multi-byte locales (UTF-8, UCS-2) broken-by-design. This phase: 1. Threads the connection's ``client_locale`` through to the user-data string codecs (CHAR / VARCHAR / NVCHAR / LVARCHAR / CLOB / TEXT). 2. Maps locale strings → Python encoding names via :func:`informix_db._python_encoding_from_locale`. 3. Verifies round-trip integrity at multiple locale settings. Protocol-level strings (cursor names, function signatures, error "near tokens") stay iso-8859-1 — those are always ASCII and never contain user-controlled bytes. Caveat: many test scenarios depend on the *database's* DB_LOCALE, which is set at CREATE DATABASE time. The dev container's testdb was created with the default 8859-1 locale — so chars outside 8859-1 will fail server-side regardless of CLIENT_LOCALE. Tests for multibyte UTF-8 storage are skipped unless a UTF-8 database is available (env var IFX_UTF8_DATABASE). """ from __future__ import annotations import contextlib import os from collections.abc import Iterator import pytest import informix_db from tests.conftest import ConnParams pytestmark = pytest.mark.integration def _connect(params: ConnParams, **overrides) -> informix_db.Connection: kwargs = { "host": params.host, "port": params.port, "user": params.user, "password": params.password, "database": params.database, "server": params.server, "autocommit": True, } kwargs.update(overrides) return informix_db.connect(**kwargs) # -------- ISO-8859-1 (default) — chars 0..255 round-trip -------- def test_ascii_round_trip(conn_params: ConnParams) -> None: """Pure ASCII works (regression test).""" with _connect(conn_params) as conn: cur = conn.cursor() cur.execute("CREATE TEMP TABLE p20_ascii (s VARCHAR(50))") cur.execute("INSERT INTO p20_ascii VALUES (?)", ("hello world",)) cur.execute("SELECT s FROM p20_ascii") assert cur.fetchone() == ("hello world",) def test_iso8859_high_bit_round_trip(conn_params: ConnParams) -> None: """Latin-1 high-bit chars (128-255) round-trip on default locale.""" samples = [ "café", # é = 0xE9 "résumé", # é = 0xE9 "naïve", # ï = 0xEF "Zürich", # ü = 0xFC "señorita", # ñ = 0xF1 "©™®", # 0xA9, trademark not in 8859-1, replaced ] with _connect(conn_params) as conn: cur = conn.cursor() cur.execute("CREATE TEMP TABLE p20_latin (id INT, s VARCHAR(50))") # Filter to chars that ARE in 8859-1 latin_safe = [s for s in samples if all(ord(c) <= 0xFF for c in s)] for i, s in enumerate(latin_safe): cur.execute("INSERT INTO p20_latin VALUES (?, ?)", (i, s)) cur.execute("SELECT id, s FROM p20_latin ORDER BY id") rows = cur.fetchall() assert [r[1] for r in rows] == latin_safe def test_iso8859_full_byte_range(conn_params: ConnParams) -> None: """Each byte 0x20..0xFE round-trips through VARCHAR. 0x00 is NUL (string terminator on the wire) and not allowed in VARCHAR. 0x1F and below are control chars; some servers reject. 0xFF is sometimes treated specially in length-prefixed encodings. Using 0x20..0xFE keeps us in safe territory. """ chars = bytes(range(0x20, 0xFF)).decode("iso-8859-1") assert len(chars) == 0xFF - 0x20 with _connect(conn_params) as conn: cur = conn.cursor() cur.execute("CREATE TEMP TABLE p20_full (s VARCHAR(255))") cur.execute("INSERT INTO p20_full VALUES (?)", (chars,)) cur.execute("SELECT s FROM p20_full") (got,) = cur.fetchone() assert got == chars # -------- Locale mapping -------- def test_locale_maps_to_python_encoding() -> None: """The locale → Python-encoding mapping handles common forms.""" from informix_db.connections import _python_encoding_from_locale assert _python_encoding_from_locale("en_US.8859-1") == "iso-8859-1" assert _python_encoding_from_locale("en_US.819") == "iso-8859-1" assert _python_encoding_from_locale("en_US.utf8") == "utf-8" assert _python_encoding_from_locale("en_US.UTF-8") == "utf-8" # Unknown / no codeset suffix: fall back to safe default assert _python_encoding_from_locale("en_US") == "iso-8859-1" assert _python_encoding_from_locale("") == "iso-8859-1" def test_connection_exposes_python_encoding(conn_params: ConnParams) -> None: """``conn.encoding`` reports the Python-side encoding for user data.""" with _connect(conn_params) as conn: assert conn.encoding == "iso-8859-1" with _connect(conn_params, client_locale="en_US.utf8") as conn: assert conn.encoding == "utf-8" # -------- UTF-8 connections (require UTF-8 DB to fully validate) -------- def test_utf8_locale_negotiation_works(conn_params: ConnParams) -> None: """Connecting with ``client_locale='en_US.utf8'`` doesn't crash. The server handles transcoding when CLIENT_LOCALE differs from DB_LOCALE for code points representable in both. ASCII obviously is. """ with _connect(conn_params, client_locale="en_US.utf8") as conn: cur = conn.cursor() cur.execute("SELECT FIRST 1 tabname FROM systables") row = cur.fetchone() assert isinstance(row[0], str) assert row[0] == "systables" @pytest.fixture def utf8_db_params(conn_params: ConnParams) -> Iterator[ConnParams]: """Provide a UTF-8 DB connection if one's available; skip otherwise.""" db_name = os.environ.get("IFX_UTF8_DATABASE") if not db_name: pytest.skip( "UTF-8 database not available; set IFX_UTF8_DATABASE env var " "to enable. Create with: CREATE DATABASE my_utf8db WITH LOG IN " "rootdbs (after setting DB_LOCALE=en_US.utf8 in the env)." ) yield conn_params._replace(database=db_name) def test_utf8_multibyte_round_trip(utf8_db_params: ConnParams) -> None: """Multi-byte UTF-8 chars round-trip when both locale + DB are UTF-8.""" samples = [ "你好世界", # CJK "مرحبا", # Arabic (RTL) "ñoño 🎉", # Latin + emoji (4-byte UTF-8) "Здравствуй", # Cyrillic ] with _connect(utf8_db_params, client_locale="en_US.utf8") as conn: cur = conn.cursor() cur.execute( "CREATE TEMP TABLE p20_utf8 (id INT, s NVARCHAR(100))" ) for i, s in enumerate(samples): cur.execute("INSERT INTO p20_utf8 VALUES (?, ?)", (i, s)) cur.execute("SELECT id, s FROM p20_utf8 ORDER BY id") rows = cur.fetchall() assert [r[1] for r in rows] == samples # -------- Negative tests: non-representable chars on 8859-1 DB -------- def test_chinese_into_8859_1_db_raises_or_lossy( conn_params: ConnParams, ) -> None: """Storing CJK chars in an 8859-1 DB either raises cleanly or lossy-substitutes. The exact behavior depends on the server's transcoding: some versions raise -1820 ('character not in target codeset'); others silently replace with '?'. Either is acceptable — the test asserts the connection survives. """ with _connect(conn_params) as conn: cur = conn.cursor() cur.execute("CREATE TEMP TABLE p20_neg (s VARCHAR(50))") with contextlib.suppress(informix_db.Error): cur.execute("INSERT INTO p20_neg VALUES (?)", ("你好",)) # Connection survives whatever happened cur.execute("SELECT 1 FROM systables WHERE tabid = 1") assert cur.fetchone() == (1,) # -------- Smart-LOB CLOB with locale -------- def test_clob_round_trip_8859_1(conn_params: ConnParams) -> None: """CLOB columns round-trip Latin-1 text through the SQ_FILE protocol.""" text = "Lorem ipsum dolor sit amet, café résumé naïve" text_bytes = text.encode("iso-8859-1") # Need a logged DB for CLOB logged_params = conn_params._replace(database="testdb") try: conn = _connect(logged_params) except informix_db.Error as e: pytest.skip(f"logged DB unavailable: {e!r}") try: cur = conn.cursor() with contextlib.suppress(Exception): cur.execute("DROP TABLE p20_clob") try: cur.execute("CREATE TABLE p20_clob (id INT, txt CLOB)") except informix_db.Error as e: pytest.skip(f"sbspace unavailable: {e!r}") try: cur.write_blob_column( "INSERT INTO p20_clob VALUES (?, BLOB_PLACEHOLDER)", text_bytes, (1,), clob=True, ) got = cur.read_blob_column( "SELECT txt FROM p20_clob WHERE id = ?", (1,) ) assert got == text_bytes finally: with contextlib.suppress(Exception): cur.execute("DROP TABLE p20_clob") finally: conn.close()