informix-db/tests/test_unicode.py

"""Phase 20 integration tests — locale + multi-byte string handling.

The driver historically hardcoded ``iso-8859-1`` everywhere, which was
"the default and probably fine" but made multi-byte locales (UTF-8,
UCS-2) broken-by-design. This phase:

1. Threads the connection's ``client_locale`` through to the user-data
   string codecs (CHAR / VARCHAR / NVCHAR / LVARCHAR / CLOB / TEXT).
2. Maps locale strings → Python encoding names via
   :func:`informix_db._python_encoding_from_locale`.
3. Verifies round-trip integrity at multiple locale settings.

Protocol-level strings (cursor names, function signatures, error
"near tokens") stay iso-8859-1 — those are always ASCII and never
contain user-controlled bytes.

Caveat: many test scenarios depend on the *database's* DB_LOCALE,
which is set at CREATE DATABASE time. The dev container's testdb
was created with the default 8859-1 locale — so chars outside 8859-1
will fail server-side regardless of CLIENT_LOCALE. Tests for
multibyte UTF-8 storage are skipped unless a UTF-8 database is
available (env var IFX_UTF8_DATABASE).
"""

from __future__ import annotations

import contextlib
import os
from collections.abc import Iterator

import pytest

import informix_db
from tests.conftest import ConnParams

pytestmark = pytest.mark.integration


def _connect(params: ConnParams, **overrides) -> informix_db.Connection:
    kwargs = {
        "host": params.host,
        "port": params.port,
        "user": params.user,
        "password": params.password,
        "database": params.database,
        "server": params.server,
        "autocommit": True,
    }
    kwargs.update(overrides)
    return informix_db.connect(**kwargs)


# -------- ISO-8859-1 (default) — chars 0..255 round-trip --------


def test_ascii_round_trip(conn_params: ConnParams) -> None:
    """Pure ASCII works (regression test)."""
    with _connect(conn_params) as conn:
        cur = conn.cursor()
        cur.execute("CREATE TEMP TABLE p20_ascii (s VARCHAR(50))")
        cur.execute("INSERT INTO p20_ascii VALUES (?)", ("hello world",))
        cur.execute("SELECT s FROM p20_ascii")
        assert cur.fetchone() == ("hello world",)


def test_iso8859_high_bit_round_trip(conn_params: ConnParams) -> None:
    """Latin-1 high-bit chars (128-255) round-trip on default locale."""
    samples = [
        "café",       # é = 0xE9
        "résumé",     # é = 0xE9
        "naïve",      # ï = 0xEF
        "Zürich",     # ü = 0xFC
        "señorita",   # ñ = 0xF1
        "©™®",        # 0xA9, trademark not in 8859-1, replaced
    ]
    with _connect(conn_params) as conn:
        cur = conn.cursor()
        cur.execute("CREATE TEMP TABLE p20_latin (id INT, s VARCHAR(50))")
        # Filter to chars that ARE in 8859-1
        latin_safe = [s for s in samples if all(ord(c) <= 0xFF for c in s)]
        for i, s in enumerate(latin_safe):
            cur.execute("INSERT INTO p20_latin VALUES (?, ?)", (i, s))
        cur.execute("SELECT id, s FROM p20_latin ORDER BY id")
        rows = cur.fetchall()
        assert [r[1] for r in rows] == latin_safe


def test_iso8859_full_byte_range(conn_params: ConnParams) -> None:
    """Each byte 0x20..0xFE round-trips through VARCHAR.

    0x00 is NUL (string terminator on the wire) and not allowed in
    VARCHAR. 0x1F and below are control chars; some servers reject.
    0xFF is sometimes treated specially in length-prefixed encodings.
    Using 0x20..0xFE keeps us in safe territory.
    """
    chars = bytes(range(0x20, 0xFF)).decode("iso-8859-1")
    assert len(chars) == 0xFF - 0x20

    with _connect(conn_params) as conn:
        cur = conn.cursor()
        cur.execute("CREATE TEMP TABLE p20_full (s VARCHAR(255))")
        cur.execute("INSERT INTO p20_full VALUES (?)", (chars,))
        cur.execute("SELECT s FROM p20_full")
        (got,) = cur.fetchone()
        assert got == chars


# -------- Locale mapping --------


def test_locale_maps_to_python_encoding() -> None:
    """The locale → Python-encoding mapping handles common forms."""
    from informix_db.connections import _python_encoding_from_locale

    assert _python_encoding_from_locale("en_US.8859-1") == "iso-8859-1"
    assert _python_encoding_from_locale("en_US.819") == "iso-8859-1"
    assert _python_encoding_from_locale("en_US.utf8") == "utf-8"
    assert _python_encoding_from_locale("en_US.UTF-8") == "utf-8"
    # Unknown / no codeset suffix: fall back to safe default
    assert _python_encoding_from_locale("en_US") == "iso-8859-1"
    assert _python_encoding_from_locale("") == "iso-8859-1"


def test_connection_exposes_python_encoding(conn_params: ConnParams) -> None:
    """``conn.encoding`` reports the Python-side encoding for user data."""
    with _connect(conn_params) as conn:
        assert conn.encoding == "iso-8859-1"
    with _connect(conn_params, client_locale="en_US.utf8") as conn:
        assert conn.encoding == "utf-8"


# -------- UTF-8 connections (require UTF-8 DB to fully validate) --------


def test_utf8_locale_negotiation_works(conn_params: ConnParams) -> None:
    """Connecting with ``client_locale='en_US.utf8'`` doesn't crash.

    The server handles transcoding when CLIENT_LOCALE differs from
    DB_LOCALE for code points representable in both. ASCII obviously is.
    """
    with _connect(conn_params, client_locale="en_US.utf8") as conn:
        cur = conn.cursor()
        cur.execute("SELECT FIRST 1 tabname FROM systables")
        row = cur.fetchone()
        assert isinstance(row[0], str)
        assert row[0] == "systables"


@pytest.fixture
def utf8_db_params(conn_params: ConnParams) -> Iterator[ConnParams]:
    """Provide a UTF-8 DB connection if one's available; skip otherwise."""
    db_name = os.environ.get("IFX_UTF8_DATABASE")
    if not db_name:
        pytest.skip(
            "UTF-8 database not available; set IFX_UTF8_DATABASE env var "
            "to enable. Create with: CREATE DATABASE my_utf8db WITH LOG IN "
            "rootdbs (after setting DB_LOCALE=en_US.utf8 in the env)."
        )
    yield conn_params._replace(database=db_name)


def test_utf8_multibyte_round_trip(utf8_db_params: ConnParams) -> None:
    """Multi-byte UTF-8 chars round-trip when both locale + DB are UTF-8."""
    samples = [
        "你好世界",       # CJK
        "مرحبا",          # Arabic (RTL)
        "ñoño 🎉",        # Latin + emoji (4-byte UTF-8)
        "Здравствуй",     # Cyrillic
    ]
    with _connect(utf8_db_params, client_locale="en_US.utf8") as conn:
        cur = conn.cursor()
        cur.execute(
            "CREATE TEMP TABLE p20_utf8 (id INT, s NVARCHAR(100))"
        )
        for i, s in enumerate(samples):
            cur.execute("INSERT INTO p20_utf8 VALUES (?, ?)", (i, s))
        cur.execute("SELECT id, s FROM p20_utf8 ORDER BY id")
        rows = cur.fetchall()
        assert [r[1] for r in rows] == samples


# -------- Negative tests: non-representable chars on 8859-1 DB --------


def test_chinese_into_8859_1_db_raises_or_lossy(
    conn_params: ConnParams,
) -> None:
    """Storing CJK chars in an 8859-1 DB either raises cleanly or lossy-substitutes.

    The exact behavior depends on the server's transcoding: some
    versions raise -1820 ('character not in target codeset'); others
    silently replace with '?'. Either is acceptable — the test asserts
    the connection survives.
    """
    with _connect(conn_params) as conn:
        cur = conn.cursor()
        cur.execute("CREATE TEMP TABLE p20_neg (s VARCHAR(50))")
        with contextlib.suppress(informix_db.Error):
            cur.execute("INSERT INTO p20_neg VALUES (?)", ("你好",))

        # Connection survives whatever happened
        cur.execute("SELECT 1 FROM systables WHERE tabid = 1")
        assert cur.fetchone() == (1,)


# -------- Smart-LOB CLOB with locale --------


def test_clob_round_trip_8859_1(conn_params: ConnParams) -> None:
    """CLOB columns round-trip Latin-1 text through the SQ_FILE protocol."""
    text = "Lorem ipsum dolor sit amet, café résumé naïve"
    text_bytes = text.encode("iso-8859-1")

    # Need a logged DB for CLOB
    logged_params = conn_params._replace(database="testdb")
    try:
        conn = _connect(logged_params)
    except informix_db.Error as e:
        pytest.skip(f"logged DB unavailable: {e!r}")
    try:
        cur = conn.cursor()
        with contextlib.suppress(Exception):
            cur.execute("DROP TABLE p20_clob")
        try:
            cur.execute("CREATE TABLE p20_clob (id INT, txt CLOB)")
        except informix_db.Error as e:
            pytest.skip(f"sbspace unavailable: {e!r}")
        try:
            cur.write_blob_column(
                "INSERT INTO p20_clob VALUES (?, BLOB_PLACEHOLDER)",
                text_bytes,
                (1,),
                clob=True,
            )
            got = cur.read_blob_column(
                "SELECT txt FROM p20_clob WHERE id = ?", (1,)
            )
            assert got == text_bytes
        finally:
            with contextlib.suppress(Exception):
                cur.execute("DROP TABLE p20_clob")
    finally:
        conn.close()