From 2b461e1a7548dbdcc784c8f4e7abece662d65aa7 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Thu, 4 Jun 2026 11:02:02 +0100 Subject: [PATCH 1/6] add/remove language codes and refuse to publish books with invalid language codes --- backend/pyproject.toml | 1 + backend/src/cms_backend/mill/__init__.py | 18 ++++ backend/src/cms_backend/mill/context.py | 30 ++++++- .../src/cms_backend/mill/processors/book.py | 9 ++ .../processors/test_zimfarm_notification.py | 88 +++++++++++++++++++ 5 files changed, 145 insertions(+), 1 deletion(-) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 97f33087..8c805217 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "humanfriendly == 10.0", "Werkzeug == 3.1.5", "xxhash == 3.7.0", + "pycountry == 26.2.16", ] dynamic = ["version"] diff --git a/backend/src/cms_backend/mill/__init__.py b/backend/src/cms_backend/mill/__init__.py index e69de29b..808b84b2 100644 --- a/backend/src/cms_backend/mill/__init__.py +++ b/backend/src/cms_backend/mill/__init__.py @@ -0,0 +1,18 @@ +import pycountry + +from cms_backend import logger +from cms_backend.mill.context import Context as MillContext + + +def update_language_codes(): + for code in MillContext.disallowed_language_codes: + try: + pycountry.languages.remove_entry(alpha_3=code) + except Exception as exc: + logger.warning(f"failed to remove language code '{code}': {exc}") + + for code in MillContext.custom_language_codes: + pycountry.languages.add_entry(alpha_3=code) + + +update_language_codes() diff --git a/backend/src/cms_backend/mill/context.py b/backend/src/cms_backend/mill/context.py index 866a1142..acd85c26 100644 --- a/backend/src/cms_backend/mill/context.py +++ b/backend/src/cms_backend/mill/context.py @@ -1,13 +1,33 @@ import os from dataclasses import dataclass from datetime import timedelta -from typing import TypeVar +from typing import ClassVar, TypeVar +import pycountry from humanfriendly import parse_timespan T = TypeVar("T") +def _parse_custom_language_codes(language_code: str | None) -> list[str]: + """Transform the env language codes (comma-seperated) into a list.""" + if language_code is None: + return [] + + codes = language_code.split(",") + for code in codes: + if len(code) != 3: # noqa: PLR2004 + raise ValueError(f"Custom code '{code}' must be 3 characters long.") + return codes + + +def _validate_language_codes(language_codes: list[str]) -> list[str]: + for code in language_codes: + if pycountry.languages.get(alpha_3=code) is None: + raise ValueError(f"Code '{code}' is not a valid ISO 639-3 code.") + return language_codes + + @dataclass(kw_only=True) class Context: """Class holding every contextual / configuration bits which can be moved @@ -37,3 +57,11 @@ class Context: os.getenv("PROCESS_RETENTION_RULES_INTERVAL", default="1d") ) ) + + # Comma-seperated list of custom iso639-3 language codes + custom_language_codes: ClassVar[list[str]] = _parse_custom_language_codes( + os.getenv("CUSTOM_LANGUAGE_CODES") + ) + disallowed_language_codes: ClassVar[list[str]] = _validate_language_codes( + _parse_custom_language_codes(os.getenv("DISALLOWED_LANGUAGE_CODES")) + ) diff --git a/backend/src/cms_backend/mill/processors/book.py b/backend/src/cms_backend/mill/processors/book.py index 826cbacc..1df55704 100644 --- a/backend/src/cms_backend/mill/processors/book.py +++ b/backend/src/cms_backend/mill/processors/book.py @@ -1,3 +1,4 @@ +import pycountry from sqlalchemy.orm import Session as ORMSession from cms_backend import logger @@ -46,6 +47,14 @@ def check_book_zim_spec(book: Book) -> bool: book.has_error = True return False + language_code = book.zim_metadata["Language"] + if pycountry.languages.get(alpha_3=language_code) is None: + book.events.append( + f"{getnow()}: book has unknown language code {language_code}" + ) + book.has_error = True + return False + book.events.append(f"{getnow()}: book passed ZIM specification checks") return True diff --git a/backend/tests/mill/processors/test_zimfarm_notification.py b/backend/tests/mill/processors/test_zimfarm_notification.py index cfd4e42e..4dc3fbdb 100644 --- a/backend/tests/mill/processors/test_zimfarm_notification.py +++ b/backend/tests/mill/processors/test_zimfarm_notification.py @@ -9,6 +9,7 @@ from pathlib import Path from uuid import UUID +from pytest import MonkeyPatch from sqlalchemy.orm import Session as OrmSession from cms_backend.context import Context @@ -20,6 +21,7 @@ Warehouse, ZimfarmNotification, ) +from cms_backend.mill import update_language_codes from cms_backend.mill.processors.zimfarm_notification import process_notification VALID_NOTIFICATION_CONTENT = { @@ -241,6 +243,92 @@ def test_missing_metadata_sets_error_flag( assert book.needs_file_operation is False +class TestValidNotificationWithDifferentLanguageMetadata: + """Test valid notifications with different variations of supported languages""" + + def test_unknown_language_code_sets_error_flag( + self, + dbsession: OrmSession, + warehouse: Warehouse, # noqa: ARG002 + create_zimfarm_notification: Callable[..., ZimfarmNotification], + ): + """Valid notification but unknown langauge code → book marked with error.""" + content = VALID_NOTIFICATION_CONTENT.copy() + content["metadata"]["Language"] = "xyz" + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.location_kind == "quarantine" + assert book.has_error is True + assert any("book has unknown language code" in event for event in book.events) + assert book.needs_processing is False + assert book.needs_file_operation is False + + def test_invalid_supported_language_code( + self, + dbsession: OrmSession, + monkeypatch: MonkeyPatch, + warehouse: Warehouse, # noqa: ARG002 + create_zimfarm_notification: Callable[..., ZimfarmNotification], + ): + """Test that invalid language code that is supported passes checks""" + content = VALID_NOTIFICATION_CONTENT.copy() + content["metadata"]["Language"] = "xyz" + monkeypatch.setattr( + "cms_backend.mill.context.Context.custom_language_codes", ["xyz"] + ) + update_language_codes() + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.location_kind == "quarantine" + assert book.has_error is False + assert any( + "book has unknown language code" not in event for event in book.events + ) + + def test_valid_language_code_but_dislallowed_sets_error_flag( + self, + dbsession: OrmSession, + monkeypatch: MonkeyPatch, + warehouse: Warehouse, # noqa: ARG002 + create_zimfarm_notification: Callable[..., ZimfarmNotification], + ): + """Valid language code that is disallowed → book marked with error.""" + content = VALID_NOTIFICATION_CONTENT.copy() + content["metadata"]["Language"] = "fra" + monkeypatch.setattr( + "cms_backend.mill.context.Context.disallowed_language_codes", ["fra"] + ) + update_language_codes() + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.location_kind == "quarantine" + assert book.has_error is True + assert any("book has unknown language code" in event for event in book.events) + assert book.needs_processing is False + assert book.needs_file_operation is False + + class TestValidNotificationWithMatchingTitleUnstableMaturity: """Test valid notifications that match an existing title with unstable maturity. From edb3f168a2057c970d31dae1bb9246c970e4bc09 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Thu, 4 Jun 2026 11:49:13 +0100 Subject: [PATCH 2/6] restore original languages in fixtures --- backend/src/cms_backend/mill/__init__.py | 4 +-- backend/src/cms_backend/mill/context.py | 2 +- .../src/cms_backend/mill/processors/book.py | 2 +- .../processors/test_zimfarm_notification.py | 27 ++++++++++++++++++- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/backend/src/cms_backend/mill/__init__.py b/backend/src/cms_backend/mill/__init__.py index 808b84b2..6f734a87 100644 --- a/backend/src/cms_backend/mill/__init__.py +++ b/backend/src/cms_backend/mill/__init__.py @@ -7,12 +7,12 @@ def update_language_codes(): for code in MillContext.disallowed_language_codes: try: - pycountry.languages.remove_entry(alpha_3=code) + pycountry.languages.remove_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] except Exception as exc: logger.warning(f"failed to remove language code '{code}': {exc}") for code in MillContext.custom_language_codes: - pycountry.languages.add_entry(alpha_3=code) + pycountry.languages.add_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] update_language_codes() diff --git a/backend/src/cms_backend/mill/context.py b/backend/src/cms_backend/mill/context.py index acd85c26..c2588c1b 100644 --- a/backend/src/cms_backend/mill/context.py +++ b/backend/src/cms_backend/mill/context.py @@ -23,7 +23,7 @@ def _parse_custom_language_codes(language_code: str | None) -> list[str]: def _validate_language_codes(language_codes: list[str]) -> list[str]: for code in language_codes: - if pycountry.languages.get(alpha_3=code) is None: + if pycountry.languages.get(alpha_3=code) is None: # pyright: ignore[reportUnknownMemberType] raise ValueError(f"Code '{code}' is not a valid ISO 639-3 code.") return language_codes diff --git a/backend/src/cms_backend/mill/processors/book.py b/backend/src/cms_backend/mill/processors/book.py index 1df55704..37570da5 100644 --- a/backend/src/cms_backend/mill/processors/book.py +++ b/backend/src/cms_backend/mill/processors/book.py @@ -48,7 +48,7 @@ def check_book_zim_spec(book: Book) -> bool: return False language_code = book.zim_metadata["Language"] - if pycountry.languages.get(alpha_3=language_code) is None: + if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType] book.events.append( f"{getnow()}: book has unknown language code {language_code}" ) diff --git a/backend/tests/mill/processors/test_zimfarm_notification.py b/backend/tests/mill/processors/test_zimfarm_notification.py index 4dc3fbdb..7acf49e2 100644 --- a/backend/tests/mill/processors/test_zimfarm_notification.py +++ b/backend/tests/mill/processors/test_zimfarm_notification.py @@ -7,8 +7,11 @@ from collections.abc import Callable from pathlib import Path +from typing import Any from uuid import UUID +import pycountry +import pytest from pytest import MonkeyPatch from sqlalchemy.orm import Session as OrmSession @@ -24,7 +27,7 @@ from cms_backend.mill import update_language_codes from cms_backend.mill.processors.zimfarm_notification import process_notification -VALID_NOTIFICATION_CONTENT = { +VALID_NOTIFICATION_CONTENT: dict[str, Any] = { "article_count": 1000, "media_count": 500, "size": 1000000, @@ -48,6 +51,28 @@ } +@pytest.fixture(autouse=True) +def restore_language_codes(): + """Fixture to restore pycountry language codes after test modifications.""" + original_entries = list(pycountry.languages) # pyright: ignore[reportUnknownVariableType] + yield + current_entries = list(pycountry.languages) # pyright: ignore[reportUnknownVariableType] + for entry in current_entries: # pyright: ignore[reportUnknownVariableType] + try: + pycountry.languages.remove_entry(alpha_3=entry.alpha_3) # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType] + except Exception: # noqa: S110 + pass + + for entry in original_entries: # pyright: ignore[reportUnknownVariableType] + try: + pycountry.languages.add_entry( # pyright: ignore[reportUnknownMemberType] + alpha_3=entry.alpha_3, # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType] + name=entry.name, # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType] + ) + except Exception: # noqa: S110 + pass + + class TestBadNotifications: """Test notifications that fail validation and are marked as bad_notification.""" From f25f393b5530ead731da0e4f203e1a9de312cd37 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Thu, 4 Jun 2026 13:42:55 +0100 Subject: [PATCH 3/6] add invalid language code to issues and force books to staging --- backend/src/cms_backend/__init__.py | 16 ++ backend/src/cms_backend/api/routes/fields.py | 23 ++ backend/src/cms_backend/context.py | 29 ++- backend/src/cms_backend/db/book.py | 9 + backend/src/cms_backend/mill/__init__.py | 18 -- backend/src/cms_backend/mill/context.py | 30 +-- .../src/cms_backend/mill/processors/book.py | 9 - backend/src/cms_backend/schemas/models.py | 4 +- .../processors/test_zimfarm_notification.py | 243 +++++++++++------- 9 files changed, 235 insertions(+), 146 deletions(-) diff --git a/backend/src/cms_backend/__init__.py b/backend/src/cms_backend/__init__.py index ea15cc2a..d32635aa 100644 --- a/backend/src/cms_backend/__init__.py +++ b/backend/src/cms_backend/__init__.py @@ -1,5 +1,7 @@ import logging +import pycountry + from cms_backend.context import Context logger = logging.getLogger("backend") @@ -9,3 +11,17 @@ handler = logging.StreamHandler() handler.setFormatter(logging.Formatter("[%(asctime)s: %(levelname)s] %(message)s")) logger.addHandler(handler) + + +def update_language_codes(): + for code in Context.disallowed_language_codes: + try: + pycountry.languages.remove_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] + except Exception as exc: + logger.warning(f"failed to remove language code '{code}': {exc}") + + for code in Context.custom_language_codes: + pycountry.languages.add_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] + + +update_language_codes() diff --git a/backend/src/cms_backend/api/routes/fields.py b/backend/src/cms_backend/api/routes/fields.py index cf879abd..6c5dbaca 100644 --- a/backend/src/cms_backend/api/routes/fields.py +++ b/backend/src/cms_backend/api/routes/fields.py @@ -1,6 +1,7 @@ import base64 from typing import Annotated, Any +import pycountry from pydantic import ( AfterValidator, Field, @@ -54,3 +55,25 @@ def validate_base64(value: str) -> str: LimitFieldMax200 = Annotated[int, Field(ge=1, le=200), WrapValidator(skip_validation)] Base64Str = Annotated[NotEmptyString, AfterValidator(validate_base64)] + + +def validate_language_code(value: str | None, info: ValidationInfo) -> str | None: + """Validate that string is a valid ISO-693-3 language code""" + if value is None: + return value + context = info.context + if context and context.get("skip_validation"): + return value + if pycountry.languages.get(alpha_3=value): # pyright: ignore[reportUnknownMemberType] + return value + raise ValueError( + f"Language code '{value}' is not a supported ISO-639-3 language code" + ) + + +LangCode = Annotated[ + str, + Field(min_length=3, max_length=3), + WrapValidator(skip_validation), + AfterValidator(validate_language_code), +] diff --git a/backend/src/cms_backend/context.py b/backend/src/cms_backend/context.py index a98455ac..670cdfa0 100644 --- a/backend/src/cms_backend/context.py +++ b/backend/src/cms_backend/context.py @@ -3,9 +3,10 @@ from dataclasses import field from datetime import timedelta from pathlib import Path -from typing import Any, TypeVar +from typing import Any, ClassVar, TypeVar from uuid import UUID +import pycountry from humanfriendly import parse_timespan T = TypeVar("T") @@ -23,6 +24,25 @@ def get_mandatory_env(key: str) -> str: return value +def _parse_custom_language_codes(language_code: str | None) -> list[str]: + """Transform the env language codes (comma-seperated) into a list.""" + if language_code is None: + return [] + + codes = language_code.split(",") + for code in codes: + if len(code) != 3: # noqa: PLR2004 + raise ValueError(f"Custom code '{code}' must be 3 characters long.") + return codes + + +def _validate_language_codes(language_codes: list[str]) -> list[str]: + for code in language_codes: + if pycountry.languages.get(alpha_3=code) is None: # pyright: ignore[reportUnknownMemberType] + raise ValueError(f"Code '{code}' is not a valid ISO 639-3 code.") + return language_codes + + @dataclasses.dataclass(kw_only=True) class Context: """Class holding every contextual / configuration bits which can be moved @@ -66,3 +86,10 @@ class Context: quarantine_base_path: Path = field( default=Path(os.getenv("QUARANTINE_BASE_PATH", "")) ) + # Comma-seperated list of custom iso639-3 language codes + custom_language_codes: ClassVar[list[str]] = _parse_custom_language_codes( + os.getenv("CUSTOM_LANGUAGE_CODES") + ) + disallowed_language_codes: ClassVar[list[str]] = _validate_language_codes( + _parse_custom_language_codes(os.getenv("DISALLOWED_LANGUAGE_CODES")) + ) diff --git a/backend/src/cms_backend/db/book.py b/backend/src/cms_backend/db/book.py index d36c37d8..acbd5f68 100644 --- a/backend/src/cms_backend/db/book.py +++ b/backend/src/cms_backend/db/book.py @@ -1,6 +1,7 @@ from typing import Any, Literal from uuid import UUID +import pycountry from sqlalchemy import select, update from sqlalchemy.orm import Session as OrmSession from sqlalchemy.orm import selectinload @@ -539,6 +540,14 @@ def update_book_issues(session: OrmSession, book: Book, *, update_events: bool = issues: list[str] = [] + language_code = book.zim_metadata["Language"] + if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType] + issues.append("invalid language code") + if update_events: + book.events.append( + f"{getnow()}: book has unknown language code {language_code}" + ) + different_metadata_keys = get_differing_metadata_keys(book) if different_metadata_keys: issues.append("metadata mismatch") diff --git a/backend/src/cms_backend/mill/__init__.py b/backend/src/cms_backend/mill/__init__.py index 6f734a87..e69de29b 100644 --- a/backend/src/cms_backend/mill/__init__.py +++ b/backend/src/cms_backend/mill/__init__.py @@ -1,18 +0,0 @@ -import pycountry - -from cms_backend import logger -from cms_backend.mill.context import Context as MillContext - - -def update_language_codes(): - for code in MillContext.disallowed_language_codes: - try: - pycountry.languages.remove_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] - except Exception as exc: - logger.warning(f"failed to remove language code '{code}': {exc}") - - for code in MillContext.custom_language_codes: - pycountry.languages.add_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType] - - -update_language_codes() diff --git a/backend/src/cms_backend/mill/context.py b/backend/src/cms_backend/mill/context.py index c2588c1b..866a1142 100644 --- a/backend/src/cms_backend/mill/context.py +++ b/backend/src/cms_backend/mill/context.py @@ -1,33 +1,13 @@ import os from dataclasses import dataclass from datetime import timedelta -from typing import ClassVar, TypeVar +from typing import TypeVar -import pycountry from humanfriendly import parse_timespan T = TypeVar("T") -def _parse_custom_language_codes(language_code: str | None) -> list[str]: - """Transform the env language codes (comma-seperated) into a list.""" - if language_code is None: - return [] - - codes = language_code.split(",") - for code in codes: - if len(code) != 3: # noqa: PLR2004 - raise ValueError(f"Custom code '{code}' must be 3 characters long.") - return codes - - -def _validate_language_codes(language_codes: list[str]) -> list[str]: - for code in language_codes: - if pycountry.languages.get(alpha_3=code) is None: # pyright: ignore[reportUnknownMemberType] - raise ValueError(f"Code '{code}' is not a valid ISO 639-3 code.") - return language_codes - - @dataclass(kw_only=True) class Context: """Class holding every contextual / configuration bits which can be moved @@ -57,11 +37,3 @@ class Context: os.getenv("PROCESS_RETENTION_RULES_INTERVAL", default="1d") ) ) - - # Comma-seperated list of custom iso639-3 language codes - custom_language_codes: ClassVar[list[str]] = _parse_custom_language_codes( - os.getenv("CUSTOM_LANGUAGE_CODES") - ) - disallowed_language_codes: ClassVar[list[str]] = _validate_language_codes( - _parse_custom_language_codes(os.getenv("DISALLOWED_LANGUAGE_CODES")) - ) diff --git a/backend/src/cms_backend/mill/processors/book.py b/backend/src/cms_backend/mill/processors/book.py index 37570da5..826cbacc 100644 --- a/backend/src/cms_backend/mill/processors/book.py +++ b/backend/src/cms_backend/mill/processors/book.py @@ -1,4 +1,3 @@ -import pycountry from sqlalchemy.orm import Session as ORMSession from cms_backend import logger @@ -47,14 +46,6 @@ def check_book_zim_spec(book: Book) -> bool: book.has_error = True return False - language_code = book.zim_metadata["Language"] - if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType] - book.events.append( - f"{getnow()}: book has unknown language code {language_code}" - ) - book.has_error = True - return False - book.events.append(f"{getnow()}: book passed ZIM specification checks") return True diff --git a/backend/src/cms_backend/schemas/models.py b/backend/src/cms_backend/schemas/models.py index 74edf50f..630a270d 100644 --- a/backend/src/cms_backend/schemas/models.py +++ b/backend/src/cms_backend/schemas/models.py @@ -5,7 +5,7 @@ from pydantic import AnyUrl, model_validator -from cms_backend.api.routes.fields import Base64Str, NotEmptyString +from cms_backend.api.routes.fields import Base64Str, LangCode, NotEmptyString from cms_backend.roles import RoleEnum from cms_backend.schemas import BaseModel from cms_backend.schemas.orms import BaseTitleCollectionSchema @@ -62,7 +62,7 @@ class BaseTitleCreateUpdateSchema(BaseModel): creator: NotEmptyString | None = None description: NotEmptyString | None = None publisher: NotEmptyString | None = None - language: NotEmptyString | None = None + language: LangCode | None = None illustration_48x48_at_1: Base64Str | None = None flavours: list[str] | None = None archived: bool | None = None diff --git a/backend/tests/mill/processors/test_zimfarm_notification.py b/backend/tests/mill/processors/test_zimfarm_notification.py index 7acf49e2..224280ff 100644 --- a/backend/tests/mill/processors/test_zimfarm_notification.py +++ b/backend/tests/mill/processors/test_zimfarm_notification.py @@ -15,6 +15,7 @@ from pytest import MonkeyPatch from sqlalchemy.orm import Session as OrmSession +from cms_backend import update_language_codes from cms_backend.context import Context from cms_backend.db.models import ( Book, @@ -24,7 +25,6 @@ Warehouse, ZimfarmNotification, ) -from cms_backend.mill import update_language_codes from cms_backend.mill.processors.zimfarm_notification import process_notification VALID_NOTIFICATION_CONTENT: dict[str, Any] = { @@ -268,92 +268,6 @@ def test_missing_metadata_sets_error_flag( assert book.needs_file_operation is False -class TestValidNotificationWithDifferentLanguageMetadata: - """Test valid notifications with different variations of supported languages""" - - def test_unknown_language_code_sets_error_flag( - self, - dbsession: OrmSession, - warehouse: Warehouse, # noqa: ARG002 - create_zimfarm_notification: Callable[..., ZimfarmNotification], - ): - """Valid notification but unknown langauge code → book marked with error.""" - content = VALID_NOTIFICATION_CONTENT.copy() - content["metadata"]["Language"] = "xyz" - notification = create_zimfarm_notification(content=content) - dbsession.flush() - - process_notification(dbsession, notification) - - assert notification.status == "processed" - - book = dbsession.query(Book).filter_by(id=notification.id).first() - assert book is not None - assert book.location_kind == "quarantine" - assert book.has_error is True - assert any("book has unknown language code" in event for event in book.events) - assert book.needs_processing is False - assert book.needs_file_operation is False - - def test_invalid_supported_language_code( - self, - dbsession: OrmSession, - monkeypatch: MonkeyPatch, - warehouse: Warehouse, # noqa: ARG002 - create_zimfarm_notification: Callable[..., ZimfarmNotification], - ): - """Test that invalid language code that is supported passes checks""" - content = VALID_NOTIFICATION_CONTENT.copy() - content["metadata"]["Language"] = "xyz" - monkeypatch.setattr( - "cms_backend.mill.context.Context.custom_language_codes", ["xyz"] - ) - update_language_codes() - notification = create_zimfarm_notification(content=content) - dbsession.flush() - - process_notification(dbsession, notification) - - assert notification.status == "processed" - - book = dbsession.query(Book).filter_by(id=notification.id).first() - assert book is not None - assert book.location_kind == "quarantine" - assert book.has_error is False - assert any( - "book has unknown language code" not in event for event in book.events - ) - - def test_valid_language_code_but_dislallowed_sets_error_flag( - self, - dbsession: OrmSession, - monkeypatch: MonkeyPatch, - warehouse: Warehouse, # noqa: ARG002 - create_zimfarm_notification: Callable[..., ZimfarmNotification], - ): - """Valid language code that is disallowed → book marked with error.""" - content = VALID_NOTIFICATION_CONTENT.copy() - content["metadata"]["Language"] = "fra" - monkeypatch.setattr( - "cms_backend.mill.context.Context.disallowed_language_codes", ["fra"] - ) - update_language_codes() - notification = create_zimfarm_notification(content=content) - dbsession.flush() - - process_notification(dbsession, notification) - - assert notification.status == "processed" - - book = dbsession.query(Book).filter_by(id=notification.id).first() - assert book is not None - assert book.location_kind == "quarantine" - assert book.has_error is True - assert any("book has unknown language code" in event for event in book.events) - assert book.needs_processing is False - assert book.needs_file_operation is False - - class TestValidNotificationWithMatchingTitleUnstableMaturity: """Test valid notifications that match an existing title with unstable maturity. @@ -747,6 +661,161 @@ def test_moves_book_to_staging_due_to_diffrent_flavour_from_title( assert book.needs_file_operation is True assert book.needs_processing is False + def test_moves_book_to_staging_due_to_invalid_language( + self, + dbsession: OrmSession, + warehouse: Warehouse, # noqa: ARG002 + create_zimfarm_notification: Callable[..., ZimfarmNotification], + create_title: Callable[..., Title], + create_collection: Callable[..., Collection], + create_warehouse: Callable[..., Warehouse], + ): + """ + Test that book goes to staging because it has an invalid language code + """ + + title = create_title(name="test_en_all") + title.maturity = "stable" + + prod = create_warehouse( + name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003") + ) + collection = create_collection(warehouse=prod) + + ct = CollectionTitle(path=Path("wikipedia")) + ct.title = title + ct.collection = collection + dbsession.add(ct) + dbsession.flush() + + content = VALID_NOTIFICATION_CONTENT.copy() + content["folder_name"] = "" + content["metadata"]["Language"] = "xyz" + + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.title_id == title.id + assert book.location_kind == "staging" + assert len(book.issues) == 1 + assert set(book.issues) == {"invalid language code"} + assert book.has_error is False + assert book.needs_file_operation is True + assert book.needs_processing is False + + def test_moves_book_to_prod_due_to_invalid_language_code_being_supported( + self, + dbsession: OrmSession, + warehouse: Warehouse, # noqa: ARG002 + monkeypatch: MonkeyPatch, + create_zimfarm_notification: Callable[..., ZimfarmNotification], + create_title: Callable[..., Title], + create_collection: Callable[..., Collection], + create_warehouse: Callable[..., Warehouse], + ): + """ + Test that book goes to prod even though it's language code is invalid + but supported + """ + + title = create_title(name="test_en_all") + title.maturity = "stable" + + prod = create_warehouse( + name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003") + ) + collection = create_collection(warehouse=prod) + + ct = CollectionTitle(path=Path("wikipedia")) + ct.title = title + ct.collection = collection + dbsession.add(ct) + dbsession.flush() + + content = VALID_NOTIFICATION_CONTENT.copy() + content["folder_name"] = "" + content["metadata"]["Language"] = "xyz" + monkeypatch.setattr( + "cms_backend.context.Context.custom_language_codes", ["xyz"] + ) + update_language_codes() + + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.title_id == title.id + assert book.location_kind == "prod" + assert len(book.issues) == 0 + assert book.has_error is False + assert book.needs_file_operation is True + assert book.needs_processing is False + + def test_moves_book_to_staging_due_to_valid_language_code_being_disallowed( + self, + dbsession: OrmSession, + warehouse: Warehouse, # noqa: ARG002 + monkeypatch: MonkeyPatch, + create_zimfarm_notification: Callable[..., ZimfarmNotification], + create_title: Callable[..., Title], + create_collection: Callable[..., Collection], + create_warehouse: Callable[..., Warehouse], + ): + """ + Test that book goes to staging because there it's language code is disallowed + even though it's valid + """ + + title = create_title(name="test_en_all") + title.maturity = "stable" + + prod = create_warehouse( + name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003") + ) + collection = create_collection(warehouse=prod) + + ct = CollectionTitle(path=Path("wikipedia")) + ct.title = title + ct.collection = collection + dbsession.add(ct) + dbsession.flush() + + content = VALID_NOTIFICATION_CONTENT.copy() + content["folder_name"] = "" + content["metadata"]["Language"] = "fra" + monkeypatch.setattr( + "cms_backend.context.Context.disallowed_language_codes", ["fra"] + ) + update_language_codes() + + notification = create_zimfarm_notification(content=content) + dbsession.flush() + + process_notification(dbsession, notification) + + assert notification.status == "processed" + + book = dbsession.query(Book).filter_by(id=notification.id).first() + assert book is not None + assert book.title_id == title.id + assert book.location_kind == "staging" + assert len(book.issues) == 1 + assert set(book.issues) == {"invalid language code"} + assert book.has_error is False + assert book.needs_file_operation is True + assert book.needs_processing is False + class TestValidNotificationOnArchivedTitle: """Test valid notifications that are associated to an archived title.""" From c6f23b8f289549fbd22757a779fdc9604a521ed1 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 5 Jun 2026 10:06:14 +0100 Subject: [PATCH 4/6] validate each language code in zim metadata --- backend/src/cms_backend/db/book.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/backend/src/cms_backend/db/book.py b/backend/src/cms_backend/db/book.py index acbd5f68..18314212 100644 --- a/backend/src/cms_backend/db/book.py +++ b/backend/src/cms_backend/db/book.py @@ -540,12 +540,16 @@ def update_book_issues(session: OrmSession, book: Book, *, update_events: bool = issues: list[str] = [] - language_code = book.zim_metadata["Language"] - if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType] + unknown_languages: list[str] = [] + for language_code in book.zim_metadata["Language"].split(","): + if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType] + unknown_languages.append(language_code) + + if unknown_languages: issues.append("invalid language code") if update_events: book.events.append( - f"{getnow()}: book has unknown language code {language_code}" + f"{getnow()}: book has unknown language code(s) {unknown_languages}" ) different_metadata_keys = get_differing_metadata_keys(book) From 0744c401394bd20baf1916407fe03cd911abc158 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 5 Jun 2026 15:14:54 +0100 Subject: [PATCH 5/6] validate csv lang codes --- backend/src/cms_backend/api/routes/fields.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/backend/src/cms_backend/api/routes/fields.py b/backend/src/cms_backend/api/routes/fields.py index 6c5dbaca..1f4c45e4 100644 --- a/backend/src/cms_backend/api/routes/fields.py +++ b/backend/src/cms_backend/api/routes/fields.py @@ -64,6 +64,7 @@ def validate_language_code(value: str | None, info: ValidationInfo) -> str | Non context = info.context if context and context.get("skip_validation"): return value + if pycountry.languages.get(alpha_3=value): # pyright: ignore[reportUnknownMemberType] return value raise ValueError( @@ -71,9 +72,20 @@ def validate_language_code(value: str | None, info: ValidationInfo) -> str | Non ) +def validate_comma_separated_lang_code( + value: str | None, info: ValidationInfo +) -> str | None: + """Validate that string is a comma separated list of ISO-693-3 language codes""" + if value is None: + return value + for lang_code in value.split(","): + validate_language_code(lang_code, info) + return value + + LangCode = Annotated[ str, Field(min_length=3, max_length=3), WrapValidator(skip_validation), - AfterValidator(validate_language_code), + AfterValidator(validate_comma_separated_lang_code), ] From e26d165cd568d18e8cb0883df87b66aa65d1d032 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 5 Jun 2026 15:15:29 +0100 Subject: [PATCH 6/6] remove length restriction on langcode --- backend/src/cms_backend/api/routes/fields.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/src/cms_backend/api/routes/fields.py b/backend/src/cms_backend/api/routes/fields.py index 1f4c45e4..54d6ffe4 100644 --- a/backend/src/cms_backend/api/routes/fields.py +++ b/backend/src/cms_backend/api/routes/fields.py @@ -85,7 +85,6 @@ def validate_comma_separated_lang_code( LangCode = Annotated[ str, - Field(min_length=3, max_length=3), WrapValidator(skip_validation), AfterValidator(validate_comma_separated_lang_code), ]