Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"humanfriendly == 10.0",
"Werkzeug == 3.1.5",
"xxhash == 3.7.0",
"pycountry == 26.2.16",
]
dynamic = ["version"]

Expand Down
16 changes: 16 additions & 0 deletions backend/src/cms_backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

import pycountry

from cms_backend.context import Context

logger = logging.getLogger("backend")
Expand All @@ -9,3 +11,17 @@
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(asctime)s: %(levelname)s] %(message)s"))
logger.addHandler(handler)


def update_language_codes():
for code in Context.disallowed_language_codes:
try:
pycountry.languages.remove_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType]
except Exception as exc:
logger.warning(f"failed to remove language code '{code}': {exc}")

for code in Context.custom_language_codes:
pycountry.languages.add_entry(alpha_3=code) # pyright: ignore[reportUnknownMemberType]


update_language_codes()
34 changes: 34 additions & 0 deletions backend/src/cms_backend/api/routes/fields.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
from typing import Annotated, Any

import pycountry
from pydantic import (
AfterValidator,
Field,
Expand Down Expand Up @@ -54,3 +55,36 @@ def validate_base64(value: str) -> str:
LimitFieldMax200 = Annotated[int, Field(ge=1, le=200), WrapValidator(skip_validation)]

Base64Str = Annotated[NotEmptyString, AfterValidator(validate_base64)]


def validate_language_code(value: str | None, info: ValidationInfo) -> str | None:
"""Validate that string is a valid ISO-693-3 language code"""
if value is None:
return value
context = info.context
if context and context.get("skip_validation"):
return value

if pycountry.languages.get(alpha_3=value): # pyright: ignore[reportUnknownMemberType]
return value
raise ValueError(
f"Language code '{value}' is not a supported ISO-639-3 language code"
)


def validate_comma_separated_lang_code(
value: str | None, info: ValidationInfo
) -> str | None:
"""Validate that string is a comma separated list of ISO-693-3 language codes"""
if value is None:
return value
for lang_code in value.split(","):
validate_language_code(lang_code, info)
return value


LangCode = Annotated[
str,
WrapValidator(skip_validation),
AfterValidator(validate_comma_separated_lang_code),
]
29 changes: 28 additions & 1 deletion backend/src/cms_backend/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from dataclasses import field
from datetime import timedelta
from pathlib import Path
from typing import Any, TypeVar
from typing import Any, ClassVar, TypeVar
from uuid import UUID

import pycountry
from humanfriendly import parse_timespan

T = TypeVar("T")
Expand All @@ -23,6 +24,25 @@ def get_mandatory_env(key: str) -> str:
return value


def _parse_custom_language_codes(language_code: str | None) -> list[str]:
"""Transform the env language codes (comma-seperated) into a list."""
if language_code is None:
return []

codes = language_code.split(",")
for code in codes:
if len(code) != 3: # noqa: PLR2004
raise ValueError(f"Custom code '{code}' must be 3 characters long.")
return codes


def _validate_language_codes(language_codes: list[str]) -> list[str]:
for code in language_codes:
if pycountry.languages.get(alpha_3=code) is None: # pyright: ignore[reportUnknownMemberType]
raise ValueError(f"Code '{code}' is not a valid ISO 639-3 code.")
return language_codes


@dataclasses.dataclass(kw_only=True)
class Context:
"""Class holding every contextual / configuration bits which can be moved
Expand Down Expand Up @@ -66,3 +86,10 @@ class Context:
quarantine_base_path: Path = field(
default=Path(os.getenv("QUARANTINE_BASE_PATH", ""))
)
# Comma-seperated list of custom iso639-3 language codes
custom_language_codes: ClassVar[list[str]] = _parse_custom_language_codes(
os.getenv("CUSTOM_LANGUAGE_CODES")
)
disallowed_language_codes: ClassVar[list[str]] = _validate_language_codes(
_parse_custom_language_codes(os.getenv("DISALLOWED_LANGUAGE_CODES"))
)
13 changes: 13 additions & 0 deletions backend/src/cms_backend/db/book.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, Literal
from uuid import UUID

import pycountry
from sqlalchemy import select, update
from sqlalchemy.orm import Session as OrmSession
from sqlalchemy.orm import selectinload
Expand Down Expand Up @@ -539,6 +540,18 @@ def update_book_issues(session: OrmSession, book: Book, *, update_events: bool =

issues: list[str] = []

unknown_languages: list[str] = []
for language_code in book.zim_metadata["Language"].split(","):
if pycountry.languages.get(alpha_3=language_code) is None: # pyright: ignore[reportUnknownMemberType]
unknown_languages.append(language_code)

if unknown_languages:
issues.append("invalid language code")
if update_events:
book.events.append(
f"{getnow()}: book has unknown language code(s) {unknown_languages}"
)

different_metadata_keys = get_differing_metadata_keys(book)
if different_metadata_keys:
issues.append("metadata mismatch")
Expand Down
4 changes: 2 additions & 2 deletions backend/src/cms_backend/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from pydantic import AnyUrl, model_validator

from cms_backend.api.routes.fields import Base64Str, NotEmptyString
from cms_backend.api.routes.fields import Base64Str, LangCode, NotEmptyString
from cms_backend.roles import RoleEnum
from cms_backend.schemas import BaseModel
from cms_backend.schemas.orms import BaseTitleCollectionSchema
Expand Down Expand Up @@ -62,7 +62,7 @@ class BaseTitleCreateUpdateSchema(BaseModel):
creator: NotEmptyString | None = None
description: NotEmptyString | None = None
publisher: NotEmptyString | None = None
language: NotEmptyString | None = None
language: LangCode | None = None
illustration_48x48_at_1: Base64Str | None = None
flavours: list[str] | None = None
archived: bool | None = None
Expand Down
184 changes: 183 additions & 1 deletion backend/tests/mill/processors/test_zimfarm_notification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@

from collections.abc import Callable
from pathlib import Path
from typing import Any
from uuid import UUID

import pycountry
import pytest
from pytest import MonkeyPatch
from sqlalchemy.orm import Session as OrmSession

from cms_backend import update_language_codes
from cms_backend.context import Context
from cms_backend.db.models import (
Book,
Expand All @@ -22,7 +27,7 @@
)
from cms_backend.mill.processors.zimfarm_notification import process_notification

VALID_NOTIFICATION_CONTENT = {
VALID_NOTIFICATION_CONTENT: dict[str, Any] = {
"article_count": 1000,
"media_count": 500,
"size": 1000000,
Expand All @@ -46,6 +51,28 @@
}


@pytest.fixture(autouse=True)
def restore_language_codes():
"""Fixture to restore pycountry language codes after test modifications."""
original_entries = list(pycountry.languages) # pyright: ignore[reportUnknownVariableType]
yield
current_entries = list(pycountry.languages) # pyright: ignore[reportUnknownVariableType]
for entry in current_entries: # pyright: ignore[reportUnknownVariableType]
try:
pycountry.languages.remove_entry(alpha_3=entry.alpha_3) # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType]
except Exception: # noqa: S110
pass

Check notice on line 64 in backend/tests/mill/processors/test_zimfarm_notification.py

View check run for this annotation

codefactor.io / CodeFactor

backend/tests/mill/processors/test_zimfarm_notification.py#L63-L64

Try, Except, Pass detected. (B110)

for entry in original_entries: # pyright: ignore[reportUnknownVariableType]
try:
pycountry.languages.add_entry( # pyright: ignore[reportUnknownMemberType]
alpha_3=entry.alpha_3, # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType]
name=entry.name, # pyright: ignore[reportUnknownMemberType,reportUnknownArgumentType]
)
except Exception: # noqa: S110
pass

Check notice on line 73 in backend/tests/mill/processors/test_zimfarm_notification.py

View check run for this annotation

codefactor.io / CodeFactor

backend/tests/mill/processors/test_zimfarm_notification.py#L72-L73

Try, Except, Pass detected. (B110)


class TestBadNotifications:
"""Test notifications that fail validation and are marked as bad_notification."""

Expand Down Expand Up @@ -634,6 +661,161 @@
assert book.needs_file_operation is True
assert book.needs_processing is False

def test_moves_book_to_staging_due_to_invalid_language(
self,
dbsession: OrmSession,
warehouse: Warehouse, # noqa: ARG002
create_zimfarm_notification: Callable[..., ZimfarmNotification],
create_title: Callable[..., Title],
create_collection: Callable[..., Collection],
create_warehouse: Callable[..., Warehouse],
):
"""
Test that book goes to staging because it has an invalid language code
"""

title = create_title(name="test_en_all")
title.maturity = "stable"

prod = create_warehouse(
name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003")
)
collection = create_collection(warehouse=prod)

ct = CollectionTitle(path=Path("wikipedia"))
ct.title = title
ct.collection = collection
dbsession.add(ct)
dbsession.flush()

content = VALID_NOTIFICATION_CONTENT.copy()
content["folder_name"] = ""
content["metadata"]["Language"] = "xyz"

notification = create_zimfarm_notification(content=content)
dbsession.flush()

process_notification(dbsession, notification)

assert notification.status == "processed"

book = dbsession.query(Book).filter_by(id=notification.id).first()
assert book is not None
assert book.title_id == title.id
assert book.location_kind == "staging"
assert len(book.issues) == 1
assert set(book.issues) == {"invalid language code"}
assert book.has_error is False
assert book.needs_file_operation is True
assert book.needs_processing is False

def test_moves_book_to_prod_due_to_invalid_language_code_being_supported(
self,
dbsession: OrmSession,
warehouse: Warehouse, # noqa: ARG002
monkeypatch: MonkeyPatch,
create_zimfarm_notification: Callable[..., ZimfarmNotification],
create_title: Callable[..., Title],
create_collection: Callable[..., Collection],
create_warehouse: Callable[..., Warehouse],
):
"""
Test that book goes to prod even though it's language code is invalid
but supported
"""

title = create_title(name="test_en_all")
title.maturity = "stable"

prod = create_warehouse(
name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003")
)
collection = create_collection(warehouse=prod)

ct = CollectionTitle(path=Path("wikipedia"))
ct.title = title
ct.collection = collection
dbsession.add(ct)
dbsession.flush()

content = VALID_NOTIFICATION_CONTENT.copy()
content["folder_name"] = ""
content["metadata"]["Language"] = "xyz"
monkeypatch.setattr(
"cms_backend.context.Context.custom_language_codes", ["xyz"]
)
update_language_codes()

notification = create_zimfarm_notification(content=content)
dbsession.flush()

process_notification(dbsession, notification)

assert notification.status == "processed"

book = dbsession.query(Book).filter_by(id=notification.id).first()
assert book is not None
assert book.title_id == title.id
assert book.location_kind == "prod"
assert len(book.issues) == 0
assert book.has_error is False
assert book.needs_file_operation is True
assert book.needs_processing is False

def test_moves_book_to_staging_due_to_valid_language_code_being_disallowed(
self,
dbsession: OrmSession,
warehouse: Warehouse, # noqa: ARG002
monkeypatch: MonkeyPatch,
create_zimfarm_notification: Callable[..., ZimfarmNotification],
create_title: Callable[..., Title],
create_collection: Callable[..., Collection],
create_warehouse: Callable[..., Warehouse],
):
"""
Test that book goes to staging because there it's language code is disallowed
even though it's valid
"""

title = create_title(name="test_en_all")
title.maturity = "stable"

prod = create_warehouse(
name="prod", warehouse_id=UUID("00000000-0000-0000-0000-000000000003")
)
collection = create_collection(warehouse=prod)

ct = CollectionTitle(path=Path("wikipedia"))
ct.title = title
ct.collection = collection
dbsession.add(ct)
dbsession.flush()

content = VALID_NOTIFICATION_CONTENT.copy()
content["folder_name"] = ""
content["metadata"]["Language"] = "fra"
monkeypatch.setattr(
"cms_backend.context.Context.disallowed_language_codes", ["fra"]
)
update_language_codes()

notification = create_zimfarm_notification(content=content)
dbsession.flush()

process_notification(dbsession, notification)

assert notification.status == "processed"

book = dbsession.query(Book).filter_by(id=notification.id).first()
assert book is not None
assert book.title_id == title.id
assert book.location_kind == "staging"
assert len(book.issues) == 1
assert set(book.issues) == {"invalid language code"}
assert book.has_error is False
assert book.needs_file_operation is True
assert book.needs_processing is False


class TestValidNotificationOnArchivedTitle:
"""Test valid notifications that are associated to an archived title."""
Expand Down
Loading