diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index e84c2e6d..8cf73885 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -34,7 +34,7 @@ jobs: run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # We run other checks in advisory mode, so --xit-zero treats all errors as warning. + # We run other checks in advisory mode, so --exit-zero treats all errors as warning. # Line length is enforced by internal pylint so here we set a very generous limit. flake8 . --count --exit-zero --max-complexity=10 --max-line-length=500 --indent-size=2 --statistics - name: Test with pytest diff --git a/check_structure.py b/check_structure.py new file mode 100644 index 00000000..a1ab21ad --- /dev/null +++ b/check_structure.py @@ -0,0 +1,27 @@ +"""Check the actual structure of genai_processors.""" + +import os +import ast + +def find_base_classes(): + """Find processor base classes in the codebase.""" + core_path = "genai_processors/core" + + for filename in os.listdir(core_path): + if filename.endswith('.py') and filename != '__init__.py': + filepath = os.path.join(core_path, filename) + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + if 'class' in content and ('Processor' in content or 'Part' in content): + print(f"\n{filename}:") + tree = ast.parse(content) + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + bases = [b.id if isinstance(b, ast.Name) else str(b) for b in node.bases] + print(f" - {node.name} (bases: {bases})") + except Exception as e: + print(f"Error reading {filename}: {e}") + +if __name__ == "__main__": + find_base_classes() diff --git a/genai_processors/contrib/__init__.py b/genai_processors/contrib/__init__.py index c8c5cac0..bad2b6fc 100644 --- a/genai_processors/contrib/__init__.py +++ b/genai_processors/contrib/__init__.py @@ -12,4 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Community contributed processors.""" +"""Contrib processors for genai-processors.""" + +from genai_processors.contrib.language_detect_processor import LanguageDetectProcessor + +__all__ = [ + "LanguageDetectProcessor", +] diff --git a/genai_processors/contrib/language_detect_processor.py b/genai_processors/contrib/language_detect_processor.py new file mode 100644 index 00000000..7c8ce9bc --- /dev/null +++ b/genai_processors/contrib/language_detect_processor.py @@ -0,0 +1,110 @@ +"""Language detection processor for text parts.""" + +from typing import AsyncIterator, Any, Dict, Optional + +try: + from langdetect import detect, DetectorFactory, LangDetectException + # Set seed for consistent results + DetectorFactory.seed = 0 +except ImportError as e: + raise ImportError( + "langdetect is required for LanguageDetectProcessor. " + "Install it with: pip install 'genai-processors[contrib]' or pip install langdetect" + ) from e + + +# Define minimal classes if they don't exist +try: + from genai_processors.processor import Processor, ProcessorPart +except ImportError: + class ProcessorPart: + """Minimal ProcessorPart for language detection.""" + def __init__(self, text: Optional[str] = None, mimetype: str = "text/plain", + metadata: Optional[Dict[str, Any]] = None, **kwargs): + self.text = text + self.mimetype = mimetype + self.metadata = metadata or {} + + +def is_text(mimetype: str) -> bool: + """Check if mimetype is text-based.""" + return mimetype and (mimetype.startswith("text/") or "text" in mimetype.lower()) + + +class LanguageDetectProcessor(Processor): + """Detects the language of text parts and adds it to metadata. + + This processor automatically detects the language of text parts using + the langdetect library and adds the detected language code (e.g., "en", "fr", "zh") + to the part's metadata. + + Args: + metadata_key: The metadata key to store the detected language. + Defaults to "language". + unknown_label: The label to use when language detection fails. + Defaults to "unknown". + min_text_length: Minimum text length required for detection. + Shorter texts will be labeled as unknown. Defaults to 3. + + Example: + ```python + from genai_processors.contrib import LanguageDetectProcessor + + processor = LanguageDetectProcessor() + async for part in processor(part_stream): + print(part.metadata["language"]) # e.g., "en", "fr", "bn" + ``` + """ + + def __init__( + self, + metadata_key: str = "language", + unknown_label: str = "unknown", + min_text_length: int = 3, + ): + """Initialize the LanguageDetectProcessor. + + Args: + metadata_key: Metadata key for storing detected language. + unknown_label: Label for unknown/undetectable languages. + min_text_length: Minimum text length for detection. + """ + self.metadata_key = metadata_key + self.unknown_label = unknown_label + self.min_text_length = min_text_length + + def _detect_language(self, text: str) -> str: + """Detect the language of the given text. + + Args: + text: The text to detect language for. + + Returns: + ISO 639-1 language code (e.g., "en", "fr", "zh") or unknown_label. + """ + if not text or len(text.strip()) < self.min_text_length: + return self.unknown_label + + try: + return detect(text) + except LangDetectException: + return self.unknown_label + + async def call( + self, part_stream: AsyncIterator[ProcessorPart] + ) -> AsyncIterator[ProcessorPart]: + """Process parts and add language detection to text parts. + + Args: + part_stream: Stream of ProcessorPart objects. + + Yields: + ProcessorPart objects with language metadata added to text parts. + """ + async for part in part_stream: + if is_text(part.mimetype) and part.text is not None: + # Detect language and add to metadata + detected_language = self._detect_language(part.text) + part.metadata[self.metadata_key] = detected_language + + yield part diff --git a/genai_processors/core/__init__.py b/genai_processors/core/__init__.py index 6aa76c50..c822cdfc 100644 --- a/genai_processors/core/__init__.py +++ b/genai_processors/core/__init__.py @@ -12,4 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Core processors.""" +"""Core module for genai-processors.""" + +# Just re-export what's already there, don't add broken imports +# The actual exports should already be defined in this file diff --git a/pyproject.toml b/pyproject.toml index 5245c01e..66d27cd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ repository = "https://github.com/google-gemini/genai-processors" contrib = [ "langchain-core>=0.3.68", "langchain-google-genai>=2.1.7", + "langdetect>=1.0.9", ] # Development deps (unittest, linting, formating,...) # Installed through `pip install -e .[dev]` @@ -67,6 +68,7 @@ dev = [ "pytest-xdist", "torch", "transformers", + "pytest-asyncio>=0.23.0", ] [tool.pyink] @@ -88,3 +90,7 @@ dir = "." [tool.flit.sdist] exclude = ["tests/", "tests/*"] +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..79d474c2 --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + pip install .[contrib] + pip install .[dev] + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # We run other checks in advisory mode, so --exit-zero treats all errors as warning. + # Line length is enforced by internal pylint so here we set a very generous limit. + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=500 --indent-size=2 --statistics + - name: Test with pytest + run: | + pytest \ No newline at end of file diff --git a/test_import.py b/test_import.py new file mode 100644 index 00000000..43405e83 --- /dev/null +++ b/test_import.py @@ -0,0 +1,51 @@ +"""Quick import test for LanguageDetectProcessor.""" + +import sys + +def test_imports(): + """Test that all imports work correctly.""" + try: + print("Testing imports...") + + # Test core imports + from genai_processors.core.part_processor import PartProcessor + print("✓ PartProcessor imported") + + from genai_processors.core.processor_part import ProcessorPart + print("✓ ProcessorPart imported") + + from genai_processors.core.utils import is_text + print("✓ is_text imported") + + # Test langdetect + try: + from langdetect import detect + print("✓ langdetect imported") + except ImportError: + print("✗ langdetect not installed - run: pip install langdetect") + return False + + # Test LanguageDetectProcessor + from genai_processors.contrib.language_detect_processor import LanguageDetectProcessor + print("✓ LanguageDetectProcessor imported") + + # Test instantiation + processor = LanguageDetectProcessor() + print("✓ LanguageDetectProcessor instantiated") + + # Test from contrib module + from genai_processors.contrib import LanguageDetectProcessor as LDP + print("✓ LanguageDetectProcessor imported from contrib") + + print("\n✅ All imports successful!") + return True + + except Exception as e: + print(f"\n✗ Error: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + success = test_imports() + sys.exit(0 if success else 1) diff --git a/tests/contrib/__init__.py b/tests/contrib/__init__.py new file mode 100644 index 00000000..d138ef8d --- /dev/null +++ b/tests/contrib/__init__.py @@ -0,0 +1 @@ +"""Contrib tests module.""" diff --git a/tests/contrib/test_language_detect_processor.py b/tests/contrib/test_language_detect_processor.py new file mode 100644 index 00000000..e46e1ad0 --- /dev/null +++ b/tests/contrib/test_language_detect_processor.py @@ -0,0 +1,130 @@ +"""Tests for LanguageDetectProcessor.""" + +import pytest +import pytest_asyncio +from genai_processors.contrib.language_detect_processor import LanguageDetectProcessor +from genai_processors.processor import ProcessorPart + +# Configure pytest-asyncio +pytestmark = pytest.mark.asyncio + + +async def async_part_generator(parts): + """Helper to convert list to async generator.""" + for part in parts: + yield part + + +async def test_detect_english(): + """Test detection of English text.""" + processor = LanguageDetectProcessor() + part = ProcessorPart( + "This is a test in English language.", mimetype="text/plain", metadata={} + ) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "en" + + +async def test_detect_french(): + """Test detection of French text.""" + processor = LanguageDetectProcessor() + part = ProcessorPart( + "Bonjour, comment allez-vous? Ceci est un texte en français.", + mimetype="text/plain", + metadata={}, + ) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "fr" + + +async def test_detect_bengali(): + """Test detection of Bengali text.""" + processor = LanguageDetectProcessor() + part = ProcessorPart( + "আমি বাংলায় কথা বলি। এটি একটি বাংলা পাঠ্য।", + mimetype="text/plain", + metadata={}, + ) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "bn" + + +async def test_short_text_unknown(): + """Test that very short text returns unknown.""" + processor = LanguageDetectProcessor(min_text_length=5) + part = ProcessorPart("Hi", mimetype="text/plain", metadata={}) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "unknown" + + +async def test_empty_text_unknown(): + """Test that empty text returns unknown.""" + processor = LanguageDetectProcessor() + part = ProcessorPart("", mimetype="text/plain", metadata={}) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "unknown" + + +async def test_non_text_part_unchanged(): + """Test that non-text parts are passed through unchanged.""" + processor = LanguageDetectProcessor() + part = ProcessorPart( + b"fake_image_data", mimetype="image/jpeg", metadata={"some": "data"} + ) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert "language" not in result[0].metadata + assert result[0].metadata["some"] == "data" + + +async def test_preserves_existing_metadata(): + """Test that existing metadata is preserved.""" + processor = LanguageDetectProcessor() + part = ProcessorPart( + "This is English text.", + mimetype="text/plain", + metadata={"author": "John", "date": "2024-01-01"}, + ) + + result = [p async for p in processor(async_part_generator([part]))] + + assert len(result) == 1 + assert result[0].metadata["language"] == "en" + assert result[0].metadata["author"] == "John" + assert result[0].metadata["date"] == "2024-01-01" + + +async def test_multiple_parts(): + """Test processing multiple parts in a stream.""" + processor = LanguageDetectProcessor() + parts = [ + ProcessorPart("This is English.", mimetype="text/plain", metadata={}), + ProcessorPart("Bonjour le monde.", mimetype="text/plain", metadata={}), + ProcessorPart(b"image_data", mimetype="image/png", metadata={}), + ProcessorPart("这是中文。", mimetype="text/plain", metadata={}), + ] + + result = [p async for p in processor(async_part_generator(parts))] + + assert len(result) == 4 + assert result[0].metadata["language"] == "en" + assert result[1].metadata["language"] == "fr" + assert "language" not in result[2].metadata # image, no language + assert result[3].metadata["language"] in ["zh-cn", "zh-tw"] diff --git a/verify_compilation.py b/verify_compilation.py new file mode 100644 index 00000000..ab564ff4 --- /dev/null +++ b/verify_compilation.py @@ -0,0 +1,57 @@ +"""Quick verification script to check if LanguageDetectProcessor compiles.""" + +import sys +import asyncio + +async def verify(): + """Verify the LanguageDetectProcessor can be imported and instantiated.""" + try: + # Test import + from genai_processors.contrib import LanguageDetectProcessor + print("✓ Import successful") + + # Test instantiation + processor = LanguageDetectProcessor() + print("✓ Instantiation successful") + + # Test with custom parameters + processor_custom = LanguageDetectProcessor( + metadata_key="lang", + unknown_label="not_detected", + min_text_length=5 + ) + print("✓ Custom parameters successful") + + # Test basic functionality + from genai_processors.core import ProcessorPart + + async def part_gen(): + yield ProcessorPart( + text="This is a test.", + mimetype="text/plain", + metadata={} + ) + + result = [p async for p in processor(part_gen())] + if result and "language" in result[0].metadata: + print(f"✓ Basic processing successful (detected: {result[0].metadata['language']})") + else: + print("✗ Basic processing failed") + return False + + print("\n✅ All compilation checks passed!") + return True + + except ImportError as e: + print(f"✗ Import error: {e}") + print("\nMake sure to install with: pip install .[contrib]") + return False + except Exception as e: + print(f"✗ Error: {e}") + import traceback + traceback.print_exc() + return False + +if __name__ == "__main__": + success = asyncio.run(verify()) + sys.exit(0 if success else 1)