diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 5f8ba9285ac..5f731a31595 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -7,9 +7,12 @@ on: # ** matches 'zero or more of any character' - 'release-v[0-9]+.[0-9]+.[0-9]+**' - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' + +permissions: {} + jobs: build_wheels: - uses: explosion/gha-cibuildwheel/.github/workflows/cibuildwheel.yml@main + uses: explosion/gha-cibuildwheel/.github/workflows/cibuildwheel.yml@2c98f757f13d112cf73fcf4b627249f1fffb5aae # main permissions: contents: write actions: read diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 78a27cfa3ba..979385ccb90 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -6,6 +6,8 @@ on: - created - edited +permissions: {} + jobs: explosion-bot: if: github.repository_owner == 'explosion' @@ -15,13 +17,15 @@ jobs: env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 - name: Install and run explosion-bot run: | - pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot + git config --global url."https://x-access-token:${EXPLOSIONBOT_TOKEN}@github.com/".insteadOf "https://github.com/" + pip install git+https://github.com/explosion/explosion-bot python -m explosionbot env: + EXPLOSIONBOT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu" diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index 6c7d7d5a6f8..264707485e7 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -11,12 +11,16 @@ on: types: - labeled +permissions: {} + jobs: issue-manager: + permissions: + issues: write if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: tiangolo/issue-manager@0.4.0 + - uses: tiangolo/issue-manager@4d1b7e05935a404dc8337d30bd23be46be8bb8e5 # 0.4.0 with: token: ${{ secrets.GITHUB_TOKEN }} config: > diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 2bbdd64c771..8fcf3028476 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: dessant/lock-threads@v5 + - uses: dessant/lock-threads@1bf7ec25051fe7c00bdd17e6a7cf3d7bfb7dc771 # v5 with: process-only: 'issues' issue-inactive-days: '30' diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index 9f432874cc2..fcc6f2a9999 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -8,6 +8,8 @@ on: types: - published +permissions: {} + jobs: upload_pypi: runs-on: ubuntu-latest @@ -21,7 +23,7 @@ jobs: # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') steps: - - uses: robinraju/release-downloader@v1 + - uses: robinraju/release-downloader@daf26c55d821e836577a15f77d86ddc078948b05 # v1 with: tag: ${{ github.event.release.tag_name }} fileName: '*' diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 01731ffe0d7..ec0230699be 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -5,21 +5,16 @@ on: paths: - "website/meta/universe.json" +permissions: {} + jobs: build: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - name: Dump GitHub context - env: - GITHUB_CONTEXT: ${{ toJson(github) }} - PR_NUMBER: ${{github.event.number}} - run: | - echo "$GITHUB_CONTEXT" - - - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: '3.10' - name: Install Bernadette app dependency and send an alert diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bb4eb278131..b20dba12f04 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,6 +19,8 @@ on: - "*.mdx" - "website/**" +permissions: {} + jobs: validate: name: Validate @@ -26,49 +28,38 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: "3.10" - - name: black - run: | - python -m pip install black -c requirements.txt - python -m black spacy --check - - name: isort + - name: ruff format run: | - python -m pip install isort -c requirements.txt - python -m isort spacy --check - - name: flake8 + python -m pip install ruff -c requirements.txt + python -m ruff format spacy --check + - name: ruff isort run: | - python -m pip install flake8==5.0.4 - python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics - # Unfortunately cython-lint isn't working after the shift to Cython 3. - #- name: cython-lint - # run: | - # python -m pip install cython-lint -c requirements.txt - # # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment - # cython-lint spacy --ignore E501,W291,E266 + python -m ruff check spacy --select I tests: name: Test needs: Validate strategy: - fail-fast: true + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python_version: ["3.10", "3.11", "3.12", "3.13"] + python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] runs-on: ${{ matrix.os }} steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: ${{ matrix.python_version }} @@ -104,7 +95,7 @@ jobs: shell: bash - name: Test import - run: python -W error -c "import spacy" + run: python -W error -W 'ignore:Core Pydantic V1:UserWarning:pydantic' -c "import spacy" - name: "Test download CLI" run: | @@ -165,7 +156,7 @@ jobs: - name: "Run CPU tests" run: | - python -m pytest --pyargs spacy -W error + python -m pytest --pyargs spacy -W error -W 'ignore:Core Pydantic V1:UserWarning:pydantic' if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')" - name: "Run CPU tests with thinc-apple-ops" diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index ce7df49dbae..e97850cd4b0 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -13,6 +13,8 @@ on: paths: - "website/meta/universe.json" +permissions: {} + jobs: validate: name: Validate @@ -20,10 +22,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Configure Python version - uses: actions/setup-python@v4 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: "3.7" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2c5e98fd97..7d57c3a0c56 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,13 +1,7 @@ repos: -- repo: https://github.com/ambv/black - rev: 22.3.0 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.0 hooks: - - id: black - language_version: python3.7 - additional_dependencies: ['click==8.0.4'] -- repo: https://github.com/pycqa/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: - - "--config=setup.cfg" + - id: ruff + args: ['--fix'] + - id: ruff-format diff --git a/MANIFEST.in b/MANIFEST.in index 1caf758464f..36465ea94a0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml *.hh +recursive-include spacy_cli *.json include LICENSE include README.md include pyproject.toml diff --git a/lint.sh b/lint.sh new file mode 100755 index 00000000000..0ec0bda3f6b --- /dev/null +++ b/lint.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Local lint script matching the CI Validate job + mypy type checks. +# Fixes formatting and import sorting in-place, then re-verifies in +# check mode to catch any conflicts between the two, and runs mypy. +set -euo pipefail + +err=0 + +echo "==> ruff format (auto-fixing)" +python -m ruff format spacy + +echo "==> ruff isort (auto-fixing)" +python -m ruff check spacy --select I --fix + +echo "==> ruff format (verify)" +if ! python -m ruff format spacy --check; then + echo "FAIL: isort fix broke formatting" + err=1 +fi + +echo "==> ruff isort (verify)" +if ! python -m ruff check spacy --select I; then + echo "FAIL: format fix broke import sorting" + err=1 +fi + +echo "==> mypy" +if ! python -m mypy spacy; then + err=1 +fi + +if [ "$err" -ne 0 ]; then + echo "FAIL: see errors above" + exit 1 +fi + +echo "OK: all checks passed" diff --git a/pyproject.toml b/pyproject.toml index 64b71429e6e..395c2f7a108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.3.4,<8.4.0", + "thinc>=8.3.12,<8.4.0", "numpy>=2.0.0,<3.0.0" ] build-backend = "setuptools.build_meta" @@ -62,5 +62,13 @@ repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest [tool.cibuildwheel.pyodide] -[tool.isort] -profile = "black" +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "W", "C", "B", "B9"] +ignore = ["E203", "E266", "E501", "E731", "E741", "F541"] + +[tool.ruff.lint.isort] +combine-as-imports = true +split-on-trailing-comma = true diff --git a/requirements.txt b/requirements.txt index 6e79ed526bd..50c6382bea3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,19 +3,19 @@ spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.3.4,<8.4.0 -ml_datasets>=0.2.0,<0.3.0 +thinc>=8.3.12,<8.4.0 +ml_datasets>=0.2.1,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 -srsly>=2.4.3,<3.0.0 +srsly>=2.5.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer-slim>=0.3.0,<1.0.0 -weasel>=0.4.2,<0.5.0 +typer>=0.3.0,<1.0.0 +weasel>=1.0.0,<2.0.0 # Third party dependencies numpy>=2.0.0,<3.0.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 +pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools @@ -26,13 +26,12 @@ cython>=3.0,<4.0 pytest>=5.2.0,!=7.1.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=25.0.0 +ruff>=0.9.0 cython-lint>=0.15.0 -isort>=5.0,<6.0 +confection>=1.1.0,<2.0.0 diff --git a/setup.cfg b/setup.cfg index c4928af9224..83147ad0d48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases @@ -41,7 +42,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.3.4,<8.4.0 + thinc>=8.3.12,<8.4.0 install_requires = # Our libraries spacy-legacy>=3.0.11,<3.1.0 @@ -49,18 +50,19 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.3.4,<8.4.0 + thinc>=8.3.12,<8.4.0 wasabi>=0.9.1,<1.2.0 - srsly>=2.4.3,<3.0.0 + srsly>=2.5.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=0.4.2,<0.5.0 + weasel>=1.0.0,<2.0.0 + confection>=1.1.0,<2.0.0 # Third-party dependencies - typer-slim>=0.3.0,<1.0.0 + typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" requests>=2.13.0,<3.0.0 - pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 + pydantic>=2.0.0,<3.0.0 jinja2 # Official Python utilities setuptools @@ -68,7 +70,7 @@ install_requires = [options.entry_points] console_scripts = - spacy = spacy.cli:setup_cli + spacy = spacy_cli.main:main [options.extras_require] lookups = @@ -130,20 +132,13 @@ universal = false [sdist] formats = gztar -[flake8] -ignore = E203, E266, E501, E731, W503, E741, F541 -max-line-length = 80 -select = B,C,E,F,W,T4,B9 -exclude = - .env, - .git, - __pycache__, - _tokenizer_exceptions_list.py, - [tool:pytest] markers = slow: mark a test as slow issue: reference specific issue +filterwarnings = + error + ignore:Core Pydantic V1:UserWarning:pydantic [mypy] ignore_missing_imports = True diff --git a/setup.py b/setup.py index 33178662df4..e18e98b9249 100755 --- a/setup.py +++ b/setup.py @@ -82,9 +82,9 @@ } # Files to copy into the package that are otherwise not included COPY_FILES = { - ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package", - ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package", - ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package", + ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package" / "test.cfg", + ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package" / "test.toml", + ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package" / "test.txt", } @@ -158,10 +158,10 @@ def _minimal_ext_cmd(cmd): def clean(path): - for path in path.glob("**/*"): - if path.is_file() and path.suffix in (".so", ".cpp", ".html"): - print(f"Deleting {path.name}") - path.unlink() + for child in path.glob("**/*"): + if child.is_file() and child.suffix in (".so", ".cpp", ".html"): + print(f"Deleting {child.name}") + child.unlink() def setup_package(): @@ -173,10 +173,10 @@ def setup_package(): about = {} exec(f.read(), about) - for copy_file, target_dir in COPY_FILES.items(): + for copy_file, target_file in COPY_FILES.items(): if copy_file.exists(): - shutil.copy(str(copy_file), str(target_dir)) - print(f"Copied {copy_file} -> {target_dir}") + shutil.copyfile(str(copy_file), str(target_file)) + print(f"Copied {copy_file} -> {target_file}") include_dirs = [ numpy.get_include(), @@ -213,7 +213,7 @@ def setup_package(): version=about["__version__"], ext_modules=ext_modules, cmdclass={"build_ext": build_ext_subclass}, - package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, + package_data={"": ["*.pyx", "*.pxd", "*.pxi"], "spacy_cli": ["*.json"]}, ) diff --git a/spacy/__init__.py b/spacy/__init__.py index 8bb8b49498e..5b3ff25c872 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -10,17 +10,39 @@ # These are imported as part of the API from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401 -from . import pipeline # noqa: F401 -from . import util +from . import ( + pipeline, # noqa: F401 + util, +) from .about import __version__ # noqa: F401 from .cli.info import info # noqa: F401 from .errors import Errors from .glossary import explain # noqa: F401 from .language import Language from .registrations import REGISTRY_POPULATED, populate_registry + +# Rebuild pydantic v2 schemas that use forward references to Language/Vocab +from .schemas import ( # noqa: F401 + ConfigSchema, + ConfigSchemaInit, + ConfigSchemaNlp, + ConfigSchemaPretrain, + ConfigSchemaTraining, +) +from .training import Example # noqa: F401 from .util import logger, registry # noqa: F401 from .vocab import Vocab +_rebuild_ns = {"Language": Language, "Vocab": Vocab, "Example": Example} +for _schema in ( + ConfigSchemaTraining, + ConfigSchemaNlp, + ConfigSchemaPretrain, + ConfigSchemaInit, + ConfigSchema, +): + _schema.model_rebuild(_types_namespace=_rebuild_ns) # type: ignore[attr-defined] + if sys.maxunicode == 65535: raise SystemError(Errors.E130) diff --git a/spacy/about.py b/spacy/about.py index a93d91532b6..df33ff96bfe 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.8.11" +__version__ = "3.8.12" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 3095778fe22..f176a2eabad 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,40 +1,96 @@ +import sys +import types +from importlib import import_module +from typing import Iterable + +from typer.main import get_command from wasabi import msg -# Needed for testing -from . import download as download_module # noqa: F401 -from ._util import app, setup_cli # noqa: F401 -from .apply import apply # noqa: F401 -from .assemble import assemble_cli # noqa: F401 - -# These are the actual functions, NOT the wrapped CLI commands. The CLI commands -# are registered automatically and won't have to be imported here. -from .benchmark_speed import benchmark_speed_cli # noqa: F401 -from .convert import convert # noqa: F401 -from .debug_config import debug_config # noqa: F401 -from .debug_data import debug_data # noqa: F401 -from .debug_diff import debug_diff # noqa: F401 -from .debug_model import debug_model # noqa: F401 -from .download import download # noqa: F401 -from .evaluate import evaluate # noqa: F401 -from .find_function import find_function # noqa: F401 -from .find_threshold import find_threshold # noqa: F401 -from .info import info # noqa: F401 -from .init_config import fill_config, init_config # noqa: F401 -from .init_pipeline import init_pipeline_cli # noqa: F401 -from .package import package # noqa: F401 -from .pretrain import pretrain # noqa: F401 -from .profile import profile # noqa: F401 -from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401 -from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401 -from .project.document import ( # type: ignore[attr-defined] # noqa: F401 - project_document, +from ..util import registry +from ._dispatch import ( + GROUP_MODULES, + PUBLIC_ATTRS, + SUBCOMMAND_MODULES, + TOP_LEVEL_MODULES, + iter_builtin_modules, ) -from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401 -from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401 -from .project.push import project_push # type: ignore[attr-defined] # noqa: F401 -from .project.run import project_run # type: ignore[attr-defined] # noqa: F401 -from .train import train_cli # type: ignore[attr-defined] # noqa: F401 -from .validate import validate # type: ignore[attr-defined] # noqa: F401 +from ._util import COMMAND, add_project_cli, app + +HELP_OPTIONS = {"--help", "-h"} +ROOT_OPTIONS = HELP_OPTIONS | {"--install-completion", "--show-completion"} + +__all__ = [ + "app", + "load_all_commands", + "load_for_argv", + "setup_cli", + *sorted(PUBLIC_ATTRS), +] + + +def _import_modules(module_names: Iterable[str]) -> None: + for module_name in module_names: + import_module(module_name) + + +def load_all_commands() -> None: + _import_modules(iter_builtin_modules()) + add_project_cli() + + +def load_for_argv(argv: Iterable[str]) -> None: + args = list(argv) + if not args or args[0] in ROOT_OPTIONS or args[0].startswith("-"): + load_all_commands() + return + command = args[0] + if command == "project": + add_project_cli() + return + if command in GROUP_MODULES: + subcommand = args[1] if len(args) > 1 and not args[1].startswith("-") else None + if subcommand is not None and (command, subcommand) in SUBCOMMAND_MODULES: + _import_modules(SUBCOMMAND_MODULES[(command, subcommand)]) + return + _import_modules(GROUP_MODULES[command]) + return + if command in TOP_LEVEL_MODULES: + _import_modules(TOP_LEVEL_MODULES[command]) + + +def setup_cli() -> None: + # Make sure entry-point CLI integrations are imported before command dispatch. + registry.cli.get_all() + load_for_argv(sys.argv[1:]) + command = get_command(app) + command(prog_name=COMMAND) + + +def __getattr__(name: str): + if name not in PUBLIC_ATTRS: + raise AttributeError(f"module 'spacy.cli' has no attribute {name!r}") + module_name, attr_name = PUBLIC_ATTRS[name] + module = import_module(module_name) + value = module if attr_name is None else getattr(module, attr_name) + globals()[name] = value + return value + + +def __dir__(): + return sorted(set(globals()) | set(PUBLIC_ATTRS)) + + +class _CLIModule(types.ModuleType): + def __setattr__(self, name, value): + if isinstance(value, types.ModuleType) and name in PUBLIC_ATTRS: + _, attr_name = PUBLIC_ATTRS[name] + if attr_name is not None: + super().__setattr__(name, getattr(value, attr_name)) + return + super().__setattr__(name, value) + + +sys.modules[__name__].__class__ = _CLIModule @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_dispatch.py b/spacy/cli/_dispatch.py new file mode 100644 index 00000000000..5ee4f654d39 --- /dev/null +++ b/spacy/cli/_dispatch.py @@ -0,0 +1,104 @@ +from typing import Dict, Iterable, Optional, Tuple + +CommandPath = Tuple[str, ...] + + +TOP_LEVEL_MODULES: Dict[str, Tuple[str, ...]] = { + "apply": ("spacy.cli.apply",), + "assemble": ("spacy.cli.assemble",), + "convert": ("spacy.cli.convert",), + "debug-data": ("spacy.cli.debug_data",), + "download": ("spacy.cli.download",), + "evaluate": ("spacy.cli.evaluate",), + "find-function": ("spacy.cli.find_function",), + "find-threshold": ("spacy.cli.find_threshold",), + "info": ("spacy.cli.info",), + "package": ("spacy.cli.package",), + "pretrain": ("spacy.cli.pretrain",), + "profile": ("spacy.cli.profile",), + "train": ("spacy.cli.train",), + "validate": ("spacy.cli.validate",), +} + + +GROUP_MODULES: Dict[str, Tuple[str, ...]] = { + "benchmark": ( + "spacy.cli.benchmark_speed", + "spacy.cli.evaluate", + ), + "debug": ( + "spacy.cli.debug_config", + "spacy.cli.debug_data", + "spacy.cli.debug_diff", + "spacy.cli.debug_model", + "spacy.cli.profile", + ), + "init": ( + "spacy.cli.init_config", + "spacy.cli.init_pipeline", + ), +} + + +SUBCOMMAND_MODULES: Dict[CommandPath, Tuple[str, ...]] = { + ("benchmark", "accuracy"): ("spacy.cli.evaluate",), + ("benchmark", "speed"): ("spacy.cli.benchmark_speed",), + ("debug", "config"): ("spacy.cli.debug_config",), + ("debug", "data"): ("spacy.cli.debug_data",), + ("debug", "diff-config"): ("spacy.cli.debug_diff",), + ("debug", "model"): ("spacy.cli.debug_model",), + ("debug", "profile"): ("spacy.cli.profile",), + ("init", "config"): ("spacy.cli.init_config",), + ("init", "fill-config"): ("spacy.cli.init_config",), + ("init", "labels"): ("spacy.cli.init_pipeline",), + ("init", "nlp"): ("spacy.cli.init_pipeline",), + ("init", "vectors"): ("spacy.cli.init_pipeline",), +} + + +PUBLIC_ATTRS: Dict[str, Tuple[str, Optional[str]]] = { + "app": ("spacy.cli._util", "app"), + "apply": ("spacy.cli.apply", "apply"), + "assemble_cli": ("spacy.cli.assemble", "assemble_cli"), + "benchmark_speed_cli": ("spacy.cli.benchmark_speed", "benchmark_speed_cli"), + "convert": ("spacy.cli.convert", "convert"), + "debug_config": ("spacy.cli.debug_config", "debug_config"), + "debug_data": ("spacy.cli.debug_data", "debug_data"), + "debug_diff": ("spacy.cli.debug_diff", "debug_diff"), + "debug_model": ("spacy.cli.debug_model", "debug_model"), + "download": ("spacy.cli.download", "download"), + "download_module": ("spacy.cli.download", None), + "evaluate": ("spacy.cli.evaluate", "evaluate"), + "fill_config": ("spacy.cli.init_config", "fill_config"), + "find_function": ("spacy.cli.find_function", "find_function"), + "find_threshold": ("spacy.cli.find_threshold", "find_threshold"), + "info": ("spacy.cli.info", "info"), + "init_config": ("spacy.cli.init_config", "init_config"), + "init_pipeline_cli": ("spacy.cli.init_pipeline", "init_pipeline_cli"), + "package": ("spacy.cli.package", "package"), + "pretrain": ("spacy.cli.pretrain", "pretrain"), + "profile": ("spacy.cli.profile", "profile"), + "project_assets": ("spacy.cli.project.assets", "project_assets"), + "project_clone": ("spacy.cli.project.clone", "project_clone"), + "project_document": ("spacy.cli.project.document", "project_document"), + "project_pull": ("spacy.cli.project.pull", "project_pull"), + "project_push": ("spacy.cli.project.push", "project_push"), + "project_run": ("spacy.cli.project.run", "project_run"), + "project_update_dvc": ("spacy.cli.project.dvc", "project_update_dvc"), + "train_cli": ("spacy.cli.train", "train_cli"), + "validate": ("spacy.cli.validate", "validate"), +} + + +def iter_builtin_modules() -> Iterable[str]: + seen = set() + for modules in TOP_LEVEL_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module + for modules in GROUP_MODULES.values(): + for module in modules: + if module not in seen: + seen.add(module) + yield module diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 309b6b1e79a..35f899b2cf8 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,15 +1,11 @@ -import hashlib import os -import shutil import sys from configparser import InterpolationError from contextlib import contextmanager from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, - Iterable, List, Optional, Tuple, @@ -21,23 +17,15 @@ import typer from click import NoSuchOption from click.shell_completion import split_arg_string -from thinc.api import Config, ConfigValidationError, require_gpu +from thinc.api import ConfigValidationError, require_gpu from thinc.util import gpu_is_available -from typer.main import get_command from wasabi import Printer, msg -from weasel import app as project_cli -from .. import about from ..compat import Literal -from ..schemas import validate from ..util import ( ENV_VARS, - SimpleFrozenDict, import_file, - is_compatible_version, logger, - make_tempdir, - registry, run_command, ) @@ -68,23 +56,25 @@ Arg = typer.Argument Opt = typer.Option -app = typer.Typer(name=NAME, help=HELP) +app = typer.Typer(name=NAME, help=HELP, rich_markup_mode=None) benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) +_PROJECT_CLI_ADDED = False -app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) app.add_typer(debug_cli) app.add_typer(benchmark_cli) app.add_typer(init_cli) -def setup_cli() -> None: - # Make sure the entry-point for CLI runs, so that they get imported. - registry.cli.get_all() - # Ensure that the help messages always display the correct prompt - command = get_command(app) - command(prog_name=COMMAND) +def add_project_cli() -> None: + global _PROJECT_CLI_ADDED + if _PROJECT_CLI_ADDED: + return + from weasel import app as project_cli + + app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) + _PROJECT_CLI_ADDED = True def parse_config_overrides( @@ -215,8 +205,8 @@ def get_git_version( """ try: ret = run_command("git --version", capture=True) - except: - raise RuntimeError(error) + except Exception as err: + raise RuntimeError(error) from err stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): return 0, 0 diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index ffd8105060a..7671026f488 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -22,7 +22,7 @@ out_help = "Path to save the resulting .spacy file" code_help = ( - "Path to Python file with additional " "code (registered functions) to be imported" + "Path to Python file with additional code (registered functions) to be imported" ) gold_help = "Use gold preprocessing provided in the .spacy files" force_msg = ( @@ -72,11 +72,15 @@ def apply_cli( data_path: Path = Arg(..., help=path_help, exists=True), output_file: Path = Arg(..., help=out_help, dir_okay=False), code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help), - text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"), - force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"), + text_key: str = Opt( + "text", "--text-key", "-tk", help="Key containing text string for JSONL" + ), + force_overwrite: bool = Opt( + False, "--force", "-F", help="Force overwriting the output file" + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."), batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."), - n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.") + n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use."), ): """ Apply a trained pipeline to documents to get predictions. @@ -114,8 +118,7 @@ def apply( if len(paths) == 0: docbin.to_disk(output_file) msg.warn( - "Did not find data to process," - f" {data_path} seems to be an empty directory." + f"Did not find data to process, {data_path} seems to be an empty directory." ) return nlp = load_model(model) diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py index f74bbacb555..bc97a9d594f 100644 --- a/spacy/cli/assemble.py +++ b/spacy/cli/assemble.py @@ -24,10 +24,25 @@ def assemble_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + output_path: Path = Arg( + ..., help="Output directory to store assembled pipeline in" + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), # fmt: on ): """ diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py index 4dd10049cda..052e7d43416 100644 --- a/spacy/cli/benchmark_speed.py +++ b/spacy/cli/benchmark_speed.py @@ -24,13 +24,29 @@ def benchmark_speed_cli( # fmt: off ctx: typer.Context, model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), + batch_size: Optional[int] = Opt( + None, "--batch-size", "-b", min=1, help="Override the pipeline batch size" + ), no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), - warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + n_batches: int = Opt( + 50, + "--batches", + help="Minimum number of batches to benchmark", + min=30, + ), + warmup_epochs: int = Opt( + 3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup" + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), # fmt: on ): """ @@ -151,7 +167,7 @@ def print_mean_with_ci(sample: numpy.ndarray): low = bootstrap_means[int(len(bootstrap_means) * 0.025)] high = bootstrap_means[int(len(bootstrap_means) * 0.975)] - print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})") + print(f"Mean: {mean:.1f} words/s (95% CI: {low - mean:.1f} +{high - mean:.1f})") def print_outliers(sample: numpy.ndarray): diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a66a68133b3..140999207f3 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -48,17 +48,47 @@ class FileTypes(str, Enum): def convert_cli( # fmt: off input_path: str = Arg(..., help="Input file or directory", exists=True), - output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True), - file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"), - n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"), - seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"), - model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), - morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), - merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), - converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), - ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), - lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), - concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), + output_dir: Path = Arg( + "-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True + ), + file_type: FileTypes = Opt( + "spacy", "--file-type", "-t", help="Type of data to produce" + ), + n_sents: int = Opt( + 1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)" + ), + seg_sents: bool = Opt( + False, "--seg-sents", "-s", help="Segment sentences (for -c ner)" + ), + model: Optional[str] = Opt( + None, + "--model", + "--base", + "-b", + help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)", + ), + morphology: bool = Opt( + False, "--morphology", "-m", help="Enable appending morphology to tags" + ), + merge_subtokens: bool = Opt( + False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens" + ), + converter: str = Opt( + AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}" + ), + ner_map: Optional[Path] = Opt( + None, + "--ner-map", + "-nm", + help="NER tag mapping (as JSON-encoded dict of entity types)", + exists=True, + ), + lang: Optional[str] = Opt( + None, "--lang", "-l", help="Language (if tokenizer required)" + ), + concatenate: bool = Opt( + None, "--concatenate", "-C", help="Concatenate output to a single file" + ), # fmt: on ): """ diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index 0e5382cd956..4876b6ff9e1 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -26,10 +26,28 @@ def debug_config_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), - show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + show_funcs: bool = Opt( + False, + "--show-functions", + "-F", + help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)", + ), + show_vars: bool = Opt( + False, + "--show-variables", + "-V", + help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.", + ), # fmt: on ): """Debug a config file and show validation errors. The command will @@ -64,10 +82,10 @@ def debug_config( config = nlp.config.interpolate() msg.divider("Config validation for [initialize]") with show_validation_error(config_path): - T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + T = registry.resolve(config["initialize"], schema=ConfigSchemaInit) # type: ignore[arg-type] msg.divider("Config validation for [training]") with show_validation_error(config_path): - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] util.resolve_dot_names(config, dot_names) msg.good("Config is valid") diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1c9c0e0ea3a..6ba18e7f224 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -71,11 +71,28 @@ def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), - verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), - no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + ignore_warnings: bool = Opt( + False, + "--ignore-warnings", + "-IW", + help="Ignore warnings, only show stats and errors", + ), + verbose: bool = Opt( + False, "--verbose", "-V", help="Print additional information and explanations" + ), + no_format: bool = Opt( + False, "--no-format", "-NF", help="Don't pretty-print the results" + ), # fmt: on ): """ @@ -120,7 +137,7 @@ def debug_data( cfg = util.load_config(config_path, overrides=config_overrides) nlp = util.load_model_from_config(cfg) config = nlp.config.interpolate() - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] # Use original config here, not resolved version sourced_components = get_sourced_components(cfg) frozen_components = T["frozen_components"] @@ -708,7 +725,7 @@ def debug_data( if len(dev_not_train) != 0: pct = len(dev_not_train) / len(trees_dev) msg.info( - f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)" + f"{len(dev_not_train)} lemmatizer trees ({pct * 100:.1f}% of dev trees)" " were found exclusively in the dev data." ) else: diff --git a/spacy/cli/debug_diff.py b/spacy/cli/debug_diff.py index c53b0acab50..71d8826bc5e 100644 --- a/spacy/cli/debug_diff.py +++ b/spacy/cli/debug_diff.py @@ -2,11 +2,10 @@ from typing import Optional import typer -from thinc.api import Config from wasabi import MarkdownRenderer, Printer, diff_strings from ..util import load_config -from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error +from ._util import Arg, Opt, debug_cli, show_validation_error from .init_config import Optimizations, init_config @@ -17,12 +16,36 @@ def debug_diff_cli( # fmt: off ctx: typer.Context, - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - compare_to: Optional[Path] = Opt(None, help="Path to a config file to diff against, or `None` to compare against default settings", exists=True, allow_dash=True), - optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config."), - gpu: bool = Opt(False, "--gpu", "-G", help="Whether the original config can run on a GPU. Only relevant when comparing against the default config."), - pretraining: bool = Opt(False, "--pretraining", "--pt", help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config."), - markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + compare_to: Optional[Path] = Opt( + None, + help="Path to a config file to diff against, or `None` to compare against default settings", + exists=True, + allow_dash=True, + ), + optimize: Optimizations = Opt( + Optimizations.efficiency.value, + "--optimize", + "-o", + help="Whether the user config was optimized for efficiency or accuracy. Only relevant when comparing against the default config.", + ), + gpu: bool = Opt( + False, + "--gpu", + "-G", + help="Whether the original config can run on a GPU. Only relevant when comparing against the default config.", + ), + pretraining: bool = Opt( + False, + "--pretraining", + "--pt", + help="Whether to compare on a config with pretraining involved. Only relevant when comparing against the default config.", + ), + markdown: bool = Opt( + False, "--markdown", "-md", help="Generate Markdown for GitHub issues" + ), # fmt: on ): """Show a diff of a config file with respect to spaCy's defaults or another config file. If diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 3c667e42a2b..dc0de3e1489 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -36,18 +36,26 @@ def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"), - layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + component: str = Arg( + ..., help="Name of the pipeline component of which the model should be analysed" + ), + layers: str = Opt( + "", "--layers", "-l", help="Comma-separated names of layer IDs to print" + ), dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"), parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"), gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"), attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"), P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"), - P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"), + P1: bool = Opt( + False, "--print-step1", "-P1", help="Print model after initialization" + ), P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"), P3: bool = Opt(False, "--print-step3", "-P3", help="Print final predictions"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """ @@ -81,7 +89,7 @@ def debug_model_cli( with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) config = nlp.config.interpolate() - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 8104fd2d285..8a1110dcef3 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,3 +1,4 @@ +import shutil import sys from typing import Optional, Sequence from urllib.parse import urljoin @@ -27,9 +28,16 @@ def download_cli( # fmt: off ctx: typer.Context, model: str = Arg(..., help="Name of pipeline package to download"), - direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), - sdist: bool = Opt(False, "--sdist", "-S", help="Download sdist (.tar.gz) archive instead of pre-built binary wheel"), - url: str = Opt(None, "--url", "-U", help="Download from given url") + direct: bool = Opt( + False, "--direct", "-d", "-D", help="Force direct download of name + version" + ), + sdist: bool = Opt( + False, + "--sdist", + "-S", + help="Download sdist (.tar.gz) archive instead of pre-built binary wheel", + ), + url: str = Opt(None, "--url", "-U", help="Download from given url"), # fmt: on ): """ @@ -176,5 +184,19 @@ def download_model( if not download_url.startswith(about.__download_url__): raise ValueError(f"Download from {filename} rejected. Was it a relative path?") pip_args = list(user_pip_args) if user_pip_args is not None else [] - cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] + cmd = _get_pip_install_cmd() + pip_args + [download_url] run_command(cmd) + + +def _get_pip_install_cmd() -> list: + if shutil.which("pip"): + return [sys.executable, "-m", "pip", "install"] + elif shutil.which("uv"): + return ["uv", "pip", "install"] + else: + msg.fail( + "No package installer found", + "spaCy requires either pip or uv to download models. " + "Please install one of them and try again.", + exits=1, + ) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 2276ca6b0d4..9704ea44413 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,13 +1,12 @@ import re from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import srsly from thinc.api import fix_random_seed from wasabi import Printer from .. import displacy, util -from ..scorer import Scorer from ..tokens import Doc from ..training import Corpus from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu @@ -20,15 +19,42 @@ def evaluate_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), - output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), + output: Optional[Path] = Opt( + None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), - displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), - displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), - per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), - spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"), + gold_preproc: bool = Opt( + False, "--gold-preproc", "-G", help="Use gold preprocessing" + ), + displacy_path: Optional[Path] = Opt( + None, + "--displacy-path", + "-dp", + help="Directory to output rendered parses as HTML", + exists=True, + file_okay=False, + ), + displacy_limit: int = Opt( + 25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML" + ), + per_component: bool = Opt( + False, + "--per-component", + "-P", + help="Return scores per component, only applicable when an output JSON file is specified.", + ), + spans_key: str = Opt( + "sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans" + ), # fmt: on ): """ @@ -123,7 +149,7 @@ def evaluate( if key == "speed": results[metric] = f"{scores[key]:.0f}" else: - results[metric] = f"{scores[key]*100:.2f}" + results[metric] = f"{scores[key] * 100:.2f}" else: results[metric] = "-" data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] diff --git a/spacy/cli/find_function.py b/spacy/cli/find_function.py index f99ce2adc9f..3b3b333337b 100644 --- a/spacy/cli/find_function.py +++ b/spacy/cli/find_function.py @@ -11,7 +11,9 @@ def find_function_cli( # fmt: off func_name: str = Arg(..., help="Name of the registered function."), - registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."), + registry_name: Optional[str] = Opt( + None, "--registry", "-r", help="Name of the catalogue registry." + ), # fmt: on ): """ diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index ff7af32e6f6..1873f476fcd 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -27,15 +27,39 @@ def find_threshold_cli( # fmt: off model: str = Arg(..., help="Model name or path"), - data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True), + data_path: Path = Arg( + ..., help="Location of binary evaluation data in .spacy format", exists=True + ), pipe_name: str = Arg(..., help="Name of pipe to examine thresholds for"), - threshold_key: str = Arg(..., help="Key of threshold attribute in component's configuration"), + threshold_key: str = Arg( + ..., help="Key of threshold attribute in component's configuration" + ), scores_key: str = Arg(..., help="Metric to optimize"), - n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), + n_trials: int = Opt( + _DEFAULTS["n_trials"], + "--n_trials", + "-n", + help="Number of trials to determine optimal thresholds", + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + use_gpu: int = Opt( + _DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU" + ), + gold_preproc: bool = Opt( + _DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing" + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), # fmt: on ): """ @@ -183,10 +207,10 @@ def filter_config( ), ) if hasattr(pipe, "cfg"): - setattr( - nlp.get_pipe(pipe_name), - "cfg", - set_nested_item(getattr(pipe, "cfg"), config_keys, threshold), + nlp.get_pipe(pipe_name).cfg = set_nested_item( # type: ignore[attr-defined] + pipe.cfg, + config_keys, + threshold, # type: ignore[attr-defined] ) eval_scores = nlp.evaluate(dev_dataset) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 8bfc6b54f15..ed2394c564e 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -16,10 +16,24 @@ def info_cli( # fmt: off model: Optional[str] = Arg(None, help="Optional loadable spaCy pipeline"), - markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"), - silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"), - exclude: str = Opt("labels", "--exclude", "-e", help="Comma-separated keys to exclude from the print-out"), - url: bool = Opt(False, "--url", "-u", help="Print the URL to download the most recent compatible version of the pipeline"), + markdown: bool = Opt( + False, "--markdown", "-md", help="Generate Markdown for GitHub issues" + ), + silent: bool = Opt( + False, "--silent", "-s", "-S", help="Don't print anything (just return)" + ), + exclude: str = Opt( + "labels", + "--exclude", + "-e", + help="Comma-separated keys to exclude from the print-out", + ), + url: bool = Opt( + False, + "--url", + "-u", + help="Print the URL to download the most recent compatible version of the pipeline", + ), # fmt: on ): """ diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index a7fb2b5b81f..c7081040280 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -49,13 +49,44 @@ class InitValues: @init_cli.command("config") def init_config_cli( # fmt: off - output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), - lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), - pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), - optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), - gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), - pretraining: bool = Opt(InitValues.pretraining, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - force_overwrite: bool = Opt(InitValues.force_overwrite, "--force", "-F", help="Force overwriting the output file"), + output_file: Path = Arg( + ..., + help="File to save the config to or - for stdout (will only output config and no additional logging info)", + allow_dash=True, + ), + lang: str = Opt( + InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use" + ), + pipeline: str = Opt( + ",".join(InitValues.pipeline), + "--pipeline", + "-p", + help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')", + ), + optimize: Optimizations = Opt( + InitValues.optimize, + "--optimize", + "-o", + help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters.", + ), + gpu: bool = Opt( + InitValues.gpu, + "--gpu", + "-G", + help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters.", + ), + pretraining: bool = Opt( + InitValues.pretraining, + "--pretraining", + "-pt", + help="Include config for pretraining (with 'spacy pretrain')", + ), + force_overwrite: bool = Opt( + InitValues.force_overwrite, + "--force", + "-F", + help="Force overwriting the output file", + ), # fmt: on ): """ @@ -88,11 +119,28 @@ def init_config_cli( @init_cli.command("fill-config") def init_fill_config_cli( # fmt: off - base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), - output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), - pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), - diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), - code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + base_path: Path = Arg( + ..., help="Path to base config to fill", exists=True, dir_okay=False + ), + output_file: Path = Arg( + "-", help="Path to output .cfg file (or - for stdout)", allow_dash=True + ), + pretraining: bool = Opt( + False, + "--pretraining", + "-pt", + help="Include config for pretraining (with 'spacy pretrain')", + ), + diff: bool = Opt( + False, "--diff", "-D", help="Print a visual diff highlighting the changes" + ), + code_path: Optional[Path] = Opt( + None, + "--code-path", + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), # fmt: on ): """ @@ -168,7 +216,7 @@ def init_config( # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] - reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).dict() + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, defaults)).model_dump() variables = { "lang": lang, "components": pipeline, diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index 21eea8edf2f..1c0ff526235 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -26,13 +26,42 @@ def init_vectors_cli( lang: str = Arg(..., help="The language of the nlp object to create"), vectors_loc: Path = Arg(..., help="Vectors file in Word2Vec format", exists=True), output_dir: Path = Arg(..., help="Pipeline output directory"), - prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), - truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), + prune: int = Opt( + -1, "--prune", "-p", help="Optional number of vectors to prune to" + ), + truncate: int = Opt( + 0, + "--truncate", + "-t", + help="Optional number of vectors to truncate to when reading in vectors file", + ), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), - name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), - attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"), + name: Optional[str] = Opt( + None, + "--name", + "-n", + help="Optional name for the word vectors, e.g. en_core_web_lg.vectors", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + jsonl_loc: Optional[Path] = Opt( + None, + "--lexemes-jsonl", + "-j", + help="Location of JSONL-formatted attributes file", + hidden=True, + ), + attr: str = Opt( + "ORTH", + "--attr", + "-a", + help="Optional token attribute to use for vectors, e.g. LOWER or NORM", + ), # fmt: on ): """Convert word vectors for use with spaCy. Will export an nlp object that @@ -81,11 +110,24 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None: def init_pipeline_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), output_path: Path = Arg(..., help="Output directory for the prepared data"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): if verbose: @@ -108,11 +150,24 @@ def init_pipeline_cli( def init_labels_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), output_path: Path = Arg(..., help="Output directory for the labels"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """Generate JSON files for the labels in the data. This helps speed up the diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 67b1d318651..9291aae2827 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -21,16 +21,56 @@ @app.command("package") def package_cli( # fmt: off - input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), - output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), - code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"), - meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), - create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"), - name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), - version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), - build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."), - force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), - require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"), + input_dir: Path = Arg( + ..., help="Directory with pipeline data", exists=True, file_okay=False + ), + output_dir: Path = Arg( + ..., help="Output parent directory", exists=True, file_okay=False + ), + code_paths: str = Opt( + "", + "--code", + "-c", + help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package", + ), + meta_path: Optional[Path] = Opt( + None, + "--meta-path", + "--meta", + "-m", + help="Path to meta.json", + exists=True, + dir_okay=False, + ), + create_meta: bool = Opt( + False, "--create-meta", "-C", help="Create meta.json, even if one exists" + ), + name: Optional[str] = Opt( + None, "--name", "-n", help="Package name to override meta" + ), + version: Optional[str] = Opt( + None, "--version", "-v", help="Package version to override meta" + ), + build: str = Opt( + "sdist", + "--build", + "-b", + help="Comma-separated formats to build: sdist and/or wheel, or none.", + ), + force: bool = Opt( + False, + "--force", + "-f", + "-F", + help="Force overwriting existing data in output directory", + ), + require_parent: bool = Opt( + True, + "--require-parent/--no-require-parent", + "-R", + "-R", + help="Include the parent package (e.g. spacy) in the requirements", + ), # fmt: on ): """ @@ -410,7 +450,7 @@ def generate_readme(meta: Dict[str, Any]) -> str: pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])]) components = ", ".join([md.code(p) for p in meta.get("components", [])]) vecs = meta.get("vectors", {}) - vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)" + vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({vecs.get('width', 0)} dimensions)" author = meta.get("author") or "n/a" notes = meta.get("notes", "") license_name = meta.get("license") @@ -469,7 +509,7 @@ def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> st md = MarkdownRenderer() scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))] scores = [ - (md.code(acc.upper()), f"{score*100:.2f}") + (md.code(acc.upper()), f"{score * 100:.2f}") for acc, score in scalars if acc not in exclude ] @@ -488,9 +528,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: if not labels: continue col1 = md.bold(md.code(pipe)) - col2 = ", ".join( - [md.code(str(label).replace("|", "\\|")) for label in labels] - ) # noqa: W605 + col2 = ", ".join([md.code(str(label).replace("|", "\\|")) for label in labels]) # noqa: W605 label_data.append((col1, col2)) n_labels += len(labels) n_pipes += 1 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 446c40510df..daea861a952 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -25,13 +25,32 @@ def pretrain_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True), + config_path: Path = Arg( + ..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True + ), output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), - epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + resume_path: Optional[Path] = Opt( + None, + "--resume-path", + "-r", + help="Path to pretrained weights from which to resume pretraining", + ), + epoch_resume: Optional[int] = Opt( + None, + "--epoch-resume", + "-er", + help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files.", + ), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), - skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), + skip_last: bool = Opt( + False, "--skip-last", "-L", help="Skip saving model-last.bin" + ), # fmt: on ): """ diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index e5b8f11939f..03f7127149e 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -21,8 +21,15 @@ def profile_cli( # fmt: off ctx: typer.Context, # This is only used to read current calling context model: str = Arg(..., help="Trained pipeline to load"), - inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True), - n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), + inputs: Optional[Path] = Arg( + None, + help="Location of input file. '-' for stdin.", + exists=True, + allow_dash=True, + ), + n_texts: int = Opt( + 10000, "--n-texts", "-n", help="Maximum number of texts to use if available" + ), # fmt: on ): """ diff --git a/spacy/cli/train.py b/spacy/cli/train.py index c72e13b2681..379268286ee 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -26,11 +26,30 @@ def train_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), - output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"), - code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), - use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") + config_path: Path = Arg( + ..., help="Path to config file", exists=True, allow_dash=True + ), + output_path: Optional[Path] = Opt( + None, + "--output", + "--output-path", + "-o", + help="Output directory to store trained pipeline in", + ), + code_path: Optional[Path] = Opt( + None, + "--code", + "-c", + help="Path to Python file with additional code (registered functions) to be imported", + ), + verbose: bool = Opt( + False, + "--verbose", + "-V", + "-VV", + help="Display more information for debugging purposes", + ), + use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): """ diff --git a/spacy/compat.py b/spacy/compat.py index a9e7d5a20b9..828ed1ba62e 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -35,7 +35,9 @@ try: # Python 3.8+ import importlib.metadata as importlib_metadata except ImportError: - from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401 + from catalogue import ( # type: ignore[no-redef] + _importlib_metadata as importlib_metadata, # noqa: F401 + ) from thinc.api import Optimizer # noqa: F401 diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 337afb57f8c..4b5a04a5eca 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-af -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'n aan af @@ -52,4 +53,5 @@ was wat ʼn -""".split()) +""".split() +) diff --git a/spacy/lang/am/lex_attrs.py b/spacy/lang/am/lex_attrs.py index 9e111b8d5eb..c7b2aab35bf 100644 --- a/spacy/lang/am/lex_attrs.py +++ b/spacy/lang/am/lex_attrs.py @@ -60,7 +60,7 @@ "አስራ ስምንተኛ", "አስራ ዘጠነኛ", "ሃያኛ", - "ሰላሳኛ" "አርባኛ", + "ሰላሳኛአርባኛ", "አምሳኛ", "ስድሳኛ", "ሰባኛ", diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index 8a04c555f74..5487ada5aeb 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -1,7 +1,8 @@ # Stop words by Teshome Kassie http://etd.aau.edu.et/bitstream/handle/123456789/3315/Teshome%20Kassie.pdf?sequence=1&isAllowed=y # Stop words by Tihitina Petros http://etd.aau.edu.et/bitstream/handle/123456789/3384/Tihitina%20Petros.pdf?sequence=1&isAllowed=y -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ @@ -28,4 +29,5 @@ በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም -""".split()) +""".split() +) diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 6e943d064ee..54ad7a8c363 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,6 +1,7 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ صفر واحد إثنان @@ -50,9 +51,11 @@ مليون مليار مليارات -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ اول أول حاد @@ -67,7 +70,8 @@ ثامن تاسع عاشر -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index 65c8992cbd6..f4da54dda29 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ من نحو لعل @@ -385,4 +386,5 @@ وإن ولو يا -""".split()) +""".split() +) diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py index 8beffa998da..2114939ba11 100644 --- a/spacy/lang/az/stop_words.py +++ b/spacy/lang/az/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py -STOP_WORDS = set(""" +STOP_WORDS = set( + """ amma arasında artıq @@ -140,4 +141,5 @@ əlbəttə ən əslində -""".split()) +""".split() +) diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index 7d3e756054d..061850da594 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -4,7 +4,8 @@ https://postvai.com/books/stop-dumi.pdf - Additions to the original list in order to improve it. """ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а автентичен аз ако ала бе без беше би бивш бивша бившо бивши бил била били било благодаря близо бъдат @@ -75,4 +76,5 @@ юмрук я як -""".split()) +""".split() +) diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 5aec18b7f5b..bf38e32545e 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে আগামী আগে আগেই আছে আজ আদ্যভাগে আপনার আপনি আবার আমরা আমাকে আমাদের আমার আমি আর আরও ইত্যাদি ইহা @@ -37,4 +38,5 @@ সাধারণ সামনে সঙ্গে সঙ্গেও সব সবার সমস্ত সম্প্রতি সময় সহ সহিত সাথে সুতরাং সে সেই সেখান সেখানে সেটা সেটাই সেটাও সেটি স্পষ্ট স্বয়ং হইতে হইবে হইয়া হওয়া হওয়ায় হওয়ার হচ্ছে হত হতে হতেই হন হবে হবেন হয় হয়তো হয়নি হয়ে হয়েই হয়েছিল হয়েছে হাজার হয়েছেন হল হলে হলেই হলেও হলো হিসাবে হিসেবে হৈলে হোক হয় হয়ে হয়েছে হৈতে হইয়া হয়েছিল হয়েছেন হয়নি হয়েই হয়তো হওয়া হওয়ার হওয়ায় -""".split()) +""".split() +) diff --git a/spacy/lang/bo/stop_words.py b/spacy/lang/bo/stop_words.py index 158e148b00b..407242c849b 100644 --- a/spacy/lang/bo/stop_words.py +++ b/spacy/lang/bo/stop_words.py @@ -1,6 +1,7 @@ # Source: https://zenodo.org/records/10148636 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ འི་ ། དུ་ @@ -193,4 +194,5 @@ གིང་ ཚ་ ཀྱང -""".split()) +""".split() +) diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index 90cce5de885..1a87b2f9dbe 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò als altra altre altres amb ambdues ambdós anar ans apa aquell aquella aquelles aquells aquest aquesta aquestes aquests aquí @@ -47,4 +48,5 @@ va vaig vam van vas veu vosaltres vostra vostre vostres -""".split()) +""".split() +) diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index 35db9fedc86..f61f424f6f4 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,7 +1,8 @@ # Source: https://github.com/Alir3z4/stop-words # Source: https://github.com/stopwords-iso/stopwords-cs/blob/master/stopwords-cs.txt -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby ahoj @@ -360,4 +361,5 @@ zatímco ze že -""".split()) +""".split() +) diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 0e71dfde739..05b2084dde3 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,6 +1,7 @@ # Source: Handpicked by Jens Dahl Møllerhøj. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ af aldrig alene alle allerede alligevel alt altid anden andet andre at bag begge blandt blev blive bliver burde bør @@ -40,4 +41,5 @@ var ved vi via vil ville vore vores vær være været øvrigt -""".split()) +""".split() +) diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index 5fbd7428757..f52687eb9b3 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ á a ab aber ach acht achte achten achter achtes ag alle allein allem allen aller allerdings alles allgemeinen als also am an andere anderen anderem andern anders auch auf aus ausser außer ausserdem außerdem @@ -73,4 +74,5 @@ zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen -""".split()) +""".split() +) diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py index 90735a6236a..376e04aa6e5 100644 --- a/spacy/lang/dsb/stop_words.py +++ b/spacy/lang/dsb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abo aby ako ale až daniž dokulaž @@ -10,4 +11,5 @@ pak pótom teke togodla -""".split()) +""".split() +) diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index b5c1c36c41f..7c436219fa9 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,6 +1,7 @@ # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ αδιάκοπα αι ακόμα ακόμη ακριβώς άλλα αλλά αλλαχού άλλες άλλη άλλην άλλης αλλιώς αλλιώτικα άλλο άλλοι αλλοιώς αλλοιώτικα άλλον άλλος άλλοτε αλλού άλλους άλλων άμα άμεσα αμέσως αν ανά ανάμεσα αναμεταξύ άνευ αντί αντίπερα αντίς @@ -82,4 +83,5 @@ χωρίς χωριστά ω ως ωσάν ωσότου ώσπου ώστε ωστόσο ωχ -""".split()) +""".split() +) diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index 41317ba9770..d88d4837e2a 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -128,7 +128,6 @@ _exc.update(_other_exc) for h in range(1, 12 + 1): - for period in ["π.μ.", "πμ"]: _exc[f"{h}{period}"] = [ {ORTH: f"{h}"}, diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index cbce281b491..1ca5cbc1670 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,5 +1,6 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at @@ -61,7 +62,8 @@ whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves -""".split()) +""".split() +) contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] STOP_WORDS.update(contractions) diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py index ee5d38e8466..3102f3b9bc4 100644 --- a/spacy/lang/es/lemmatizer.py +++ b/spacy/lang/es/lemmatizer.py @@ -415,7 +415,10 @@ def lemmatize_verb_pron( else: rule = self.select_rule("verb", features) verb_lemma = self.lemmatize_verb( - verb, features - {"PronType=Prs"}, rule, index # type: ignore[operator] + verb, + features - {"PronType=Prs"}, # type: ignore[operator] + rule, + index, # type: ignore[operator] )[0] pron_lemmas = [] for pron in prons: diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 5099359e843..6d28854810a 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a acuerdo adelante ademas además afirmó agregó ahi ahora ahí al algo alguna algunas alguno algunos algún alli allí alrededor ambos ante anterior antes apenas aproximadamente aquel aquella aquellas aquello aquellos aqui aquél @@ -75,4 +76,5 @@ vosotras vosotros voy vuestra vuestras vuestro vuestros y ya yo -""".split()) +""".split() +) diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 248bcb61f08..e1da1f14d5e 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-et -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aga ei et @@ -36,4 +37,5 @@ ta te ära -""".split()) +""".split() +) diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py index 4a6661e7d20..d213b5b81a5 100644 --- a/spacy/lang/eu/stop_words.py +++ b/spacy/lang/eu/stop_words.py @@ -1,7 +1,8 @@ # Source: https://github.com/stopwords-iso/stopwords-eu # https://www.ranks.nl/stopwords/basque # https://www.mustgo.com/worldlanguages/basque/ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ al anitz arabera @@ -100,4 +101,5 @@ zuek zuen zuten -""".split()) +""".split() +) diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index a6d79a386df..7ef82c3e8a9 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -611,8 +611,8 @@ present_ends = ["م", "ی", "د", "یم", "ید", "ند"] # special case of '#هست': -VERBS_EXC.update({conj: "هست" for conj in ["هست" + end for end in simple_ends]}) -VERBS_EXC.update({conj: "هست" for conj in ["نیست" + end for end in simple_ends]}) +VERBS_EXC.update(dict.fromkeys(["هست" + end for end in simple_ends], "هست")) +VERBS_EXC.update(dict.fromkeys(["نیست" + end for end in simple_ends], "هست")) for verb_root in verb_roots: conjugations = [] @@ -648,4 +648,4 @@ ) ) - VERBS_EXC.update({conj: (past,) if past else present for conj in conjugations}) + VERBS_EXC.update(dict.fromkeys(conjugations, (past,) if past else present)) diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index 9b0ff546e0d..065e81bd6af 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -5,7 +5,8 @@ YE_NUN = "ین" -_num_words = set(""" +_num_words = set( + """ صفر یک دو @@ -62,12 +63,15 @@ کوادریلیون کادریلیارد کوینتیلیون -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ اول سوم -سی‌ام""".split()) +سی‌ام""".split() +) _ordinal_words.update({num + MIM for num in _num_words}) _ordinal_words.update({num + ZWNJ_O_MIM for num in _num_words}) diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 93738c89263..f462f2e7a5d 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,5 +1,6 @@ # Stop words from HAZM package -STOP_WORDS = set(""" +STOP_WORDS = set( + """ و در به @@ -388,4 +389,5 @@ لذا زاده گردد -اینجا""".split()) +اینجا""".split() +) diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index 742cacc2689..8e8dcfa565d 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,6 +1,7 @@ # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aiemmin aika aikaa aikaan aikaisemmin aikaisin aikana aikoina aikoo aikovat aina ainakaan ainakin ainoa ainoat aiomme aion aiotte aivan ajan alas alemmas alkuisin alkuun alla alle aloitamme aloitan aloitat aloitatte aloitattivat @@ -105,4 +106,5 @@ ympäri älköön älä -""".split()) +""".split() +) diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index 8a9dfb82a8b..9cf508a07b9 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,20 +1,24 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ zero un une deux trois quatre cinq six sept huit neuf dix onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf vingt trente quarante cinquante soixante soixante-dix septante quatre-vingt huitante quatre-vingt-dix nonante cent mille mil million milliard billion quadrillion quintillion sextillion septillion octillion nonillion decillion -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ premier première deuxième second seconde troisième quatrième cinquième sixième septième huitième neuvième dixième onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neuvième vingtième trentième quarantième cinquantième soixantième soixante-dixième septantième quatre-vingtième huitantième quatre-vingt-dixième nonantième centième millième millionnième milliardième billionnième quadrillionnième quintillionnième sextillionnième septillionnième octillionnième nonillionnième decillionnième -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index 85ffe47baef..b32ee3d7173 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à â abord afin ah ai aie ainsi ait allaient allons alors anterieur anterieure anterieures antérieur antérieure antérieures apres après as assez attendu au @@ -79,4 +80,5 @@ y -""".split()) +""".split() +) diff --git a/spacy/lang/ga/lemmatizer.py b/spacy/lang/ga/lemmatizer.py index c9fbfbc193a..cffcf1d3c49 100644 --- a/spacy/lang/ga/lemmatizer.py +++ b/spacy/lang/ga/lemmatizer.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple +from typing import List, Tuple from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index e32ad6431f6..4ef052ca58a 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ach ag agus an aon ar arna as ba beirt bhúr @@ -38,4 +39,5 @@ í ó ón óna ónár -""".split()) +""".split() +) diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py index 6f2c2856bec..d5132c35e31 100644 --- a/spacy/lang/gd/stop_words.py +++ b/spacy/lang/gd/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'ad 'ar 'd # iad @@ -381,4 +382,5 @@ ì ò ó -""".split("\n")) +""".split("\n") +) diff --git a/spacy/lang/grc/stop_words.py b/spacy/lang/grc/stop_words.py index 51f5e9d9dac..cbb766a8ce1 100644 --- a/spacy/lang/grc/stop_words.py +++ b/spacy/lang/grc/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ αὐτῷ αὐτοῦ αὐτῆς αὐτόν αὐτὸν αὐτῶν αὐτὸς αὐτὸ αὐτό αὐτός αὐτὴν αὐτοῖς αὐτοὺς αὔτ' αὐτὰ αὐτῇ αὐτὴ αὐτὼ αὑταὶ καὐτὸς αὐτά αὑτός αὐτοῖσι αὐτοῖσιν αὑτὸς αὐτήν αὐτοῖσί αὐτοί αὐτοὶ αὐτοῖο αὐτάων αὐτὰς αὐτέων αὐτώ αὐτάς αὐτούς αὐτή αὐταί αὐταὶ αὐτῇσιν τὠυτῷ τὠυτὸ ταὐτὰ ταύτῃ αὐτῇσι αὐτῇς αὐταῖς αὐτᾶς αὐτὰν ταὐτὸν @@ -56,4 +57,5 @@ ὣς ὡς ὥς ὧς ὥστ' ὥστε ὥσθ' ὤ ὢ - """.split()) + """.split() +) diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 1d11a3ebd96..2c859681b05 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ એમ આ એ @@ -83,4 +84,5 @@ દર એટલો પરંતુ -""".split()) +""".split() +) diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index ea486722475..23bb5176de9 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ אני את אתה @@ -217,4 +218,5 @@ אחרות אשר או -""".split()) +""".split() +) diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index 9bc57bd3136..475b07da152 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अंदर अत अदि @@ -234,4 +235,5 @@ होते होना होने -""".split()) +""".split() +) diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index 769ebe4db53..dd10f792d01 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-hr -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ah aha @@ -339,4 +340,5 @@ željeo zimus zum -""".split()) +""".split() +) diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py index 86021f555c1..e6fedaf4c92 100644 --- a/spacy/lang/hsb/stop_words.py +++ b/spacy/lang/hsb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abo ale ani dokelž @@ -14,4 +15,5 @@ tež tohodla zo zoby -""".split()) +""".split() +) diff --git a/spacy/lang/ht/lemmatizer.py b/spacy/lang/ht/lemmatizer.py index 52bf23d2390..7687865c300 100644 --- a/spacy/lang/ht/lemmatizer.py +++ b/spacy/lang/ht/lemmatizer.py @@ -1,6 +1,5 @@ from typing import List, Tuple -from ...lookups import Lookups from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/lang/ht/lex_attrs.py b/spacy/lang/ht/lex_attrs.py index 27a535dd746..ab1a39a8234 100644 --- a/spacy/lang/ht/lex_attrs.py +++ b/spacy/lang/ht/lex_attrs.py @@ -1,20 +1,24 @@ from ...attrs import LIKE_NUM, NORM # Cardinal numbers in Creole -_num_words = set(""" +_num_words = set( + """ zewo youn en de twa kat senk sis sèt uit nèf dis onz douz trèz katoz kenz sèz disèt dizwit diznèf vent trant karant sinkant swasant swasann-dis san mil milyon milya -""".split()) +""".split() +) # Ordinal numbers in Creole (some are French-influenced, some simplified) -_ordinal_words = set(""" +_ordinal_words = set( + """ premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm ventyèm trantyèm karantyèm sinkantyèm swasantyèm swasann-disyèm santyèm milyèm milyonnyèm milyadyèm -""".split()) +""".split() +) NORM_MAP = { "'m": "mwen", diff --git a/spacy/lang/ht/stop_words.py b/spacy/lang/ht/stop_words.py index fd85c2a197f..50998e0e5ff 100644 --- a/spacy/lang/ht/stop_words.py +++ b/spacy/lang/ht/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ak an ankò ant apre ap atò avan avanlè byen bò byenke @@ -38,7 +39,8 @@ men mèsi oswa osinon -""".split()) +""".split() +) # Add common contractions, with and without apostrophe variants contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"] diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index 1841557073a..e39a26d35ae 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abban ahhoz ahogy ahol aki akik akkor akár alatt amely amelyek amelyekben amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az azok azon azonban azt aztán azután azzal azért @@ -57,4 +58,5 @@ úgy új újabb újra ő őket -""".split()) +""".split() +) diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index 1bfd09a4b29..46d0f6b511c 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ նա ողջը այստեղ @@ -102,4 +103,5 @@ այս մեջ թ -""".split()) +""".split() +) diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index 11220a61e5b..a0b35fa1a2b 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,4 +1,5 @@ -ID_BASE_EXCEPTIONS = set(""" +ID_BASE_EXCEPTIONS = set( + """ aba-aba abah-abah abal-abal @@ -3897,4 +3898,5 @@ yo-yo zam-zam zig-zag -""".split()) +""".split() +) diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index fc85f83679a..b1bfaea796e 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -113,4 +114,5 @@ waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split()) +""".split() +) diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 8dea4e97fd1..8e206262c10 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -156,7 +156,7 @@ "S.T.", "S.T.Han", "S.Th.", - "S.Th.I" "S.TI.", + "S.Th.IS.TI.", "S.T.P.", "S.TrK", "S.Tekp.", @@ -210,7 +210,7 @@ "hlm.", "i/o", "n.b.", - "p.p." "pjs.", + "p.p.pjs.", "s.d.", "tel.", "u.p.", diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index 79f84ee6000..917fb6df444 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Xangis/extra-stopwords -STOP_WORDS = set(""" +STOP_WORDS = set( + """ afhverju aftan aftur @@ -153,4 +154,5 @@ því þær ætti -""".split()) +""".split() +) diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 2a37236a9b9..42adc7904c8 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai @@ -78,4 +79,5 @@ v' va vale vari varia varie vario verso vi via vicino visto vita voi volta volte vostra vostre vostri vostro -""".split()) +""".split() +) diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index 661b5183594..98560d7e28b 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -2,7 +2,8 @@ # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were # present. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ あ あっ あまり あり ある あるいは あれ い いい いう いく いずれ いっ いつ いる いわ うち @@ -43,4 +44,5 @@ を ん 一 -""".split()) +""".split() +) diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py index 93e6ea27f0c..aee33c2b748 100644 --- a/spacy/lang/kmr/stop_words.py +++ b/spacy/lang/kmr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ û li bi @@ -39,4 +40,5 @@ hemû kes tişt -""".split()) +""".split() +) diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 528e5e3a8a8..dba9740af91 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ಹಲವು ಮೂಲಕ ಹಾಗೂ @@ -81,4 +82,5 @@ ಎಂದು ನನ್ನ ಮೇಲೆ -""".split()) +""".split() +) diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index d4cdbc7a112..3eba9fc8299 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 이 있 하 @@ -62,4 +63,5 @@ 원 잘 놓 -""".split()) +""".split() +) diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index fb8e2c84b95..ea40bdfa222 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ага адам айтты айтымында айтып ал алар алардын алган алуу алып анда андан аны анын ар @@ -37,4 +38,5 @@ үч үчүн өз -""".split()) +""".split() +) diff --git a/spacy/lang/la/stop_words.py b/spacy/lang/la/stop_words.py index 47abf7384f4..8b590bb67b3 100644 --- a/spacy/lang/la/stop_words.py +++ b/spacy/lang/la/stop_words.py @@ -1,6 +1,7 @@ # Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem cum cur @@ -32,4 +33,5 @@ ubi uel uero vel vero -""".split()) +""".split() +) diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index bbef72b9bb3..11923137418 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,18 +1,22 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg honnert dausend millioun milliard billioun billiard trillioun triliard -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten honnertsten dausendsten milliounsten milliardsten billiounsten billiardsten trilliounsten trilliardsten -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 386ce1222af..8f22ea6e694 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à äis @@ -206,4 +207,5 @@ zu zum zwar -""".split()) +""".split() +) diff --git a/spacy/lang/lg/stop_words.py b/spacy/lang/lg/stop_words.py index a9f99cbf40f..7bad59344fb 100644 --- a/spacy/lang/lg/stop_words.py +++ b/spacy/lang/lg/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ abadde abalala abamu abangi abava ajja ali alina ani anti ateekeddwa atewamu atya awamu aweebwa ayinza ba baali babadde babalina bajja bajjanewankubade bali balina bandi bangi bano bateekeddwa baweebwa bayina bebombi beera bibye @@ -14,4 +15,5 @@ tetuteekeddwa tewali teyalina teyayina tolina tu tuyina tulina tuyina twafuna twetaaga wa wabula wabweru wadde waggulunnina wakati waliwobangi waliyo wandi wange wano wansi weebwa yabadde yaffe ye yenna yennyini yina yonna ziba zijja zonna -""".split()) +""".split() +) diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 37eb163ffe7..1d6f09d27ca 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei bella belle belli bello ben @@ -34,4 +35,5 @@ un uña unn' unna za zu -""".split()) +""".split() +) diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 4ed61996ac1..2685c243083 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-lv -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aiz ap apakš @@ -162,4 +163,5 @@ zem ārpus šaipus -""".split()) +""".split() +) diff --git a/spacy/lang/mk/stop_words.py b/spacy/lang/mk/stop_words.py index 90a27179852..312a456c5db 100644 --- a/spacy/lang/mk/stop_words.py +++ b/spacy/lang/mk/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а абре aв @@ -810,4 +811,5 @@ џагара-магара џанам џив-џив - """.split()) + """.split() +) diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 64b9acc1025..441e9358699 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ അത് ഇത് ആയിരുന്നു @@ -8,4 +9,5 @@ അന്ന് ഇന്ന് ആണ് -""".split()) +""".split() +) diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 3c9c6208916..9b0cee951ab 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json -STOP_WORDS = set(""" +STOP_WORDS = set( + """ न अतरी तो @@ -187,4 +188,5 @@ होता होती होते -""".split()) +""".split() +) diff --git a/spacy/lang/ms/_tokenizer_exceptions_list.py b/spacy/lang/ms/_tokenizer_exceptions_list.py index e579e316ae9..fba1dd70f94 100644 --- a/spacy/lang/ms/_tokenizer_exceptions_list.py +++ b/spacy/lang/ms/_tokenizer_exceptions_list.py @@ -1,6 +1,7 @@ # from https://prpm.dbp.gov.my/cari1?keyword= # dbp https://en.wikipedia.org/wiki/Dewan_Bahasa_dan_Pustaka -MS_BASE_EXCEPTIONS = set(""" +MS_BASE_EXCEPTIONS = set( + """ aba-aba abah-abah abar-abar @@ -1938,4 +1939,5 @@ water-cooled world-class yang-yang -""".split()) +""".split() +) diff --git a/spacy/lang/ms/examples.py b/spacy/lang/ms/examples.py index 1af439d4a5b..236e0c0f660 100644 --- a/spacy/lang/ms/examples.py +++ b/spacy/lang/ms/examples.py @@ -10,7 +10,7 @@ "Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?", "Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.", "Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir", - "Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?", + "Kuala Lumpur merupakan ibu negara Malaysia.Kau berada di mana semalam?", "Siapa yang akan memimpin projek itu?", "Siapa perdana menteri Malaysia sekarang?", ] diff --git a/spacy/lang/ms/stop_words.py b/spacy/lang/ms/stop_words.py index fc85f83679a..b1bfaea796e 100644 --- a/spacy/lang/ms/stop_words.py +++ b/spacy/lang/ms/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal @@ -113,4 +114,5 @@ waduh wah wahai waktu waktunya walau walaupun wong yaitu yakin yakni yang -""".split()) +""".split() +) diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index bc1c54a4af3..d9ed414efdf 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ alle allerede alt and andre annen annet at av bak bare bedre beste blant ble bli blir blitt bris by både @@ -45,4 +46,5 @@ å år ønsker -""".split()) +""".split() +) diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py index 95d7a375821..8470297b9f0 100644 --- a/spacy/lang/ne/stop_words.py +++ b/spacy/lang/ne/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अक्सर अगाडि अगाडी @@ -489,4 +490,5 @@ होइन होकि होला -""".split()) +""".split() +) diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 1b8602831ae..488224c2f20 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,17 +1,21 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd duizend miljoen miljard biljoen biljard triljoen triljard -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste miljardste biljoenste biljardste triljoenste triljardste -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index a88c2905199..cd4fdefdf58 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -13,7 +13,8 @@ # should have a Dutch counterpart here. -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna afgelopen aldus alhoewel anderzijds @@ -67,4 +68,5 @@ zal ze zei zelf zich zij zijn zo zonder zou zeer zeker zekere zelfde zelfs zichzelf zijnde zijne zo’n zoals zodra zouden zoveel zowat zulk zulke zulks zullen zult -""".split()) +""".split() +) diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 4418deedc0b..075aec39167 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,6 +1,7 @@ # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby ach acz aczkolwiek aj albo ale alez ależ ani az aż @@ -73,4 +74,5 @@ z za zaden zadna zadne zadnych zapewne zawsze zaś ze zeby znow znowu znów zostal został -żaden żadna żadne żadnych że żeby""".split()) +żaden żadna żadne żadnych że żeby""".split() +) diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py index b2d63cb3d63..60bd50da1eb 100644 --- a/spacy/lang/pt/punctuation.py +++ b/spacy/lang/pt/punctuation.py @@ -1,6 +1,8 @@ -from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES -from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES -from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES +from ..punctuation import ( + TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES, + TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES, + TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES, +) _prefixes = [r"\w{1,3}\$"] + BASE_TOKENIZER_PREFIXES diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 722aef80236..ce3c86ff570 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a à às área acerca ademais adeus agora ainda algo algumas alguns ali além ambas ambos antes ao aos apenas apoia apoio apontar após aquela aquelas aquele aqueles aqui aquilo as assim através atrás até aí @@ -61,4 +62,5 @@ vossas vosso vossos vários vão vêm vós zero -""".split()) +""".split() +) diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index a5880fc2fac..736aa911ac6 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,13 +1,16 @@ from ...attrs import LIKE_NUM -_num_words = set(""" +_num_words = set( + """ zero unu doi două trei patru cinci șase șapte opt nouă zece unsprezece doisprezece douăsprezece treisprezece patrusprezece cincisprezece șaisprezece șaptesprezece optsprezece nouăsprezece douăzeci treizeci patruzeci cincizeci șaizeci șaptezeci optzeci nouăzeci sută mie milion miliard bilion trilion cvadrilion catralion cvintilion sextilion septilion enșpemii -""".split()) +""".split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ primul doilea treilea patrulea cincilea șaselea șaptelea optulea nouălea zecelea prima doua treia patra cincia șasea șaptea opta noua zecea unsprezecelea doisprezecelea treisprezecelea patrusprezecelea cincisprezecelea șaisprezecelea șaptesprezecelea optsprezecelea nouăsprezecelea @@ -15,7 +18,8 @@ douăzecilea treizecilea patruzecilea cincizecilea șaizecilea șaptezecilea optzecilea nouăzecilea sutălea douăzecea treizecea patruzecea cincizecea șaizecea șaptezecea optzecea nouăzecea suta miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia -""".split()) +""".split() +) def like_num(text): diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index c7c0801f171..d68a81c4569 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-ro -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a abia acea @@ -494,4 +495,5 @@ știu ți ție -""".split()) +""".split() +) diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 63b1cead810..e0b35bdc07f 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,6 +1,8 @@ from ...attrs import LIKE_NUM -_num_words = list(set(""" +_num_words = list( + set( + """ ноль ноля нолю нолём ноле нулевой нулевого нулевому нулевым нулевом нулевая нулевую нулевое нулевые нулевых нулевыми четверть четверти четвертью четвертей четвертям четвертями четвертях @@ -201,7 +203,9 @@ квинтиллиону квинтиллионов квинтлн i ii iii iv v vi vii viii ix x xi xii xiii xiv xv xvi xvii xviii xix xx xxi xxii xxiii xxiv xxv xxvi xxvii xxvii xxix -""".split())) +""".split() + ) +) def like_num(text): diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 3040adb52b1..d6ea6b42af9 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а авось ага агу аж ай али алло ау ах ая б будем будет будете будешь буду будут будучи будь будьте бы был была были было @@ -106,4 +107,5 @@ ю я явно явных яко якобы якоже -""".split()) +""".split() +) diff --git a/spacy/lang/sa/stop_words.py b/spacy/lang/sa/stop_words.py index eaf0ffaa2c9..30302a14dcb 100644 --- a/spacy/lang/sa/stop_words.py +++ b/spacy/lang/sa/stop_words.py @@ -1,6 +1,7 @@ # Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323 -STOP_WORDS = set(""" +STOP_WORDS = set( + """ अहम् आवाम् वयम् @@ -510,4 +511,5 @@ ह हन्त हि -""".split()) +""".split() +) diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index acae5763b52..7d29bc1b4d8 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ සහ සමග සමඟ @@ -190,4 +191,5 @@ ලෙස පරිදි එහෙත් -""".split()) +""".split() +) diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index 6ef4818c3a2..017e7beef39 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Ardevop-sk/stopwords-sk -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a aby aj @@ -419,4 +420,5 @@ ňou ňu že -""".split()) +""".split() +) diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py index 6d6b40b4546..3c1493050a1 100644 --- a/spacy/lang/sl/lex_attrs.py +++ b/spacy/lang/sl/lex_attrs.py @@ -2,7 +2,8 @@ from ...attrs import IS_CURRENCY, LIKE_NUM -_num_words = set(""" +_num_words = set( + """ nula ničla nič ena dva tri štiri pet šest sedem osem devet deset enajst dvanajst trinajst štirinajst petnajst šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset @@ -17,9 +18,11 @@ šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi - """.split()) + """.split() +) -_ordinal_words = set(""" +_ordinal_words = set( + """ prvi drugi tretji četrti peti šesti sedmi osmi deveti deseti enajsti dvanajsti trinajsti štirinajsti petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti @@ -89,9 +92,11 @@ osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi trilijontimi kvadrilijontimi neštetimi - """.split()) + """.split() +) -_currency_words = set(""" +_currency_words = set( + """ evro evra evru evrom evrov evroma evrih evrom evre evri evr eur cent centa centu cenom centov centoma centih centom cente centi dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd @@ -104,7 +109,8 @@ jen jena jeni jenu jenom jenov jenoma jenih jene kuna kuni kune kuno kun kunama kunah kunam kunami marka marki marke markama markah markami - """.split()) + """.split() +) def like_num(text): diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py index dadb54d315c..3be83eba382 100644 --- a/spacy/lang/sl/punctuation.py +++ b/spacy/lang/sl/punctuation.py @@ -5,14 +5,12 @@ CONCAT_QUOTES, CURRENCY, HYPHENS, - LIST_CURRENCY, LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES, PUNCT, UNITS, - merge_chars, ) from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index a81c00db269..8491efcb580 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/stopwords-iso/stopwords-sl -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a ali b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo @@ -79,4 +80,5 @@ z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj ž že -""".split()) +""".split() +) diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index bf1c7a7039c..f2b1a4f4a7b 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/andrixh/index-albanian -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a afert ai @@ -224,4 +225,5 @@ vjen yne zakonisht -""".split()) +""".split() +) diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 758964a5853..5df5509d2c4 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ а авај ако @@ -388,4 +389,5 @@ ћете ћеш ћу -""".split()) +""".split() +) diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 08251bcff32..2422b2a9e5a 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ aderton adertonde adjö aldrig alla allas allt alltid alltså än andra andras annan annat ännu artonde arton åtminstone att åtta åttio åttionde åttonde av även @@ -61,4 +62,5 @@ vad vänster vänstra var vår vara våra varför varifrån varit varken värre varsågod vart vårt vem vems verkligen vi vid vidare viktig viktigare viktigast viktigt vilka vilken vilket vill -""".split()) +""".split() +) diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index d6ef21f3b0a..abbff949d79 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,6 +1,7 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ஒரு என்று மற்றும் @@ -126,4 +127,5 @@ வரையில் சற்று எனக் -""".split()) +""".split() +) diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index d2834260898..b18dab697da 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,6 +1,7 @@ # Source: https://github.com/Xangis/extra-stopwords (MIT License) -STOP_WORDS = set(""" +STOP_WORDS = set( + """ అందరూ అందుబాటులో అడగండి @@ -51,4 +52,5 @@ వేరుగా వ్యతిరేకంగా సంబంధం -""".split()) +""".split() +) diff --git a/spacy/lang/th/stop_words.py b/spacy/lang/th/stop_words.py index 3dd6e56525b..2823281ce95 100644 --- a/spacy/lang/th/stop_words.py +++ b/spacy/lang/th/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ทั้งนี้ ดัง ขอ รวม หลังจาก เป็น หลัง หรือ ๆ เกี่ยวกับ ซึ่งได้แก่ ด้วยเพราะ ด้วยว่า ด้วยเหตุเพราะ ด้วยเหตุว่า สุดๆ เสร็จแล้ว เช่น เข้า ถ้า ถูก ถึง ต่างๆ ใคร เปิดเผย ครา รือ ตาม ใน ได้แก่ ได้แต่ ได้ที่ ตลอดถึง นอกจากว่า นอกนั้น จริง อย่างดี ส่วน เพียงเพื่อ เดียว จัด ทั้งที ทั้งคน ทั้งตัว ไกลๆ @@ -70,4 +71,5 @@ แห่งนี้ แห่งโน้น แห่งไหน แหละ ให้แก่ ใหญ่ ใหญ่โต อย่างมาก อย่างยิ่ง อย่างไรก็ อย่างไรก็ได้ อย่างไรเสีย อย่างละ อย่างหนึ่ง อย่างๆ อัน อันจะ อันได้แก่ อันที่ อันที่จริง อันที่จะ อันเนื่องมาจาก อันละ อันๆ อาจจะ อาจเป็น อาจเป็นด้วย อื่น อื่นๆ เอ็ง เอา ฯ ฯล ฯลฯ 555 กำ ขอโทษ เยี่ยม นี่คือ -""".split()) +""".split() +) diff --git a/spacy/lang/ti/stop_words.py b/spacy/lang/ti/stop_words.py index e0aaf47d3fe..9bd7122007a 100644 --- a/spacy/lang/ti/stop_words.py +++ b/spacy/lang/ti/stop_words.py @@ -1,7 +1,8 @@ # Stop words from Tigrinya Wordcount: https://github.com/fgaim/Tigrinya-WordCount/blob/main/ti_stop_words.txt # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ 'ምበር 'ሞ 'ቲ 'ታ 'ኳ 'ውን 'ዚ 'የ 'ዩ 'ያ 'ዮም 'ዮን ልዕሊ ሒዙ ሒዛ ሕጂ መበል መን መንጎ መጠን ማለት ምስ ምባል ምእንቲ ምኽንያቱ ምኽንያት ምዃኑ ምዃንና ምዃኖም @@ -22,4 +23,5 @@ ዝነበረ ዝነበረት ዝነበሩ ዝካየድ ዝኸውን ዝኽእል ዝኾነ ዝዀነ የለን ይቕረብ ይብል ይኸውን ይኹን ይኽእል ደኣ ድሕሪ ድማ ገለ ገሊጹ ገና ገይሩ ግና ግን ጥራይ -""".split()) +""".split() +) diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index a7bf541990a..2560cdaed6a 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set(""" +STOP_WORDS = set( + """ akin aking ako @@ -146,4 +147,5 @@ tungkol una walang -""".split()) +""".split() +) diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index a63a455f754..f614771dd11 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,5 +1,6 @@ # Stop words -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ke gareng ga selekanyo tlhwatlhwa yo mongwe se sengwe fa go le jalo gongwe ba na mo tikologong jaaka kwa morago nna gonne ka sa pele nako teng @@ -15,4 +16,5 @@ bonala e tshwanang bogolo tsenya tsweetswee karolo sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa tlhano lesometlhano botlalo lekgolo -""".split()) +""".split() +) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index e80423e5150..b7d91d86f0d 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -34,11 +34,11 @@ # host & domain names # mods: match is case-sensitive, so include [A-Z] r"(?:" # noqa: E131 - r"(?:" # noqa: E131 - r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 - r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" - r")?" - r"[A-Za-z0-9\u00a1-\uffff]\." + r"(?:" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff]" # noqa: E131 + r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" + r")?" + r"[A-Za-z0-9\u00a1-\uffff]\." r")+" # TLD identifier # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match @@ -111,7 +111,8 @@ BASE_EXCEPTIONS[orth] = [{ORTH: orth}] -emoticons = set(r""" +emoticons = set( + r""" :) :-) :)) @@ -242,7 +243,8 @@ ¯\(ツ)/¯ (╯°□°)╯︵┻━┻ ><(((*> -""".split()) +""".split() +) for orth in emoticons: diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 5323cf32d9c..85dcff6a53a 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords-iso/stopwords-tr -STOP_WORDS = set(""" +STOP_WORDS = set( + """ acaba acep adamakıllı @@ -552,4 +553,5 @@ zaten zati zira -""".split()) +""".split() +) diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 8f146d9150a..44169b757e5 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,6 +1,7 @@ # Tatar stopwords are from https://github.com/aliiae/stopwords-tt -STOP_WORDS = set("""алай алайса алар аларга аларда алардан аларны аларның аларча +STOP_WORDS = set( + """алай алайса алар аларга аларда алардан аларны аларның аларча алары аларын аларынга аларында аларыннан аларының алтмыш алтмышынчы алтмышынчыга алтмышынчыда алтмышынчыдан алтмышынчылар алтмышынчыларга алтмышынчыларда алтмышынчылардан алтмышынчыларны алтмышынчыларның алтмышынчыны алтмышынчының @@ -168,4 +169,5 @@ өстәп өч өчен өченче өченчегә өченчедә өченчедән өченчеләр өченчеләргә өченчеләрдә өченчеләрдән өченчеләрне өченчеләрнең өченчене өченченең өчләп -өчәрләп""".split()) +өчәрләп""".split() +) diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index 517c300070a..b11d7a044a3 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,4 +1,5 @@ -STOP_WORDS = set("""а +STOP_WORDS = set( + """а або адже аж @@ -464,4 +465,5 @@ якій якого якої -якщо""".split()) +якщо""".split() +) diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index 916a47bfd19..e590ed3e303 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -5,8 +5,7 @@ # https://en.wikibooks.org/wiki/Urdu/Vocabulary/Numbers # https://www.urdu-english.com/lessons/beginner/numbers -_num_words = ( - """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ +_num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس @@ -18,7 +17,6 @@ سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو """.split() -) # source https://www.google.com/intl/ur/inputtools/try/ diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index 00f0dd2d6b4..abfa3649713 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,5 +1,6 @@ # Source: collected from different resource on internet -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ثھی خو گی @@ -508,4 +509,5 @@ ہورہی ثبعث ضت -""".split()) +""".split() +) diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 9163e10938e..3481701d5ea 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,5 +1,6 @@ # Source: https://github.com/stopwords/vietnamese-stopwords -STOP_WORDS = set(""" +STOP_WORDS = set( + """ a_lô a_ha ai @@ -1942,4 +1943,5 @@ ừ_ào ừ_ừ ử -""".split("\n")) +""".split("\n") +) diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index d54fe689504..42ae4a1de04 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,6 +1,7 @@ # stop words as whitespace-separated list # Chinese stop words,maybe not enough -STOP_WORDS = set(""" +STOP_WORDS = set( + """ ! " # @@ -1894,4 +1895,5 @@ ~± ~+ ¥ -""".split()) +""".split() +) diff --git a/spacy/language.py b/spacy/language.py index dcf436c65fe..8e91018254e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1323,7 +1323,7 @@ def get_examples(): # Make sure the config is interpolated so we can resolve subsections config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config - I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) # type: ignore[arg-type] before_init = I["before_init"] if before_init is not None: before_init(self) @@ -1353,7 +1353,7 @@ def get_examples(): proc.initialize(get_examples, nlp=self, **p_settings) pretrain_cfg = config.get("pretraining") if pretrain_cfg: - P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) + P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain) # type: ignore[arg-type] init_tok2vec(self, P, I) self._link_components() self._optimizer = sgd @@ -1589,9 +1589,7 @@ def pipe( # noqa: F811 if batch_size is None: batch_size = self.batch_size - pipes = ( - [] - ) # contains functools.partial objects to easily create multiprocess worker. + pipes = [] # contains functools.partial objects to easily create multiprocess worker. for name, proc in self.pipeline: if name in disable: continue @@ -1626,7 +1624,11 @@ def _has_gpu_model(self, disable: Iterable[str]): if name in disable or not is_trainable: continue - if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore + if ( + hasattr(proc, "model") + and hasattr(proc.model, "ops") + and isinstance(proc.model.ops, CupyOps) + ): # type: ignore return True return False @@ -1821,7 +1823,7 @@ def from_config( orig_pretraining = config.pop("pretraining", None) config["components"] = {} if auto_fill: - filled = registry.fill(config, validate=validate, schema=ConfigSchema) + filled = registry.fill(config, validate=validate, schema=ConfigSchema) # type: ignore[arg-type] else: filled = config filled["components"] = orig_pipeline @@ -1830,7 +1832,9 @@ def from_config( filled["pretraining"] = orig_pretraining config["pretraining"] = orig_pretraining resolved_nlp = registry.resolve( - filled["nlp"], validate=validate, schema=ConfigSchemaNlp + filled["nlp"], + validate=validate, + schema=ConfigSchemaNlp, # type: ignore[arg-type] ) create_tokenizer = resolved_nlp["tokenizer"] create_vectors = resolved_nlp["vectors"] diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi index d84a30a58b0..3d744dfce4b 100644 --- a/spacy/matcher/dependencymatcher.pyi +++ b/spacy/matcher/dependencymatcher.pyi @@ -48,10 +48,12 @@ class DependencyMatcher: *, on_match: Optional[ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] - ] = ... + ] = ..., ) -> None: ... def has_key(self, key: Union[str, int]) -> bool: ... - def get(self, key: Union[str, int], default: Optional[Any] = ...) -> Tuple[ + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[ Optional[ Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] ], diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index c33b534cbd2..e474d250d22 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -33,7 +33,7 @@ class Matcher: on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] ] = ..., - greedy: Optional[str] = ... + greedy: Optional[str] = ..., ) -> None: ... def remove(self, key: str) -> None: ... def has_key(self, key: Union[str, int]) -> bool: ... @@ -56,7 +56,7 @@ class Matcher: *, as_spans: Literal[False] = ..., allow_missing: bool = ..., - with_alignments: bool = ... + with_alignments: bool = ..., ) -> List[Tuple[int, int, int]]: ... @overload def __call__( @@ -65,6 +65,6 @@ class Matcher: *, as_spans: Literal[True], allow_missing: bool = ..., - with_alignments: bool = ... + with_alignments: bool = ..., ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 27f6ba373fc..0f56699d63f 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, overload +from typing import Any, Callable, List, Optional, Tuple, Union, overload from ..compat import Literal from ..tokens import Doc, Span diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index ccc830e35c1..a71f85f6e63 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -57,7 +57,7 @@ cdef class PhraseMatcher: attr = "ORTH" if attr == "IS_SENT_START": attr = "SENT_START" - if attr.lower() not in TokenPattern().dict(): + if attr.lower() not in TokenPattern().model_dump(): raise ValueError(Errors.E152.format(attr=attr)) self.attr = IDS.get(attr) diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index fde73f35b5b..8cc4d25743e 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -4,7 +4,6 @@ from thinc.types import Floats2d from ..tokens import Doc -from ..util import registry def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py index cdcac0c3812..464c32594dc 100644 --- a/spacy/ml/_precomputable_affine.py +++ b/spacy/ml/_precomputable_affine.py @@ -1,7 +1,5 @@ from thinc.api import Model, normal_init -from ..util import registry - def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): model = Model( diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py index fefb170ba21..d9976cea80a 100644 --- a/spacy/ml/callbacks.py +++ b/spacy/ml/callbacks.py @@ -2,14 +2,12 @@ import inspect import types import warnings -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set from thinc.layers import with_nvtx_range -from thinc.model import Model, wrap_model_recursive from thinc.util import use_nvtx_range from ..errors import Warnings -from ..util import registry if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index d571973122e..9f54b48899e 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -1,7 +1,6 @@ from thinc.api import Model from ..attrs import LOWER -from ..util import registry def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index d3456b705a6..925bfd45c31 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -3,8 +3,6 @@ from thinc.api import Model, to_numpy from thinc.types import Ints1d, Ragged -from ..util import registry - def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]: """Extract spans from a sequence of source arrays, as specified by an array diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py index fb4e3c39aea..ad376e15f25 100644 --- a/spacy/ml/featureextractor.py +++ b/spacy/ml/featureextractor.py @@ -1,6 +1,6 @@ from typing import Callable, List, Tuple, Union -from thinc.api import Model, registry +from thinc.api import Model from thinc.types import Ints2d from ..tokens import Doc diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 8b12720db20..05ad9a27287 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -23,7 +23,6 @@ get_candidates_batch, ) from ...tokens import Doc, Span -from ...util import registry from ...vocab import Vocab from ..extract_spans import extract_spans @@ -122,7 +121,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: return get_candidates -def create_candidates_batch() -> ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]] -): +def create_candidates_batch() -> Callable[ + [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] +]: return get_candidates_batch diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 7c68fe48126..9beecf878ad 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,5 +1,5 @@ from functools import partial -from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, cast +from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Tuple, cast import numpy from thinc.api import ( @@ -21,7 +21,7 @@ from ...attrs import ID, ORTH from ...errors import Errors -from ...util import OOV_RANK, registry +from ...util import OOV_RANK from ...vectors import Mode as VectorsMode if TYPE_CHECKING: @@ -199,7 +199,7 @@ def mlm_initialize(model: Model, X=None, Y=None): layers=[wrapped_model], init=mlm_initialize, refs={"wrapped": wrapped_model}, - dims={dim: None for dim in wrapped_model.dim_names}, + dims=dict.fromkeys(wrapped_model.dim_names), ) mlm_model.set_ref("wrapped", wrapped_model) return mlm_model diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 9ff0ac8ba3c..20b8f6d6e80 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast +from typing import List, Optional from thinc.api import Linear, Model, chain, list2array, use_ops, zero_init from thinc.types import Floats2d @@ -6,7 +6,6 @@ from ...compat import Literal from ...errors import Errors from ...tokens import Doc -from ...util import registry from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel diff --git a/spacy/ml/models/span_finder.py b/spacy/ml/models/span_finder.py index 8081ed92b70..226b736c7eb 100644 --- a/spacy/ml/models/span_finder.py +++ b/spacy/ml/models/span_finder.py @@ -4,7 +4,6 @@ from thinc.types import Floats1d, Floats2d from ...tokens import Doc -from ...util import registry InT = List[Doc] OutT = Floats2d diff --git a/spacy/ml/models/spancat.py b/spacy/ml/models/spancat.py index 91dfb41ed7f..697d1df4d35 100644 --- a/spacy/ml/models/spancat.py +++ b/spacy/ml/models/spancat.py @@ -18,7 +18,6 @@ from thinc.types import Floats2d, Ragged from ...tokens import Doc -from ...util import registry from ..extract_spans import extract_spans diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index aec4276dbd8..d3b090de005 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -4,7 +4,6 @@ from thinc.types import Floats2d from ...tokens import Doc -from ...util import registry def build_tagger_model( diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 49c0dd7077c..8194ab3101e 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -36,7 +36,6 @@ from ...attrs import ORTH from ...errors import Errors from ...tokens import Doc -from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors from .tok2vec import get_tok2vec_width diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index b2b803b6ed0..ade84274475 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -17,14 +17,13 @@ with_array, with_padded, ) -from thinc.types import Floats2d, Ints1d, Ints2d, Ragged +from thinc.types import Floats2d, Ints2d, Ragged from ...attrs import intify_attr from ...errors import Errors from ...ml import _character_embed from ...pipeline.tok2vec import Tok2VecListener from ...tokens import Doc -from ...util import registry from ..featureextractor import FeatureExtractor from ..staticvectors import StaticVectors diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 122ef379544..d90acdaf008 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -1,7 +1,7 @@ import warnings -from typing import Callable, List, Optional, Sequence, Tuple, cast +from typing import Callable, List, Optional, Tuple, cast -from thinc.api import Model, Ops, registry +from thinc.api import Model, Ops from thinc.initializers import glorot_uniform_init from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from thinc.util import partial @@ -19,7 +19,7 @@ def StaticVectors( *, dropout: Optional[float] = None, init_W: Callable = glorot_uniform_init, - key_attr: str = "ORTH" + key_attr: str = "ORTH", ) -> Model[List[Doc], Ragged]: """Embed Doc objects with their vocab's vectors table, applying a learned linear projection to control the dimensionality. If a dropout rate is diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 16c894f6c5c..e538b9e88c0 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,6 +1,5 @@ from thinc.api import Model, noop -from ..util import registry from .parser_model import ParserStepModel diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index d26884487d3..b564b466e50 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -23,7 +23,7 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]: values (Iterable[str]): The string attributes to check, e.g. `["token.pos"]`. RETURNS (Iterable[str]): The checked attributes. """ - data = dot_to_dict({value: True for value in values}) + data = dot_to_dict(dict.fromkeys(values, True)) objs = {"doc": Doc, "token": Token, "span": Span} for obj_key, attrs in data.items(): if obj_key == "span": @@ -100,7 +100,7 @@ def analyze_pipes( all_attrs.update(meta.requires) result["summary"][name] = {key: getattr(meta, key, None) for key in keys} prev_pipes = nlp.pipeline[:i] - requires = {annot: False for annot in meta.requires} + requires = dict.fromkeys(meta.requires, False) if requires: for prev_name, prev_pipe in prev_pipes: prev_meta = nlp.get_pipe_meta(prev_name) diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py index 89f2861ceac..ef7a076b6cd 100644 --- a/spacy/pipeline/_edit_tree_internals/schemas.py +++ b/spacy/pipeline/_edit_tree_internals/schemas.py @@ -1,12 +1,16 @@ from collections import defaultdict from typing import Any, Dict, List, Union -try: - from pydantic.v1 import BaseModel, Field, ValidationError - from pydantic.v1.types import StrictBool, StrictInt, StrictStr -except ImportError: - from pydantic import BaseModel, Field, ValidationError # type: ignore - from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore +from pydantic import ( + BaseModel, + ConfigDict, + Field, + RootModel, + StrictBool, + StrictInt, + StrictStr, + ValidationError, +) class MatchNodeSchema(BaseModel): @@ -15,20 +19,18 @@ class MatchNodeSchema(BaseModel): prefix_tree: StrictInt = Field(..., title="Prefix tree") suffix_tree: StrictInt = Field(..., title="Suffix tree") - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class SubstNodeSchema(BaseModel): orig: Union[int, StrictStr] = Field(..., title="Original substring") subst: Union[int, StrictStr] = Field(..., title="Replacement substring") - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") -class EditTreeSchema(BaseModel): - __root__: Union[MatchNodeSchema, SubstNodeSchema] +class EditTreeSchema(RootModel[Union[MatchNodeSchema, SubstNodeSchema]]): + pass def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: @@ -38,7 +40,7 @@ def validate_edit_tree(obj: Dict[str, Any]) -> List[str]: RETURNS (List[str]): A list of error messages, if available. """ try: - EditTreeSchema.parse_obj(obj) + EditTreeSchema.model_validate(obj) return [] except ValidationError as e: errors = e.errors() diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index cc1e2e37a64..d4f96ec014b 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -1,5 +1,4 @@ import importlib -import sys from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -14,12 +13,14 @@ from ..tokens import Doc, Span from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..training import Example -from ..util import SimpleFrozenList, registry +from ..util import SimpleFrozenList from ..vocab import Vocab from .pipe import Pipe MatcherPatternType = List[Dict[Union[int, str], Any]] -AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]] +AttributeRulerPatternType = Dict[ + str, Union[List[MatcherPatternType], MatcherPatternType, Dict, int] +] TagMapType = Dict[str, Dict[Union[int, str], Union[int, str]]] MorphRulesType = Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]] @@ -137,7 +138,8 @@ def match(self, doc: Doc): matches = self.matcher(doc, allow_missing=True, as_spans=False) # Sort by the attribute ID, so that later rules have precedence matches = [ - (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches # type: ignore + (int(self.vocab.strings[m_id]), m_id, s, e) + for m_id, s, e in matches # type: ignore ] matches.sort() return matches diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index 0941b43c1ce..77f033b1c48 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -1,5 +1,4 @@ import importlib -import sys from collections import Counter from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, cast diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6a1ed11dfc5..4b23fee6249 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,6 +1,5 @@ import importlib import random -import sys from itertools import islice from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Union @@ -16,9 +15,8 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example, validate_examples, validate_get_examples -from ..util import SimpleFrozenList, registry +from ..util import SimpleFrozenList from ..vocab import Vocab -from .legacy.entity_linker import EntityLinker_v1 from .pipe import deserialize_config from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2b8c9830720..0728c3f0006 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from collections import defaultdict from pathlib import Path @@ -14,7 +13,7 @@ from ..scorer import get_ner_prf from ..tokens import Doc, Span from ..training import Example -from ..util import SimpleFrozenList, ensure_path, from_disk, registry, to_disk +from ..util import SimpleFrozenList, ensure_path, from_disk, to_disk from .pipe import Pipe DEFAULT_ENT_ID_SEP = "||" diff --git a/spacy/pipeline/factories.py b/spacy/pipeline/factories.py index f796f2dc8a5..8c71067b32e 100644 --- a/spacy/pipeline/factories.py +++ b/spacy/pipeline/factories.py @@ -14,9 +14,10 @@ ) # Import factory default configurations -from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker, EntityLinker_v1 +from ..pipeline.entity_linker import DEFAULT_NEL_MODEL, EntityLinker from ..pipeline.entityruler import DEFAULT_ENT_ID_SEP, EntityRuler from ..pipeline.functions import DocCleaner, TokenSplitter +from ..pipeline.legacy import EntityLinker_v1 from ..pipeline.lemmatizer import Lemmatizer from ..pipeline.morphologizer import DEFAULT_MORPH_MODEL, Morphologizer from ..pipeline.multitask import DEFAULT_MT_MODEL, MultitaskObjective @@ -24,8 +25,8 @@ from ..pipeline.sentencizer import Sentencizer from ..pipeline.senter import DEFAULT_SENTER_MODEL, SentenceRecognizer from ..pipeline.span_finder import DEFAULT_SPAN_FINDER_MODEL, SpanFinder -from ..pipeline.span_ruler import DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY from ..pipeline.span_ruler import ( + DEFAULT_SPANS_KEY as SPAN_RULER_DEFAULT_SPANS_KEY, SpanRuler, prioritize_existing_ents_filter, prioritize_new_ents_filter, diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index e4a3d6d1d5b..b2aa8b708c8 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from typing import Any, Dict diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index e8d467ef8db..f518e1072ac 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -13,7 +12,7 @@ from ..scorer import Scorer from ..tokens import Doc, Token from ..training import Example -from ..util import SimpleFrozenList, logger, registry +from ..util import SimpleFrozenList, logger from ..vocab import Vocab from .pipe import Pipe diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi index 9a1c11cefea..55cfd1fec95 100644 --- a/spacy/pipeline/pipe.pyi +++ b/spacy/pipeline/pipe.pyi @@ -7,7 +7,6 @@ from typing import ( Iterator, List, NoReturn, - Optional, Tuple, Union, ) diff --git a/spacy/pipeline/span_finder.py b/spacy/pipeline/span_finder.py index 26c9efb6a9d..7ee19de04b0 100644 --- a/spacy/pipeline/span_finder.py +++ b/spacy/pipeline/span_finder.py @@ -1,5 +1,4 @@ import importlib -import sys from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple from thinc.api import Config, Model, Optimizer, set_dropout_rate @@ -10,7 +9,6 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example -from ..util import registry from .spancat import DEFAULT_SPANS_KEY from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 98287ba1d22..703eda61561 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -1,5 +1,4 @@ import importlib -import sys import warnings from functools import partial from pathlib import Path @@ -27,7 +26,7 @@ from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example -from ..util import SimpleFrozenList, ensure_path, registry +from ..util import SimpleFrozenList, ensure_path from .pipe import Pipe PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 805a0538f01..9b945df35b5 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,5 +1,4 @@ import importlib -import sys from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast @@ -14,7 +13,6 @@ from ..scorer import Scorer from ..tokens import Doc, Span, SpanGroup from ..training import Example, validate_examples -from ..util import registry from ..vocab import Vocab from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 36b569edc63..7b03c7e81d4 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,18 +1,15 @@ import importlib -import sys from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import numpy -from thinc.api import Config, Model, Optimizer, get_array_module, set_dropout_rate -from thinc.types import Floats2d +from thinc.api import Config, Model, Optimizer, set_dropout_rate from ..errors import Errors from ..language import Language from ..scorer import Scorer from ..tokens import Doc from ..training import Example, validate_examples, validate_get_examples -from ..util import registry from ..vocab import Vocab from .trainable_pipe import TrainablePipe diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 32845490d4e..cc094bf6197 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,17 +1,13 @@ import importlib -import sys from itertools import islice -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, Optional from thinc.api import Config, Model -from thinc.types import Floats2d from ..errors import Errors from ..language import Language from ..scorer import Scorer -from ..tokens import Doc from ..training import Example, validate_get_examples -from ..util import registry from ..vocab import Vocab from .textcat import TextCategorizer diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index ce0296bf5f3..4e2e5af846f 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,4 @@ import importlib -import sys from itertools import islice from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence diff --git a/spacy/schemas.py b/spacy/schemas.py index fa987b90f19..359c3fd0f83 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,4 @@ import inspect -import re from collections import defaultdict from enum import Enum from typing import ( @@ -16,34 +15,19 @@ Union, ) -try: - from pydantic.v1 import ( - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.v1.main import ModelMetaclass -except ImportError: - from pydantic import ( # type: ignore - BaseModel, - ConstrainedStr, - Field, - StrictBool, - StrictFloat, - StrictInt, - StrictStr, - ValidationError, - create_model, - validator, - ) - from pydantic.main import ModelMetaclass # type: ignore +from pydantic import ( + BaseModel, + ConfigDict, + Field, + StrictBool, + StrictFloat, + StrictInt, + StrictStr, + ValidationError, + constr, + create_model, + field_validator, +) from thinc.api import ConfigValidationError, Model, Optimizer from thinc.config import Promise @@ -89,14 +73,9 @@ def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]: # Initialization -class ArgSchemaConfig: - extra = "forbid" - arbitrary_types_allowed = True +ArgSchemaConfig = ConfigDict(extra="forbid", arbitrary_types_allowed=True) - -class ArgSchemaConfigExtra: - extra = "forbid" - arbitrary_types_allowed = True +ArgSchemaConfigExtra = ConfigDict(extra="forbid", arbitrary_types_allowed=True) def get_arg_model( @@ -105,7 +84,7 @@ def get_arg_model( exclude: Iterable[str] = tuple(), name: str = "ArgModel", strict: bool = True, -) -> ModelMetaclass: +) -> type[BaseModel]: """Generate a pydantic model for function arguments. func (Callable): The function to generate the schema for. @@ -113,7 +92,7 @@ def get_arg_model( name (str): Name of created model class. strict (bool): Don't allow extra arguments if no variable keyword arguments are allowed on the function. - RETURNS (ModelMetaclass): A pydantic model. + RETURNS (type[BaseModel]): A pydantic model. """ sig_args = {} try: @@ -167,7 +146,7 @@ def validate_init_settings( """ schema = get_arg_model(func, exclude=exclude, name="InitArgModel") try: - return schema(**settings).dict() + return schema.model_validate(settings).model_dump() except ValidationError as e: block = "initialize" if not section else f"initialize.{section}" title = f"Error validating initialization settings in [{block}]" @@ -228,11 +207,10 @@ class TokenPatternString(BaseModel): None, alias="fuzzy9" ) - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name + model_config = ConfigDict(extra="forbid", populate_by_name=True) - @validator("*", pre=True, each_item=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -253,11 +231,10 @@ class TokenPatternNumber(BaseModel): GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<") - class Config: - extra = "forbid" - allow_population_by_field_name = True # allow alias and field name + model_config = ConfigDict(extra="forbid", populate_by_name=True) - @validator("*", pre=True, each_item=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -271,11 +248,10 @@ class TokenPatternOperatorSimple(str, Enum): exclamation: StrictStr = StrictStr("!") -class TokenPatternOperatorMinMax(ConstrainedStr): - regex = re.compile(r"^({\d+}|{\d+,\d*}|{\d*,\d+})$") +TokenPatternOperatorMinMax = constr(pattern=r"^(\{\d+\}|\{\d+,\d*\}|\{\d*,\d+\})$") -TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] +TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax] # type: ignore[valid-type] StringValue = Union[TokenPatternString, StrictStr] NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ @@ -323,12 +299,14 @@ class TokenPattern(BaseModel): op: Optional[TokenPatternOperator] = None underscore: Optional[Dict[StrictStr, UnderscoreValue]] = Field(None, alias="_") - class Config: - extra = "forbid" - allow_population_by_field_name = True - alias_generator = lambda value: value.upper() + model_config = ConfigDict( + extra="forbid", + populate_by_name=True, + alias_generator=lambda value: value.upper(), + ) - @validator("*", pre=True, allow_reuse=True) + @field_validator("*", mode="before") + @classmethod def raise_for_none(cls, v): if v is None: raise ValueError("None / null is not allowed") @@ -336,10 +314,9 @@ def raise_for_none(cls, v): class TokenPatternSchema(BaseModel): - pattern: List[TokenPattern] = Field(..., min_items=1) + pattern: List[TokenPattern] = Field(..., min_length=1) - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") # Model meta @@ -397,9 +374,7 @@ class ConfigSchemaTraining(BaseModel): before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaNlp(BaseModel): @@ -415,14 +390,11 @@ class ConfigSchemaNlp(BaseModel): vectors: Callable = Field(..., title="Vectors implementation") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaPretrainEmpty(BaseModel): - class Config: - extra = "forbid" + model_config = ConfigDict(extra="forbid") class ConfigSchemaPretrain(BaseModel): @@ -439,9 +411,7 @@ class ConfigSchemaPretrain(BaseModel): objective: Callable[["Vocab", Model], Model] = Field(..., title="A function that creates the pretraining objective.") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchemaInit(BaseModel): @@ -450,15 +420,13 @@ class ConfigSchemaInit(BaseModel): lookups: Optional[Lookups] = Field(..., title="Vocabulary lookups, e.g. lexeme normalization") vectors: Optional[StrictStr] = Field(..., title="Path to vectors") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") - tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") - components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + tokenizer: Dict[StrictStr, Any] = Field(..., title="Arguments to be passed into Tokenizer.initialize") + components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., title="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on - class Config: - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) class ConfigSchema(BaseModel): @@ -469,9 +437,7 @@ class ConfigSchema(BaseModel): corpora: Dict[str, Reader] initialize: ConfigSchemaInit - class Config: - extra = "allow" - arbitrary_types_allowed = True + model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) CONFIG_SCHEMAS = { diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 65e851cae4e..a65cdb6fc62 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -205,11 +205,13 @@ cdef class StringStore: if mem is None: mem = Pool() self.mem = mem - yield mem - for key in self._transient_keys: - map_clear(self._map.c_map, key) - self._transient_keys.clear() - self.mem = self._non_temp_mem + try: + yield mem + finally: + for key in self._transient_keys: + map_clear(self._map.c_map, key) + self._transient_keys.clear() + self.mem = self._non_temp_mem def add(self, string: str, allow_transient: Optional[bool] = None) -> int: """Add a string to the StringStore. diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d72c916efb0..ef098ec1a9f 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -60,12 +60,12 @@ def test_issue1757(): """Test comparison against None doesn't cause segfault.""" doc = Doc(Vocab(), words=["a", "b", "c"]) assert not doc[0] < None - assert not doc[0] is None + assert doc[0] is not None assert doc[0] >= None assert not doc[:2] < None - assert not doc[:2] is None + assert doc[:2] is not None assert doc[:2] >= None - assert not doc.vocab["a"] is None + assert doc.vocab["a"] is not None assert not doc.vocab["a"] < None diff --git a/spacy/tests/lang/bg/test_tokenizer.py b/spacy/tests/lang/bg/test_tokenizer.py index 2e2c45001ef..b16ef12d880 100644 --- a/spacy/tests/lang/bg/test_tokenizer.py +++ b/spacy/tests/lang/bg/test_tokenizer.py @@ -1,6 +1,3 @@ -import pytest - - def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer): text = "Ня̀маше яйца̀. Ня̀маше яйца̀." tokens = bg_tokenizer(text) diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 8e5fe83540c..50d49fcc28e 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -48,13 +48,13 @@ [(0,4)] ), # Tengo un gato y un perro -> un gato, un perro - ( + ( ["Tengo", "un", "gato", "y", "un", "perro"], [0, 2, 0, 5, 5, 0], ["ROOT", "det", "obj", "cc", "det", "conj"], ["VERB", "DET", "NOUN", "CCONJ", "DET", "NOUN"], [(1,3), (4,6)] - + ), # Dom Pedro II -> Dom Pedro II ( @@ -101,11 +101,11 @@ [1, 1, 3, 1, 5, 1], ['det', 'ROOT', 'case', 'nmod', 'case', 'nmod'], ['DET', 'NOUN', 'ADP', 'PROPN', 'ADP', 'NOUN'], - [(0,2), (3,4), (5,6)] - + [(0,2), (3,4), (5,6)] + ), # El gato regordete de Susana y su amigo -> el gato regordete, Susana, su amigo - ( + ( ['El', 'gato', 'regordete', 'de', 'Susana', 'y', 'su', 'amigo'], [1, 1, 1, 4, 1, 7, 7, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py index f0f8079cae8..8bee2288033 100644 --- a/spacy/tests/lang/et/test_tokenizer.py +++ b/spacy/tests/lang/et/test_tokenizer.py @@ -2,8 +2,7 @@ ET_BASIC_TOKENIZATION_TESTS = [ ( - "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda " - "ega karistada.", + "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda ega karistada.", [ "Kedagi", "ei", diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 436e07b29d0..d413f1f2211 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -35,7 +35,7 @@ [(0, 2)], ), # det + adj + noun - # Le vieux Londres -> Le vieux Londres + # Le vieux Londres -> Le vieux Londres ( ['Les', 'vieux', 'Londres'], [2, 2, 2], @@ -144,13 +144,13 @@ ), # Two NPs conjuncted # Il a un chien et un chat -> Il, un chien, un chat - ( + ( ['Il', 'a', 'un', 'chien', 'et', 'un', 'chat'], [1, 1, 3, 1, 6, 6, 3], ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(0,1), (2,4), (5,7)] - + ), # Two NPs together # l'écrivain brésilien Aníbal Machado -> l'écrivain brésilien, Aníbal Machado @@ -195,12 +195,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # Le gros chat de Susana et son amie -> Le gros chat, Susana, son amie - ( + ( ['Le', 'gros', 'chat', 'de', 'Susana', 'et', 'son', 'amie'], [2, 2, 2, 4, 2, 7, 7, 2], ['det', 'amod', 'ROOT', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/it/test_noun_chunks.py b/spacy/tests/lang/it/test_noun_chunks.py index 7f6659ee7bd..5fd39ab01b9 100644 --- a/spacy/tests/lang/it/test_noun_chunks.py +++ b/spacy/tests/lang/it/test_noun_chunks.py @@ -62,7 +62,7 @@ [(0,3)], ), # noun + adj plural - # mucche bianche + # mucche bianche ( ["mucche", "bianche"], [0, 0], @@ -117,13 +117,13 @@ ), # Two NPs conjuncted # Ho un cane e un gatto -> un cane, un gatto - ( + ( ['Ho', 'un', 'cane', 'e', 'un', 'gatto'], [0, 2, 0, 5, 5, 0], ['ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(1,3), (4,6)] - + ), # Two NPs together # lo scrittore brasiliano Aníbal Machado -> lo scrittore brasiliano, Aníbal Machado @@ -177,12 +177,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # Il gatto grasso di Susana e la sua amica -> Il gatto grasso, Susana, sua amica - ( + ( ['Il', 'gatto', 'grasso', 'di', 'Susana', 'e', 'la', 'sua', 'amica'], [1, 1, 1, 4, 1, 8, 8, 8, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'det:poss', 'conj'], diff --git a/spacy/tests/lang/la/test_exception.py b/spacy/tests/lang/la/test_exception.py index 966ae22cfec..9a6e6a422c5 100644 --- a/spacy/tests/lang/la/test_exception.py +++ b/spacy/tests/lang/la/test_exception.py @@ -1,6 +1,3 @@ -import pytest - - def test_la_tokenizer_handles_exc_in_text(la_tokenizer): text = "scio te omnia facturum, ut nobiscum quam primum sis" tokens = la_tokenizer(text) diff --git a/spacy/tests/lang/pt/test_noun_chunks.py b/spacy/tests/lang/pt/test_noun_chunks.py index eee96d593b1..5dd7bfd3b82 100644 --- a/spacy/tests/lang/pt/test_noun_chunks.py +++ b/spacy/tests/lang/pt/test_noun_chunks.py @@ -126,13 +126,13 @@ ), # Two NPs conjuncted # Eu tenho um cachorro e um gato -> Eu, um cacharo, um gato - ( + ( ["Eu", "tenho", "um", "cachorro", "e", "um", "gato"], [1, 1, 3, 1, 6, 6, 3], ['nsubj', 'ROOT', 'det', 'obj', 'cc', 'det', 'conj'], ['PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'DET', 'NOUN'], [(0,1), (2,4), (5,7)] - + ), # Two NPs together # o escritor brasileiro Aníbal Machado -> o escritor brasileiro, Aníbal Machado @@ -186,12 +186,12 @@ [0, 2, 0, 4, 2], ['ROOT', 'case', 'nmod', 'case', 'nmod'], ['NOUN', 'ADP', 'NOUN', 'ADP', 'PROPN'], - [(0,1), (2,3), (4,5)] - + [(0,1), (2,3), (4,5)] + ), # Several NPs # O gato gordo da Susana e seu amigo -> O gato gordo, Susana, seu amigo - ( + ( ['O', 'gato', 'gordo', 'da', 'Susana', 'e', 'seu', 'amigo'], [1, 1, 1, 4, 1, 7, 7, 1], ['det', 'ROOT', 'amod', 'case', 'nmod', 'cc', 'det', 'conj'], diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py index a2a93207729..4781bebcdcb 100644 --- a/spacy/tests/lang/sl/test_text.py +++ b/spacy/tests/lang/sl/test_text.py @@ -1,6 +1,3 @@ -import pytest - - def test_long_text(sl_tokenizer): # Excerpt: European Convention on Human Rights text = """ diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py index 44eedaa5487..24d60afdf20 100644 --- a/spacy/tests/lang/sq/test_text.py +++ b/spacy/tests/lang/sq/test_text.py @@ -1,6 +1,3 @@ -import pytest - - def test_long_text(sq_tokenizer): # Excerpt: European Convention on Human Rights text = """ diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py index 477f0ebe271..a4eafdcb98e 100644 --- a/spacy/tests/lang/xx/test_text.py +++ b/spacy/tests/lang/xx/test_text.py @@ -1,6 +1,3 @@ -import pytest - - def test_long_text(xx_tokenizer): # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi text = """ diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index cdba5e39709..cb9b4ec539a 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,5 +1,5 @@ import pytest -from thinc.api import ConfigValidationError +from confection import ConfigValidationError from spacy.lang.zh import Chinese, _get_pkuseg_trie_data diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 45f9f4ee718..e0dc7d5a1dd 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -10,30 +10,46 @@ # Bad patterns flagged in all cases ([{"XX": "foo"}], 1, 1), ([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1), - ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), + ( + [{"IS_PUNCT": True, "OP": "$"}], + 2, + 1, + ), # v2: union reports 2 errors (enum + pattern) ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), - ([{"TEXT": "foo", "OP": "{,}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1), - ([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{a}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{,a}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1), - ([{"TEXT": "foo", "OP": "{-2}"}], 1, 1), + ([{"TEXT": "foo", "OP": "{,}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{,4}4"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{a,3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{a}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{,a}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{1,2,3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{1, 3}"}], 2, 1), # v2: union reports 2 errors + ([{"TEXT": "foo", "OP": "{-2}"}], 2, 1), # v2: union reports 2 errors # Bad patterns flagged outside of Matcher - ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) + ( + [{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], + 7, + 0, + ), # v2: more detailed union errors # Bad patterns not flagged with minimal checks - ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0), - ([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 4, 0), # prev: (2, 0) - ([{"LENGTH": {"VALUE": 5}}], 2, 0), # prev: (1, 0) - ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) + ( + [{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], + 5, + 0, + ), # v2: more detailed union errors + ( + [{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], + 5, + 0, + ), # v2: more detailed union errors + ([{"LENGTH": {"VALUE": 5}}], 3, 0), # v2: more detailed union errors + ([{"TEXT": {"VALUE": "foo"}}], 2, 0), ([{"IS_DIGIT": -1}], 1, 0), - ([{"ORTH": -1}], 1, 0), - ([{"ENT_ID": -1}], 1, 0), - ([{"ENT_KB_ID": -1}], 1, 0), + ([{"ORTH": -1}], 2, 0), # v2: union reports 2 errors + ([{"ENT_ID": -1}], 2, 0), # v2: union reports 2 errors + ([{"ENT_KB_ID": -1}], 2, 0), # v2: union reports 2 errors # Good patterns ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index ff07c5b454a..f4c6f056aa2 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -15,6 +15,7 @@ def test_build_dependencies(): "cython-lint", "black", "isort", + "ruff", "mypy", "types-dataclasses", "types-mock", @@ -37,7 +38,7 @@ def test_build_dependencies(): req_dict = {} root_dir = Path(__file__).parent - req_file = root_dir / "requirements.txt" + req_file = root_dir / "test.txt" with req_file.open() as f: lines = f.readlines() for line in lines: @@ -48,7 +49,7 @@ def test_build_dependencies(): req_dict[lib] = v # check setup.cfg and compare to requirements.txt # also fails when there are missing or additional libs - setup_file = root_dir / "setup.cfg" + setup_file = root_dir / "test.cfg" with setup_file.open() as f: lines = f.readlines() @@ -59,9 +60,9 @@ def test_build_dependencies(): lib, v = _parse_req(line) if lib and not lib.startswith("cupy") and lib not in libs_ignore_setup: req_v = req_dict.get(lib, None) - assert ( - req_v is not None - ), "{} in setup.cfg but not in requirements.txt".format(lib) + assert req_v is not None, ( + "{} in setup.cfg but not in requirements.txt".format(lib) + ) assert (lib + v) == (lib + req_v), ( "{} has different version in setup.cfg and in requirements.txt: " "{} and {} respectively".format(lib, v, req_v) @@ -73,7 +74,7 @@ def test_build_dependencies(): # check pyproject.toml and compare the versions of the libs to requirements.txt # does not fail when there are missing or additional libs - toml_file = root_dir / "pyproject.toml" + toml_file = root_dir / "test.toml" with toml_file.open() as f: lines = f.readlines() for line in lines: diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 1b6f49f4cde..74dd026e716 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,9 +1,9 @@ -from typing import Any, Callable, Dict, Iterable, Tuple +from typing import Any, Callable, Dict, Iterable import pytest from numpy.testing import assert_equal -from spacy import Language, registry, util +from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates @@ -496,15 +496,15 @@ def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @registry.misc("spacy.LowercaseCandidateGenerator.v1") - def create_candidates() -> ( - Callable[[InMemoryLookupKB, "Span"], Iterable[Candidate]] - ): + def create_candidates() -> Callable[ + [InMemoryLookupKB, "Span"], Iterable[Candidate] + ]: return get_lowercased_candidates @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1") - def create_candidates_batch() -> ( - Callable[[InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]] - ): + def create_candidates_batch() -> Callable[ + [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]] + ]: return get_lowercased_candidates_batch # replace the pipe with a new one with with a different candidate generator diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py index 9854b391e60..71b12227f2c 100644 --- a/spacy/tests/pipeline/test_initialize.py +++ b/spacy/tests/pipeline/test_initialize.py @@ -1,10 +1,5 @@ import pytest - -try: - from pydantic.v1 import StrictBool -except ImportError: - from pydantic import StrictBool # type: ignore - +from pydantic import StrictBool from thinc.api import ConfigValidationError from spacy.lang.en import English @@ -51,7 +46,7 @@ def initialize( errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom1",) - assert errors[0]["type"] == "value_error.missing" + assert errors[0]["type"] == "missing" init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x", "custom2": 1}}, @@ -63,7 +58,7 @@ def initialize( errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ("custom2",) - assert errors[0]["type"] == "value_error.strictbool" + assert errors[0]["type"] == "bool_type" init_cfg = { "tokenizer": {"custom": 1}, "components": {name: {"custom1": "x"}}, diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index b355379bfd0..a8a6c7d136a 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,10 +1,5 @@ import pytest - -try: - from pydantic.v1 import StrictInt, StrictStr -except ImportError: - from pydantic import StrictInt, StrictStr # type: ignore - +from pydantic import StrictInt, StrictStr from thinc.api import ConfigValidationError, Linear, Model import spacy diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 9b1ddd53012..826086fc7fe 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -135,14 +135,38 @@ def test_sentencizer_serialize_bytes(en_vocab): # fmt: off "lang,text", [ - ('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'), - ('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'), - ('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'), - ('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'), - ('si', 'ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .'), - ('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'), - ('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'), - ('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'), + ( + "bn", + "বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।", + ), + ( + "de", + "Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.", + ), + ( + "hi", + "हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]", + ), + ( + "kn", + "ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]", + ), + ( + "si", + "ශ්‍රී ලංකාවේ ප්‍රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල‍ ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්‍රී ලංකාවේ නිල භාෂාවයි .", + ), + ( + "ta", + "தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]", + ), + ( + "te", + "ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.", + ), + ( + "ur", + "اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔", + ), ], # fmt: on ) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 4310e41ab47..e7499404f63 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -24,12 +24,12 @@ ) from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.tests.tok2vec import build_lazy_init_tok2vec as _ # noqa: F401 from spacy.tokens import Doc, DocBin from spacy.training import Example from spacy.training.initialize import init_nlp # Ensure that the architecture gets added to the registry. -from ..tok2vec import build_lazy_init_tok2vec as _ from ..util import make_tempdir TRAIN_DATA_SINGLE_LABEL = [ diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 998f0472c7e..ddd9a990c65 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -65,10 +65,30 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "embed_arch,embed_config", # fmt: off [ - ("spacy.MultiHashEmbed.v1", {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}), - ("spacy.MultiHashEmbed.v1", {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}), - ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}), - ("spacy.CharacterEmbed.v1", {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}), + ( + "spacy.MultiHashEmbed.v1", + { + "rows": [100, 100], + "attrs": ["SHAPE", "LOWER"], + "include_static_vectors": False, + }, + ), + ( + "spacy.MultiHashEmbed.v1", + { + "rows": [100, 20], + "attrs": ["ORTH", "PREFIX"], + "include_static_vectors": False, + }, + ), + ( + "spacy.CharacterEmbed.v1", + {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, + ), + ( + "spacy.CharacterEmbed.v1", + {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, + ), ], # fmt: on ) @@ -76,10 +96,26 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "tok2vec_arch,encode_arch,encode_config", # fmt: off [ - ("spacy.Tok2Vec.v1", "spacy.MaxoutWindowEncoder.v1", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - ("spacy.Tok2Vec.v2", "spacy.MaxoutWindowEncoder.v2", {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - ("spacy.Tok2Vec.v1", "spacy.MishWindowEncoder.v1", {"window_size": 1, "depth": 6}), - ("spacy.Tok2Vec.v2", "spacy.MishWindowEncoder.v2", {"window_size": 1, "depth": 6}), + ( + "spacy.Tok2Vec.v1", + "spacy.MaxoutWindowEncoder.v1", + {"window_size": 1, "maxout_pieces": 3, "depth": 2}, + ), + ( + "spacy.Tok2Vec.v2", + "spacy.MaxoutWindowEncoder.v2", + {"window_size": 1, "maxout_pieces": 3, "depth": 2}, + ), + ( + "spacy.Tok2Vec.v1", + "spacy.MishWindowEncoder.v1", + {"window_size": 1, "depth": 6}, + ), + ( + "spacy.Tok2Vec.v2", + "spacy.MishWindowEncoder.v2", + {"window_size": 1, "depth": 6}, + ), ], # fmt: on ) @@ -164,9 +200,9 @@ def test_init_tok2vec(): @pytest.mark.parametrize("with_vectors", (False, True)) def test_tok2vec_listener(with_vectors): orig_config = Config().from_str(cfg_string) - orig_config["components"]["tok2vec"]["model"]["embed"][ - "include_static_vectors" - ] = with_vectors + orig_config["components"]["tok2vec"]["model"]["embed"]["include_static_vectors"] = ( + with_vectors + ) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) if with_vectors: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 43d5f62837a..4bac40f0b89 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -153,21 +153,171 @@ def test_issue12566(factory: str, output_file: str): "Briana McNaira - Cultural Chaos .", "tokens": [ # fmt: off - {"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, }, - {"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, }, - {"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, }, - {"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, }, - {"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, }, - {"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, }, - {"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, }, - {"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, }, - {"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, }, - {"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, }, - {"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, }, - {"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, }, - {"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, }, - {"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, }, - {"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, }, + { + "id": 0, + "start": 0, + "end": 8, + "tag": "ADV", + "pos": "ADV", + "morph": "Degree=Pos", + "lemma": "niedawno", + "dep": "advmod", + "head": 1, + }, + { + "id": 1, + "start": 9, + "end": 15, + "tag": "PRAET", + "pos": "VERB", + "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "lemma": "czytać", + "dep": "ROOT", + "head": 1, + }, + { + "id": 2, + "start": 16, + "end": 18, + "tag": "AGLT", + "pos": "NOUN", + "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", + "lemma": "em", + "dep": "iobj", + "head": 1, + }, + { + "id": 3, + "start": 19, + "end": 23, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", + "lemma": "nowy", + "dep": "amod", + "head": 4, + }, + { + "id": 4, + "start": 24, + "end": 31, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Case=Acc|Gender=Fem|Number=Sing", + "lemma": "książka", + "dep": "obj", + "head": 1, + }, + { + "id": 5, + "start": 32, + "end": 43, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", + "lemma": "znakomit", + "dep": "acl", + "head": 4, + }, + { + "id": 6, + "start": 44, + "end": 54, + "tag": "ADJ", + "pos": "ADJ", + "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", + "lemma": "szkockiy", + "dep": "amod", + "head": 7, + }, + { + "id": 7, + "start": 55, + "end": 66, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "medioznawca", + "dep": "iobj", + "head": 5, + }, + { + "id": 8, + "start": 67, + "end": 68, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Comm", + "lemma": ",", + "dep": "punct", + "head": 9, + }, + { + "id": 9, + "start": 69, + "end": 75, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "Brian", + "dep": "nmod", + "head": 4, + }, + { + "id": 10, + "start": 76, + "end": 83, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", + "lemma": "McNair", + "dep": "flat", + "head": 9, + }, + { + "id": 11, + "start": 84, + "end": 85, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Dash", + "lemma": "-", + "dep": "punct", + "head": 12, + }, + { + "id": 12, + "start": 86, + "end": 94, + "tag": "SUBST", + "pos": "PROPN", + "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", + "lemma": "Cultural", + "dep": "conj", + "head": 4, + }, + { + "id": 13, + "start": 95, + "end": 100, + "tag": "SUBST", + "pos": "NOUN", + "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", + "lemma": "Chaos", + "dep": "flat", + "head": 12, + }, + { + "id": 14, + "start": 101, + "end": 102, + "tag": "INTERP", + "pos": "PUNCT", + "morph": "PunctType=Peri", + "lemma": ".", + "dep": "punct", + "head": 1, + }, # fmt: on ], } @@ -420,8 +570,14 @@ def test_cli_converters_conll_ner_to_docs(): (["--x.foo=bar"], {"x.foo": "bar"}), (["--x.foo", "--x.bar", "baz"], {"x.foo": True, "x.bar": "baz"}), (["--x.foo", "--x.bar=baz"], {"x.foo": True, "x.bar": "baz"}), - (["--x.foo", "10.1", "--x.bar", "--x.baz", "false"], {"x.foo": 10.1, "x.bar": True, "x.baz": False}), - (["--x.foo", "10.1", "--x.bar", "--x.baz=false"], {"x.foo": 10.1, "x.bar": True, "x.baz": False}) + ( + ["--x.foo", "10.1", "--x.bar", "--x.baz", "false"], + {"x.foo": 10.1, "x.bar": True, "x.baz": False}, + ), + ( + ["--x.foo", "10.1", "--x.bar", "--x.baz=false"], + {"x.foo": 10.1, "x.bar": True, "x.baz": False}, + ), # fmt: on ], ) @@ -499,11 +655,11 @@ def test_model_recommendations(): # fmt: off "parser,textcat,tagger", " parser, textcat ,tagger ", - 'parser,textcat,tagger', - ' parser, textcat ,tagger ', + "parser,textcat,tagger", + " parser, textcat ,tagger ", ' "parser"," textcat " ,"tagger "', " 'parser',' textcat ' ,'tagger '", - '[parser,textcat,tagger]', + "[parser,textcat,tagger]", '["parser","textcat","tagger"]', '[" parser" ,"textcat ", " tagger " ]', "[parser,textcat,tagger]", @@ -522,7 +678,7 @@ def test_string_to_list(value): [ # fmt: off "1,2,3", - '[1,2,3]', + "[1,2,3]", '["1","2","3"]', '[" 1" ,"2 ", " 3 " ]', "[' 1' , '2', ' 3 ' ]", @@ -1073,6 +1229,9 @@ def test_download_rejects_relative_urls(monkeypatch): relative path in the filename""" monkeypatch.setattr(download_module, "run_command", lambda cmd: None) + monkeypatch.setattr( + download_module, "_get_pip_install_cmd", lambda: ["pip", "install"] + ) # Check that normal download works download_module.download("en_core_web_sm-3.7.1", direct=True) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 1789d60ea4c..c72e26c3444 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,16 +1,18 @@ import os -import sys from pathlib import Path import pytest import srsly from typer.testing import CliRunner +from spacy.cli import load_all_commands from spacy.cli._util import app, get_git_version from spacy.tokens import Doc, DocBin, Span from .util import make_tempdir, normalize_whitespace +load_all_commands() + def has_git(): try: @@ -285,30 +287,30 @@ def test_find_function_invalid(): example_ents = ["O", "O", "I-ANIMAL"] example_spans = [(2, 3, "ANIMAL")] -TRAIN_EXAMPLE_1 = dict( - words=example_words_1, - lemmas=example_lemmas_1, - tags=example_tags, - morphs=example_morphs, - deps=example_deps, - heads=[1, 1, 1], - pos=example_pos, - ents=example_ents, - spans=example_spans, - cats={"CAT": 1.0, "DOG": 0.0}, -) -TRAIN_EXAMPLE_2 = dict( - words=example_words_2, - lemmas=example_lemmas_2, - tags=example_tags, - morphs=example_morphs, - deps=example_deps, - heads=[1, 1, 1], - pos=example_pos, - ents=example_ents, - spans=example_spans, - cats={"CAT": 0.0, "DOG": 1.0}, -) +TRAIN_EXAMPLE_1 = { + "words": example_words_1, + "lemmas": example_lemmas_1, + "tags": example_tags, + "morphs": example_morphs, + "deps": example_deps, + "heads": [1, 1, 1], + "pos": example_pos, + "ents": example_ents, + "spans": example_spans, + "cats": {"CAT": 1.0, "DOG": 0.0}, +} +TRAIN_EXAMPLE_2 = { + "words": example_words_2, + "lemmas": example_lemmas_2, + "tags": example_tags, + "morphs": example_morphs, + "deps": example_deps, + "heads": [1, 1, 1], + "pos": example_pos, + "ents": example_ents, + "spans": example_spans, + "cats": {"CAT": 0.0, "DOG": 1.0}, +} @pytest.mark.slow diff --git a/spacy/tests/test_cli_launcher.py b/spacy/tests/test_cli_launcher.py new file mode 100644 index 00000000000..c9af62a509d --- /dev/null +++ b/spacy/tests/test_cli_launcher.py @@ -0,0 +1,123 @@ +import importlib +import subprocess +import sys + +import pytest + +from spacy_cli.static import load_manifest + +launcher_module = importlib.import_module("spacy_cli.main") + + +def _run_python(code: str) -> str: + result = subprocess.run( + [sys.executable, "-c", code], + check=True, + capture_output=True, + text=True, + ) + return result.stdout.strip() + + +def test_cli_package_import_is_lazy(): + output = _run_python( + "import sys; import spacy.cli; " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["False", "False"] + + +def test_load_for_argv_imports_only_requested_command(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['train', '--help']); " + "print('spacy.cli.train' in sys.modules); print('weasel' in sys.modules)" + ) + assert output.splitlines() == ["True", "False"] + + +def test_load_for_argv_imports_project_on_demand(): + output = _run_python( + "import sys; from spacy.cli import load_for_argv; " + "load_for_argv(['project', '--help']); print('weasel' in sys.modules)" + ) + assert output == "True" + + +def test_manifest_is_current(): + # Run in a subprocess to avoid command registration order being affected + # by other test modules importing CLI submodules (which register commands + # as a side effect of import). + result = subprocess.run( + [ + sys.executable, + "-c", + "import json; " + "from spacy_cli.build_manifest import build_manifest; " + "from spacy_cli.static import load_manifest; " + "b, l = build_manifest(), load_manifest(); " + "diffs = {}; " + "[diffs.update({f'{k}.{sk}': (repr(b[k][sk])[:120], repr(l[k][sk])[:120])}) " + "for k in b if isinstance(b[k], dict) and b[k] != l[k] " + "for sk in b[k] if b[k].get(sk) != l[k].get(sk)]; " + "[diffs.update({k: (repr(b[k])[:120], repr(l[k])[:120])}) " + "for k in b if not isinstance(b[k], dict) and b[k] != l[k]]; " + "assert b == l, json.dumps(diffs, indent=2)", + ], + capture_output=True, + text=True, + ) + assert result.returncode == 0, result.stderr + + +def test_launcher_root_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["root_help"] + + +def test_launcher_command_help_uses_static(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["train", "--help"]) + assert exc.value.code == 0 + assert capsys.readouterr().out == load_manifest()["command_help"]["train"] + + +def test_launcher_unknown_command_uses_static_error(capsys, monkeypatch): + monkeypatch.setattr( + launcher_module, "_run_live", lambda: (_ for _ in ()).throw(AssertionError) + ) + with pytest.raises(SystemExit) as exc: + launcher_module.main(["definitely-not-a-command"]) + assert exc.value.code == 2 + assert "No such command 'definitely-not-a-command'" in capsys.readouterr().out + + +def test_launcher_non_help_command_falls_back_to_live(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + launcher_module.main(["train", "config.cfg"]) + assert called == [True] + + +def test_launcher_root_help_falls_back_with_plugins(monkeypatch): + called = [] + + def fake_run_live(): + called.append(True) + + monkeypatch.setattr(launcher_module, "_run_live", fake_run_live) + monkeypatch.setattr(launcher_module, "get_plugin_command_names", lambda: {"custom"}) + launcher_module.main(["--help"]) + assert called == [True] diff --git a/spacy/tests/test_factory_imports.py b/spacy/tests/test_factory_imports.py index a975af0bbd2..7a1b4a769a8 100644 --- a/spacy/tests/test_factory_imports.py +++ b/spacy/tests/test_factory_imports.py @@ -67,16 +67,16 @@ def test_factory_import_compatibility(factory_name, original_module, compat_modu # Import from the original module (registrations.py) original_module_obj = importlib.import_module(original_module) original_factory = getattr(original_module_obj, factory_name) - assert ( - original_factory is not None - ), f"Could not import {factory_name} from {original_module}" + assert original_factory is not None, ( + f"Could not import {factory_name} from {original_module}" + ) # Import from the compatibility module (component file) compat_module_obj = importlib.import_module(compat_module) compat_factory = getattr(compat_module_obj, factory_name) - assert ( - compat_factory is not None - ), f"Could not import {factory_name} from {compat_module}" + assert compat_factory is not None, ( + f"Could not import {factory_name} from {compat_module}" + ) # Test that they're the same function (identity) assert original_factory is compat_factory, ( diff --git a/spacy/tests/test_factory_registrations.py b/spacy/tests/test_factory_registrations.py index 8e93f54f0b0..eb69265e3f3 100644 --- a/spacy/tests/test_factory_registrations.py +++ b/spacy/tests/test_factory_registrations.py @@ -1,17 +1,14 @@ -import inspect import json from pathlib import Path import pytest -from spacy.language import Language from spacy.util import registry # Path to the reference factory registrations, relative to this file REFERENCE_FILE = Path(__file__).parent / "factory_registrations.json" # Monkey patch the util.is_same_func to handle Cython functions -import inspect from spacy import util @@ -82,9 +79,9 @@ def test_factory_registrations_preserved(reference_factory_registrations): missing_registrations = set(reference_factory_registrations.keys()) - set( current_registrations.keys() ) - assert ( - not missing_registrations - ), f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + assert not missing_registrations, ( + f"Missing factory registrations: {', '.join(sorted(missing_registrations))}" + ) # Check for new registrations (not an error, but informative) new_registrations = set(current_registrations.keys()) - set( diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index d2a41ff0fed..309c57b0926 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -3,12 +3,7 @@ from pathlib import Path import pytest - -try: - from pydantic.v1 import ValidationError -except ImportError: - from pydantic import ValidationError # type: ignore - +from pydantic import ValidationError from thinc.api import ( Config, ConfigValidationError, diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 5228b4544fd..706203ffd63 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -95,7 +95,7 @@ def test_multi_hash_embed(): hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] assert len(hash_embeds) == 3 # Check they look at different columns. - assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2] + assert sorted(he.attrs["column"] for he in hash_embeds) == [0, 1, 2] # Check they use different seeds assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3 # Check they all have the same number of rows diff --git a/spacy/tests/test_registry_population.py b/spacy/tests/test_registry_population.py index 592e74dd20a..e72f3d9f8e8 100644 --- a/spacy/tests/test_registry_population.py +++ b/spacy/tests/test_registry_population.py @@ -1,5 +1,4 @@ import json -import os from pathlib import Path import pytest @@ -50,6 +49,6 @@ def test_registry_entries(reference_registry): # Check for missing entries - these would indicate our new registry population # mechanism is missing something missing_entries = expected_set - current_set - assert ( - not missing_entries - ), f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" + assert not missing_entries, ( + f"Registry '{registry_name}' missing entries: {', '.join(missing_entries)}" + ) diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py index e7cae989384..ded6a53833c 100644 --- a/spacy/tests/training/test_corpus.py +++ b/spacy/tests/training/test_corpus.py @@ -1,7 +1,6 @@ -import tempfile from contextlib import contextmanager from pathlib import Path -from typing import IO, Generator, Iterable, List, TextIO, Tuple +from typing import Generator, Iterable, List, Tuple import pytest diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 156e3391aa2..3c01055b552 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -2,7 +2,6 @@ import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT -from spacy.lookups import Lookups from spacy.tokens import Doc from spacy.util import OOV_RANK from spacy.vocab import Vocab diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py index 910d2664eb4..f718afa2f6e 100644 --- a/spacy/tests/vocab_vectors/test_memory_zone.py +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -34,3 +34,26 @@ def test_memory_zone_redundant_insertion(): _ = vocab["dog"] assert "dog" in vocab assert "horse" not in vocab + + +def test_memory_zone_exception_cleanup(): + """Test that if an exception occurs inside a memory zone, the vocab + is properly cleaned up and remains usable afterward.""" + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + try: + with vocab.memory_zone(): + _ = vocab["horse"] + raise ValueError("simulated error") + except ValueError: + pass + # Vocab should not be stuck in memory zone state + assert not vocab.in_memory_zone + # Pre-existing words should still work + assert "dog" in vocab + # Transient word from failed zone should be cleaned up + assert "horse" not in vocab + # Vocab should be fully usable for new operations + lex = vocab["cat"] + assert lex.text == "cat" diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 873d85835f0..51f5740c25c 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -13,8 +13,7 @@ from ..util import SimpleFrozenList, ensure_path from ..vocab import Vocab from ._dict_proxies import SpanGroups -from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS -from .doc import Doc +from .doc import DOCBIN_ALL_ATTRS as ALL_ATTRS, Doc class DocBin: @@ -207,7 +206,7 @@ def to_bytes(self) -> bytes: "tokens": tokens.tobytes("C"), "spaces": spaces.tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), - "strings": list(sorted(self.strings)), + "strings": sorted(self.strings), "cats": self.cats, "flags": self.flags, "span_groups": self.span_groups, diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index d92f04d0564..b8b26ce8b0d 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -57,7 +57,9 @@ class Doc: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], @@ -66,7 +68,9 @@ class Doc: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[DocMethod], Optional[Callable[[Doc], Any]], @@ -144,7 +148,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 070aaffb3a8..b982eb810b8 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -23,7 +23,9 @@ class Span: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], @@ -32,7 +34,9 @@ class Span: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[SpanMethod], Optional[Callable[[Span], Any]], diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi index d063bb59533..3bd2b6788fb 100644 --- a/spacy/tokens/span_group.pyi +++ b/spacy/tokens/span_group.pyi @@ -12,7 +12,7 @@ class SpanGroup: *, name: str = ..., attrs: Dict[str, Any] = ..., - spans: Iterable[Span] = ... + spans: Iterable[Span] = ..., ) -> None: ... def __repr__(self) -> str: ... @property diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi index 7e56ae3bccd..435ace52707 100644 --- a/spacy/tokens/token.pyi +++ b/spacy/tokens/token.pyi @@ -27,7 +27,9 @@ class Token: force: bool = ..., ) -> None: ... @classmethod - def get_extension(cls, name: str) -> Tuple[ + def get_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], @@ -36,7 +38,9 @@ class Token: @classmethod def has_extension(cls, name: str) -> bool: ... @classmethod - def remove_extension(cls, name: str) -> Tuple[ + def remove_extension( + cls, name: str + ) -> Tuple[ Optional[Any], Optional[TokenMethod], Optional[Callable[[Token], Any]], diff --git a/spacy/training/augment.py b/spacy/training/augment.py index da5ae3d087a..ba4368acef1 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -3,7 +3,6 @@ from functools import partial from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Optional, Tuple -from ..util import registry from .example import Example from .iob_utils import _doc_to_biluo_tags_with_partial, split_bilu_label diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index 4a1dfa94515..40e437dcc8c 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -12,7 +12,7 @@ Union, ) -from ..util import minibatch, registry +from ..util import minibatch Sizing = Union[Sequence[int], int] ItemT = TypeVar("ItemT") @@ -24,7 +24,7 @@ def configure_minibatch_by_padded_size( size: Sizing, buffer: int, discard_oversize: bool, - get_length: Optional[Callable[[ItemT], int]] = None + get_length: Optional[Callable[[ItemT], int]] = None, ) -> BatcherT: """Create a batcher that uses the `batch_by_padded_size` strategy. @@ -49,7 +49,7 @@ def configure_minibatch_by_padded_size( size=size, buffer=buffer, discard_oversize=discard_oversize, - **optionals + **optionals, ) @@ -58,7 +58,7 @@ def configure_minibatch_by_words( size: Sizing, tolerance: float, discard_oversize: bool, - get_length: Optional[Callable[[ItemT], int]] = None + get_length: Optional[Callable[[ItemT], int]] = None, ) -> BatcherT: """Create a batcher that uses the "minibatch by words" strategy. @@ -76,7 +76,7 @@ def configure_minibatch_by_words( size=size, tolerance=tolerance, discard_oversize=discard_oversize, - **optionals + **optionals, ) @@ -232,6 +232,6 @@ def _batch_by_length( batches.append(batch) # Check lengths match assert sum(len(b) for b in batches) == len(seqs) - batches = [list(sorted(batch)) for batch in batches] + batches = [sorted(batch) for batch in batches] batches.reverse() return batches diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 714deea6dcd..19382757a95 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Callable, Optional from ..errors import Errors -from ..util import load_model, logger, registry +from ..util import load_model, logger if TYPE_CHECKING: from ..language import Language diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index b19d1791b27..e66a8a8dfed 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -74,8 +74,7 @@ def conll_ner_to_docs( # provide warnings for problematic data if "\n\n" not in input_data: msg.warn( - "No sentence boundaries found. Use `-s` to automatically segment " - "sentences." + "No sentence boundaries found. Use `-s` to automatically segment sentences." ) if doc_delimiter not in input_data: msg.warn( diff --git a/spacy/training/converters/conllu_to_docs.py b/spacy/training/converters/conllu_to_docs.py index bda5c88c3d4..3a60c4e024b 100644 --- a/spacy/training/converters/conllu_to_docs.py +++ b/spacy/training/converters/conllu_to_docs.py @@ -15,7 +15,7 @@ def conllu_to_docs( ner_map=None, merge_subtokens=False, no_print=False, - **_ + **_, ): """ Convert conllu files into JSON format for use with train cli. diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 5cc2733a540..30e32911e6b 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -50,9 +50,10 @@ def create_jsonl_reader( @util.registry.readers("spacy.read_labels.v1") -def read_labels(path: Path, *, require: bool = False): +def read_labels(path: Union[str, Path], *, require: bool = False): # I decided not to give this a generic name, because I don't want people to # use it for arbitrary stuff, as I want this require arg with default False. + path = Path(path) if not require and not path.exists(): return None return srsly.read_json(path) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 0621702214c..164a0867494 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -25,8 +25,7 @@ registry, resolve_dot_names, ) -from ..vectors import Mode as VectorsMode -from ..vectors import Vectors +from ..vectors import Mode as VectorsMode, Vectors from .pretrain import get_tok2vec_ref if TYPE_CHECKING: @@ -51,7 +50,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": logger.info("Set up nlp object from config") config = nlp.config.interpolate() # Resolve all training-relevant sections using the filled nlp config - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] if not isinstance(T["train_corpus"], str): raise ConfigValidationError( diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 488ca4a7136..7f200545ca0 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -8,7 +8,6 @@ from .. import util from ..errors import Errors -from ..util import registry if TYPE_CHECKING: from ..language import Language # noqa: F401 @@ -176,7 +175,7 @@ def log_step(info: Optional[Dict[str, Any]]) -> None: initial = info["step"] else: total = eval_frequency - desc = f"Epoch {info['epoch']+1}" + desc = f"Epoch {info['epoch'] + 1}" initial = 0 # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 6f5099858f1..d6f1ad7d608 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -62,7 +62,7 @@ def train( allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) - T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) # type: ignore[arg-type] dot_names = [T["train_corpus"], T["dev_corpus"]] train_corpus, dev_corpus = resolve_dot_names(config, dot_names) optimizer = T["optimizer"] diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 14a813a0993..32eada4d749 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -42,7 +42,7 @@ def pretrain( config["initialize"]["init_tok2vec"] = None nlp = load_model_from_config(config) _config = nlp.config.interpolate() - P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) + P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) # type: ignore[arg-type] corpus = dot_to_object(_config, P["corpus"]) corpus = registry.resolve({"corpus": corpus})["corpus"] batcher = P["batcher"] diff --git a/spacy/ty.py b/spacy/ty.py index b37f2e18a1f..c18ce284dc0 100644 --- a/spacy/ty.py +++ b/spacy/ty.py @@ -29,7 +29,7 @@ def update( *, drop: float = 0.0, sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None + losses: Optional[Dict[str, float]] = None, ) -> Dict[str, float]: ... def finish_update(self, sgd: Optimizer) -> None: ... @@ -41,7 +41,7 @@ def initialize( self, get_examples: Callable[[], Iterable["Example"]], nlp: "Language", - **kwargs: Any + **kwargs: Any, ): ... diff --git a/spacy/util.py b/spacy/util.py index ad5a7e0bada..14d7b539994 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -557,7 +557,9 @@ def load_model_from_package( RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined] + return cls.load( + vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config + ) # type: ignore[attr-defined] def load_model_from_path( diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index ee7636f02c8..906a4c0d978 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Unio from cymem.cymem import Pool from thinc.types import Floats1d, FloatsXd -from . import Language from .lexeme import Lexeme from .lookups import Lookups from .morphology import Morphology diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 11043c17ae7..4bf80c85d8e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -150,9 +150,11 @@ cdef class Vocab: if hasattr(self._vectors, "memory_zone"): contexts.append(stack.enter_context(self._vectors.memory_zone(mem))) self.mem = mem - yield mem - self._clear_transient_orths() - self.mem = self._non_temp_mem + try: + yield mem + finally: + self._clear_transient_orths() + self.mem = self._non_temp_mem def add_flag(self, flag_getter, int flag_id=-1): """Set a new boolean flag to words in the vocabulary. diff --git a/spacy_cli/__init__.py b/spacy_cli/__init__.py new file mode 100644 index 00000000000..a2cb1f66b78 --- /dev/null +++ b/spacy_cli/__init__.py @@ -0,0 +1 @@ +"""Lightweight launcher package for the spaCy console script.""" diff --git a/spacy_cli/build_manifest.py b/spacy_cli/build_manifest.py new file mode 100644 index 00000000000..71982d82d77 --- /dev/null +++ b/spacy_cli/build_manifest.py @@ -0,0 +1,99 @@ +import json +from pathlib import Path +from typing import Dict, Iterable, List + +from typer.main import get_command +from typer.testing import CliRunner + +from spacy.cli import load_all_commands +from spacy.cli._util import COMMAND, app + +from .static import MANIFEST_FILE, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN + +DEFAULT_ENV = {"COLUMNS": "100", "LINES": "40", "TERM": "xterm-256color"} + + +def _invoke(runner: CliRunner, cli, args: Iterable[str]): + return runner.invoke(cli, list(args), prog_name=COMMAND, env=DEFAULT_ENV) + + +def _get_help(runner: CliRunner, cli, args: Iterable[str]) -> str: + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + err = f"Could not render help for: {' '.join(args) or ''}" + raise RuntimeError(err) + return result.stdout + + +def _maybe_get_help(runner: CliRunner, cli, args: Iterable[str]): + result = _invoke(runner, cli, [*list(args), "--help"]) + if result.exit_code != 0: + return None + return result.stdout + + +def build_manifest() -> Dict[str, object]: + load_all_commands() + cli = get_command(app) + runner = CliRunner() + known_top_level: List[str] = sorted(cli.commands.keys()) + known_groups: Dict[str, List[str]] = {} + hidden_top_level: List[str] = [] + hidden_group_commands: Dict[str, List[str]] = {} + group_help: Dict[str, str] = {} + command_help: Dict[str, str] = {} + unknown_subcommand: Dict[str, str] = {} + + for name, command in cli.commands.items(): + if getattr(command, "hidden", False): + hidden_top_level.append(name) + if hasattr(command, "commands"): + subcommands = sorted(command.commands.keys()) + known_groups[name] = subcommands + hidden_group_commands[name] = sorted( + sub_name + for sub_name, sub_cmd in command.commands.items() + if getattr(sub_cmd, "hidden", False) + ) + group_help[name] = _get_help(runner, app, [name]) + unknown_subcommand[name] = _invoke( + runner, app, [name, UNKNOWN_SUBCOMMAND_TOKEN] + ).output + for sub_name in subcommands: + help_text = _maybe_get_help(runner, app, [name, sub_name]) + if help_text is not None: + command_help[f"{name} {sub_name}"] = help_text + else: + command_help[name] = _get_help(runner, app, [name]) + + return { + "command": COMMAND, + "known_top_level": known_top_level, + "known_groups": known_groups, + "hidden_top_level": hidden_top_level, + "hidden_group_commands": hidden_group_commands, + "root_help": _get_help(runner, app, []), + "group_help": group_help, + "command_help": command_help, + "errors": { + "missing_command": _invoke(runner, app, []).output, + "unknown_command": _invoke(runner, app, [UNKNOWN_COMMAND_TOKEN]).output, + "unknown_subcommand": unknown_subcommand, + }, + } + + +def write_manifest(path: Path) -> Path: + data = build_manifest() + path.write_text( + json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True) + "\n" + ) + return path + + +def main() -> None: + write_manifest(Path(__file__).with_name(MANIFEST_FILE)) + + +if __name__ == "__main__": + main() diff --git a/spacy_cli/cli_manifest.json b/spacy_cli/cli_manifest.json new file mode 100644 index 00000000000..e756c058eb3 --- /dev/null +++ b/spacy_cli/cli_manifest.json @@ -0,0 +1,118 @@ +{ + "command": "python -m spacy", + "command_help": { + "apply": "Usage: python -m spacy apply [OPTIONS] MODEL DATA_PATH OUTPUT_FILE\n\n Apply a trained pipeline to documents to get predictions. Expects a loadable\n spaCy pipeline and path to the data, which can be a directory or a file. The\n data files can be provided in multiple formats: 1. .spacy files 2.\n .jsonl files with a specified \"field\" to read the text from. 3. Files with\n any other extension are assumed to be containing a single document.\n DOCS: https://spacy.io/api/cli#apply\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of the documents to predict on. Can be a single file in\n .spacy format or a .jsonl file. Files with other extensions are\n treated as single plain text documents. If a directory is\n provided it is traversed recursively to grab all files to be\n processed. The files can be a mixture of .spacy, .jsonl and text\n files. If .jsonl is provided the specified field is going to be\n grabbed (\"text\" by default). [required]\n OUTPUT_FILE Path to save the resulting .spacy file [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -tk, --text-key TEXT Key containing text string for JSONL [default:\n text]\n -F, --force Force overwriting the output file\n -g, --gpu-id INTEGER GPU ID or -1 for CPU. [default: -1]\n -b, --batch-size INTEGER Batch size. [default: 1]\n -n, --n-process INTEGER number of processors to use. [default: 1]\n --help Show this message and exit.\n", + "assemble": "Usage: python -m spacy assemble [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\n Assemble a spaCy pipeline from a config file. The config file includes all\n settings for initializing the pipeline. To override settings in the config,\n e.g. settings that point to local paths or that you want to experiment with,\n you can override them as command line options. The --code argument lets you\n pass in a Python file that can be used to register custom functions that are\n referenced in the config.\n\n DOCS: https://spacy.io/api/cli#assemble\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory to store assembled pipeline in [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n --help Show this message and exit.\n", + "benchmark accuracy": "Usage: python -m spacy benchmark accuracy [OPTIONS] MODEL DATA_PATH\n\n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation\n data in the binary .spacy format. The --gold-preproc option sets up the\n evaluation examples with gold-standard sentences and tokens for the\n predictions. Gold preprocessing helps the annotations align to the\n tokenization, and may result in sequences of more consistent length. However,\n it may reduce runtime accuracy due to train/test skew. To render a sample of\n dependency parses in a HTML file, set as output directory as the displacy_path\n argument.\n\n DOCS: https://spacy.io/api/cli#benchmark-accuracy\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -o, --output FILE Output JSON file for metrics\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -dp, --displacy-path DIRECTORY Directory to output rendered parses as HTML\n -dl, --displacy-limit INTEGER Limit of parses to render as HTML [default:\n 25]\n -P, --per-component Return scores per component, only applicable\n when an output JSON file is specified.\n -sk, --spans-key TEXT Spans key to use when evaluating Doc.spans\n [default: sc]\n --help Show this message and exit.\n", + "benchmark speed": "Usage: python -m spacy benchmark speed [OPTIONS] MODEL DATA_PATH\n\n Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in\n the binary .spacy format.\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -b, --batch-size INTEGER RANGE Override the pipeline batch size [x>=1]\n --no-shuffle Do not shuffle benchmark data\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --batches INTEGER RANGE Minimum number of batches to benchmark\n [default: 50; x>=30]\n -w, --warmup INTEGER RANGE Number of iterations over the data for warmup\n [default: 3; x>=0]\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n --help Show this message and exit.\n", + "convert": "Usage: python -m spacy convert [OPTIONS] INPUT_PATH [OUTPUT_DIR]\n\n Convert files into json or DocBin format for training. The resulting .spacy\n file can be used with the train command and other experiment management\n functions.\n\n If no output_dir is specified and the output format is JSON, the data is\n written to stdout, so you can pipe them forward to a JSON file: $ spacy\n convert some_file.conllu --file-type json > some_file.json\n\n DOCS: https://spacy.io/api/cli#convert\n\nArguments:\n INPUT_PATH Input file or directory [required]\n [OUTPUT_DIR] Output directory. '-' for stdout. [default: -]\n\nOptions:\n -t, --file-type [json|spacy] Type of data to produce [default: spacy]\n -n, --n-sents INTEGER Number of sentences per doc (0 to disable)\n [default: 1]\n -s, --seg-sents Segment sentences (for -c ner)\n -b, --model, --base TEXT Trained spaCy pipeline for sentence segmentation\n to use as base (for --seg-sents)\n -m, --morphology Enable appending morphology to tags\n -T, --merge-subtokens Merge CoNLL-U subtokens\n -c, --converter TEXT Converter: ('conllubio', 'conllu', 'conll',\n 'ner', 'iob', 'json') [default: auto]\n -nm, --ner-map PATH NER tag mapping (as JSON-encoded dict of entity\n types)\n -l, --lang TEXT Language (if tokenizer required)\n -C, --concatenate Concatenate output to a single file\n --help Show this message and exit.\n", + "debug config": "Usage: python -m spacy debug config [OPTIONS] CONFIG_PATH\n\n Debug a config file and show validation errors. The command will create all\n objects in the tree and validate them. Note that some config validation errors\n are blocking and will prevent the rest of the config from being resolved. This\n means that you may not see all validation errors at once and some issues are\n only shown once previous errors have been fixed. Similar as with the 'train'\n command, you can override settings from the config as command line options.\n For instance, --training.batch_size 128 overrides the value of \"batch_size\" in\n the block \"[training]\".\n\n DOCS: https://spacy.io/api/cli#debug-config\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -F, --show-functions Show an overview of all registered functions\n used in the config and where they come from\n (modules, files etc.)\n -V, --show-variables Show an overview of all variables referenced in\n the config and their values. This will also\n reflect variables overwritten on the CLI.\n --help Show this message and exit.\n", + "debug data": "Usage: python -m spacy debug data [OPTIONS] CONFIG_PATH\n\n Analyze, debug and validate your training and development data. Outputs useful\n stats, and can help you find problems like invalid entity annotations, cyclic\n dependencies, low data labels and more.\n\n DOCS: https://spacy.io/api/cli#debug-data\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -IW, --ignore-warnings Ignore warnings, only show stats and errors\n -V, --verbose Print additional information and explanations\n -NF, --no-format Don't pretty-print the results\n --help Show this message and exit.\n", + "debug diff-config": "Usage: python -m spacy debug diff-config [OPTIONS] CONFIG_PATH\n\n Show a diff of a config file with respect to spaCy's defaults or another\n config file. If additional settings were used in the creation of the config\n file, then you must supply these as extra parameters to the command when\n comparing to the default settings. The generated diff can also be used when\n posting to the discussion forum to provide more information for the\n maintainers.\n\n The `optimize`, `gpu`, and `pretraining` options are only relevant when\n comparing against the default configuration (or specifically when `compare_to`\n is None).\n\n DOCS: https://spacy.io/api/cli#debug-diff\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n --compare-to PATH Path to a config file to diff against, or\n `None` to compare against default settings\n -o, --optimize [efficiency|accuracy]\n Whether the user config was optimized for\n efficiency or accuracy. Only relevant when\n comparing against the default config.\n [default: efficiency]\n -G, --gpu Whether the original config can run on a GPU.\n Only relevant when comparing against the\n default config.\n --pretraining, --pt Whether to compare on a config with\n pretraining involved. Only relevant when\n comparing against the default config.\n -md, --markdown Generate Markdown for GitHub issues\n --help Show this message and exit.\n", + "debug model": "Usage: python -m spacy debug model [OPTIONS] CONFIG_PATH COMPONENT\n\n Analyze a Thinc model implementation. Includes checks for internal structure\n and activations during training.\n\n DOCS: https://spacy.io/api/cli#debug-model\n\nArguments:\n CONFIG_PATH Path to config file [required]\n COMPONENT Name of the pipeline component of which the model should be\n analysed [required]\n\nOptions:\n -l, --layers TEXT Comma-separated names of layer IDs to print\n -DIM, --dimensions Show dimensions\n -PAR, --parameters Show parameters\n -GRAD, --gradients Show gradients\n -ATTR, --attributes Show attributes\n -P0, --print-step0 Print model before training\n -P1, --print-step1 Print model after initialization\n -P2, --print-step2 Print model after training\n -P3, --print-step3 Print final predictions\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "debug profile": "Usage: python -m spacy debug profile [OPTIONS] MODEL [INPUTS]\n\n Profile which functions take the most time in a spaCy pipeline. Input should\n be formatted as one JSON object per line with a key \"text\". It can either be\n provided as a JSONL file, or be read from sys.sytdin. If no input file is\n specified, the IMDB dataset is loaded via Thinc.\n\n DOCS: https://spacy.io/api/cli#debug-profile\n\nArguments:\n MODEL Trained pipeline to load [required]\n [INPUTS] Location of input file. '-' for stdin.\n\nOptions:\n -n, --n-texts INTEGER Maximum number of texts to use if available [default:\n 10000]\n --help Show this message and exit.\n", + "debug-data": "Usage: python -m spacy debug-data [OPTIONS] CONFIG_PATH\n\n Analyze, debug and validate your training and development data. Outputs useful\n stats, and can help you find problems like invalid entity annotations, cyclic\n dependencies, low data labels and more.\n\n DOCS: https://spacy.io/api/cli#debug-data\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -IW, --ignore-warnings Ignore warnings, only show stats and errors\n -V, --verbose Print additional information and explanations\n -NF, --no-format Don't pretty-print the results\n --help Show this message and exit.\n", + "download": "Usage: python -m spacy download [OPTIONS] MODEL\n\n Download compatible trained pipeline from the default download path using pip.\n If --direct flag is set, the command expects the full package name with\n version. For direct downloads, the compatibility check will be skipped. All\n additional arguments provided to this command will be passed to `pip install`\n on package installation.\n\n DOCS: https://spacy.io/api/cli#download AVAILABLE PACKAGES:\n https://spacy.io/models\n\nArguments:\n MODEL Name of pipeline package to download [required]\n\nOptions:\n -d, -D, --direct Force direct download of name + version\n -S, --sdist Download sdist (.tar.gz) archive instead of pre-built binary\n wheel\n -U, --url TEXT Download from given url\n --help Show this message and exit.\n", + "evaluate": "Usage: python -m spacy evaluate [OPTIONS] MODEL DATA_PATH\n\n Evaluate a trained pipeline. Expects a loadable spaCy pipeline and evaluation\n data in the binary .spacy format. The --gold-preproc option sets up the\n evaluation examples with gold-standard sentences and tokens for the\n predictions. Gold preprocessing helps the annotations align to the\n tokenization, and may result in sequences of more consistent length. However,\n it may reduce runtime accuracy due to train/test skew. To render a sample of\n dependency parses in a HTML file, set as output directory as the displacy_path\n argument.\n\n DOCS: https://spacy.io/api/cli#benchmark-accuracy\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n\nOptions:\n -o, --output FILE Output JSON file for metrics\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -dp, --displacy-path DIRECTORY Directory to output rendered parses as HTML\n -dl, --displacy-limit INTEGER Limit of parses to render as HTML [default:\n 25]\n -P, --per-component Return scores per component, only applicable\n when an output JSON file is specified.\n -sk, --spans-key TEXT Spans key to use when evaluating Doc.spans\n [default: sc]\n --help Show this message and exit.\n", + "find-function": "Usage: python -m spacy find-function [OPTIONS] FUNC_NAME\n\n Find the module, path and line number to the file the registered function is\n defined in, if available.\n\n func_name (str): Name of the registered function. registry_name\n (Optional[str]): Name of the catalogue registry.\n\n DOCS: https://spacy.io/api/cli#find-function\n\nArguments:\n FUNC_NAME Name of the registered function. [required]\n\nOptions:\n -r, --registry TEXT Name of the catalogue registry.\n --help Show this message and exit.\n", + "find-threshold": "Usage: python -m spacy find-threshold [OPTIONS] MODEL DATA_PATH PIPE_NAME\n THRESHOLD_KEY SCORES_KEY\n\n Runs prediction trials for a trained model with varying thresholds to maximize\n the specified metric. The search space for the threshold is traversed linearly\n from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`\n (the corresponding API call to `spacy.cli.find_threshold.find_threshold()`\n returns all results).\n\n This is applicable only for components whose predictions are influenced by\n thresholds - e.g. `textcat_multilabel` and `spancat`, but not `textcat`. Note\n that the full path to the corresponding threshold attribute in the config has\n to be provided.\n\n DOCS: https://spacy.io/api/cli#find-threshold\n\nArguments:\n MODEL Model name or path [required]\n DATA_PATH Location of binary evaluation data in .spacy format [required]\n PIPE_NAME Name of pipe to examine thresholds for [required]\n THRESHOLD_KEY Key of threshold attribute in component's configuration\n [required]\n SCORES_KEY Metric to optimize [required]\n\nOptions:\n -n, --n_trials INTEGER Number of trials to determine optimal thresholds\n [default: 11]\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -G, --gold-preproc Use gold preprocessing\n -V, -VV, --verbose Display more information for debugging purposes\n --help Show this message and exit.\n", + "info": "Usage: python -m spacy info [OPTIONS] [MODEL]\n\n Print info about spaCy installation. If a pipeline is specified as an\n argument, print its meta information. Flag --markdown prints details in\n Markdown for easy copy-pasting to GitHub issues.\n\n Flag --url prints only the download URL of the most recent compatible version\n of the pipeline.\n\n DOCS: https://spacy.io/api/cli#info\n\nArguments:\n [MODEL] Optional loadable spaCy pipeline\n\nOptions:\n -md, --markdown Generate Markdown for GitHub issues\n -s, -S, --silent Don't print anything (just return)\n -e, --exclude TEXT Comma-separated keys to exclude from the print-out\n [default: labels]\n -u, --url Print the URL to download the most recent compatible\n version of the pipeline\n --help Show this message and exit.\n", + "init config": "Usage: python -m spacy init config [OPTIONS] OUTPUT_FILE\n\n Generate a starter config file for training. Based on your requirements\n specified via the CLI arguments, this command generates a config with the\n optimal settings for your use case. This includes the choice of architecture,\n pretrained weights and related hyperparameters.\n\n DOCS: https://spacy.io/api/cli#init-config\n\nArguments:\n OUTPUT_FILE File to save the config to or - for stdout (will only output\n config and no additional logging info) [required]\n\nOptions:\n -l, --lang TEXT Two-letter code of the language to use\n [default: en]\n -p, --pipeline TEXT Comma-separated names of trainable pipeline\n components to include (without 'tok2vec' or\n 'transformer') [default: tagger,parser,ner]\n -o, --optimize [efficiency|accuracy]\n Whether to optimize for efficiency (faster\n inference, smaller model, lower memory\n consumption) or higher accuracy (potentially\n larger and slower model). This will impact the\n choice of architecture, pretrained weights and\n related hyperparameters. [default:\n efficiency]\n -G, --gpu Whether the model can run on GPU. This will\n impact the choice of architecture, pretrained\n weights and related hyperparameters.\n -pt, --pretraining Include config for pretraining (with 'spacy\n pretrain')\n -F, --force Force overwriting the output file\n --help Show this message and exit.\n", + "init fill-config": "Usage: python -m spacy init fill-config [OPTIONS] BASE_PATH [OUTPUT_FILE]\n\n Fill partial config file with default values. Will add all missing settings\n from the default config and will create all objects, check the registered\n functions for their default values and update the base config. This command\n can be used with a config generated via the training quickstart widget:\n https://spacy.io/usage/training#quickstart\n\n DOCS: https://spacy.io/api/cli#init-fill-config\n\nArguments:\n BASE_PATH Path to base config to fill [required]\n [OUTPUT_FILE] Path to output .cfg file (or - for stdout) [default: -]\n\nOptions:\n -pt, --pretraining Include config for pretraining (with 'spacy\n pretrain')\n -D, --diff Print a visual diff highlighting the changes\n -c, --code-path, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n --help Show this message and exit.\n", + "init labels": "Usage: python -m spacy init labels [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\n Generate JSON files for the labels in the data. This helps speed up the\n training process, since spaCy won't have to preprocess the data to extract the\n labels.\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory for the labels [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "init nlp": "Usage: python -m spacy init nlp [OPTIONS] CONFIG_PATH OUTPUT_PATH\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_PATH Output directory for the prepared data [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code (registered\n functions) to be imported\n -V, -VV, --verbose Display more information for debugging purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "init vectors": "Usage: python -m spacy init vectors [OPTIONS] LANG VECTORS_LOC OUTPUT_DIR\n\n Convert word vectors for use with spaCy. Will export an nlp object that you\n can use in the [initialize] block of your config to initialize a model with\n vectors.\n\nArguments:\n LANG The language of the nlp object to create [required]\n VECTORS_LOC Vectors file in Word2Vec format [required]\n OUTPUT_DIR Pipeline output directory [required]\n\nOptions:\n -p, --prune INTEGER Optional number of vectors to prune to [default: -1]\n -t, --truncate INTEGER Optional number of vectors to truncate to when reading\n in vectors file [default: 0]\n -m, --mode TEXT Vectors mode: default or floret [default: default]\n -n, --name TEXT Optional name for the word vectors, e.g.\n en_core_web_lg.vectors\n -V, -VV, --verbose Display more information for debugging purposes\n -a, --attr TEXT Optional token attribute to use for vectors, e.g.\n LOWER or NORM [default: ORTH]\n --help Show this message and exit.\n", + "link": "Usage: python -m spacy link [OPTIONS] ARGS KWARGS\n\n As of spaCy v3.0, symlinks like \"en\" are not supported anymore. You can load\n trained pipeline packages using their full names or from a directory path.\n (DEPRECATED)\n\nArguments:\n ARGS [required]\n KWARGS [required]\n\nOptions:\n --help Show this message and exit.\n", + "package": "Usage: python -m spacy package [OPTIONS] INPUT_DIR OUTPUT_DIR\n\n Generate an installable Python package for a pipeline. Includes binary data,\n meta and required installation files. A new directory will be created in the\n specified output directory, and the data will be copied over. If --create-meta\n is set and a meta.json already exists in the output directory, the existing\n values will be used as the defaults in the command-line prompt. After\n packaging, \"python -m build --sdist\" is run in the package directory, which\n will create a .tar.gz archive that can be installed via \"pip install\".\n\n If additional code files are provided (e.g. Python files containing custom\n registered functions like pipeline components), they are copied into the\n package and imported in the __init__.py.\n\n DOCS: https://spacy.io/api/cli#package\n\nArguments:\n INPUT_DIR Directory with pipeline data [required]\n OUTPUT_DIR Output parent directory [required]\n\nOptions:\n -c, --code TEXT Comma-separated paths to Python file with\n additional code (registered functions) to be\n included in the package\n -m, --meta-path, --meta FILE Path to meta.json\n -C, --create-meta Create meta.json, even if one exists\n -n, --name TEXT Package name to override meta\n -v, --version TEXT Package version to override meta\n -b, --build TEXT Comma-separated formats to build: sdist and/or\n wheel, or none. [default: sdist]\n -f, -F, --force Force overwriting existing data in output\n directory\n -R, -R, --require-parent / --no-require-parent\n Include the parent package (e.g. spacy) in the\n requirements [default: require-parent]\n --help Show this message and exit.\n", + "pretrain": "Usage: python -m spacy pretrain [OPTIONS] CONFIG_PATH OUTPUT_DIR\n\n Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using\n an approximate language-modelling objective. Two objective types are\n available, vector-based and character-based.\n\n In the vector-based objective, we load word vectors that have been trained\n using a word2vec-style distributional similarity algorithm, and train a\n component like a CNN, BiLSTM, etc to predict vectors which match the\n pretrained ones. The weights are saved to a directory after each epoch. You\n can then pass a path to one of these pretrained weights files to the 'spacy\n train' command.\n\n This technique may be especially helpful if you have little labelled data.\n However, it's still quite experimental, so your mileage may vary.\n\n To load the weights back in during 'spacy train', you need to ensure all\n settings are the same between pretraining and training. Ideally, this is done\n by using the same config file for both commands.\n\n DOCS: https://spacy.io/api/cli#pretrain\n\nArguments:\n CONFIG_PATH Path to config file [required]\n OUTPUT_DIR Directory to write weights to on each epoch [required]\n\nOptions:\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -r, --resume-path PATH Path to pretrained weights from which to resume\n pretraining\n -er, --epoch-resume INTEGER The epoch to resume counting from when using\n --resume-path. Prevents unintended overwriting of\n existing weight files.\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n -L, --skip-last Skip saving model-last.bin\n --help Show this message and exit.\n", + "profile": "Usage: python -m spacy profile [OPTIONS] MODEL [INPUTS]\n\n Profile which functions take the most time in a spaCy pipeline. Input should\n be formatted as one JSON object per line with a key \"text\". It can either be\n provided as a JSONL file, or be read from sys.sytdin. If no input file is\n specified, the IMDB dataset is loaded via Thinc.\n\n DOCS: https://spacy.io/api/cli#debug-profile\n\nArguments:\n MODEL Trained pipeline to load [required]\n [INPUTS] Location of input file. '-' for stdin.\n\nOptions:\n -n, --n-texts INTEGER Maximum number of texts to use if available [default:\n 10000]\n --help Show this message and exit.\n", + "project assets": "Usage: python -m spacy project assets [OPTIONS] [PROJECT_DIR]\n\n Fetch project assets like datasets and pretrained weights. Assets are defined\n in the \"assets\" section of the project.yml. If a checksum is provided in the\n project.yml, the file is only downloaded if no local file with the same\n checksum exists.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/tutorial/directory-\n and-assets.md\n\nArguments:\n [PROJECT_DIR] Path to cloned project. Defaults to current working directory.\n [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n -S, --sparse Use sparse checkout for assets provided via Git, to only check\n out and clone the files needed. Requires Git v22.2+.\n -e, --extra Download all assets, including those marked as 'extra'.\n --help Show this message and exit.\n", + "project clone": "Usage: python -m spacy project clone [OPTIONS] NAME [DEST]\n\n Clone a project template from a repository. Calls into \"git\" and will only\n download the files from the given subdirectory. The GitHub repo defaults to\n the official Weasel template repo, but can be customized (including using a\n private repo).\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#clipboard-\n clone\n\nArguments:\n NAME The name of the template to clone [required]\n [DEST] Where to clone the project. Defaults to current working directory\n\nOptions:\n -r, --repo TEXT The repository to clone from [default:\n https://github.com/explosion/projects]\n -b, --branch TEXT The branch to clone from. If not provided, will attempt\n main, master\n -S, --sparse Use sparse Git checkout to only check out and clone the\n files needed. Requires Git v22.2+.\n --help Show this message and exit.\n", + "project document": "Usage: python -m spacy project document [OPTIONS] [PROJECT_DIR]\n\n Auto-generate a README.md for a project. If the content is saved to a file,\n hidden markers are added so you can add custom content before or after the\n auto-generated section and only the auto-generated docs will be replaced when\n you re-run the command.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#closed_book-\n document\n\nArguments:\n [PROJECT_DIR] Path to cloned project. Defaults to current working directory.\n [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n -o, --output PATH Path to output Markdown file for output. Defaults to - for\n standard output [default: -]\n -NE, --no-emoji Don't use emoji\n --help Show this message and exit.\n", + "project dvc": "Usage: python -m spacy project dvc [OPTIONS] [PROJECT_DIR] [WORKFLOW]\n\n Auto-generate Data Version Control (DVC) config. A DVC project can only define\n one pipeline, so you need to specify one workflow defined in the project.yml.\n If no workflow is specified, the first defined workflow is used. The DVC\n config will only be updated if the project.yml changed.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#repeat-dvc\n\nArguments:\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n [WORKFLOW] Name of workflow defined in project.yml. Defaults to first\n workflow if not set.\n\nOptions:\n -V, --verbose Print more info\n -q, --quiet Print less info\n -F, --force Force update DVC config\n --help Show this message and exit.\n", + "project pull": "Usage: python -m spacy project pull [OPTIONS] [REMOTE] [PROJECT_DIR]\n\n Retrieve available precomputed outputs from a remote storage. You can alias\n remotes in your project.yml by mapping them to storage paths. A storage can be\n anything that the smart_open library can upload to, e.g. AWS, Google Cloud\n Storage, SSH, local directories etc.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_down-\n push\n\nArguments:\n [REMOTE] Name or path of remote storage [default: default]\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n --help Show this message and exit.\n", + "project push": "Usage: python -m spacy project push [OPTIONS] [REMOTE] [PROJECT_DIR]\n\n Persist outputs to a remote storage. You can alias remotes in your project.yml\n by mapping them to storage paths. A storage can be anything that the\n smart_open library can upload to, e.g. AWS, Google Cloud Storage, SSH, local\n directories etc.\n\n DOCS: https://github.com/explosion/weasel/tree/main/docs/cli.md#arrow_up-push\n\nArguments:\n [REMOTE] Name or path of remote storage [default: default]\n [PROJECT_DIR] Location of project directory. Defaults to current working\n directory. [default: /Users/matt/repos/spacy-monorepo/spacy]\n\nOptions:\n --help Show this message and exit.\n", + "train": "Usage: python -m spacy train [OPTIONS] CONFIG_PATH\n\n Train or update a spaCy pipeline. Requires data in spaCy's binary format. To\n convert data from other formats, use the `spacy convert` command. The config\n file includes all settings and hyperparameters used during training. To\n override settings in the config, e.g. settings that point to local paths or\n that you want to experiment with, you can override them as command line\n options. For instance, --training.batch_size 128 overrides the value of\n \"batch_size\" in the block \"[training]\". The --code argument lets you pass in a\n Python file that's imported before training. It can be used to register custom\n functions and architectures that can then be referenced in the config.\n\n DOCS: https://spacy.io/api/cli#train\n\nArguments:\n CONFIG_PATH Path to config file [required]\n\nOptions:\n -o, --output, --output-path PATH\n Output directory to store trained pipeline in\n -c, --code PATH Path to Python file with additional code\n (registered functions) to be imported\n -V, -VV, --verbose Display more information for debugging\n purposes\n -g, --gpu-id INTEGER GPU ID or -1 for CPU [default: -1]\n --help Show this message and exit.\n", + "validate": "Usage: python -m spacy validate [OPTIONS]\n\n Validate the currently installed pipeline packages and spaCy version. Checks\n if the installed packages are compatible and shows upgrade instructions if\n available. Should be run after `pip install -U spacy`.\n\n DOCS: https://spacy.io/api/cli#validate\n\nOptions:\n --help Show this message and exit.\n" + }, + "errors": { + "missing_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n\nError: Missing command.\n", + "unknown_command": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_COMMAND__'.\n", + "unknown_subcommand": { + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy benchmark --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy debug --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy init --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\nTry 'python -m spacy project --help' for help.\n\nError: No such command '__SPACY_UNKNOWN_SUBCOMMAND__'.\n" + } + }, + "group_help": { + "benchmark": "Usage: python -m spacy benchmark [OPTIONS] COMMAND [ARGS]...\n\n Commands for benchmarking pipelines.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n accuracy Evaluate a trained pipeline.\n speed Benchmark a pipeline.\n", + "debug": "Usage: python -m spacy debug [OPTIONS] COMMAND [ARGS]...\n\n Suite of helpful commands for debugging and profiling. Includes commands to\n check and validate your config files, training and evaluation data, and custom\n model implementations.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n data Analyze, debug and validate your training and development data.\n profile Profile which functions take the most time in a spaCy pipeline.\n config Debug a config file and show validation errors.\n diff-config Show a diff of a config file with respect to spaCy's...\n model Analyze a Thinc model implementation.\n", + "init": "Usage: python -m spacy init [OPTIONS] COMMAND [ARGS]...\n\n Commands for initializing configs and pipeline packages.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n config Generate a starter config file for training.\n fill-config Fill partial config file with default values.\n vectors Convert word vectors for use with spaCy.\n labels Generate JSON files for the labels in the data.\n", + "project": "Usage: python -m spacy project [OPTIONS] COMMAND [ARGS]...\n\n Command-line interface for spaCy projects and templates. You'd typically start\n by cloning a project template to a local directory and fetching its assets\n like datasets etc. See the project's project.yml for the available commands.\n\nOptions:\n --help Show this message and exit.\n\nCommands:\n assets Fetch project assets like datasets and pretrained weights.\n clone Clone a project template from a repository.\n document Auto-generate a README.md for a project.\n dvc Auto-generate Data Version Control (DVC) config.\n run Run a named command or workflow defined in the project.yml.\n pull Retrieve available precomputed outputs from a remote storage.\n push Persist outputs to a remote storage.\n" + }, + "hidden_group_commands": { + "benchmark": [], + "debug": [], + "init": [ + "nlp" + ], + "project": [] + }, + "hidden_top_level": [ + "link", + "debug-data", + "profile" + ], + "known_groups": { + "benchmark": [ + "accuracy", + "speed" + ], + "debug": [ + "config", + "data", + "diff-config", + "model", + "profile" + ], + "init": [ + "config", + "fill-config", + "labels", + "nlp", + "vectors" + ], + "project": [ + "assets", + "clone", + "document", + "dvc", + "pull", + "push", + "run" + ] + }, + "known_top_level": [ + "apply", + "assemble", + "benchmark", + "convert", + "debug", + "debug-data", + "download", + "evaluate", + "find-function", + "find-threshold", + "info", + "init", + "link", + "package", + "pretrain", + "profile", + "project", + "train", + "validate" + ], + "root_help": "Usage: python -m spacy [OPTIONS] COMMAND [ARGS]...\n\n spaCy Command-line Interface\n\n DOCS: https://spacy.io/api/cli\n\nOptions:\n --install-completion Install completion for the current shell.\n --show-completion Show completion for the current shell, to copy it or\n customize the installation.\n --help Show this message and exit.\n\nCommands:\n download Download compatible trained pipeline from the default...\n info Print info about spaCy installation.\n apply Apply a trained pipeline to documents to get predictions.\n assemble Assemble a spaCy pipeline from a config file.\n convert Convert files into json or DocBin format for training.\n evaluate Evaluate a trained pipeline.\n find-function Find the module, path and line number to the file the...\n find-threshold Runs prediction trials for a trained model with varying...\n package Generate an installable Python package for a pipeline.\n pretrain Pre-train the 'token-to-vector' (tok2vec) layer of...\n train Train or update a spaCy pipeline.\n validate Validate the currently installed pipeline packages and...\n debug Suite of helpful commands for debugging and profiling.\n benchmark Commands for benchmarking pipelines.\n init Commands for initializing configs and pipeline packages.\n project Command-line interface for spaCy projects and templates.\n" +} diff --git a/spacy_cli/main.py b/spacy_cli/main.py new file mode 100644 index 00000000000..f8e6cabe808 --- /dev/null +++ b/spacy_cli/main.py @@ -0,0 +1,73 @@ +import sys +from typing import Iterable, Optional + +from .static import HELP_OPTIONS, UNKNOWN_COMMAND_TOKEN, UNKNOWN_SUBCOMMAND_TOKEN +from .static import get_plugin_command_names, load_manifest + + +def _write_output(text: str) -> None: + sys.stdout.write(text) + if not text.endswith("\n"): + sys.stdout.write("\n") + + +def _run_live() -> None: + from spacy.cli import setup_cli + + setup_cli() + + +def _try_static(argv: Iterable[str]): + args = list(argv) + manifest = load_manifest() + plugin_command_names = get_plugin_command_names() + known_groups = manifest["known_groups"] + known_top_level = set(manifest["known_top_level"]) + if not args: + return manifest["errors"]["missing_command"], 2 + first = args[0] + if first in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["root_help"], 0 + if first.startswith("-"): + return None + if first not in known_top_level: + if first in plugin_command_names: + return None + template = manifest["errors"]["unknown_command"] + return template.replace(UNKNOWN_COMMAND_TOKEN, first), 2 + if first in known_groups: + return _try_static_group(args, first, manifest, known_groups, plugin_command_names) + if any(arg in HELP_OPTIONS for arg in args[1:]): + return manifest["command_help"][first], 0 + return None + + +def _try_static_group(args, first, manifest, known_groups, plugin_command_names): + if len(args) == 1 or args[1] in HELP_OPTIONS: + if plugin_command_names: + return None + return manifest["group_help"][first], 0 + second = args[1] + if second not in known_groups[first]: + if plugin_command_names: + return None + template = manifest["errors"]["unknown_subcommand"][first] + return template.replace(UNKNOWN_SUBCOMMAND_TOKEN, second), 2 + if any(arg in HELP_OPTIONS for arg in args[2:]): + return manifest["command_help"][f"{first} {second}"], 0 + return None + + +def main(argv: Optional[Iterable[str]] = None) -> None: + args = sys.argv[1:] if argv is None else list(argv) + try: + static_result = _try_static(args) + except Exception: + return _run_live() + if static_result is None: + return _run_live() + text, code = static_result + _write_output(text) + raise SystemExit(code) diff --git a/spacy_cli/static.py b/spacy_cli/static.py new file mode 100644 index 00000000000..51594ceef9a --- /dev/null +++ b/spacy_cli/static.py @@ -0,0 +1,24 @@ +import json +from functools import lru_cache +from importlib.metadata import entry_points +from importlib.resources import files +from typing import Any, Dict, Set + + +HELP_OPTIONS = {"--help", "-h"} +PLUGIN_ENTRY_POINT_GROUP = "spacy_cli" +MANIFEST_FILE = "cli_manifest.json" +UNKNOWN_COMMAND_TOKEN = "__SPACY_UNKNOWN_COMMAND__" +UNKNOWN_SUBCOMMAND_TOKEN = "__SPACY_UNKNOWN_SUBCOMMAND__" + + +@lru_cache(maxsize=1) +def load_manifest() -> Dict[str, Any]: + data = files("spacy_cli").joinpath(MANIFEST_FILE).read_text(encoding="utf8") + return json.loads(data) + + +def get_plugin_command_names() -> Set[str]: + return { + entry_point.name for entry_point in entry_points(group=PLUGIN_ENTRY_POINT_GROUP) + }