diff --git a/data/licenses/CDLA-Permissive-2.0.json b/data/licenses/CDLA-Permissive-2.0.json index bd1f738..9319ad7 100644 --- a/data/licenses/CDLA-Permissive-2.0.json +++ b/data/licenses/CDLA-Permissive-2.0.json @@ -23,11 +23,62 @@ "licenseTextHtml": "\n
\n

Community Data License Agreement - Permissive - Version 2.0

\n\n
\n\n

This is the Community Data License Agreement - Permissive, Version 2.0 (the "agreement"). Data Provider(s) and Data Recipient(s) agree as follows:

\n\n\n " }, "categorized": false, - "permissions": [], - "conditions": [], - "limitations": [], + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "license-linking" + ], + "limitations": [ + "liability", + "warranty" + ], "tags": [ + "copyleft:none", + "domain:data", + "domain:software", + "license:open-data-commons", "license:open-source", - "domain:software" - ] + "notes:custom-terms" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[inferred] 1.1 grants use, modify, and share rights with no noncommercial or field-of-use restriction" + ], + "modifications": [ + "[verbatim] 1.1: \"A Data Recipient may use, modify, and share the Data made available\"" + ], + "distribution": [ + "[verbatim] 2.1: \"A Data Recipient may share Data, with or without modifications\"" + ], + "private-use": [ + "[inferred] 1.1 grants general rights to use, modify, and share Data, with no restriction excluding private use" + ], + "data-use": [ + "[verbatim] 1.1: \"A Data Recipient may use... the Data made available by Data Provider(s)\"" + ], + "create-adaptations": [ + "[verbatim] 2.1: \"A Data Recipient may share Data, with or without modifications\"" + ] + }, + "conditions": { + "license-linking": [ + "[verbatim] 2.1: share Data only if the recipient \"makes available the text of this agreement with the shared Data\"" + ] + }, + "limitations": { + "liability": [ + "[verbatim] 4.1: \"NO DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\"" + ], + "warranty": [ + "[verbatim] 4.1: \"THE DATA IS PROVIDED ON AN \\\"AS IS\\\" BASIS, WITHOUT REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND\"" + ] + } + } } \ No newline at end of file diff --git a/docs/classification/SYSTEM_PROMPT.md b/docs/classification/SYSTEM_PROMPT.md deleted file mode 100644 index d018e9e..0000000 --- a/docs/classification/SYSTEM_PROMPT.md +++ /dev/null @@ -1,181 +0,0 @@ -# SYSTEM PROMPT — License Classifier (Permissions / Conditions / Limitations / Tags + Nested Reasons) - -You are a strict, conservative license classifier for an internal mobility/data project. -Given license text (and optional metadata), decide which standardized rules apply and provide concise evidence per selected rule, grouped by category. - -Output must be a single JSON object (no prose, no markdown) with EXACT top-level keys: -{ - "permissions": [...], - "conditions": [...], - "limitations": [...], - "tags": [...], - "reasons": { - "permissions": {"": [""]}, - "conditions": {"": [""]}, - "limitations": {"": [""]} - } -} -Order unimportant. No additional keys. - -## Allowed values -- permission names: {allowed_permissions} -- condition names: {allowed_conditions} -- limitation names: {allowed_limitations} -- tag names (optional): {allowed_tags} -Use ONLY these lists. Omit anything uncertain. - -## Evidence (reasons) -For every selected rule in permissions / conditions / limitations: -- Provide ≥1 evidence string under the corresponding nested category. -- Each evidence string ≤160 characters **excluding** the prefix. -- Every string MUST begin with one of: - - `[verbatim]` — directly quotes or paraphrases a specific clause or section in the license text. Use this whenever the text contains a direct clause supporting the rule. - - `[inferred]` — use ONLY when no verbatim clause directly supports the rule; the rule is implied by absence of restriction or by the structure of a well-known license family. Do NOT add `[inferred]` alongside `[verbatim]` for the same rule — if verbatim evidence is sufficient, stop there. - - For well-known license families, the absence of a restriction element is valid grounds for inference. For example: in Creative Commons licenses, the absence of the NC (NonCommercial) element means `commercial-use` is permitted and must be inferred even when the text has no explicit "commercial use allowed" statement. -- Multiple evidence strings allowed only if they come from **distinct clauses** that each independently support the rule. Do not repeat the same reasoning with different prefixes. -- Do NOT provide reasons for unselected rules. -- Do NOT fabricate external sources; rely only on provided text or explicit metadata claims (e.g., “This license reproduces ODbL 1.0 in full.”). - -If text claims it reproduces a known license verbatim, treat embedded standard text as authoritative. "Based on" or "inspired by" is not sufficient for inheritance; classify only what appears. - -## Existing classification -If prior classification is provided, you may use it as a starting hypothesis, but must remove any rule not justified by the text and must add missing justified rules. - -## Tags -Tags are optional semantic flags. Include a tag only if clearly supported (e.g., explicit scope like data/software/content; explicit copyleft strength; attribution requirement). Omit tags lacking strong textual or metadata support. - -## Ambiguity / failure -If no rules can be confidently classified, return all empty arrays and: -"reasons": {"permissions": {}, "conditions": {}, "limitations": {}} -Tags array should also be empty in that case. - -## Prohibited -- No guessing unsupported rules or tags. -- No markdown, comments, extra keys, or explanatory prose. -- No invented evidence or external URLs beyond those explicitly present. -- **Never mix `[verbatim]` and `[inferred]` for the same rule.** If you have a verbatim clause, use only `[verbatim]`. Reserve `[inferred]` exclusively for rules where no direct verbatim clause exists. - -## Reference examples (few-shot) -Use the following pre-classified licenses as calibration references for format and evidence style. -Always base your output on the license text in the current prompt — not by analogy with these examples. - -### Example 1 — Permissive software (MIT-style) -Input: License grants "to any person… to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies"; requires copyright notice preserved; disclaims all warranties; limits liability. -```json -{ - "permissions": ["commercial-use", "modifications", "distribution", "private-use"], - "conditions": ["include-copyright"], - "limitations": ["warranty", "liability"], - "tags": ["copyleft:none", "domain:software", "license:open-source"], - "reasons": { - "permissions": { - "commercial-use": ["[verbatim] \"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software\""], - "modifications": ["[verbatim] \"to use, copy, modify, merge, publish...\""], - "distribution": ["[verbatim] \"to distribute, sublicense, and/or sell copies of the Software\""], - "private-use": ["[inferred] No private-use restriction; broad grant \'to any person\' covers all use types without exception"] - }, - "conditions": { - "include-copyright": ["[verbatim] \"The above copyright notice and this permission notice shall be included in all copies or substantial portions\""] - }, - "limitations": { - "warranty": ["[verbatim] \"THE SOFTWARE IS PROVIDED \'AS IS\', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED\""], - "liability": ["[verbatim] \"IN NO EVENT SHALL THE AUTHORS... BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY\""] - } - } -} -``` - -### Example 2 — PDDL-1.0 (public domain, data) -Input: Public Domain Dedication and Licence. "…there are no restrictions or requirements placed on the recipient by this document. Recipients may use this work commercially, freely share, modify, and use this work for any purpose…" -```json -{ - "permissions": ["commercial-use", "modifications", "distribution", "private-use", "data-use", "create-adaptations"], - "conditions": [], - "limitations": ["trademark-use", "patent-use", "liability", "warranty", "database-rights-disclaimed"], - "tags": ["domain:data", "family:ODC", "license:open-data-commons", "license:public-domain"], - "reasons": { - "permissions": { - "commercial-use": ["[verbatim] \"there are no restrictions or requirements placed on the recipient... Recipients may use this work commercially\""], - "modifications": ["[verbatim] \"freely share, modify, and use this work for any purpose and without any restrictions\""], - "distribution": ["[verbatim] \"freely share, modify, and use this work for any purpose and without any restrictions\""], - "private-use": ["[verbatim] \"there are no restrictions or requirements placed on the recipient by this document\""], - "data-use": ["[verbatim] \"this licence is intended for use on databases or their contents (\'data\'), either together or individually\""], - "create-adaptations": ["[verbatim] \"share their changes and additions or keep them secret\""] - }, - "conditions": {}, - "limitations": { - "trademark-use": ["[verbatim] \"This Document does not cover any trade marks associated with the Database.\""], - "patent-use": ["[verbatim] \"This Document does not cover any patents over the Data or the Database.\""], - "liability": ["[verbatim] \"the Rightsholder is not liable for, and expressly excludes, all liability for loss or damage...\""], - "warranty": ["[verbatim] \"The Work is provided by the Rightsholder \'as is\' and without any warranty of any kind\""], - "database-rights-disclaimed": ["[verbatim] \"dedicates the Work to the public domain... relinquishes all rights in Copyright and Database Rights\""] - } - } -} -``` - -### Example 3 — ODbL-1.0 (strong copyleft, data) -Input: Open Database License. Grants extraction, re-utilisation, distribution, derivative databases commercially; requires attribution, share-alike, disclose source; excludes trademarks, patents, warranty, liability. -```json -{ - "permissions": ["commercial-use", "distribution", "modifications", "private-use", "data-use", "create-adaptations"], - "conditions": ["include-copyright", "attribution", "same-license", "disclose-source"], - "limitations": ["liability", "warranty", "trademark-use", "license-incompatibility", "database-rights-disclaimed"], - "tags": ["copyleft:strong", "domain:data", "family:ODC", "license:open-data-commons", "notes:attribution-required", "notes:share-alike", "spdx:fsf-free"], - "reasons": { - "permissions": { - "commercial-use": ["[verbatim] 3.1: \'These rights explicitly include commercial use, and do not exclude any field of endeavour.\'"], - "distribution": ["[verbatim] 3.1(e): \'Distribution, communication, display, lending, making available, or performance to the public...\'"], - "modifications": ["[verbatim] 3.1(b): \'Creation of Derivative Databases;\'"], - "private-use": ["[verbatim] 6.1(a): \'Extraction of Contents from non-electronic Databases for private purposes...\'"], - "data-use": ["[verbatim] 3.1(a): \'Extraction and Re-utilisation of the whole or a Substantial part of the Contents;\'"], - "create-adaptations": ["[verbatim] 3.1(b): \'Creation of Derivative Databases;\'"] - }, - "conditions": { - "include-copyright": ["[verbatim] 4.2(c): \'Keep intact any copyright or Database Right notices and notices that refer to this License.\'"], - "attribution": ["[verbatim] 4.3: \'You must include a notice... to make any Person... aware that Content was obtained from the Database...\'"], - "same-license": ["[verbatim] 4.4(a): \'Any Derivative Database that You Publicly Use must be only under the terms of: i. This License...\'"], - "disclose-source": ["[verbatim] 4.6: \'You must also offer... a copy in a machine readable form of: a. The entire Derivative Database; or b. A file...\'"] - }, - "limitations": { - "liability": ["[verbatim] 8.1: \'Licensor is not liable for, and expressly excludes, all liability for loss or damage however and whenever caused...\'"], - "warranty": ["[verbatim] 7.1: \'The Database is licensed by the Licensor \"as is\" and without any warranty of any kind...\'"], - "trademark-use": ["[verbatim] 2.3(c): \'This License does not cover any trademarks associated with the Database.\'"], - "license-incompatibility": ["[verbatim] 4.4(d): \'You must not add Contents to Derivative Databases... that are incompatible with the rights granted under this License.\'"], - "database-rights-disclaimed": ["[verbatim] 2.2(b): \'Database Rights only extend to the Extraction and Re-utilisation of the whole or a Substantial part of the Contents.\'"] - } - } -} -``` - -### Example 4 — CC BY-ND (Creative Commons Attribution-NoDerivs) -Input: Creative Commons Attribution-NoDerivs 2.0. Grants worldwide, royalty-free license to reproduce and distribute the Work. Prohibits creation of Derivative Works. No NonCommercial restriction. Requires attribution and copyright notice. Disclaims warranties and limits liability. -```json -{ - "permissions": ["commercial-use", "distribution", "private-use"], - "conditions": ["include-copyright", "attribution", "license-linking"], - "limitations": ["trademark-use", "liability", "warranty"], - "tags": ["domain:content", "family:CC", "license:creative-commons", "notes:attribution-required", "notes:no-derivatives"], - "reasons": { - "permissions": { - "commercial-use": [ - "[inferred] No NC (NonCommercial) element present; CC BY-ND broad royalty-free grant with no commercial restriction permits commercial use" - ], - "distribution": ["[verbatim] Sec. 3.b grants the right \"to distribute copies or phonorecords of... the Work\""], - "private-use": ["[inferred] No clause restricts private use; broad royalty-free grant covers all use types without exception"] - }, - "conditions": { - "include-copyright": ["[verbatim] Sec. 4.b: \"You must keep intact all copyright notices for the Work\""], - "attribution": ["[verbatim] Sec. 4.b: \"give the Original Author credit reasonable to the medium or means You are utilizing\""], - "license-linking": ["[verbatim] Sec. 4.a: \"You must include a copy of, or the Uniform Resource Identifier for, this License with every copy\""] - }, - "limitations": { - "trademark-use": ["[verbatim] License states neither party may use Creative Commons trademarks without prior written consent"], - "liability": ["[verbatim] Sec. 6: \"IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY... DAMAGES\""], - "warranty": ["[verbatim] Sec. 5: \"LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND\""] - } - } -} -``` - -Adhere strictly to these instructions. Return ONLY the JSON object. diff --git a/pyproject.toml b/pyproject.toml index 3f3599c..3d2864e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,3 +45,10 @@ Repository = "https://github.com/MobilityData/licenses-catalog" [tool.setuptools] package-dir = {"" = "src"} packages = ["licensing", "licensing.classify"] + +[tool.setuptools.package-data] +"licensing.classify" = [ + "data/*.json", + "data/*.md", + "data/examples/*.json", +] diff --git a/scripts/README.md b/scripts/README.md index 662fc53..b5a3d7a 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -129,4 +129,79 @@ Each license JSON file is updated with a `tags` field, which contains a list of Run the script from the repository root: ```bash -python licenses_tags.py \ No newline at end of file +python licenses_tags.py + +--- + +## License Classifier + +### `classify_license.sh` + +Classifies a license using an LLM (default: `gpt-5.4`) and writes standardised +`permissions`, `conditions`, `limitations`, `tags`, and `reasons` fields into the +license JSON file. + +The classifier automatically injects **few-shot examples** from already-classified +licenses in `data/licenses/` (those with a non-empty `reasons` block) to guide the +LLM. The number of injected examples is controlled by `--max-examples`. + +### Usage + +```bash +./classify_license.sh [OPTIONS] +``` + +#### Options + +| Flag | Description | +|---|---| +| `` | Path to a merged SPDX JSON file or a plain license text file | +| `--output [PATH]` | Write results to PATH; omit PATH to update the input file in-place; omit flag entirely to print to stdout | +| `--dry-run` | Print classification JSON to stdout without writing any files | +| `--spdx-id ID` | Override the SPDX ID (useful for plain-text inputs) | +| `--model MODEL` | LLM model name (default: `gpt-5.4`) | +| `--max-examples N` | Maximum few-shot examples to inject from already-classified licenses (default: `5`; set to `0` to disable) | +| `--skip-tags` | Skip heuristic tag inference; only LLM-assigned tags are included | +| `--disable-llm` | Disable LLM calls and return an empty classification (useful for testing) | +| `--credentials-file PATH` | Path to a dcredentials file containing `OPENAI_API_KEY` | +| `--system-prompt PATH` | Path to the system prompt markdown file (default: bundled `src/licensing/classify/data/SYSTEM_PROMPT.md`) | +| `--user-prompt PATH` | Path to the user prompt markdown file (default: bundled `src/licensing/classify/data/USER_PROMPT.md`) | + +#### Examples + +```bash +# Classify a license and print to stdout +./classify_license.sh ./data/licenses/MIT.json + +# Classify and update in-place +./classify_license.sh ./data/licenses/MIT.json --output + +# Classify with more few-shot examples for better accuracy +./classify_license.sh ./data/licenses/MIT.json --max-examples 10 + +# Classify with no few-shot examples +./classify_license.sh ./data/licenses/MIT.json --max-examples 0 + +# Preview without writing +./classify_license.sh ./data/licenses/MIT.json --dry-run +``` + +### Few-shot example injection + +Each time the classifier runs, it scans `data/licenses/` for files that already +have a non-empty `reasons` block and injects up to `--max-examples` of them as +calibration examples into the system prompt. The license being classified is +excluded from the pool to avoid self-reference. + +This means the classifier improves automatically as more licenses are classified: +the growing pool of examples helps the LLM handle edge cases (e.g. distinguishing +licenses that explicitly restrict commercial use from those that merely omit the +word "commercial"). + +### Environment variables + +| Variable | Description | +|---|---| +| `OPENAI_API_KEY` | API key for OpenAI LLM calls | +| `DCREDENTIALS_FILE` | Path to a dcredentials file containing `OPENAI_API_KEY` | +| `DISABLE_LLM` | Set to `1` to skip LLM calls (equivalent to `--disable-llm`) | \ No newline at end of file diff --git a/scripts/classify_license.sh b/scripts/classify_license.sh index 8fe63da..30b6d13 100755 --- a/scripts/classify_license.sh +++ b/scripts/classify_license.sh @@ -48,11 +48,11 @@ set -euo pipefail # # --system-prompt PATH # Path to the system prompt markdown file. -# Default: docs/classification/SYSTEM_PROMPT.md +# Default: bundled package data (src/licensing/classify/data/SYSTEM_PROMPT.md) # # --user-prompt PATH # Path to the user prompt markdown file. -# Default: docs/classification/USER_PROMPT.md +# Default: bundled package data (src/licensing/classify/data/USER_PROMPT.md) # # --model MODEL_NAME # LLM model name to use. @@ -63,6 +63,11 @@ set -euo pipefail # Overrides DCREDENTIALS_FILE env var and the default ~/.dcredentials # lookup. # +# --max-examples N +# Maximum number of few-shot examples to inject from already-classified +# licenses in data/licenses/ (those with non-empty reasons). Defaults to +# 5. Set to 0 to disable few-shot injection entirely. +# # --skip-tags # Skip heuristic tag inference. Only tags returned by the LLM are # included in the output. By default, heuristic tags are merged with @@ -89,7 +94,7 @@ set -euo pipefail # ./classify_license.sh ./data/licenses/MIT.json --dry-run # # # Use a custom model and credentials file -# ./classify_license.sh ./data/licenses/MIT.json --model gpt-4o --credentials-file ~/creds +# ./classify_license.sh ./data/licenses/MIT.json --model gpt-5.4 --credentials-file ~/creds # # Environment Variables: # OPENAI_API_KEY - API key for OpenAI LLM calls. diff --git a/scripts/sync_package_data.py b/scripts/sync_package_data.py new file mode 100644 index 0000000..e0a5753 --- /dev/null +++ b/scripts/sync_package_data.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Sync bundled package data in src/licensing/classify/data/. + +Run this script after: + - Classifying a new license (to add it to the few-shot examples bundle) + +Note: rules.json and tags.json are symlinks to data/rules.json and data/tags.json — +no sync needed for those. SYSTEM_PROMPT.md and USER_PROMPT.md live in +src/licensing/classify/data/ as canonical sources — edit them there directly. + +Usage: + python scripts/sync_package_data.py [--dry-run] + +What it does: + 1. Ensures rules.json / tags.json symlinks exist in src/licensing/classify/data/ + 2. For every license in data/licenses/ that has a non-empty reasons block, + writes a slim version (licenseId, permissions, conditions, limitations, tags, + reasons) to src/licensing/classify/data/examples/.json + +Slim versions strip the large spdx.licenseText / licenseTextHtml / standardLicenseTemplate +fields so the bundled package stays small (~47 KB vs ~581 KB for the full files). +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +DATA_DIR = REPO_ROOT / "data" +PKG_DATA_DIR = REPO_ROOT / "src" / "licensing" / "classify" / "data" +PKG_EXAMPLES_DIR = PKG_DATA_DIR / "examples" + +_EMPTY_REASONS: dict = {"permissions": {}, "conditions": {}, "limitations": {}} + +_SPDX_STRIP_KEYS = { + "licenseText", + "standardLicenseTemplate", + "licenseTextHtml", + "crossRef", + "seeAlso", +} + + +def _slim_example(data: dict) -> dict: + """Return a minimal copy of a license JSON suitable for few-shot injection.""" + spdx_block = data.get("spdx", {}) + slim_spdx = {k: v for k, v in spdx_block.items() if k not in _SPDX_STRIP_KEYS} + return { + "spdx": slim_spdx, + "permissions": data.get("permissions") or [], + "conditions": data.get("conditions") or [], + "limitations": data.get("limitations") or [], + "tags": data.get("tags") or [], + "reasons": data.get("reasons") or {}, + } + + +def sync(dry_run: bool = False) -> None: + """Sync all bundled package data.""" + + def write(dest: Path, content: str) -> None: + if dry_run: + print(f" [dry-run] would write {dest.relative_to(REPO_ROOT)}") + else: + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text(content, encoding="utf-8") + + # --- ensure symlinks for rules.json / tags.json --- + symlinks = [ + (DATA_DIR / "rules.json", PKG_DATA_DIR / "rules.json"), + (DATA_DIR / "tags.json", PKG_DATA_DIR / "tags.json"), + ] + for target, link in symlinks: + rel_target = Path(os.path.relpath(target, link.parent)) + if link.is_symlink() and Path(os.readlink(link)) == rel_target: + print(f" symlink ok {link.relative_to(REPO_ROOT)}") + elif dry_run: + print(f" [dry-run] would symlink {link.relative_to(REPO_ROOT)} -> {rel_target}") + else: + link.parent.mkdir(parents=True, exist_ok=True) + if link.exists() or link.is_symlink(): + link.unlink() + link.symlink_to(rel_target) + print(f" symlinked {link.relative_to(REPO_ROOT)} -> {rel_target}") + + # --- classified examples --- + licenses_dir = DATA_DIR / "licenses" + if not licenses_dir.exists(): + print(f"WARNING: licenses dir not found: {licenses_dir}", file=sys.stderr) + return + + added = skipped = 0 + for path in sorted(licenses_dir.glob("*.json")): + try: + data = json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: + print(f"WARNING: could not parse {path.name}: {exc}", file=sys.stderr) + continue + reasons = data.get("reasons", {}) + if not reasons or reasons == _EMPTY_REASONS: + continue + slim = _slim_example(data) + license_id = slim["spdx"].get("licenseId") or path.stem + dest = PKG_EXAMPLES_DIR / f"{license_id}.json" + new_content = json.dumps(slim, indent=2, ensure_ascii=False) + if dest.exists() and dest.read_text(encoding="utf-8") == new_content: + skipped += 1 + continue + write(dest, new_content) + print(f" {'would write' if dry_run else 'wrote'} example {dest.relative_to(REPO_ROOT)}") + added += 1 + + print(f"\nDone: {added} example(s) {'would be ' if dry_run else ''}written, {skipped} unchanged.") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--dry-run", action="store_true", help="Print what would be written without writing anything.") + args = parser.parse_args() + sync(dry_run=args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/src/licensing/classify/classify_license.py b/src/licensing/classify/classify_license.py index 6fac063..0fa1058 100644 --- a/src/licensing/classify/classify_license.py +++ b/src/licensing/classify/classify_license.py @@ -22,6 +22,7 @@ import argparse import hashlib +import importlib.resources import json import os import re @@ -33,15 +34,22 @@ from licensing.classify.license_tags import TagRegistry, build_tags -BASE_DIR = Path(__file__).resolve().parents[3] -RULES_PATH = (BASE_DIR / "data" / "rules.json").resolve() -TAGS_PATH = (BASE_DIR / "data" / "tags.json").resolve() +def _pkg_data(relative: str) -> importlib.resources.abc.Traversable: + """Return a Traversable reference to a file in the bundled package data.""" + return importlib.resources.files("licensing.classify").joinpath("data").joinpath(relative) -def load_rules(path: Path = RULES_PATH) -> dict[str, list]: - if not path.exists(): - raise FileNotFoundError(f"Rules file not found: {path}") - data = json.loads(path.read_text(encoding="utf-8")) +def load_rules(path: Path | None = None) -> dict[str, list]: + """Load permission/condition/limitation rule names. + + Uses the bundled ``data/rules.json`` by default. Pass *path* to override + (e.g. when running from the catalog repository). + """ + if path is None: + text = _pkg_data("rules.json").read_text(encoding="utf-8") + else: + text = Path(path).read_text(encoding="utf-8") + data = json.loads(text) def extract(names_list: list) -> list: return [item.get("name") for item in names_list if isinstance(item, dict) and item.get("name")] @@ -53,10 +61,16 @@ def extract(names_list: list) -> list: } -def load_tags(path: Path = TAGS_PATH) -> list: - if not path.exists(): - raise FileNotFoundError(f"Tags file not found: {path}") - data = json.loads(path.read_text(encoding="utf-8")) +def load_tags(path: Path | None = None) -> list: + """Load valid tag names. + + Uses the bundled ``data/tags.json`` by default. Pass *path* to override. + """ + if path is None: + text = _pkg_data("tags.json").read_text(encoding="utf-8") + else: + text = Path(path).read_text(encoding="utf-8") + data = json.loads(text) flat: list[str] = [] for category, entries in data.items(): if isinstance(entries, dict): @@ -70,9 +84,6 @@ def load_tags(path: Path = TAGS_PATH) -> list: RULE_NAMES = load_rules() TAG_NAMES = load_tags() -DEFAULT_SYSTEM_PROMPT_PATH = BASE_DIR / "docs" / "classification" / "SYSTEM_PROMPT.md" -DEFAULT_USER_PROMPT_PATH = BASE_DIR / "docs" / "classification" / "USER_PROMPT.md" - def load_api_key_from_dcredentials() -> str | None: """Optionally load OPENAI_API_KEY from a local dcredentials file.""" @@ -190,6 +201,110 @@ def load_non_spdx_from_url(url: str, cache_dir: Path, force_download: bool = Fal return text, metadata +DEFAULT_MAX_EXAMPLES = 5 + +_EMPTY_REASONS: dict[str, Any] = {"permissions": {}, "conditions": {}, "limitations": {}} + + +def _select_diverse_examples(examples: list[dict[str, Any]], n: int) -> list[dict[str, Any]]: + """Select up to *n* examples using a greedy max-coverage strategy. + + Greedily picks the example that adds the most *new* rules (permissions + + conditions + limitations) to the already-covered set, ensuring the selected + subset spans as many distinct classification outcomes as possible. + """ + if len(examples) <= n: + return examples + + def profile(ex: dict) -> frozenset: + return ( + frozenset(ex.get("permissions") or []) + | frozenset(ex.get("conditions") or []) + | frozenset(ex.get("limitations") or []) + ) + + selected: list[dict[str, Any]] = [] + remaining = list(examples) + covered: frozenset = frozenset() + + while len(selected) < n and remaining: + best = max(remaining, key=lambda ex: len(profile(ex) - covered)) + selected.append(best) + covered = covered | profile(best) + remaining.remove(best) + + return selected + + +def load_classified_examples( + exclude_id: str | None = None, + max_examples: int = DEFAULT_MAX_EXAMPLES, + licenses_dir: Path | None = None, +) -> list[dict[str, Any]]: + """Return already-classified licenses (with reasons) for use as few-shot examples. + + When *licenses_dir* is ``None`` (the default), loads the slim examples + bundled with the package (``src/licensing/classify/data/examples/``). + Pass a directory path to override — useful when running from the catalog + repository to use freshly-classified licenses instead. + + Up to *max_examples* entries are returned, selected using a greedy + max-coverage strategy that maximises diversity across permission profiles. + """ + if max_examples is not None and max_examples <= 0: + return [] + + if licenses_dir is None: + pkg_examples = _pkg_data("examples") + candidates = sorted(pkg_examples.iterdir(), key=lambda t: t.name) + read_item = lambda item: json.loads(item.read_text(encoding="utf-8")) + else: + candidates = sorted(Path(licenses_dir).glob("*.json")) + read_item = lambda item: json.loads(item.read_text(encoding="utf-8")) + + all_examples: list[dict[str, Any]] = [] + for item in candidates: + try: + data = read_item(item) + except Exception: + continue + reasons = data.get("reasons", {}) + if not reasons or reasons == _EMPTY_REASONS: + continue + license_id = data.get("spdx", {}).get("licenseId") or getattr(item, "stem", None) or str(item).rsplit("/", 1)[-1].replace(".json", "") + if exclude_id and license_id == exclude_id: + continue + all_examples.append({ + "license_id": license_id, + "permissions": data.get("permissions") or [], + "conditions": data.get("conditions") or [], + "limitations": data.get("limitations") or [], + "reasons": reasons, + }) + + return _select_diverse_examples(all_examples, max_examples) if max_examples is not None else all_examples + + +def format_few_shot_block(examples: list[dict[str, Any]]) -> str: + """Render *examples* as a compact markdown block for prompt injection.""" + if not examples: + return "(No worked examples available yet.)" + blocks: list[str] = [] + for ex in examples: + lines: list[str] = [f"### Example: {ex['license_id']}"] + for category in ("permissions", "conditions", "limitations"): + items: list[str] = ex.get(category) or [] + lines.append(f"{category.capitalize()}: {', '.join(items) if items else 'none'}") + lines.append("Reasons:") + reasons: dict[str, Any] = ex.get("reasons") or {} + for category in ("permissions", "conditions", "limitations"): + for rule, evidence_list in (reasons.get(category) or {}).items(): + if evidence_list: + lines.append(f" [{category}] {rule}: {evidence_list[0][:160]}") + blocks.append("\n".join(lines)) + return "\n\n".join(blocks) + + _PLACEHOLDER_PATTERN = re.compile(r"\{([A-Za-z0-9_]+)}") @@ -212,19 +327,26 @@ def _allowed_mapping() -> dict[str, Any]: } -def load_system_prompt(path: str | Path = DEFAULT_SYSTEM_PROMPT_PATH, mapping: dict[str, Any] | None = None) -> str: - p = Path(path) - if not p.is_absolute(): - p = (BASE_DIR / p).resolve() - if not p.exists(): - raise FileNotFoundError(f"System prompt file not found: {p}") - text = p.read_text(encoding="utf-8") +def load_system_prompt(path: str | Path | None = None, mapping: dict[str, Any] | None = None) -> str: + """Load and render the system prompt. + + Uses the bundled ``data/SYSTEM_PROMPT.md`` when *path* is ``None``. + """ + if path is None: + text = _pkg_data("SYSTEM_PROMPT.md").read_text(encoding="utf-8") + else: + p = Path(path) + if not p.is_absolute(): + p = (Path.cwd() / p).resolve() + if not p.exists(): + raise FileNotFoundError(f"System prompt file not found: {p}") + text = p.read_text(encoding="utf-8") merged_mapping = {**_allowed_mapping(), **(mapping or {})} return _sub_placeholders(text, merged_mapping) def build_user_prompt( - template_path: Path, + template_path: Path | None, license_id: str, spdx_id: str | None, source: str, @@ -232,7 +354,14 @@ def build_user_prompt( existing_classification: dict[str, Any] | None, license_text: str, ) -> str: - template = template_path.read_text(encoding="utf-8") + """Build the user prompt from a template. + + Uses the bundled ``data/USER_PROMPT.md`` when *template_path* is ``None``. + """ + if template_path is None: + template = _pkg_data("USER_PROMPT.md").read_text(encoding="utf-8") + else: + template = Path(template_path).read_text(encoding="utf-8") mapping: dict[str, Any] = { "LICENSE_ID": license_id, "SPDX_ID_OR_EMPTY": spdx_id or "", @@ -394,8 +523,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser.add_argument("--credentials-file", help="Path to a dcredentials file with OPENAI_API_KEY.") parser.add_argument("--skip-tags", action="store_true", help="Skip heuristic tag inference; only LLM-assigned tags are included.") parser.add_argument("--disable-llm", action="store_true", help="Disable the LLM and return empty classification.") - parser.add_argument("--system-prompt", default=str(DEFAULT_SYSTEM_PROMPT_PATH), help="Path to system prompt file.") - parser.add_argument("--user-prompt", default=str(DEFAULT_USER_PROMPT_PATH), help="Path to user prompt template.") + parser.add_argument("--system-prompt", default=None, + help="Path to system prompt file (default: bundled package data).") + parser.add_argument("--user-prompt", default=None, + help="Path to user prompt template (default: bundled package data).") parser.add_argument("--model", default="gpt-5.4", help="LLM model name to use (default: gpt-5.4).") parser.add_argument( "--output", @@ -409,6 +540,8 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: "If not provided at all, prints to stdout." ), ) + parser.add_argument("--max-examples", type=int, default=DEFAULT_MAX_EXAMPLES, + help=f"Maximum number of few-shot examples to inject from already-classified licenses (default: {DEFAULT_MAX_EXAMPLES}; set to 0 to disable).") return parser.parse_args(argv) @@ -436,9 +569,13 @@ def main(argv: list[str] | None = None) -> None: existing_classification = None raw_json = None source = "file-text" - system_prompt = load_system_prompt(args.system_prompt) + system_prompt = load_system_prompt(args.system_prompt, mapping={ + "few_shot_examples": format_few_shot_block( + load_classified_examples(exclude_id=spdx_id, max_examples=args.max_examples) + ), + }) user_prompt = build_user_prompt( - Path(args.user_prompt), + Path(args.user_prompt) if args.user_prompt else None, license_id=license_id, spdx_id=spdx_id, source=source, diff --git a/src/licensing/classify/data/SYSTEM_PROMPT.md b/src/licensing/classify/data/SYSTEM_PROMPT.md new file mode 100644 index 0000000..ff2b202 --- /dev/null +++ b/src/licensing/classify/data/SYSTEM_PROMPT.md @@ -0,0 +1,72 @@ +# SYSTEM PROMPT — License Classifier (Permissions / Conditions / Limitations / Tags + Nested Reasons) + +You are a strict, conservative license classifier for an internal mobility/data project. +Given license text (and optional metadata), decide which standardized rules apply and provide concise evidence per selected rule, grouped by category. + +Output must be a single JSON object (no prose, no markdown) with EXACT top-level keys: +{ + "permissions": [...], + "conditions": [...], + "limitations": [...], + "tags": [...], + "reasons": { + "permissions": {"": [""]}, + "conditions": {"": [""]}, + "limitations": {"": [""]} + } +} +Order unimportant. No additional keys. + +## Allowed values +- permission names: {allowed_permissions} +- condition names: {allowed_conditions} +- limitation names: {allowed_limitations} +- tag names (optional): {allowed_tags} +Use ONLY these lists. Omit anything uncertain. + +## Evidence (reasons) +For every selected rule in permissions / conditions / limitations: +- Provide ≥1 evidence string under the corresponding nested category. +- Each evidence string ≤160 characters **excluding** the prefix. +- Every string MUST begin with one of: + - `[verbatim]` — directly quotes or paraphrases a specific clause or section in the license text. Use this whenever the text contains a direct clause supporting the rule. + - `[inferred]` — use ONLY when no verbatim clause directly supports the rule; the rule is implied by absence of restriction or by the structure of a well-known license family. Do NOT add `[inferred]` alongside `[verbatim]` for the same rule — if verbatim evidence is sufficient, stop there. + - For well-known license families, the absence of a restriction element is valid grounds for inference. For example: in Creative Commons licenses, the absence of the NC (NonCommercial) element means `commercial-use` is permitted and must be inferred even when the text has no explicit "commercial use allowed" statement. +- Multiple evidence strings allowed only if they come from **distinct clauses** that each independently support the rule. Do not repeat the same reasoning with different prefixes. +- Do NOT provide reasons for unselected rules. +- Do NOT fabricate external sources; rely only on provided text or explicit metadata claims (e.g., “This license reproduces ODbL 1.0 in full.”). + +If text claims it reproduces a known license verbatim, treat embedded standard text as authoritative. "Based on" or "inspired by" is not sufficient for inheritance; classify only what appears. + +## Existing classification +If prior classification is provided, you may use it as a starting hypothesis, but must remove any rule not justified by the text and must add missing justified rules. + +## Tags +Tags are optional semantic flags. Include a tag only if clearly supported (e.g., explicit scope like data/software/content; explicit copyleft strength; attribution requirement). Omit tags lacking strong textual or metadata support. + +## Permission inference rules + +### commercial-use and private-use are companion permissions +`commercial-use` and `private-use` derive from the same "use" grant and must be treated consistently: +- If a license grants **unrestricted use** (no non-commercial carve-out, no "personal use only" clause, no explicit exclusion of commercial contexts), include **both** `commercial-use` and `private-use`. +- Do NOT omit `commercial-use` solely because the word "commercial" does not appear — the **absence of a commercial restriction is itself the permission**. +- Only omit `commercial-use` when the license explicitly restricts or excludes commercial use (e.g., "non-commercial use only", "may not be used for commercial purposes"). + +## Ambiguity / failure +If no rules can be confidently classified, return all empty arrays and: +"reasons": {"permissions": {}, "conditions": {}, "limitations": {}} +Tags array should also be empty in that case. + +## Prohibited +- No guessing unsupported rules or tags. +- No markdown, comments, extra keys, or explanatory prose. +- No invented evidence or external URLs beyond those explicitly present. +- **Never mix `[verbatim]` and `[inferred]` for the same rule.** If you have a verbatim clause, use only `[verbatim]`. Reserve `[inferred]` exclusively for rules where no direct verbatim clause exists. + +## Reference examples (few-shot) +Use the following pre-classified licenses as calibration references for format and evidence style. +Always base your output on the license text in the current prompt — not by analogy with these examples. + +{few_shot_examples} + +Adhere strictly to these instructions. Return ONLY the JSON object. diff --git a/docs/classification/USER_PROMPT.md b/src/licensing/classify/data/USER_PROMPT.md similarity index 100% rename from docs/classification/USER_PROMPT.md rename to src/licensing/classify/data/USER_PROMPT.md diff --git a/src/licensing/classify/data/examples/CC-BY-2.0.json b/src/licensing/classify/data/examples/CC-BY-2.0.json new file mode 100644 index 0000000..7cfda20 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-2.0.json @@ -0,0 +1,80 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Creative Commons Attribution 2.0 Generic", + "licenseId": "CC-BY-2.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "attribution", + "mark-changes", + "license-linking" + ], + "limitations": [ + "trademark-use", + "liability", + "warranty" + ], + "tags": [ + "copyleft:none", + "domain:content", + "family:CC", + "license:creative-commons", + "notes:attribution-required" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[inferred] Worldwide, royalty-free rights to reproduce, create derivatives, and distribute with no field-of-use restriction permit commercial use" + ], + "modifications": [ + "[verbatim] 3.b: \"to create and reproduce Derivative Works;\"", + "[verbatim] \"The above rights include the right to make such modifications as are technically necessary\"" + ], + "distribution": [ + "[verbatim] 3.c: \"to distribute copies or phonorecords of... the Work including as incorporated in Collective Works;\"", + "[verbatim] 3.d: \"to distribute copies or phonorecords of... Derivative Works.\"" + ], + "private-use": [ + "[inferred] Broad rights to reproduce and create derivative works, with no private-use restriction, permit private use" + ], + "create-adaptations": [ + "[verbatim] 1.b: \"Derivative Work\" includes works \"recast, transformed, or adapted\"", + "[verbatim] 3.b: \"to create and reproduce Derivative Works;\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] 4.b: \"You must keep intact all copyright notices for the Work\"" + ], + "attribution": [ + "[verbatim] 4.b: \"give the Original Author credit reasonable to the medium or means You are utilizing\"" + ], + "mark-changes": [ + "[verbatim] 4.b: for a Derivative Work, include \"a credit identifying the use of the Work in the Derivative Work\"" + ], + "license-linking": [ + "[verbatim] 4.a: \"You must include a copy of, or the Uniform Resource Identifier for, this License with every copy\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] \"neither party will use the trademark \\\"Creative Commons\\\"... without the prior written consent\"" + ], + "liability": [ + "[verbatim] 6: \"IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY... DAMAGES\"" + ], + "warranty": [ + "[verbatim] 5: \"LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-3.0.json b/src/licensing/classify/data/examples/CC-BY-3.0.json new file mode 100644 index 0000000..97bf907 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-3.0.json @@ -0,0 +1,84 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Creative Commons Attribution 3.0 Unported", + "licenseId": "CC-BY-3.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "attribution", + "mark-changes", + "non-endorsement", + "license-linking" + ], + "limitations": [ + "trademark-use", + "liability", + "warranty" + ], + "tags": [ + "copyleft:none", + "domain:content", + "family:CC", + "license:creative-commons", + "notes:attribution-required" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[verbatim] Section 3 grants a \"worldwide, royalty-free, non-exclusive, perpetual\" license to exercise listed rights", + "[inferred] No field-of-use or commercial restriction appears in the grant or restrictions" + ], + "modifications": [ + "[verbatim] Section 3(b): \"to create and Reproduce Adaptations\"", + "[verbatim] Adaptation includes \"translation, adaptation, derivative work... or other alterations\"" + ], + "distribution": [ + "[verbatim] Section 3(c): \"to Distribute and Publicly Perform the Work\"", + "[verbatim] Section 3(d): \"to Distribute and Publicly Perform Adaptations\"" + ], + "private-use": [ + "[inferred] Broad reproduction and adaptation rights are granted with no private-use prohibition" + ], + "create-adaptations": [ + "[verbatim] Section 3(b): \"to create and Reproduce Adaptations\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] Section 4(b): \"keep intact all copyright notices for the Work\"" + ], + "attribution": [ + "[verbatim] Section 4(b): provide the Original Author name, title, specified URI, and adaptation credit" + ], + "mark-changes": [ + "[verbatim] Section 3(b): Adaptations must \"clearly label, demarcate or otherwise identify that changes were made\"" + ], + "non-endorsement": [ + "[verbatim] Section 4(b): You may not \"assert or imply any connection with, sponsorship or endorsement\"" + ], + "license-linking": [ + "[verbatim] Section 4(a): include \"a copy of, or the Uniform Resource Identifier (URI) for, this License\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] Notice: no use of the trademark \"Creative Commons\" or related marks without prior written consent" + ], + "liability": [ + "[verbatim] Section 6: \"IN NO EVENT WILL LICENSOR BE LIABLE... FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL... DAMAGES\"" + ], + "warranty": [ + "[verbatim] Section 5: \"LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-4.0.json b/src/licensing/classify/data/examples/CC-BY-4.0.json new file mode 100644 index 0000000..2edc3d5 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-4.0.json @@ -0,0 +1,92 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "isFsfLibre": true, + "name": "Creative Commons Attribution 4.0 International", + "licenseId": "CC-BY-4.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "document-changes", + "attribution", + "non-endorsement", + "license-linking" + ], + "limitations": [ + "trademark-use", + "patent-use", + "liability", + "warranty" + ], + "tags": [ + "copyleft:none", + "domain:content", + "domain:data", + "family:CC", + "license:creative-commons", + "notes:attribution-required", + "spdx:fsf-free" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[inferred] Section 2(a)(1) grants broad rights to reproduce, Share, and produce Adapted Material with no field-of-use restriction" + ], + "modifications": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material\"" + ], + "distribution": [ + "[verbatim] Section 2(a)(1)(A): \"reproduce and Share the Licensed Material, in whole or in part\"" + ], + "private-use": [ + "[inferred] Broad license to exercise Licensed Rights contains no restriction excluding private use" + ], + "data-use": [ + "[verbatim] Section 4(a): \"grants You the right to extract, reuse, reproduce, and Share... contents of the database\"" + ], + "create-adaptations": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] Section 3(a)(1)(A)(ii): retain \"a copyright notice\"" + ], + "document-changes": [ + "[verbatim] Section 3(a)(1)(B): \"indicate if You modified the Licensed Material and retain an indication of any previous modifications\"" + ], + "attribution": [ + "[verbatim] Section 3(a)(1)(A)(i): retain \"identification of the creator(s)... and any others designated to receive attribution\"" + ], + "non-endorsement": [ + "[verbatim] Section 2(a)(6): no permission to imply You are \"sponsored, endorsed, or granted official status\"" + ], + "license-linking": [ + "[verbatim] Section 3(a)(1)(C): include \"the text of, or the URI or hyperlink to, this Public License\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "patent-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "liability": [ + "[verbatim] Section 5(b): \"in no event will the Licensor be liable to You... for any... damages\"" + ], + "warranty": [ + "[verbatim] Section 5(a): Licensor offers the material \"as-is and as-available\" and \"makes no... warranties\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-NC-4.0.json b/src/licensing/classify/data/examples/CC-BY-NC-4.0.json new file mode 100644 index 0000000..b209029 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-NC-4.0.json @@ -0,0 +1,91 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "isFsfLibre": false, + "name": "Creative Commons Attribution Non Commercial 4.0 International", + "licenseId": "CC-BY-NC-4.0", + "isOsiApproved": false + }, + "permissions": [ + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "attribution", + "mark-changes", + "non-endorsement", + "license-linking" + ], + "limitations": [ + "trademark-use", + "patent-use", + "liability", + "warranty", + "license-incompatibility" + ], + "tags": [ + "copyleft:none", + "domain:content", + "domain:data", + "family:CC", + "license:creative-commons", + "notes:attribution-required" + ], + "reasons": { + "permissions": { + "modifications": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material for NonCommercial purposes only\"" + ], + "distribution": [ + "[verbatim] Section 2(a)(1)(A): \"reproduce and Share the Licensed Material, in whole or in part\"" + ], + "private-use": [ + "[inferred] Section 2(a)(1) grants Licensed Rights broadly; no clause forbids private use, though only NonCommercial uses are licensed" + ], + "data-use": [ + "[verbatim] Section 4(a): \"right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database\"" + ], + "create-adaptations": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material for NonCommercial purposes only\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] Section 3(a)(1)(A)(ii): if You Share, You must retain \"a copyright notice\"" + ], + "attribution": [ + "[verbatim] Section 3(a)(1)(A)(i): retain \"identification of the creator(s)... and any others designated to receive attribution\"" + ], + "mark-changes": [ + "[verbatim] Section 3(a)(1)(B): \"indicate if You modified the Licensed Material and retain an indication of any previous modifications\"" + ], + "non-endorsement": [ + "[verbatim] Section 2(a)(6): no permission to assert or imply You are \"sponsored, endorsed, or granted official status\"" + ], + "license-linking": [ + "[verbatim] Section 3(a)(1)(C): \"include the text of, or the URI or hyperlink to, this Public License\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "patent-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "liability": [ + "[verbatim] Section 5(b): \"in no event will the Licensor be liable to You on any legal theory... for any... damages\"" + ], + "warranty": [ + "[verbatim] Section 5(a): Licensor offers the material \"as-is and as-available\" and \"makes no representations or warranties\"" + ], + "license-incompatibility": [ + "[verbatim] Section 2(a)(5)(B): You may not impose terms \"if doing so restricts exercise of the Licensed Rights\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-NC-SA-4.0.json b/src/licensing/classify/data/examples/CC-BY-NC-SA-4.0.json new file mode 100644 index 0000000..47056ca --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-NC-SA-4.0.json @@ -0,0 +1,93 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Creative Commons Attribution Non Commercial Share Alike 4.0 International", + "licenseId": "CC-BY-NC-SA-4.0", + "isOsiApproved": false + }, + "permissions": [ + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "attribution", + "mark-changes", + "share-alike", + "non-endorsement", + "license-linking" + ], + "limitations": [ + "trademark-use", + "patent-use", + "liability", + "warranty" + ], + "tags": [ + "copyleft:strong", + "domain:content", + "domain:data", + "domain:mixed", + "family:CC", + "license:creative-commons", + "notes:attribution-required", + "notes:share-alike" + ], + "reasons": { + "permissions": { + "modifications": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material for NonCommercial purposes only.\"" + ], + "distribution": [ + "[verbatim] Section 2(a)(1)(A): \"reproduce and Share the Licensed Material, in whole or in part\"" + ], + "private-use": [ + "[inferred] No clause forbids private use; granted rights cover reproduction/use, with restrictions aimed at NonCommercial use and sharing" + ], + "data-use": [ + "[verbatim] Section 4(a): right to \"extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database\"" + ], + "create-adaptations": [ + "[verbatim] Section 2(a)(1)(B): \"produce, reproduce, and Share Adapted Material for NonCommercial purposes only.\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] Section 3(a)(1)(A)(ii): You must retain \"a copyright notice\"" + ], + "attribution": [ + "[verbatim] Section 3(a)(1)(A)(i): retain \"identification of the creator(s)... and any others designated to receive attribution\"" + ], + "mark-changes": [ + "[verbatim] Section 3(a)(1)(B): \"indicate if You modified the Licensed Material and retain an indication of any previous modifications\"" + ], + "share-alike": [ + "[verbatim] Section 3(b)(1): Adapter's License \"must be a Creative Commons license with the same License Elements\"" + ], + "non-endorsement": [ + "[verbatim] Section 2(a)(6): no permission to imply You are \"sponsored, endorsed, or granted official status\" by the Licensor" + ], + "license-linking": [ + "[verbatim] Section 3(a)(1)(C): include \"the text of, or the URI or hyperlink to, this Public License\"", + "[verbatim] Section 3(b)(2): include \"the text of, or the URI or hyperlink to, the Adapter's License You apply\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "patent-use": [ + "[verbatim] Section 2(b)(2): \"Patent and trademark rights are not licensed under this Public License.\"" + ], + "liability": [ + "[verbatim] Section 5(b): \"in no event will the Licensor be liable to You on any legal theory... for any... damages\"" + ], + "warranty": [ + "[verbatim] Section 5(a): Licensor offers the material \"as-is and as-available\" and \"makes no representations or warranties\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-ND-2.0.json b/src/licensing/classify/data/examples/CC-BY-ND-2.0.json new file mode 100644 index 0000000..8d0ca68 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-ND-2.0.json @@ -0,0 +1,67 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "isFsfLibre": false, + "name": "Creative Commons Attribution No Derivatives 2.0 Generic", + "licenseId": "CC-BY-ND-2.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "distribution", + "private-use" + ], + "conditions": [ + "include-copyright", + "attribution", + "license-linking" + ], + "limitations": [ + "trademark-use", + "liability", + "warranty" + ], + "tags": [ + "domain:content", + "family:CC", + "license:creative-commons", + "notes:attribution-required", + "notes:no-derivatives" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[verbatim] 3 grants a \"worldwide, royalty-free, non-exclusive, perpetual\" license to exercise rights in the Work", + "[inferred] No NonCommercial restriction appears; absence of NC in this CC license permits commercial use" + ], + "distribution": [ + "[verbatim] 3.b grants the right \"to distribute copies or phonorecords of... the Work\"" + ], + "private-use": [ + "[inferred] No clause restricts private use; broad rights to reproduce and exercise rights in the Work cover private use" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] 4.b: \"You must keep intact all copyright notices for the Work\"" + ], + "attribution": [ + "[verbatim] 4.b: \"give the Original Author credit reasonable to the medium or means You are utilizing\"" + ], + "license-linking": [ + "[verbatim] 4.a: \"You must include a copy of, or the Uniform Resource Identifier for, this License with every copy\"" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] \"neither party will use the trademark \\\"Creative Commons\\\" or any related trademark or logo... without prior written consent\"" + ], + "liability": [ + "[verbatim] 6: \"IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY... DAMAGES\"" + ], + "warranty": [ + "[verbatim] 5: \"LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC-BY-SA-3.0.json b/src/licensing/classify/data/examples/CC-BY-SA-3.0.json new file mode 100644 index 0000000..9cd1495 --- /dev/null +++ b/src/licensing/classify/data/examples/CC-BY-SA-3.0.json @@ -0,0 +1,89 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Creative Commons Attribution Share Alike 3.0 Unported", + "licenseId": "CC-BY-SA-3.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "create-adaptations" + ], + "conditions": [ + "include-copyright", + "attribution", + "mark-changes", + "share-alike", + "non-endorsement", + "license-linking" + ], + "limitations": [ + "trademark-use", + "liability", + "warranty" + ], + "tags": [ + "domain:content", + "family:CC", + "license:creative-commons", + "notes:attribution-required", + "notes:share-alike" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[inferred] No NC element or field-of-use restriction appears; \"Distribute\" includes transfer \"through sale or other transfer of ownership\"" + ], + "modifications": [ + "[verbatim] Section 3(b): \"to create and Reproduce Adaptations\"", + "[verbatim] \"The above rights include the right to make such modifications as are technically necessary\"" + ], + "distribution": [ + "[verbatim] Section 3(c): \"to Distribute and Publicly Perform the Work including as incorporated in Collections\"", + "[verbatim] Section 3(d): \"to Distribute and Publicly Perform Adaptations\"" + ], + "private-use": [ + "[inferred] Broad reproduction and adaptation rights are granted, and no clause restricts private use" + ], + "create-adaptations": [ + "[verbatim] Section 3(b): \"to create and Reproduce Adaptations\"" + ] + }, + "conditions": { + "include-copyright": [ + "[verbatim] Section 4(c): \"keep intact all copyright notices for the Work\"" + ], + "attribution": [ + "[verbatim] Section 4(c): provide the Original Author name, title, and URI if supplied", + "[verbatim] Section 4(c): include attribution parties designated by Licensor, if any" + ], + "mark-changes": [ + "[verbatim] Section 3(b): Adaptations must \"clearly label, demarcate or otherwise identify that changes were made\"" + ], + "share-alike": [ + "[verbatim] Section 4(b): You may distribute an Adaptation \"only under\" this License, later same-elements CC, or compatible license" + ], + "non-endorsement": [ + "[verbatim] Section 4(c): You may not \"assert or imply any connection with, sponsorship or endorsement\" without permission" + ], + "license-linking": [ + "[verbatim] Section 4(a): include \"a copy of, or the Uniform Resource Identifier (URI) for, this License\"", + "[verbatim] Section 4(b)(I): include \"a copy of, or the URI for, the Applicable License\" with each Adaptation" + ] + }, + "limitations": { + "trademark-use": [ + "[verbatim] CC Notice: no use of the trademark \"Creative Commons\" or related marks without prior written consent" + ], + "liability": [ + "[verbatim] Section 6: \"IN NO EVENT WILL LICENSOR BE LIABLE TO YOU... FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL\" damages" + ], + "warranty": [ + "[verbatim] Section 5: \"LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CC0-1.0.json b/src/licensing/classify/data/examples/CC0-1.0.json new file mode 100644 index 0000000..fb02c5e --- /dev/null +++ b/src/licensing/classify/data/examples/CC0-1.0.json @@ -0,0 +1,76 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "isFsfLibre": true, + "name": "Creative Commons Zero v1.0 Universal", + "licenseId": "CC0-1.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [], + "limitations": [ + "trademark-use", + "patent-use", + "warranty", + "liability" + ], + "tags": [ + "copyleft:none", + "domain:content", + "domain:data", + "family:CC", + "license:creative-commons", + "license:public-domain", + "spdx:fsf-free" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[verbatim] 2: rights are waived \"for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes\"", + "[verbatim] Statement of Purpose: public may reuse and redistribute \"for any purposes, including without limitation commercial purposes\"" + ], + "modifications": [ + "[verbatim] Statement of Purpose: the public may \"build upon, modify, incorporate in other works\"", + "[verbatim] 1(i): rights include \"the right to reproduce, adapt, distribute... and translate a Work\"" + ], + "distribution": [ + "[verbatim] Statement of Purpose: the public may \"reuse and redistribute as freely as possible in any form whatsoever\"", + "[verbatim] 1(i): rights include \"the right to reproduce, adapt, distribute, perform, display, communicate\"" + ], + "private-use": [ + "[verbatim] 2: rights are waived \"for any purpose whatsoever\"", + "[inferred] \"Any purpose whatsoever\" includes private use; no clause excludes personal or internal use" + ], + "data-use": [ + "[verbatim] 1(v): rights include \"the extraction, dissemination, use and reuse of data in a Work\"", + "[verbatim] 1(vi): Copyright and Related Rights include \"database rights\"" + ], + "create-adaptations": [ + "[verbatim] Statement of Purpose: the public may \"build upon, modify, incorporate in other works\"", + "[verbatim] 1(i): rights include \"the right to reproduce, adapt... and translate a Work\"" + ] + }, + "conditions": {}, + "limitations": { + "trademark-use": [ + "[verbatim] 4(a): \"No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected\"" + ], + "patent-use": [ + "[verbatim] 4(a): \"No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected\"" + ], + "warranty": [ + "[verbatim] 4(b): \"Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work\"" + ], + "liability": [ + "[verbatim] Intro: \"Creative Commons... DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/CDLA-Permissive-2.0.json b/src/licensing/classify/data/examples/CDLA-Permissive-2.0.json new file mode 100644 index 0000000..e2840ff --- /dev/null +++ b/src/licensing/classify/data/examples/CDLA-Permissive-2.0.json @@ -0,0 +1,65 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Community Data License Agreement Permissive 2.0", + "licenseId": "CDLA-Permissive-2.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "license-linking" + ], + "limitations": [ + "liability", + "warranty" + ], + "tags": [ + "copyleft:none", + "domain:data", + "domain:software", + "license:open-source", + "notes:custom-terms" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[inferred] 1.1 grants rights to use, modify, and share Data with no noncommercial or field-of-use restriction" + ], + "modifications": [ + "[verbatim] 1.1: \"A Data Recipient may use, modify, and share the Data made available\"" + ], + "distribution": [ + "[verbatim] 2.1: \"A Data Recipient may share Data, with or without modifications\"" + ], + "private-use": [ + "[inferred] 1.1 grants general rights to use, modify, and share Data, with no restriction excluding private use" + ], + "data-use": [ + "[verbatim] 1.1: \"A Data Recipient may use... the Data made available by Data Provider(s)\"" + ], + "create-adaptations": [ + "[verbatim] 2.1: \"A Data Recipient may share Data, with or without modifications\"" + ] + }, + "conditions": { + "license-linking": [ + "[verbatim] 2.1: share Data only if the recipient \"makes available the text of this agreement with the shared Data\"" + ] + }, + "limitations": { + "liability": [ + "[verbatim] 4.1: \"NO DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\"" + ], + "warranty": [ + "[verbatim] 4.1: \"THE DATA IS PROVIDED ON AN \\\"AS IS\\\" BASIS, WITHOUT REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/examples/etalab-2.0.json b/src/licensing/classify/data/examples/etalab-2.0.json new file mode 100644 index 0000000..7b2bf3d --- /dev/null +++ b/src/licensing/classify/data/examples/etalab-2.0.json @@ -0,0 +1,78 @@ +{ + "spdx": { + "isDeprecatedLicenseId": false, + "name": "Etalab Open License 2.0", + "licenseComments": "English translation can be found here: https://www.etalab.gouv.fr/wp-content/uploads/2018/11/open-licence.pdf", + "comment": "English translation can be found here: https://www.etalab.gouv.fr/wp-content/uploads/2018/11/open-licence.pdf", + "licenseId": "etalab-2.0", + "isOsiApproved": false + }, + "permissions": [ + "commercial-use", + "modifications", + "distribution", + "private-use", + "data-use", + "create-adaptations" + ], + "conditions": [ + "attribution", + "non-endorsement" + ], + "limitations": [ + "liability", + "warranty" + ], + "tags": [ + "copyleft:none", + "copyleft:permissive", + "domain:content", + "domain:data", + "license:government-open-license", + "notes:attribution-required", + "notes:government-open-license", + "region:FR" + ], + "reasons": { + "permissions": { + "commercial-use": [ + "[verbatim] \"à des fins commerciales ou non\"", + "[verbatim] \"de l’exploiter à titre commercial\"" + ], + "modifications": [ + "[verbatim] \"de l’adapter, la modifier, l’extraire et la transformer\"" + ], + "distribution": [ + "[verbatim] \"de la communiquer, la diffuser, la redistribuer, la publier et la transmettre\"" + ], + "private-use": [ + "[inferred] Broad right of libre \"Réutilisation\" for commercial or non-commercial purposes includes private use" + ], + "data-use": [ + "[verbatim] \"libre « Réutilisation » de l’« Information »\"", + "[verbatim] \"informations publiques\"", + "[verbatim] \"droit sui generis des producteurs de bases de données\"" + ], + "create-adaptations": [ + "[verbatim] \"pour créer des « Informations dérivées », des produits ou des services\"" + ] + }, + "conditions": { + "attribution": [ + "[verbatim] \"mentionner la paternité de l’« Information » : sa source... et la date de dernière mise à jour\"" + ], + "non-endorsement": [ + "[verbatim] \"ne doit pas suggérer une quelconque reconnaissance ou caution par le « Concédant »\"" + ] + }, + "limitations": { + "liability": [ + "[verbatim] \"Il ne peut être tenu pour responsable de toute perte, préjudice ou dommage\"" + ], + "warranty": [ + "[verbatim] \"sans autre garantie expresse ou tacite\"", + "[verbatim] \"L’absence de défauts ou d’erreurs... n’est pas garantie\"" + ] + } + } +} \ No newline at end of file diff --git a/src/licensing/classify/data/rules.json b/src/licensing/classify/data/rules.json new file mode 120000 index 0000000..abb1252 --- /dev/null +++ b/src/licensing/classify/data/rules.json @@ -0,0 +1 @@ +../../../../data/rules.json \ No newline at end of file diff --git a/src/licensing/classify/data/tags.json b/src/licensing/classify/data/tags.json new file mode 120000 index 0000000..fcb0b14 --- /dev/null +++ b/src/licensing/classify/data/tags.json @@ -0,0 +1 @@ +../../../../data/tags.json \ No newline at end of file diff --git a/tests/test_classify_license.py b/tests/test_classify_license.py index ff29c3f..dbb5dc6 100644 --- a/tests/test_classify_license.py +++ b/tests/test_classify_license.py @@ -12,9 +12,13 @@ from licensing.classify.classify_license import ( _extract_json_obj, + _select_diverse_examples, + format_few_shot_block, + load_classified_examples, load_non_spdx_from_file, load_rules, load_spdx_license, + load_system_prompt, load_tags, normalize_classification, ) @@ -402,3 +406,243 @@ def test_disable_llm_flag_returns_empty_classification(self, spdx_json_file): assert data["permissions"] == [] assert data["conditions"] == [] assert data["limitations"] == [] + + +# --------------------------------------------------------------------------- +# load_classified_examples +# --------------------------------------------------------------------------- + +def _make_license_file(tmp_path, name, permissions, conditions, limitations, reasons=None): + """Helper to write a minimal license JSON with optional reasons.""" + data = { + "spdx": {"licenseId": name, "licenseText": f"License text for {name}"}, + "permissions": permissions, + "conditions": conditions, + "limitations": limitations, + "reasons": reasons or {}, + } + path = tmp_path / f"{name}.json" + path.write_text(json.dumps(data), encoding="utf-8") + return path + + +SAMPLE_REASONS = { + "permissions": {"commercial-use": ["[verbatim] §1: use for any purpose"]}, + "conditions": {"include-copyright": ["[verbatim] §2: retain copyright notice"]}, + "limitations": {"warranty": ["[verbatim] §3: provided as-is"]}, +} + + +class TestLoadClassifiedExamples: + def test_returns_examples_with_reasons(self, tmp_path): + _make_license_file(tmp_path, "MIT", ["commercial-use"], ["include-copyright"], ["warranty"], SAMPLE_REASONS) + results = load_classified_examples(licenses_dir=tmp_path) + assert len(results) == 1 + assert results[0]["license_id"] == "MIT" + + def test_skips_files_without_reasons(self, tmp_path): + _make_license_file(tmp_path, "NO-REASONS", ["commercial-use"], [], [], reasons={}) + results = load_classified_examples(licenses_dir=tmp_path) + assert results == [] + + def test_skips_files_with_empty_reasons_dict(self, tmp_path): + _make_license_file(tmp_path, "EMPTY", [], [], [], reasons={"permissions": {}, "conditions": {}, "limitations": {}}) + results = load_classified_examples(licenses_dir=tmp_path) + assert results == [] + + def test_excludes_current_license(self, tmp_path): + _make_license_file(tmp_path, "MIT", ["commercial-use"], [], [], SAMPLE_REASONS) + _make_license_file(tmp_path, "Apache-2.0", ["commercial-use"], [], [], SAMPLE_REASONS) + results = load_classified_examples(exclude_id="MIT", licenses_dir=tmp_path) + ids = [r["license_id"] for r in results] + assert "MIT" not in ids + assert "Apache-2.0" in ids + + def test_respects_max_examples(self, tmp_path): + for name in ["AAA", "BBB", "CCC", "DDD"]: + _make_license_file(tmp_path, name, ["commercial-use"], [], [], SAMPLE_REASONS) + results = load_classified_examples(max_examples=2, licenses_dir=tmp_path) + assert len(results) == 2 + + def test_max_examples_zero_returns_empty(self, tmp_path): + _make_license_file(tmp_path, "MIT", ["commercial-use"], [], [], SAMPLE_REASONS) + results = load_classified_examples(max_examples=0, licenses_dir=tmp_path) + assert results == [] + + def test_empty_dir_returns_empty(self, tmp_path): + results = load_classified_examples(licenses_dir=tmp_path) + assert results == [] + + def test_result_contains_expected_fields(self, tmp_path): + _make_license_file(tmp_path, "MIT", ["commercial-use"], ["include-copyright"], ["warranty"], SAMPLE_REASONS) + result = load_classified_examples(licenses_dir=tmp_path)[0] + assert result["license_id"] == "MIT" + assert result["permissions"] == ["commercial-use"] + assert result["conditions"] == ["include-copyright"] + assert result["limitations"] == ["warranty"] + assert "reasons" in result + + def test_skips_malformed_json(self, tmp_path): + (tmp_path / "bad.json").write_text("not json", encoding="utf-8") + _make_license_file(tmp_path, "MIT", ["commercial-use"], [], [], SAMPLE_REASONS) + results = load_classified_examples(licenses_dir=tmp_path) + assert len(results) == 1 + + +# --------------------------------------------------------------------------- +# format_few_shot_block +# --------------------------------------------------------------------------- + +class TestFormatFewShotBlock: + def test_empty_returns_placeholder(self): + result = format_few_shot_block([]) + assert "No worked examples" in result + + def test_renders_license_id(self): + examples = [{"license_id": "MIT", "permissions": ["commercial-use"], "conditions": [], "limitations": [], "reasons": {}}] + result = format_few_shot_block(examples) + assert "### Example: MIT" in result + + def test_renders_permissions_line(self): + examples = [{"license_id": "X", "permissions": ["commercial-use", "modifications"], "conditions": [], "limitations": [], "reasons": {}}] + result = format_few_shot_block(examples) + assert "commercial-use, modifications" in result + + def test_none_permissions_renders_none(self): + examples = [{"license_id": "X", "permissions": [], "conditions": [], "limitations": [], "reasons": {}}] + result = format_few_shot_block(examples) + assert "Permissions: none" in result + + def test_renders_reason_evidence(self): + reasons = {"permissions": {"commercial-use": ["[verbatim] §1: any use"]}, "conditions": {}, "limitations": {}} + examples = [{"license_id": "X", "permissions": ["commercial-use"], "conditions": [], "limitations": [], "reasons": reasons}] + result = format_few_shot_block(examples) + assert "[permissions] commercial-use" in result + assert "[verbatim] §1: any use" in result + + def test_evidence_truncated_to_160_chars(self): + long_ev = "A" * 200 + reasons = {"permissions": {"commercial-use": [long_ev]}, "conditions": {}, "limitations": {}} + examples = [{"license_id": "X", "permissions": ["commercial-use"], "conditions": [], "limitations": [], "reasons": reasons}] + result = format_few_shot_block(examples) + assert "A" * 161 not in result + assert "A" * 160 in result + + def test_multiple_examples_separated(self): + ex = {"license_id": "X", "permissions": [], "conditions": [], "limitations": [], "reasons": {}} + result = format_few_shot_block([ex, {**ex, "license_id": "Y"}]) + assert "### Example: X" in result + assert "### Example: Y" in result + + +# --------------------------------------------------------------------------- +# main() — few-shot integration +# --------------------------------------------------------------------------- + +class TestMainFewShot: + def test_max_examples_zero_accepted(self, spdx_json_file): + output = run_main([str(spdx_json_file), "--max-examples", "0"]) + data = json.loads(output) + assert "permissions" in data + + def test_max_examples_default_applied(self, spdx_json_file): + """main() should not raise when --max-examples uses its default.""" + output = run_main([str(spdx_json_file)]) + data = json.loads(output) + assert "permissions" in data + + +# --------------------------------------------------------------------------- +# _select_diverse_examples +# --------------------------------------------------------------------------- + +class TestSelectDiverseExamples: + def _make(self, license_id, permissions, conditions=None, limitations=None): + return { + "license_id": license_id, + "permissions": permissions, + "conditions": conditions or [], + "limitations": limitations or [], + "reasons": {}, + } + + def test_returns_all_when_below_limit(self): + exs = [self._make("A", ["p1"]), self._make("B", ["p2"])] + assert _select_diverse_examples(exs, 5) == exs + + def test_returns_exact_when_equal(self): + exs = [self._make("A", ["p1"]), self._make("B", ["p2"])] + assert len(_select_diverse_examples(exs, 2)) == 2 + + def test_selects_n_from_larger_pool(self): + exs = [self._make(str(i), [f"p{i}"]) for i in range(10)] + result = _select_diverse_examples(exs, 3) + assert len(result) == 3 + + def test_prefers_complementary_profiles(self): + # A and C share p1; B adds p2; D adds p3 — best pair is B+D or A+B or A+D + a = self._make("A", ["p1"]) + b = self._make("B", ["p1", "p2"]) + c = self._make("C", ["p1"]) + d = self._make("D", ["p3"]) + result = _select_diverse_examples([a, b, c, d], 2) + ids = {r["license_id"] for r in result} + # B has most rules; D adds the most new coverage after B + assert "B" in ids + assert "D" in ids + + def test_no_duplicates_in_result(self): + exs = [self._make(str(i), ["p1", "p2"]) for i in range(6)] + result = _select_diverse_examples(exs, 3) + ids = [r["license_id"] for r in result] + assert len(ids) == len(set(ids)) + + +# --------------------------------------------------------------------------- +# Bundled package data defaults +# --------------------------------------------------------------------------- + +class TestBundledDefaults: + def test_load_rules_no_path_uses_bundled(self): + rules = load_rules() + assert "commercial-use" in rules["permissions"] + assert "include-copyright" in rules["conditions"] + assert "liability" in rules["limitations"] + + def test_load_tags_no_path_uses_bundled(self): + tags = load_tags() + assert any(t.startswith("license:") for t in tags) + assert any(t.startswith("domain:") for t in tags) + + def test_load_system_prompt_no_path_uses_bundled(self): + prompt = load_system_prompt() + assert "permissions" in prompt + # placeholder exists but is only substituted when main() passes the mapping + assert "few_shot_examples" in prompt + + def test_load_system_prompt_substitutes_mapping(self): + prompt = load_system_prompt(mapping={"few_shot_examples": "INJECTED"}) + assert "INJECTED" in prompt + assert "{few_shot_examples}" not in prompt + + def test_load_classified_examples_no_dir_uses_bundled(self): + examples = load_classified_examples() + assert len(examples) > 0 + for ex in examples: + assert "license_id" in ex + assert "permissions" in ex + assert "reasons" in ex + + def test_bundled_examples_are_diverse(self): + examples = load_classified_examples(max_examples=5) + # Diversity: at least one without commercial-use (NC licenses) + has_commercial = [e for e in examples if "commercial-use" in e["permissions"]] + missing_commercial = [e for e in examples if "commercial-use" not in e["permissions"]] + assert len(has_commercial) > 0 + assert len(missing_commercial) > 0 + + def test_bundled_examples_exclude_current(self): + examples = load_classified_examples(exclude_id="CC-BY-NC-4.0") + ids = [e["license_id"] for e in examples] + assert "CC-BY-NC-4.0" not in ids +