Skip to content

Commit fb3b656

Browse files
authored
feat(medcat): CU-869azeyvz Add scripts download CLI (#206)
* CU-869azeyvz: Add download scripts * CU-869azeyvz: Add medcat-scripts tag along with a lib release * CU-869azeyvz: Add fetch depth to checkout * CU-869azeyvz: Add logging to download scripts * CU-869azeyvz: Unify 2 usage messages * CU-869azeyvz: Linting changes * CU-869azeyvz: Avoid creating implicit medcat-scripts folder. Instead, add everything directly to the folder specified
1 parent 92021a3 commit fb3b656

File tree

3 files changed

+136
-0
lines changed

3 files changed

+136
-0
lines changed

.github/workflows/medcat-v2_release.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ jobs:
2424
steps:
2525
- name: Checkout repository
2626
uses: actions/checkout@v5
27+
with:
28+
fetch-depth: 0
2729

2830
- name: Extract version tag and checkout release branch
2931
id: extract
@@ -198,3 +200,11 @@ jobs:
198200
uses: pypa/gh-action-pypi-publish@release/v1
199201
with:
200202
packages-dir: medcat-v2/dist
203+
204+
- name: Create tag for medcat-scripts
205+
run: |
206+
git tag medcat-scritps/v${{ needs.build.outputs.version_only }}
207+
git push origin medcat-scritps/v${{ needs.build.outputs.version_only }}
208+
env:
209+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
210+

medcat-v2/medcat/__main__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import sys
2+
3+
4+
_DL_SCRIPTS_USAGE = (
5+
"Usage: python -m medcat download-scripts [DEST] [log_level]")
6+
7+
8+
def main(*args: str):
9+
if not args:
10+
print(_DL_SCRIPTS_USAGE, file=sys.stderr)
11+
sys.exit(1)
12+
if len(args) >= 1 and args[0] == "download-scripts":
13+
from medcat.utils.download_scripts import main
14+
dest = args[1] if len(args) > 1 else "."
15+
kwargs = {}
16+
if len(args) > 2:
17+
kwargs["log_level"] = args[2].upper()
18+
main(dest, **kwargs)
19+
else:
20+
print(_DL_SCRIPTS_USAGE, file=sys.stderr)
21+
sys.exit(1)
22+
23+
24+
if __name__ == "__main__":
25+
main(*sys.argv[1:])
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""This module is designed to identify and download the medcat-scripts.
2+
3+
It will link the current setup (i.e medcat version) into account and
4+
subsequently identify and download the medcat-scripts based on the most
5+
recent applicable tag. So if you've got medcat==2.2.0, it might grab
6+
medcat-scripts/v2.2.3 for instance.
7+
"""
8+
import importlib.metadata
9+
import tempfile
10+
import zipfile
11+
from pathlib import Path
12+
import requests
13+
import logging
14+
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
GITHUB_REPO = "CogStack/cogstack-nlp"
20+
SCRIPTS_PATH = "medcat-scripts/"
21+
DOWNLOAD_URL_TEMPLATE = (
22+
f"https://api.github.com/repos/{GITHUB_REPO}/zipball/{{tag}}"
23+
)
24+
25+
26+
def _get_medcat_version() -> str:
27+
"""Return the installed MedCAT version as 'major.minor'."""
28+
version = importlib.metadata.version("medcat")
29+
major, minor, *_ = version.split(".")
30+
return f"{major}.{minor}"
31+
32+
33+
def _find_latest_scripts_tag(major_minor: str) -> str:
34+
"""Query for the newest medcat-scripts tag matching 'v{major_minor}.*'."""
35+
url = f"https://api.github.com/repos/{GITHUB_REPO}/tags"
36+
tags = requests.get(url, timeout=15).json()
37+
38+
matching = [
39+
t["name"]
40+
for t in tags
41+
if t["name"].startswith(f"medcat-scripts/v{major_minor}.")
42+
or t["name"].startswith(f"v{major_minor}.")
43+
]
44+
if not matching:
45+
raise RuntimeError(
46+
f"No medcat-scripts tags found for MedCAT {major_minor}.x")
47+
48+
# Tags are returned newest first by GitHub
49+
return matching[0]
50+
51+
52+
def fetch_scripts(destination: str | Path = ".") -> Path:
53+
"""Download the latest compatible medcat-scripts folder into.
54+
55+
Args:
56+
destination (str | Path): The destination path. Defaults to ".".
57+
58+
Returns:
59+
Path: The path of the scripts.
60+
"""
61+
dest = Path(destination).expanduser().resolve()
62+
dest.mkdir(parents=True, exist_ok=True)
63+
64+
version = _get_medcat_version()
65+
tag = _find_latest_scripts_tag(version)
66+
67+
logger.info("Fetching scripts for MedCAT %s → tag %s}",
68+
version, tag)
69+
70+
# Download the GitHub auto-generated zipball
71+
zip_url = DOWNLOAD_URL_TEMPLATE.format(tag=tag)
72+
with requests.get(zip_url, stream=True, timeout=30) as r:
73+
r.raise_for_status()
74+
with tempfile.NamedTemporaryFile(delete=False) as tmp:
75+
for chunk in r.iter_content(chunk_size=8192):
76+
tmp.write(chunk)
77+
zip_path = Path(tmp.name)
78+
79+
# Extract only medcat-scripts/ from the archive
80+
with zipfile.ZipFile(zip_path) as zf:
81+
for m in zf.namelist():
82+
if f"/{SCRIPTS_PATH}" not in m:
83+
continue
84+
# skip repo-hash prefix
85+
target = dest / Path(*Path(m).parts[2:])
86+
if m.endswith("/"):
87+
target.mkdir(parents=True, exist_ok=True)
88+
else:
89+
with open(target, "wb") as f:
90+
f.write(zf.read(m))
91+
92+
logger.info("Scripts extracted to: %s", dest)
93+
return dest
94+
95+
96+
def main(destination: str = ".",
97+
log_level: int | str = logging.INFO):
98+
logger.setLevel(log_level)
99+
if not logger.handlers:
100+
logger.addHandler(logging.StreamHandler())
101+
fetch_scripts(destination)

0 commit comments

Comments
 (0)