-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathlanguage_registry_to_json.py
More file actions
105 lines (85 loc) · 3.06 KB
/
language_registry_to_json.py
File metadata and controls
105 lines (85 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
language_registry_to_json.py
Reads the IANA language subtag registry file and extracts language subtag
information, writing it to a JSON file.
The output JSON maps lowercase subtag strings to lists of description strings,
for example:
{
"en": ["English"],
"fr": ["French"],
"zh": ["Chinese"],
...
}
(To shrink the file, the JSON is compacted and not human-readable like above.)
Usage:
python language_registry_to_json.py
python language_registry_to_json.py --registry language-subtag-registry
python language_registry_to_json.py --registry language-subtag-registry --output language-subtag-registry.json
"""
import argparse
import json
import os
import sys
def parse_registry(registry_path):
"""
Parse the IANA language subtag registry file.
Returns a dict mapping lowercase subtag -> list of description strings.
Only records with Type: language are included.
"""
try:
with open(registry_path, "r", encoding="utf-8") as f:
content = f.read()
except OSError as e:
sys.exit(f"error: cannot open registry file {registry_path}: {e}")
lang_map = {}
for record in content.split("%%"):
record = record.strip()
if not record:
continue
record_type = ""
subtag = ""
descriptions = []
for line in record.split("\n"):
if ": " in line:
key, _, value = line.partition(": ")
key = key.strip()
value = value.strip()
if key == "Type":
record_type = value
elif key == "Subtag":
subtag = value
elif key == "Description":
descriptions.append(value)
if record_type == "language" and subtag:
lang_map[subtag.lower()] = descriptions
return lang_map
def main():
script_dir = os.path.dirname(os.path.realpath(__file__))
default_registry = os.path.join(script_dir, "language-subtag-registry")
default_output = os.path.join(script_dir, "language-subtag-registry.json")
parser = argparse.ArgumentParser(
description="Convert IANA language subtag registry to JSON"
)
parser.add_argument(
"--registry",
default=default_registry,
help="path to language-subtag-registry file (default: %(default)s)",
)
parser.add_argument(
"--output",
default=default_output,
help="path to output JSON file (default: %(default)s)",
)
args = parser.parse_args()
lang_map = parse_registry(args.registry)
print(f"extracted {len(lang_map)} language subtags")
with open(args.output, "w", encoding="utf-8") as f:
# We can't force to ASCII - some language names contain non-ASCII.
# Change the separators to shrink file (~8% at time of writing).
json.dump(lang_map, f, ensure_ascii=False, separators=(',', ':'))
f.write("\n")
print(f"wrote {args.output}")
if __name__ == "__main__":
sys.exit(main())