Skip to content

Commit 75324b8

Browse files
committedSep 24, 2019
initial commit
0 parents  commit 75324b8

5 files changed

+919
-0
lines changed
 

‎.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"python.pythonPath": "/home/pvh/anaconda3/envs/parse_accessions/bin/python"
3+
}

‎README.md

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
This project contains two sets of code:
2+
3+
Firstly, parser for the accession format described here: https://www.ncbi.nlm.nih.gov/Sequin/acc.html called
4+
`scrape_accession_rules.py`. This can be run as
5+
6+
```
7+
scrape_accession_rules.py ncbi_accession_rules.json
8+
```
9+
10+
to scrape the rules at that website and save them in JSON format. The file generated by this parser is
11+
used in the second script, `parse_ncbi_accession.py` which can be used as a Python module but also
12+
as a command line tool e.g. `parse_ncbi_accession.py ncbi_accession_rules.json AE014297`.

‎ncbi_accession_rules.json

+760
Large diffs are not rendered by default.

‎parse_ncbi_accession.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/usr/bin/env python
2+
3+
import json
4+
import re
5+
6+
class RuleMatcher(object):
7+
def matches(this, accession):
8+
pass
9+
10+
class RangeMatcher(RuleMatcher):
11+
12+
def __init__(self, prefix, suffix_length):
13+
self.re = re.compile(r'{prefix}[A-Z]{suffix_length}')
14+
15+
def matches(this, accession):
16+
return self.re.match(accession) is not None
17+
18+
def make_range_matcher(spec):
19+
if '-' in spec:
20+
(start, end) = spec.split('-')
21+
elif '_' in spec:
22+
(start, end) = spec.split('_')
23+
else:
24+
raise ValueError('require specification with - or _ in it, got: ' + spec)
25+
assert end.endswith('Z'), f'Range specifier has unknown format: {spec}'
26+
for i in range(len(start)):
27+
if start[i] != end[i]:
28+
break
29+
prefix_length = i
30+
assert prefix_length < len(start), f'Prefix not found in range specifier {spec}'
31+
suffix_length = len(start) - prefix_length
32+
return (prefix_length + suffix_length, RangeMatcher(start[:prefix_length], suffix_length))
33+
34+
35+
def build_accession_parser(rules_file):
36+
rules_data = json.load(rules_file)
37+
rules_by_prefix_len = {}
38+
for prefix_list, database, type_description in rules_data:
39+
for prefix in prefix_list:
40+
prefix_length = len(prefix)
41+
if '-' in prefix or '_' in prefix:
42+
(prefix_length, matcher) = make_range_matcher(prefix)
43+
if prefix_length not in rules_by_prefix_len:
44+
rules_by_prefix_len[prefix_length] = []
45+
rules_by_prefix_len[prefix_length].append((matcher, database, type_description))
46+
else:
47+
if prefix_length not in rules_by_prefix_len:
48+
rules_by_prefix_len[prefix_length] = []
49+
rules_by_prefix_len[prefix_length].append((prefix, database, type_description))
50+
return rules_by_prefix_len
51+
52+
letter_re = re.compile('[A-Z]+')
53+
def match_accession(accession, rules_by_prefix_len):
54+
if accession[2] == '_':
55+
return ('NCBI', 'RefSeq')
56+
letter_match = letter_re.match(accession)
57+
if letter_match is None:
58+
raise ValueError('an accession number must start with at least one capital letter, this does not: ' + accession)
59+
letter_prefix = letter_match.group(0)
60+
letter_match_length = len(letter_prefix)
61+
accession_type = ''
62+
if letter_match_length == 0:
63+
# this should never happen
64+
raise ValueError('an accession number must start with at least one capital letter, this does not: ' + accession)
65+
if letter_match_length < 3:
66+
accession_type = 'nucleotide'
67+
elif letter_match_length < 4:
68+
accession_type = 'protein'
69+
elif letter_match_length == 4 or letter_match_length == 6:
70+
accession_type = 'WGS'
71+
elif letter_match_length == 5:
72+
accession_type = 'MGA'
73+
else:
74+
raise ValueError('an accession number must start with less than 7 capital letters, this does not: ' + accession_type)
75+
76+
rules = rules_by_prefix_len[letter_match_length]
77+
for rule in rules:
78+
(matcher, database, type_description) = rule
79+
if (isinstance(matcher, RuleMatcher) and matcher.matches(accession)) or letter_prefix == matcher:
80+
return (database, type_description)
81+
82+
if __name__ == "__main__":
83+
import argparse
84+
parser = argparse.ArgumentParser()
85+
parser.add_argument('rules_data_file', type=argparse.FileType())
86+
parser.add_argument('accession')
87+
args = parser.parse_args()
88+
rules_by_prefix_len = build_accession_parser(args.rules_data_file)
89+
print(match_accession(args.accession, rules_by_prefix_len))

‎scrape_accession_rules.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import json
5+
from typing import List, TextIO
6+
7+
from bs4 import BeautifulSoup
8+
import requests
9+
10+
11+
# def iterate_id_range(spec: str) -> List[str]:
12+
# (start, end) = spec.split('-')
13+
# assert end.endswith('Z'), f'Range specifier has unknown format: {spec}'
14+
# for i in range(len(start)):
15+
# if start[i] != end[i]:
16+
# break
17+
# prefix_length = i
18+
# assert prefix_length < len(start), f'Prefix not found in range specifier {spec}'
19+
# suffix_length = len(start) - prefix_length
20+
21+
def parse_rules(text: str) -> List[List[str]]:
22+
"""parse_rules(text: str):
23+
parse the rules from the NCBI webpage"""
24+
soup = BeautifulSoup(text, 'html.parser')
25+
data = []
26+
for table in soup.find_all('table', cellpadding='3'):
27+
for row in table.find_all('tr'):
28+
(prefix, database, type_description, _) = row.text.split('\n')
29+
if prefix.strip() == 'Prefix':
30+
continue
31+
if ',' in prefix:
32+
prefix = [p.strip() for p in prefix.split(',')]
33+
else:
34+
prefix = [prefix]
35+
data.append([prefix, database, type_description])
36+
return data
37+
38+
39+
def fetch(url='https://www.ncbi.nlm.nih.gov/Sequin/acc.html') -> str:
40+
response = requests.get(url)
41+
if response.status_code == requests.codes['ok']:
42+
return response.text
43+
else:
44+
raise requests.exceptions.RequestException(f'Failed to download {url}', response=response)
45+
46+
47+
def save_data(data: List[List[str]], output: TextIO) -> None:
48+
json.dump(data, output, indent=2)
49+
output.close()
50+
51+
if __name__ == "__main__":
52+
parser = argparse.ArgumentParser(description='Save rules from NCBI website')
53+
parser.add_argument('output_file', type=argparse.FileType('w'))
54+
args = parser.parse_args()
55+
save_data(parse_rules(fetch()), args.output_file)

0 commit comments

Comments
 (0)
Please sign in to comment.