Skip to content

Commit 2fa24a0

Browse files
committed
Add documentation, fix some pylint errors
1 parent 75324b8 commit 2fa24a0

5 files changed

+134
-32
lines changed

README.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ Firstly, parser for the accession format described here: https://www.ncbi.nlm.ni
44
`scrape_accession_rules.py`. This can be run as
55

66
```
7-
scrape_accession_rules.py ncbi_accession_rules.json
7+
scrape_accession_rules.py accession_rules.json
88
```
99

1010
to scrape the rules at that website and save them in JSON format. The file generated by this parser is
1111
used in the second script, `parse_ncbi_accession.py` which can be used as a Python module but also
12-
as a command line tool e.g. `parse_ncbi_accession.py ncbi_accession_rules.json AE014297`.
12+
as a command line tool e.g. `parse_ncbi_accession.py accession_rules.json AE014297`.
13+
14+
The `parse_ncbi_accession.py` script aims to work with all Python versions. The `scrape_accession_rules.py`
15+
script requires at least Python 3.6 and the modules specified in `conda_requirements.txt`.
File renamed without changes.

conda_requirements.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# This file may be used to create an environment using:
2+
# $ conda create --name <env> --file <this file>
3+
# platform: linux-64
4+
beautifulsoup4>=4.8.0
5+
requests>=2.22.0

parse_ncbi_accession.py

+71-13
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,70 @@
11
#!/usr/bin/env python
22

3+
"""Functions for parsing accessions and returning their source database and data type."""
4+
# Copyright (c) 2019, Peter van Heusden <[email protected]>
5+
# All rights reserved.
6+
7+
# Redistribution and use in source and binary forms, with or without
8+
# modification, are permitted provided that the following conditions are met:
9+
# * Redistributions of source code must retain the above copyright
10+
# notice, this list of conditions and the following disclaimer.
11+
# * Redistributions in binary form must reproduce the above copyright
12+
# notice, this list of conditions and the following disclaimer in the
13+
# documentation and/or other materials provided with the distribution.
14+
# * Neither the name of the <organization> nor the
15+
# names of its contributors may be used to endorse or promote products
16+
# derived from this software without specific prior written permission.
17+
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL Peter van Heusden BE LIABLE FOR ANY
22+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
from __future__ import print_function
330
import json
431
import re
532

633
class RuleMatcher(object):
7-
def matches(this, accession):
34+
"""Superclass for matcher objects that have a match() method to return whether they match an accession."""
35+
36+
def matches(self, accession):
37+
"""matches(self, accession)
38+
39+
Returns true of accession matches this rule."""
840
pass
941

1042
class RangeMatcher(RuleMatcher):
43+
"""RuleMatcher subclass that can match an accession whose prefix is in a range of letter prefixes."""
1144

1245
def __init__(self, prefix, suffix_length):
13-
self.re = re.compile(r'{prefix}[A-Z]{suffix_length}')
14-
15-
def matches(this, accession):
46+
# this is not a greedy match so we need to specify that it is followed by a number
47+
self.re = re.compile(f'{prefix}[A-Z]{suffix_length}' + r'\d')
48+
49+
50+
def matches(self, accession):
1651
return self.re.match(accession) is not None
1752

1853
def make_range_matcher(spec):
54+
"""make_range_matcher(spec)
55+
56+
spec - string (e.g. 'BAAA-BZZZ')
57+
58+
Turns a range specification into a object that can match an accession that falls
59+
within that range. Returns a RangeMatcher() object"""
1960
if '-' in spec:
2061
(start, end) = spec.split('-')
2162
elif '_' in spec:
2263
(start, end) = spec.split('_')
2364
else:
2465
raise ValueError('require specification with - or _ in it, got: ' + spec)
2566
assert end.endswith('Z'), f'Range specifier has unknown format: {spec}'
26-
for i in range(len(start)):
67+
for i, _ in enumerate(start):
2768
if start[i] != end[i]:
2869
break
2970
prefix_length = i
@@ -33,6 +74,14 @@ def make_range_matcher(spec):
3374

3475

3576
def build_accession_parser(rules_file):
77+
"""build_accession_parser(rules_file)
78+
79+
rules_file - file object open for reading
80+
81+
Builds a rule parse usable by match_accession() in this module from the accession rules
82+
described in rules_file. These rules can be downloaded by the scrape_accession_rules
83+
script."""
84+
3685
rules_data = json.load(rules_file)
3786
rules_by_prefix_len = {}
3887
for prefix_list, database, type_description in rules_data:
@@ -49,11 +98,17 @@ def build_accession_parser(rules_file):
4998
rules_by_prefix_len[prefix_length].append((prefix, database, type_description))
5099
return rules_by_prefix_len
51100

52-
letter_re = re.compile('[A-Z]+')
101+
# letter_re is a greedy match so we do not need to specify that the letter prefix is followed by a number
102+
LETTER_RE = re.compile('[A-Z]+')
53103
def match_accession(accession, rules_by_prefix_len):
54-
if accession[2] == '_':
55-
return ('NCBI', 'RefSeq')
56-
letter_match = letter_re.match(accession)
104+
"""match_accession(accession, rules_by_prefix_len)
105+
106+
Returns the tuple (database, accession_type, type_description) for accession using the rules specified by
107+
rules_by_prefix_len. The database is one of 'GenBank', 'NCBI', 'GenBank and DDBJ' , 'DDBJ', and 'EMBL',
108+
accession_type is one of 'nucleotide', 'protein', 'WGS' and 'MGA' and the type_description is the description
109+
of the type of data associated with that accession."""
110+
111+
letter_match = LETTER_RE.match(accession)
57112
if letter_match is None:
58113
raise ValueError('an accession number must start with at least one capital letter, this does not: ' + accession)
59114
letter_prefix = letter_match.group(0)
@@ -66,18 +121,21 @@ def match_accession(accession, rules_by_prefix_len):
66121
accession_type = 'nucleotide'
67122
elif letter_match_length < 4:
68123
accession_type = 'protein'
69-
elif letter_match_length == 4 or letter_match_length == 6:
124+
elif letter_match_length in (4, 6):
70125
accession_type = 'WGS'
71126
elif letter_match_length == 5:
72127
accession_type = 'MGA'
73128
else:
74129
raise ValueError('an accession number must start with less than 7 capital letters, this does not: ' + accession_type)
75-
130+
131+
if accession[2] == '_':
132+
return ('NCBI', accession_type, 'RefSeq')
133+
76134
rules = rules_by_prefix_len[letter_match_length]
77135
for rule in rules:
78136
(matcher, database, type_description) = rule
79137
if (isinstance(matcher, RuleMatcher) and matcher.matches(accession)) or letter_prefix == matcher:
80-
return (database, type_description)
138+
return (database, accession_type, type_description)
81139

82140
if __name__ == "__main__":
83141
import argparse
@@ -86,4 +144,4 @@ def match_accession(accession, rules_by_prefix_len):
86144
parser.add_argument('accession')
87145
args = parser.parse_args()
88146
rules_by_prefix_len = build_accession_parser(args.rules_data_file)
89-
print(match_accession(args.accession, rules_by_prefix_len))
147+
print(match_accession(args.accession, rules_by_prefix_len))

scrape_accession_rules.py

+53-17
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,37 @@
11
#!/usr/bin/env python
2+
"""Script for parsing accession description page and saving to JSON.
3+
4+
Usage: scrape_accession_rules.py <output file>
5+
6+
e.g.
7+
8+
scrape_accession_rules.py accession_rules.json
9+
"""
10+
11+
# Copyright (c) 2019, Peter van Heusden <[email protected]>
12+
# All rights reserved.
13+
14+
# Redistribution and use in source and binary forms, with or without
15+
# modification, are permitted provided that the following conditions are met:
16+
# * Redistributions of source code must retain the above copyright
17+
# notice, this list of conditions and the following disclaimer.
18+
# * Redistributions in binary form must reproduce the above copyright
19+
# notice, this list of conditions and the following disclaimer in the
20+
# documentation and/or other materials provided with the distribution.
21+
# * Neither the name of the <organization> nor the
22+
# names of its contributors may be used to endorse or promote products
23+
# derived from this software without specific prior written permission.
24+
25+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26+
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28+
# DISCLAIMED. IN NO EVENT SHALL Peter van Heusden BE LIABLE FOR ANY
29+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
32+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
235

336
import argparse
437
import json
@@ -8,19 +41,16 @@
841
import requests
942

1043

11-
# def iterate_id_range(spec: str) -> List[str]:
12-
# (start, end) = spec.split('-')
13-
# assert end.endswith('Z'), f'Range specifier has unknown format: {spec}'
14-
# for i in range(len(start)):
15-
# if start[i] != end[i]:
16-
# break
17-
# prefix_length = i
18-
# assert prefix_length < len(start), f'Prefix not found in range specifier {spec}'
19-
# suffix_length = len(start) - prefix_length
44+
def parse_rules(text: str) -> List[List[List[str], str, str]]:
45+
"""parse_rules(text: str) -> List[List[List[str], str, str]]:
46+
47+
parse the rules from the NCBI webpage, returns a list of lists, which each inner list having the
48+
elements:
2049
21-
def parse_rules(text: str) -> List[List[str]]:
22-
"""parse_rules(text: str):
23-
parse the rules from the NCBI webpage"""
50+
prefix - a list of accession prefixes that this rule applies to
51+
database - name of the database or databases associated with this rule
52+
type_description - description of the type of data associated with this rule
53+
"""
2454
soup = BeautifulSoup(text, 'html.parser')
2555
data = []
2656
for table in soup.find_all('table', cellpadding='3'):
@@ -36,15 +66,21 @@ def parse_rules(text: str) -> List[List[str]]:
3666
return data
3767

3868

39-
def fetch(url='https://www.ncbi.nlm.nih.gov/Sequin/acc.html') -> str:
69+
def fetch(url: str = 'https://www.ncbi.nlm.nih.gov/Sequin/acc.html') -> str:
70+
"""fetch(url:str) -> str
71+
72+
Fetches the accession type description page (by default from https://www.ncbi.nlm.nih.gov/Sequin/acc.html)"""
4073
response = requests.get(url)
41-
if response.status_code == requests.codes['ok']:
42-
return response.text
43-
else:
74+
if response.status_code != requests.codes['ok']:
4475
raise requests.exceptions.RequestException(f'Failed to download {url}', response=response)
76+
return response.text
77+
4578

79+
def save_data(data: List[List[List[str], str, str]], output: TextIO) -> None:
80+
"""save_data(data: List[List[List[str], str, str]], output: TextIO)
4681
47-
def save_data(data: List[List[str]], output: TextIO) -> None:
82+
Saves the data from parsing the accession description page to a JSON format file. output must be a
83+
file open for writing."""
4884
json.dump(data, output, indent=2)
4985
output.close()
5086

0 commit comments

Comments
 (0)