1
1
#!/usr/bin/env python
2
2
3
+ """Functions for parsing accessions and returning their source database and data type."""
4
+ # Copyright (c) 2019, Peter van Heusden <[email protected] >
5
+ # All rights reserved.
6
+
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ # * Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ # * Redistributions in binary form must reproduce the above copyright
12
+ # notice, this list of conditions and the following disclaimer in the
13
+ # documentation and/or other materials provided with the distribution.
14
+ # * Neither the name of the <organization> nor the
15
+ # names of its contributors may be used to endorse or promote products
16
+ # derived from this software without specific prior written permission.
17
+
18
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19
+ # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21
+ # DISCLAIMED. IN NO EVENT SHALL Peter van Heusden BE LIABLE FOR ANY
22
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ from __future__ import print_function
3
30
import json
4
31
import re
5
32
6
33
class RuleMatcher (object ):
7
- def matches (this , accession ):
34
+ """Superclass for matcher objects that have a match() method to return whether they match an accession."""
35
+
36
+ def matches (self , accession ):
37
+ """matches(self, accession)
38
+
39
+ Returns true of accession matches this rule."""
8
40
pass
9
41
10
42
class RangeMatcher (RuleMatcher ):
43
+ """RuleMatcher subclass that can match an accession whose prefix is in a range of letter prefixes."""
11
44
12
45
def __init__ (self , prefix , suffix_length ):
13
- self .re = re .compile (r'{prefix}[A-Z]{suffix_length}' )
14
-
15
- def matches (this , accession ):
46
+ # this is not a greedy match so we need to specify that it is followed by a number
47
+ self .re = re .compile (f'{ prefix } [A-Z]{ suffix_length } ' + r'\d' )
48
+
49
+
50
+ def matches (self , accession ):
16
51
return self .re .match (accession ) is not None
17
52
18
53
def make_range_matcher (spec ):
54
+ """make_range_matcher(spec)
55
+
56
+ spec - string (e.g. 'BAAA-BZZZ')
57
+
58
+ Turns a range specification into a object that can match an accession that falls
59
+ within that range. Returns a RangeMatcher() object"""
19
60
if '-' in spec :
20
61
(start , end ) = spec .split ('-' )
21
62
elif '_' in spec :
22
63
(start , end ) = spec .split ('_' )
23
64
else :
24
65
raise ValueError ('require specification with - or _ in it, got: ' + spec )
25
66
assert end .endswith ('Z' ), f'Range specifier has unknown format: { spec } '
26
- for i in range ( len ( start ) ):
67
+ for i , _ in enumerate ( start ):
27
68
if start [i ] != end [i ]:
28
69
break
29
70
prefix_length = i
@@ -33,6 +74,14 @@ def make_range_matcher(spec):
33
74
34
75
35
76
def build_accession_parser (rules_file ):
77
+ """build_accession_parser(rules_file)
78
+
79
+ rules_file - file object open for reading
80
+
81
+ Builds a rule parse usable by match_accession() in this module from the accession rules
82
+ described in rules_file. These rules can be downloaded by the scrape_accession_rules
83
+ script."""
84
+
36
85
rules_data = json .load (rules_file )
37
86
rules_by_prefix_len = {}
38
87
for prefix_list , database , type_description in rules_data :
@@ -49,11 +98,17 @@ def build_accession_parser(rules_file):
49
98
rules_by_prefix_len [prefix_length ].append ((prefix , database , type_description ))
50
99
return rules_by_prefix_len
51
100
52
- letter_re = re .compile ('[A-Z]+' )
101
+ # letter_re is a greedy match so we do not need to specify that the letter prefix is followed by a number
102
+ LETTER_RE = re .compile ('[A-Z]+' )
53
103
def match_accession (accession , rules_by_prefix_len ):
54
- if accession [2 ] == '_' :
55
- return ('NCBI' , 'RefSeq' )
56
- letter_match = letter_re .match (accession )
104
+ """match_accession(accession, rules_by_prefix_len)
105
+
106
+ Returns the tuple (database, accession_type, type_description) for accession using the rules specified by
107
+ rules_by_prefix_len. The database is one of 'GenBank', 'NCBI', 'GenBank and DDBJ' , 'DDBJ', and 'EMBL',
108
+ accession_type is one of 'nucleotide', 'protein', 'WGS' and 'MGA' and the type_description is the description
109
+ of the type of data associated with that accession."""
110
+
111
+ letter_match = LETTER_RE .match (accession )
57
112
if letter_match is None :
58
113
raise ValueError ('an accession number must start with at least one capital letter, this does not: ' + accession )
59
114
letter_prefix = letter_match .group (0 )
@@ -66,18 +121,21 @@ def match_accession(accession, rules_by_prefix_len):
66
121
accession_type = 'nucleotide'
67
122
elif letter_match_length < 4 :
68
123
accession_type = 'protein'
69
- elif letter_match_length == 4 or letter_match_length == 6 :
124
+ elif letter_match_length in ( 4 , 6 ) :
70
125
accession_type = 'WGS'
71
126
elif letter_match_length == 5 :
72
127
accession_type = 'MGA'
73
128
else :
74
129
raise ValueError ('an accession number must start with less than 7 capital letters, this does not: ' + accession_type )
75
-
130
+
131
+ if accession [2 ] == '_' :
132
+ return ('NCBI' , accession_type , 'RefSeq' )
133
+
76
134
rules = rules_by_prefix_len [letter_match_length ]
77
135
for rule in rules :
78
136
(matcher , database , type_description ) = rule
79
137
if (isinstance (matcher , RuleMatcher ) and matcher .matches (accession )) or letter_prefix == matcher :
80
- return (database , type_description )
138
+ return (database , accession_type , type_description )
81
139
82
140
if __name__ == "__main__" :
83
141
import argparse
@@ -86,4 +144,4 @@ def match_accession(accession, rules_by_prefix_len):
86
144
parser .add_argument ('accession' )
87
145
args = parser .parse_args ()
88
146
rules_by_prefix_len = build_accession_parser (args .rules_data_file )
89
- print (match_accession (args .accession , rules_by_prefix_len ))
147
+ print (match_accession (args .accession , rules_by_prefix_len ))
0 commit comments