1
+ #!/usr/bin/env python
2
+
3
+ import json
4
+ import re
5
+
6
+ class RuleMatcher (object ):
7
+ def matches (this , accession ):
8
+ pass
9
+
10
+ class RangeMatcher (RuleMatcher ):
11
+
12
+ def __init__ (self , prefix , suffix_length ):
13
+ self .re = re .compile (r'{prefix}[A-Z]{suffix_length}' )
14
+
15
+ def matches (this , accession ):
16
+ return self .re .match (accession ) is not None
17
+
18
+ def make_range_matcher (spec ):
19
+ if '-' in spec :
20
+ (start , end ) = spec .split ('-' )
21
+ elif '_' in spec :
22
+ (start , end ) = spec .split ('_' )
23
+ else :
24
+ raise ValueError ('require specification with - or _ in it, got: ' + spec )
25
+ assert end .endswith ('Z' ), f'Range specifier has unknown format: { spec } '
26
+ for i in range (len (start )):
27
+ if start [i ] != end [i ]:
28
+ break
29
+ prefix_length = i
30
+ assert prefix_length < len (start ), f'Prefix not found in range specifier { spec } '
31
+ suffix_length = len (start ) - prefix_length
32
+ return (prefix_length + suffix_length , RangeMatcher (start [:prefix_length ], suffix_length ))
33
+
34
+
35
+ def build_accession_parser (rules_file ):
36
+ rules_data = json .load (rules_file )
37
+ rules_by_prefix_len = {}
38
+ for prefix_list , database , type_description in rules_data :
39
+ for prefix in prefix_list :
40
+ prefix_length = len (prefix )
41
+ if '-' in prefix or '_' in prefix :
42
+ (prefix_length , matcher ) = make_range_matcher (prefix )
43
+ if prefix_length not in rules_by_prefix_len :
44
+ rules_by_prefix_len [prefix_length ] = []
45
+ rules_by_prefix_len [prefix_length ].append ((matcher , database , type_description ))
46
+ else :
47
+ if prefix_length not in rules_by_prefix_len :
48
+ rules_by_prefix_len [prefix_length ] = []
49
+ rules_by_prefix_len [prefix_length ].append ((prefix , database , type_description ))
50
+ return rules_by_prefix_len
51
+
52
+ letter_re = re .compile ('[A-Z]+' )
53
+ def match_accession (accession , rules_by_prefix_len ):
54
+ if accession [2 ] == '_' :
55
+ return ('NCBI' , 'RefSeq' )
56
+ letter_match = letter_re .match (accession )
57
+ if letter_match is None :
58
+ raise ValueError ('an accession number must start with at least one capital letter, this does not: ' + accession )
59
+ letter_prefix = letter_match .group (0 )
60
+ letter_match_length = len (letter_prefix )
61
+ accession_type = ''
62
+ if letter_match_length == 0 :
63
+ # this should never happen
64
+ raise ValueError ('an accession number must start with at least one capital letter, this does not: ' + accession )
65
+ if letter_match_length < 3 :
66
+ accession_type = 'nucleotide'
67
+ elif letter_match_length < 4 :
68
+ accession_type = 'protein'
69
+ elif letter_match_length == 4 or letter_match_length == 6 :
70
+ accession_type = 'WGS'
71
+ elif letter_match_length == 5 :
72
+ accession_type = 'MGA'
73
+ else :
74
+ raise ValueError ('an accession number must start with less than 7 capital letters, this does not: ' + accession_type )
75
+
76
+ rules = rules_by_prefix_len [letter_match_length ]
77
+ for rule in rules :
78
+ (matcher , database , type_description ) = rule
79
+ if (isinstance (matcher , RuleMatcher ) and matcher .matches (accession )) or letter_prefix == matcher :
80
+ return (database , type_description )
81
+
82
+ if __name__ == "__main__" :
83
+ import argparse
84
+ parser = argparse .ArgumentParser ()
85
+ parser .add_argument ('rules_data_file' , type = argparse .FileType ())
86
+ parser .add_argument ('accession' )
87
+ args = parser .parse_args ()
88
+ rules_by_prefix_len = build_accession_parser (args .rules_data_file )
89
+ print (match_accession (args .accession , rules_by_prefix_len ))
0 commit comments