Skip to content

Commit 42f6316

Browse files
committed
Add logic for SRA accessions
1 parent 9fff74c commit 42f6316

File tree

3 files changed

+178
-2
lines changed

3 files changed

+178
-2
lines changed

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ This project contains two sets of code:
22

33
Firstly, parser for the accession format described here: https://www.ncbi.nlm.nih.gov/Sequin/acc.html called
44
`scrape_accession_rules.py` and the RefSeq sections described in
5-
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly.
5+
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly and
6+
the SRA accessions described in https://www.ncbi.nlm.nih.gov/books/NBK56913/#search.what_do_the_different_sra_accessi.
67
This can be run as
78

89
```
@@ -14,4 +15,7 @@ used in the second script, `parse_accession.py` which can be used as a Python mo
1415
as a command line tool e.g. `parse_accession.py accession_rules.json AE014297`.
1516

1617
The `parse_accession.py` script aims to work with all Python versions. The `scrape_accession_rules.py`
17-
script requires at least Python 3.6 and the modules specified in `conda_requirements.txt`.
18+
script requires at least Python 3.6 and the modules specified in `conda_requirements.txt`.
19+
20+
This code is licensed under the BSD 3 clause license. Please feel free to use and modify the code, but
21+
it is released without warranty. See the `.py` files for details.

accession_rules.json

+144
Original file line numberDiff line numberDiff line change
@@ -940,5 +940,149 @@
940940
"NCBI",
941941
"Protein",
942942
"Non-redundant across multiple strains and species"
943+
],
944+
[
945+
[
946+
"SRA"
947+
],
948+
"NCBI",
949+
"SRA",
950+
"SRA submission accession: The submission accession represents a virtual container that holds the objects represented by the other five accessions and is used to track the submission in the archive."
951+
],
952+
[
953+
[
954+
"SRP"
955+
],
956+
"NCBI",
957+
"SRA",
958+
"SRA study accession: A Study is an object that contains the project metadata describing a sequencing study or project. Imported from BioProject."
959+
],
960+
[
961+
[
962+
"SRX"
963+
],
964+
"NCBI",
965+
"SRA",
966+
"SRA experiment accession: An Experiment is an object that contains the metadata describing the library, platform selection, and processing parameters involved in a particular sequencing experiment."
967+
],
968+
[
969+
[
970+
"SRR"
971+
],
972+
"NCBI",
973+
"SRA",
974+
"SRA run accession: A Run is an object that contains actual sequencing data for a particular sequencing experiment. Experiments may contain many Runs depending on the number of sequencing instrument runs that were needed."
975+
],
976+
[
977+
[
978+
"SRS"
979+
],
980+
"NCBI",
981+
"SRA",
982+
"SRA sample accession: A Sample is an object that contains the metadata describing the physical sample upon which a sequencing experiment was performed. Imported from BioSample."
983+
],
984+
[
985+
[
986+
"SRZ"
987+
],
988+
"NCBI",
989+
"SRA",
990+
"SRA analysis accession: An analysis is an object that contains a sequence data analysis BAM file and the metadata describing the sequence analysis."
991+
],
992+
[
993+
[
994+
"ERA"
995+
],
996+
"EMBL",
997+
"SRA",
998+
"ERA submission accession: The submission accession represents a virtual container that holds the objects represented by the other five accessions and is used to track the submission in the archive."
999+
],
1000+
[
1001+
[
1002+
"ERP"
1003+
],
1004+
"EMBL",
1005+
"SRA",
1006+
"ERA study accession: A Study is an object that contains the project metadata describing a sequencing study or project. Imported from BioProject."
1007+
],
1008+
[
1009+
[
1010+
"ERX"
1011+
],
1012+
"EMBL",
1013+
"SRA",
1014+
"ERA experiment accession: An Experiment is an object that contains the metadata describing the library, platform selection, and processing parameters involved in a particular sequencing experiment."
1015+
],
1016+
[
1017+
[
1018+
"ERR"
1019+
],
1020+
"EMBL",
1021+
"SRA",
1022+
"ERA run accession: A Run is an object that contains actual sequencing data for a particular sequencing experiment. Experiments may contain many Runs depending on the number of sequencing instrument runs that were needed."
1023+
],
1024+
[
1025+
[
1026+
"ERS"
1027+
],
1028+
"EMBL",
1029+
"SRA",
1030+
"ERA sample accession: A Sample is an object that contains the metadata describing the physical sample upon which a sequencing experiment was performed. Imported from BioSample."
1031+
],
1032+
[
1033+
[
1034+
"ERZ"
1035+
],
1036+
"EMBL",
1037+
"SRA",
1038+
"ERA analysis accession: An analysis is an object that contains a sequence data analysis BAM file and the metadata describing the sequence analysis."
1039+
],
1040+
[
1041+
[
1042+
"DRA"
1043+
],
1044+
"DDBJ",
1045+
"SRA",
1046+
"DRA submission accession: The submission accession represents a virtual container that holds the objects represented by the other five accessions and is used to track the submission in the archive."
1047+
],
1048+
[
1049+
[
1050+
"DRP"
1051+
],
1052+
"DDBJ",
1053+
"SRA",
1054+
"DRA study accession: A Study is an object that contains the project metadata describing a sequencing study or project. Imported from BioProject."
1055+
],
1056+
[
1057+
[
1058+
"DRX"
1059+
],
1060+
"DDBJ",
1061+
"SRA",
1062+
"DRA experiment accession: An Experiment is an object that contains the metadata describing the library, platform selection, and processing parameters involved in a particular sequencing experiment."
1063+
],
1064+
[
1065+
[
1066+
"DRR"
1067+
],
1068+
"DDBJ",
1069+
"SRA",
1070+
"DRA run accession: A Run is an object that contains actual sequencing data for a particular sequencing experiment. Experiments may contain many Runs depending on the number of sequencing instrument runs that were needed."
1071+
],
1072+
[
1073+
[
1074+
"DRS"
1075+
],
1076+
"DDBJ",
1077+
"SRA",
1078+
"DRA sample accession: A Sample is an object that contains the metadata describing the physical sample upon which a sequencing experiment was performed. Imported from BioSample."
1079+
],
1080+
[
1081+
[
1082+
"DRZ"
1083+
],
1084+
"DDBJ",
1085+
"SRA",
1086+
"DRA analysis accession: An analysis is an object that contains a sequence data analysis BAM file and the metadata describing the sequence analysis."
9431087
]
9441088
]

scrape_accession_rules.py

+28
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3434
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535

36+
from __future__ import print_function
3637
import argparse
3738
import json
3839
from typing import List, Tuple, TextIO
@@ -81,6 +82,31 @@ def parse_refseq_rules(text: str) -> RulesData:
8182
data.append(([prefix], database, molecule_type, type_description))
8283
return data
8384

85+
def parse_sra_rules(text: str) -> RulesData:
86+
"""parse_sra_rules(text: str) -> RulesData
87+
88+
Parse the rules for SRA, ENA and DDBJ equivalent (thanks to Torsten Seemann for pointing them out)"""
89+
soup = BeautifulSoup(text, 'html.parser')
90+
sra_table = soup.find('h4', id='search.what_do_the_different_sra_accessi').next_sibling.table
91+
data = []
92+
for row in sra_table.tbody.find_all('tr'):
93+
prefix = row.td.text
94+
database = 'NCBI'
95+
type_description = row.td.next_sibling.text.strip()
96+
type_description += ': ' + row.td.next_sibling.next_sibling.text
97+
accession_type = 'SRA'
98+
data.append(([prefix], database, accession_type, type_description))
99+
ena_table = soup.find('h4', id='search.what_do_the_different_sra_accessi').next_sibling.contents[4].table
100+
embl_ddbj_data = []
101+
for database in ('EMBL', 'DDBJ'):
102+
prefix_start = database[0]
103+
for old_prefix, _, accession_type, old_type_description in data:
104+
prefix = prefix_start + old_prefix[0][1:]
105+
type_description = prefix_start + old_type_description[1:]
106+
embl_ddbj_data.append(([prefix], database, accession_type, type_description))
107+
data.extend(embl_ddbj_data)
108+
return data
109+
84110

85111
def fetch(url: str = 'https://www.ncbi.nlm.nih.gov/Sequin/acc.html') -> str:
86112
"""fetch(url:str) -> str
@@ -107,4 +133,6 @@ def save_data(data: RulesData, output: TextIO) -> None:
107133
data = parse_rules(fetch())
108134
refseq_data = parse_refseq_rules(fetch(url='https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly'))
109135
data.extend(refseq_data)
136+
sra_data = parse_sra_rules(fetch(url='https://www.ncbi.nlm.nih.gov/books/NBK56913'))
137+
data.extend(sra_data)
110138
save_data(data, args.output_file)

0 commit comments

Comments
 (0)