Skip to content

Commit cc3fefc

Browse files
committed
restructure code
1 parent fb889f0 commit cc3fefc

File tree

1 file changed

+50
-28
lines changed

1 file changed

+50
-28
lines changed

data/utilities.py

+50-28
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,54 @@
22
import json
33
import uuid
44
from tqdm import tqdm
5+
import argparse
6+
7+
def generate_database(create_mini=False):
8+
"""
9+
Read .xlsx file and generate a .json database.
10+
:param create_mini: To create a smaller version of the database for debugging.
11+
:return: None
12+
"""
13+
wb = load_workbook("../database/allspecimens.xlsx")
14+
sheet = wb.active
15+
16+
n_samples = 197052
17+
if create_mini:
18+
n_samples = 10
19+
print('Generating mini database with {}'.format(n_samples))
20+
21+
field_access = ['BK', 'BJ', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'AH', 'AI', 'AP', 'AQ', 'A']
22+
field_names = ['image_path', 'image_name', 'family', 'subfamily', 'genus', 'specific_epithet', 'subspecific_epithet',
23+
'infraspecific_epithet', 'author', 'country', 'primary_division', 'dec_lat', 'dec_long', 'barcode']
24+
25+
data = {}
26+
for sample_id in tqdm(range(2, n_samples + 2)):
27+
db_element = {}
28+
token = str(uuid.uuid4())
29+
db_element['token'] = token
30+
for field_access_key, field_name in zip(field_access, field_names):
31+
if field_name == 'image_path':
32+
db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value[1:].strip().strip('\\')
33+
elif field_name in ['dec_lat', 'dec_long']:
34+
try:
35+
db_element[field_name] = float(sheet['{}{}'.format(field_access_key, sample_id)].value.strip())
36+
except ValueError:
37+
db_element[field_name] = None
38+
else:
39+
db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value.strip()
40+
data[token] = db_element
41+
42+
db_file = 'database'
43+
if create_mini:
44+
db_file = 'mini_database'
45+
with open('../database/{}.json'.format(db_file), 'w') as outfile:
46+
json.dump(data, outfile, indent=4)
47+
48+
49+
if __name__ == '__main__':
50+
parser = argparse.ArgumentParser()
51+
parser.add_argument("--mini", help='Generate a mini database for testing/debugging.', action='store_true')
52+
args = parser.parse_args()
53+
54+
generate_database(args.mini)
555

6-
wb = load_workbook("../database/allspecimens.xlsx")
7-
sheet = wb.active
8-
9-
n_samples = 197052
10-
11-
field_access = ['BK', 'BJ', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'AH', 'AI', 'AP', 'AQ', 'A']
12-
field_names = ['image_path', 'image_name', 'family', 'subfamily', 'genus', 'specific_epithet', 'subspecific_epithet',
13-
'infraspecific_epithet', 'author', 'country', 'primary_division', 'dec_lat', 'dec_long', 'barcode']
14-
15-
data = {}
16-
for sample_id in tqdm(range(2, n_samples + 2)):
17-
db_element = {}
18-
token = str(uuid.uuid4())
19-
db_element['token'] = token
20-
for field_access_key, field_name in zip(field_access, field_names):
21-
if field_name == 'image_path':
22-
db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value[1:].strip().strip('\\')
23-
elif field_name in ['dec_lat', 'dec_long']:
24-
try:
25-
db_element[field_name] = float(sheet['{}{}'.format(field_access_key, sample_id)].value.strip())
26-
except ValueError:
27-
db_element[field_name] = None
28-
else:
29-
db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value.strip()
30-
data[token] = db_element
31-
32-
with open('../database/database.json', 'w') as outfile:
33-
json.dump(data, outfile, indent=4)

0 commit comments

Comments
 (0)