|
2 | 2 | import json
|
3 | 3 | import uuid
|
4 | 4 | from tqdm import tqdm
|
| 5 | +import argparse |
| 6 | + |
| 7 | +def generate_database(create_mini=False): |
| 8 | + """ |
| 9 | + Read .xlsx file and generate a .json database. |
| 10 | + :param create_mini: To create a smaller version of the database for debugging. |
| 11 | + :return: None |
| 12 | + """ |
| 13 | + wb = load_workbook("../database/allspecimens.xlsx") |
| 14 | + sheet = wb.active |
| 15 | + |
| 16 | + n_samples = 197052 |
| 17 | + if create_mini: |
| 18 | + n_samples = 10 |
| 19 | + print('Generating mini database with {}'.format(n_samples)) |
| 20 | + |
| 21 | + field_access = ['BK', 'BJ', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'AH', 'AI', 'AP', 'AQ', 'A'] |
| 22 | + field_names = ['image_path', 'image_name', 'family', 'subfamily', 'genus', 'specific_epithet', 'subspecific_epithet', |
| 23 | + 'infraspecific_epithet', 'author', 'country', 'primary_division', 'dec_lat', 'dec_long', 'barcode'] |
| 24 | + |
| 25 | + data = {} |
| 26 | + for sample_id in tqdm(range(2, n_samples + 2)): |
| 27 | + db_element = {} |
| 28 | + token = str(uuid.uuid4()) |
| 29 | + db_element['token'] = token |
| 30 | + for field_access_key, field_name in zip(field_access, field_names): |
| 31 | + if field_name == 'image_path': |
| 32 | + db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value[1:].strip().strip('\\') |
| 33 | + elif field_name in ['dec_lat', 'dec_long']: |
| 34 | + try: |
| 35 | + db_element[field_name] = float(sheet['{}{}'.format(field_access_key, sample_id)].value.strip()) |
| 36 | + except ValueError: |
| 37 | + db_element[field_name] = None |
| 38 | + else: |
| 39 | + db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value.strip() |
| 40 | + data[token] = db_element |
| 41 | + |
| 42 | + db_file = 'database' |
| 43 | + if create_mini: |
| 44 | + db_file = 'mini_database' |
| 45 | + with open('../database/{}.json'.format(db_file), 'w') as outfile: |
| 46 | + json.dump(data, outfile, indent=4) |
| 47 | + |
| 48 | + |
| 49 | +if __name__ == '__main__': |
| 50 | + parser = argparse.ArgumentParser() |
| 51 | + parser.add_argument("--mini", help='Generate a mini database for testing/debugging.', action='store_true') |
| 52 | + args = parser.parse_args() |
| 53 | + |
| 54 | + generate_database(args.mini) |
5 | 55 |
|
6 |
| -wb = load_workbook("../database/allspecimens.xlsx") |
7 |
| -sheet = wb.active |
8 |
| - |
9 |
| -n_samples = 197052 |
10 |
| - |
11 |
| -field_access = ['BK', 'BJ', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'AH', 'AI', 'AP', 'AQ', 'A'] |
12 |
| -field_names = ['image_path', 'image_name', 'family', 'subfamily', 'genus', 'specific_epithet', 'subspecific_epithet', |
13 |
| - 'infraspecific_epithet', 'author', 'country', 'primary_division', 'dec_lat', 'dec_long', 'barcode'] |
14 |
| - |
15 |
| -data = {} |
16 |
| -for sample_id in tqdm(range(2, n_samples + 2)): |
17 |
| - db_element = {} |
18 |
| - token = str(uuid.uuid4()) |
19 |
| - db_element['token'] = token |
20 |
| - for field_access_key, field_name in zip(field_access, field_names): |
21 |
| - if field_name == 'image_path': |
22 |
| - db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value[1:].strip().strip('\\') |
23 |
| - elif field_name in ['dec_lat', 'dec_long']: |
24 |
| - try: |
25 |
| - db_element[field_name] = float(sheet['{}{}'.format(field_access_key, sample_id)].value.strip()) |
26 |
| - except ValueError: |
27 |
| - db_element[field_name] = None |
28 |
| - else: |
29 |
| - db_element[field_name] = sheet['{}{}'.format(field_access_key, sample_id)].value.strip() |
30 |
| - data[token] = db_element |
31 |
| - |
32 |
| -with open('../database/database.json', 'w') as outfile: |
33 |
| - json.dump(data, outfile, indent=4) |
|
0 commit comments