|
def download_gen3_list(input_url, download_limit=GEN3_DOWNLOAD_LIMIT): |
|
"""Download list of studies from gen3 |
|
|
|
This function helps download a list of items from Gen3 by downloading |
|
the list and -- as long as there are as many items as the download_limit -- |
|
by using `offset` to get the next set of results. |
|
|
|
:param input_url: The URL to download. This function will concatenate |
|
`&limit=...&offset=...` to it, so it should end with arguments or at least a |
|
question mark. |
|
|
|
:param download_limit: The maximum number of items to download (as set by |
|
`limit=...`). Note that Gen3 has an internal limit, so you should make sure |
|
your limit is smaller than that -- otherwise, you will request e.g. 3000 |
|
entries but retrieve the Gen3 limit (say, 2000), which this function will |
|
interpret to mean that all entries have been downloaded. |
|
|
|
:return: A list of retrieved strings. (This function only works when the |
|
result is a simple JSON list of strings.) |
|
|
|
""" |
|
complete_list = [] |
|
offset = 0 |
|
while True: |
|
url = input_url + f"&limit={download_limit}&offset={offset}" |
|
logging.debug(f"Requesting GET {url} from Gen3") |
|
partial_list_response = requests.get(url, timeout=GEN3_DOWNLOAD_TIMEOUT) |
|
if not partial_list_response.ok: |
|
raise RuntimeError( |
|
f"Could not download discovery_metadata from BDC Gen3 {url}: " + |
|
f"{partial_list_response.status_code} " |
|
f"{partial_list_response.text}") |
|
|
|
partial_list = partial_list_response.json() |
|
complete_list.extend(partial_list) |
|
if len(partial_list) < GEN3_DOWNLOAD_LIMIT: |
|
# No more entries to download! |
|
break |
|
|
|
# Otherwise, increment offset by DOWNLOAD_SIZE |
|
offset += download_limit |
|
|
|
# Make sure we don't have duplicates -- this is more likely to be an error |
|
# in the offset algorithm than an actual |
|
# error. |
|
if len(complete_list) != len(set(complete_list)): |
|
duplicate_ids = sorted([ident for ident, count in |
|
Counter(complete_list).items() if count > 1]) |
|
logging.warning(f"Found duplicate discovery_metadata: {duplicate_ids}") |
|
|
|
return complete_list |
We currently use a limit of 10,000 when querying the MDS, and that seems to work reasonably well (given the ~1,000 HEAL studies, and far fewer dictionaries), however, for future-proofing it would be good to support an arbitrary number of entries.
There is code for doing that in this very repository, so we would really just need to apply to the HEAL code:
dug-data-ingest/scripts/bdc/get_bdc_studies_from_gen3.py
Lines 32 to 82 in ad5be28