-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathnsidc_icesat2_associated.py
361 lines (338 loc) · 14.7 KB
/
nsidc_icesat2_associated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python
u"""
nsidc_icesat2_associated.py
Written by Tyler Sutterley (03/2024)
Acquires ICESat-2 datafiles from the National Snow and Ice Data Center (NSIDC)
server that is associated with an input file
https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python
https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-
https-earthdata-login-enabled
http://www.voidspace.org.uk/python/articles/authentication.shtml#base64
Register with NASA Earthdata Login system:
https://urs.earthdata.nasa.gov
Add NSIDC_DATAPOOL_OPS to NASA Earthdata Applications
https://urs.earthdata.nasa.gov/oauth/authorize?client_id=_JLuwMHxb2xX6NwYTb4dRA
CALLING SEQUENCE:
python nsidc_icesat2_associated.py --user <username> --release 003
--product ATL03 list_of_ATL06_files
where <username> is your NASA Earthdata username
COMMAND LINE OPTIONS:
--help: list the command line options
-U X, --user X: username for NASA Earthdata Login
-W X, --password X: Password for NASA Earthdata Login
-N X, --netrc X: path to .netrc file for alternative authentication
-D X, --directory: working data directory
-p X, --product: associated product to download
ATL03: Global Geolocated Photon Data
ATL04: Normalized Relative Backscatter
ATL06: Land Ice Height
ATL07: Sea Ice Height
ATL08: Land and Vegetation Height
ATL09: Atmospheric Layer Characteristics
ATL10: Sea Ice Freeboard
ATL12: Ocean Surface Height
ATL13: Inland Water Surface Height
-a, --auxiliary: Download ICESat-2 auxiliary files for each HDF5 file
-F, --flatten: Do not create subdirectories
-P X, --np X: Number of processes to use in file downloads
-T X, --timeout X: Timeout in seconds for blocking operations
-R X, --retry X: Connection retry attempts
-M X, --mode X: Local permissions mode of the directories and files synced
PYTHON DEPENDENCIES:
numpy: Scientific Computing Tools For Python
https://numpy.org
https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
lxml: Pythonic XML and HTML processing library using libxml2/libxslt
https://lxml.de/
https://github.com/lxml/lxml
PROGRAM DEPENDENCIES:
utilities.py: download and management utilities for syncing files
UPDATE HISTORY:
Updated 03/2024: use pathlib to define and operate on paths
Updated 12/2022: single implicit import of altimetry tools
Updated 05/2022: use argparse descriptions within sphinx documentation
Updated 03/2022: use attempt login function to check credentials
Updated 10/2021: using python logging for handling verbose output
Updated 07/2021: set context for multiprocessing to fork child processes
added a file length check to validate downloaded files
Updated 05/2021: added options for connection timeout and retry attempts
Updated 04/2021: set a default netrc file and check access
default credentials from environmental variables
Updated 11/2020: nsidc_list will output a string for errors
Updated 10/2020: using argparse to set parameters
added multiprocessing option for parallel download
Updated 08/2020: moved urllib opener to utilities. add credential check
Updated 05/2020: added option netrc to use alternative authentication
adjust regular expression to allow syncing of ATL07 sea ice products
adjust regular expression for auxiliary products
Updated 09/2019: added ssl context to urlopen headers
Written 08/2019
"""
from __future__ import print_function
import sys
import os
import re
import shutil
import logging
import pathlib
import argparse
import traceback
import posixpath
import lxml.etree
import multiprocessing as mp
import icesat2_toolkit as is2tk
# PURPOSE: download the ICESat-2 elevation data from NSIDC matching an file
def nsidc_icesat2_associated(file_list, PRODUCT, DIRECTORY=None,
AUXILIARY=False, FLATTEN=False, PROCESSES=0, TIMEOUT=None,
RETRY=1, MODE=0o775):
# compile HTML parser for lxml
parser = lxml.etree.HTMLParser()
# logging to standard output
logging.basicConfig(level=logging.INFO)
# remote https server for ICESat-2 Data
HOST = 'https://n5eil01u.ecs.nsidc.org'
# regular expression operator for extracting information from files
rx = re.compile(r'(processed_)?(ATL\d{2})(-\d{2})?_(\d{4})(\d{2})(\d{2})'
r'(\d{2})(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$')
# regular expression pattern for finding specific files
regex_suffix = '(.*?)$' if AUXILIARY else '(h5)$'
remote_regex_pattern = (r'{0}(-\d{{2}})?_(\d{{4}})(\d{{2}})(\d{{2}})'
r'(\d{{2}})(\d{{2}})(\d{{2}})_({1})({2})({3})_({4})_(\d{{2}})(.*?).{5}')
# build list of remote files, remote modification times and local files
original_files = []
remote_files = []
remote_mtimes = []
local_files = []
# for each input file
for input_file in file_list:
# absolute path for input file
input_file = pathlib.Path(input_file).expanduser().absolute()
# extract parameters from ICESat-2 ATLAS HDF5 file name
SUB,PRD,HEM,YY,MM,DD,HH,MN,SS,TRK,CYC,GRN,RL,VRS,AUX = \
rx.findall(input_file.name).pop()
# get directories from remote directory
product_directory = f'{PRODUCT}.{RL}'
sd = f'{YY}.{MM}.{DD}'
PATH = [HOST,'ATLAS',product_directory,sd]
# local and remote data directories
remote_dir = posixpath.join(*PATH)
if (DIRECTORY is None):
local_dir = input_file.parent
else:
local_dir = pathlib.Path(DIRECTORY).expanduser().absolute()
# append subdirectory
if not FLATTEN:
local_dir = local_dir.joinpath(sd)
# create output directory if not currently existing
local_dir = local_dir.mkdir(mode=MODE, parents=True, exist_ok=True)
# compile regular expression operator for file parameters
args = (PRODUCT,TRK,CYC,GRN,RL,regex_suffix)
R1 = re.compile(remote_regex_pattern.format(*args), re.VERBOSE)
# find associated ICESat-2 data file
# find matching files (for granule, release, version, track)
colnames,collastmod,colerror=is2tk.utilities.nsidc_list(PATH,
build=False,
timeout=TIMEOUT,
parser=parser,
pattern=R1,
sort=True)
# print if file was not found
if not colnames:
logging.critical(colerror)
continue
# add to lists
for colname,remote_mtime in zip(colnames,collastmod):
# save original file to list (expands if getting auxiliary files)
original_files.append(input_file)
# remote and local versions of the file
remote_files.append(posixpath.join(remote_dir,colname))
local_files.append(local_dir.joinpath(colname))
remote_mtimes.append(remote_mtime)
# download in series if PROCESSES = 0
if (PROCESSES == 0):
# download each associated ICESat-2 data file
for i,input_file in enumerate(original_files):
# download associated ICESat-2 files with NSIDC server
args = (remote_files[i], remote_mtimes[i], local_files[i])
kwds = dict(TIMEOUT=TIMEOUT, RETRY=RETRY, MODE=MODE)
out = http_pull_file(*args,**kwds)
# print the output string
logging.info(f'{str(input_file)}\n{out}')
else:
# set multiprocessing start method
ctx = mp.get_context("fork")
# download in parallel with multiprocessing Pool
pool = ctx.Pool(processes=PROCESSES)
# download each associated ICESat-2 data file
output = []
for i,input_file in enumerate(original_files):
# download associated ICESat-2 files with NSIDC server
args = (remote_files[i], remote_mtimes[i], local_files[i])
kwds = dict(TIMEOUT=TIMEOUT, RETRY=RETRY, MODE=MODE)
out=pool.apply_async(multiprocess_sync, args=args, kwds=kwds)
output.append(f'{str(input_file)}\n{out}')
# start multiprocessing jobs
# close the pool
# prevents more tasks from being submitted to the pool
pool.close()
# exit the completed processes
pool.join()
# print the output string
for out in output:
logging.info(out.get())
# PURPOSE: wrapper for running the sync program in multiprocessing mode
def multiprocess_sync(*args, **kwds):
try:
output = http_pull_file(*args, **kwds)
except Exception as exc:
# if there has been an error exception
# print the type, value, and stack trace of the
# current exception being handled
logging.critical(f'process id {os.getpid():d} failed')
logging.error(traceback.format_exc())
else:
return output
# PURPOSE: pull file from a remote host checking if file exists locally
# and if the remote file is newer than the local file
def http_pull_file(remote_file, remote_mtime, local_file,
TIMEOUT=None,
RETRY=1,
MODE=0o775
):
# Printing files transferred
output = f'{str(remote_file)} -->\n\t{str(local_file)}\n'
# chunked transfer encoding size
CHUNK = 16 * 1024
# attempt to download a file up to a set number of times
retry_download(remote_file, LOCAL=local_file, TIMEOUT=TIMEOUT,
RETRY=RETRY, CHUNK=CHUNK)
# keep remote modification time of file and local access time
os.utime(local_file, (local_file.stat().st_atime, remote_mtime))
local_file.chmod(mode=MODE)
# return the output string
return output
# PURPOSE: Try downloading a file up to a set number of times
def retry_download(remote_file,
LOCAL=None,
TIMEOUT=None,
RETRY=1,
CHUNK=0
):
# attempt to download up to the number of retries
retry_counter = 0
while (retry_counter < RETRY):
# attempt to retrieve file from https server
try:
# Create and submit request.
# There are a range of exceptions that can be thrown here
# including HTTPError and URLError.
request = is2tk.utilities.urllib2.Request(remote_file)
response = is2tk.utilities.urllib2.urlopen(request,
timeout=TIMEOUT)
# get the length of the remote file
remote_length = int(response.headers['content-length'])
# copy contents to file using chunked transfer encoding
# transfer should work with ascii and binary data formats
with open(LOCAL, 'wb') as f:
shutil.copyfileobj(response, f, CHUNK)
local_length = LOCAL.lstat().st_size
except:
pass
else:
# check that downloaded file matches original length
if (local_length == remote_length):
break
# add to retry counter
retry_counter += 1
# check if maximum number of retries were reached
if (retry_counter == RETRY):
raise TimeoutError('Maximum number of retries reached')
# PURPOSE: create argument parser
def arguments():
parser = argparse.ArgumentParser(
description="""Program to acquire ICESat-2 datafiles from the NSIDC
server that is associated with an input file
"""
)
# ICESat-2 Products
PRODUCTS = {}
PRODUCTS['ATL03'] = 'Global Geolocated Photon Data'
PRODUCTS['ATL04'] = 'Normalized Relative Backscatter'
PRODUCTS['ATL06'] = 'Land Ice Height'
PRODUCTS['ATL07'] = 'Sea Ice Height'
PRODUCTS['ATL08'] = 'Land and Vegetation Height'
PRODUCTS['ATL09'] = 'Atmospheric Layer Characteristics'
PRODUCTS['ATL10'] = 'Sea Ice Freeboard'
PRODUCTS['ATL12'] = 'Ocean Surface Height'
PRODUCTS['ATL13'] = 'Inland Water Surface Height'
# command line parameters
parser.add_argument('file',
type=pathlib.Path, nargs='+',
help='ICESat-2 products to associate')
# NASA Earthdata credentials
parser.add_argument('--user','-U',
type=str, default=os.environ.get('EARTHDATA_USERNAME'),
help='Username for NASA Earthdata Login')
parser.add_argument('--password','-W',
type=str, default=os.environ.get('EARTHDATA_PASSWORD'),
help='Password for NASA Earthdata Login')
parser.add_argument('--netrc','-N',
type=pathlib.Path,
default=pathlib.Path.home().joinpath('.netrc'),
help='Path to .netrc file for authentication')
# working data directory
parser.add_argument('--directory','-D',
type=pathlib.Path,
default=pathlib.Path.cwd(),
help='Working data directory')
# ICESat-2 parameters
# ICESat-2 data product
parser.add_argument('--product','-p',
metavar='PRODUCTS', type=str,
choices=PRODUCTS.keys(), default='ATL06',
help='Associated ICESat-2 data product to download')
# download auxiliary files
parser.add_argument('--auxiliary','-a',
default=False, action='store_true',
help='Sync ICESat-2 auxiliary files for each HDF5 file')
# output subdirectories
parser.add_argument('--flatten','-F',
default=False, action='store_true',
help='Do not create subdirectories')
# run download in series if processes is 0
parser.add_argument('--np','-P',
metavar='PROCESSES', type=int, default=0,
help='Number of processes to use in downloading files')
# connection timeout and number of retry attempts
parser.add_argument('--timeout','-T',
type=int, default=120,
help='Timeout in seconds for blocking operations')
parser.add_argument('--retry','-R',
type=int, default=5,
help='Connection retry attempts')
# permissions mode of the local directories and files (number in octal)
parser.add_argument('--mode','-M',
type=lambda x: int(x,base=8), default=0o775,
help='Permissions mode of output files')
# return the parser
return parser
# This is the main part of the program that calls the individual functions
def main():
# Read the system arguments listed after the program
parser = arguments()
args,_ = parser.parse_known_args()
# NASA Earthdata hostname
HOST = 'urs.earthdata.nasa.gov'
# build a urllib opener for NASA Earthdata
# check internet connection before attempting to run program
opener = is2tk.utilities.attempt_login(HOST,
username=args.user, password=args.password,
netrc=args.netrc)
# check NASA earthdata credentials before attempting to run program
nsidc_icesat2_associated(args.file, args.product,
DIRECTORY=args.directory, AUXILIARY=args.auxiliary,
FLATTEN=args.flatten, PROCESSES=args.np, TIMEOUT=args.timeout,
RETRY=args.retry, MODE=args.mode)
# run main program
if __name__ == '__main__':
main()