1
1
import pycurl
2
+ import subprocess
2
3
import select
3
4
import sys
4
5
import threading
16
17
import pywebhdfs .errors
17
18
18
19
DOWNLOAD_TIMEOUT = 30
20
+ SCRIPT_EXIT_TIMEOUT = 30
19
21
20
22
class DownloadMetrics (object ):
21
23
def __init__ (self , total_size ):
@@ -97,6 +99,7 @@ def load(self, job, task, fifo):
97
99
self .task = task
98
100
self .fifo = fifo
99
101
self .key = None
102
+ self .script_proc = None
100
103
101
104
if task .data ['scheme' ] == 's3' :
102
105
self .is_anonymous = job .spec .source .aws_access_key is None or job .spec .source .aws_secret_key is None
@@ -173,7 +176,25 @@ def run(self):
173
176
curl .setopt (pycurl .SSL_VERIFYPEER , 0 )
174
177
curl .setopt (pycurl .SSL_VERIFYHOST , 0 )
175
178
curl .setopt (pycurl .CONNECTTIMEOUT , 30 )
176
- curl .setopt (pycurl .WRITEFUNCTION , self ._write_to_fifo (target_file ))
179
+
180
+ if self .job .spec .options .script is not None :
181
+ self .script_proc = subprocess .Popen (
182
+ ["/bin/bash" , "-c" , self .job .spec .options .script ],
183
+ stdout = target_file .fileno (),
184
+ stdin = subprocess .PIPE )
185
+
186
+ # check that script hasn't errored before downloading
187
+ # NOTE: we wait here so that we can check if a script exits prematurely
188
+ # if this is the case, we fail the job without requeueing
189
+ time .sleep (1 )
190
+ if self .script_proc .poll () is not None :
191
+ self .logger .error ('Script `%s` exited prematurely with return code %d' % (self .job .spec .options .script , self .script_proc .returncode ))
192
+ raise WorkerException ('Script `%s` exited prematurely with return code %d' % (self .job .spec .options .script , self .script_proc .returncode ))
193
+
194
+ curl .setopt (pycurl .WRITEFUNCTION , self ._write_to_fifo (self .script_proc .stdin ))
195
+ else :
196
+ curl .setopt (pycurl .WRITEFUNCTION , self ._write_to_fifo (target_file ))
197
+
177
198
if self .task .data ['scheme' ] == 'hdfs' :
178
199
curl .setopt (pycurl .FOLLOWLOCATION , True )
179
200
@@ -187,9 +208,32 @@ def run(self):
187
208
# Catch HTTP client errors, e.g. 404:
188
209
if status_code >= 400 and status_code < 500 :
189
210
raise WorkerException ('HTTP status code %s for file %s' % (status_code , self .key .name ))
211
+
212
+ # If we're piping data through a script, catch timeouts and return codes
213
+ if self .script_proc is not None :
214
+ self .script_proc .stdin .close ()
215
+ for i in range (SCRIPT_EXIT_TIMEOUT ):
216
+ if self .script_proc .poll () is not None :
217
+ break
218
+
219
+ time .sleep (1 )
220
+ else :
221
+ self .logger .error ('Script `%s` failed to exit...killing' % self .job .spec .options .script )
222
+ self .script_proc .kill ()
223
+ raise WorkerException ('Script `%s` failed to exit after %d seconds' % (self .job .spec .options .script , SCRIPT_EXIT_TIMEOUT ))
224
+
225
+ if self .script_proc .returncode != 0 :
226
+ self .logger .error ('Script `%s` exited with return code %d' % (self .job .spec .options .script , self .script_proc .returncode ))
227
+ raise WorkerException ('Script `%s` exited with return code %d' % (self .job .spec .options .script , self .script_proc .returncode ))
190
228
finally :
191
229
with self .task .protect ():
192
230
self .task .stop_step ('download' )
231
+
232
+ if self .script_proc is not None and self .script_proc .returncode is not None :
233
+ try :
234
+ self .script_proc .kill ()
235
+ except OSError as e :
236
+ self .logger .warn ("Failed to kill script `%s`: %s" % (self .job .spec .options .script , str (e )))
193
237
except pycurl .error as e :
194
238
errno = e .args [0 ]
195
239
if errno == pycurl .E_ABORTED_BY_CALLBACK and not self ._should_exit :
0 commit comments