Merge pull request #6 from memsql/pipe_through_script

Hurshal Patel · Hurshal Patel · commit 3b4c48a2aec7 · 2015-04-30T14:33:44.000-07:00
support piping load through a script
diff --git a/memsql_loader/execution/downloader.py b/memsql_loader/execution/downloader.py
@@ -1,4 +1,5 @@
 import pycurl
+import subprocess
 import select
 import sys
 import threading
@@ -16,6 +17,7 @@
 import pywebhdfs.errors
 
 DOWNLOAD_TIMEOUT = 30
+SCRIPT_EXIT_TIMEOUT = 30
 
 class DownloadMetrics(object):
     def __init__(self, total_size):
@@ -97,6 +99,7 @@ def load(self, job, task, fifo):
         self.task = task
         self.fifo = fifo
         self.key = None
+        self.script_proc = None
 
         if task.data['scheme'] == 's3':
             self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
@@ -173,7 +176,25 @@ def run(self):
                     curl.setopt(pycurl.SSL_VERIFYPEER, 0)
                     curl.setopt(pycurl.SSL_VERIFYHOST, 0)
                     curl.setopt(pycurl.CONNECTTIMEOUT, 30)
-                    curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file))
+
+                    if self.job.spec.options.script is not None:
+                        self.script_proc = subprocess.Popen(
+                            ["/bin/bash", "-c", self.job.spec.options.script],
+                            stdout=target_file.fileno(),
+                            stdin=subprocess.PIPE)
+
+                        # check that script hasn't errored before downloading
+                        # NOTE: we wait here so that we can check if a script exits prematurely
+                        # if this is the case, we fail the job without requeueing
+                        time.sleep(1)
+                        if self.script_proc.poll() is not None:
+                            self.logger.error('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
+                            raise WorkerException('Script `%s` exited prematurely with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
+
+                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(self.script_proc.stdin))
+                    else:
+                        curl.setopt(pycurl.WRITEFUNCTION, self._write_to_fifo(target_file))
+
                     if self.task.data['scheme'] == 'hdfs':
                         curl.setopt(pycurl.FOLLOWLOCATION, True)
 
@@ -187,9 +208,32 @@ def run(self):
                         # Catch HTTP client errors, e.g. 404:
                         if status_code >= 400 and status_code < 500:
                             raise WorkerException('HTTP status code %s for file %s' % (status_code, self.key.name))
+
+                        # If we're piping data through a script, catch timeouts and return codes
+                        if self.script_proc is not None:
+                            self.script_proc.stdin.close()
+                            for i in range(SCRIPT_EXIT_TIMEOUT):
+                                if self.script_proc.poll() is not None:
+                                    break
+
+                                time.sleep(1)
+                            else:
+                                self.logger.error('Script `%s` failed to exit...killing' % self.job.spec.options.script)
+                                self.script_proc.kill()
+                                raise WorkerException('Script `%s` failed to exit after %d seconds' % (self.job.spec.options.script, SCRIPT_EXIT_TIMEOUT))
+
+                            if self.script_proc.returncode != 0:
+                                self.logger.error('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
+                                raise WorkerException('Script `%s` exited with return code %d' % (self.job.spec.options.script, self.script_proc.returncode))
                     finally:
                         with self.task.protect():
                             self.task.stop_step('download')
+
+                            if self.script_proc is not None and self.script_proc.returncode is not None:
+                                try:
+                                    self.script_proc.kill()
+                                except OSError as e:
+                                    self.logger.warn("Failed to kill script `%s`: %s" % (self.job.spec.options.script, str(e)))
             except pycurl.error as e:
                 errno = e.args[0]
                 if errno == pycurl.E_ABORTED_BY_CALLBACK and not self._should_exit:
diff --git a/memsql_loader/util/schema.py b/memsql_loader/util/schema.py
@@ -1,4 +1,5 @@
 import os
+import shlex
 import urlparse
 import voluptuous as V
 
@@ -65,6 +66,7 @@ def get_spec_validator():
         V.Required("file_id_column", default=None): V.Any(basestring, None),
         V.Required("non_local_load", default=False): bool,
         V.Required("duplicate_key_method", default="error"): V.Any("error", "replace", "ignore"),
+        V.Required("script", default=None): V.Any(basestring, None)
     })
 
     _db_schema = V.Schema({
@@ -187,4 +189,9 @@ def validate_spec(spec):
             if file_id_column in spec['options']['columns']:
                 raise V.Invalid('options.columns can not contain the file_id_column, it will be filled in by MemSQL-Loader',
                     path=[ 'options', 'columns' ])
+    if spec.options.script is not None:
+        try:
+            shlex.split(spec.options.script)
+        except ValueError as e:
+            raise V.Invalid('options.script is invalid: %s' % str(e), path=[ 'options', 'script' ])
     return spec