diff --git a/README.md b/README.md index e1bfc9a..36184b4 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Data Ingress * [Load a JSON file into an SFrame](load_json.py) * [Load a collection XML files into an SFrame](sframe_xml_to_dict.py) * [Load an Avro file into an SFrame](load_avro.py) +* [Load a file on HDFS into an SFrame](load_hdfs.py) Tabular Data Transformation ----------------------------- diff --git a/load_hdfs.py b/load_hdfs.py new file mode 100644 index 0000000..a37bcd3 --- /dev/null +++ b/load_hdfs.py @@ -0,0 +1,35 @@ +import os +import subprocess +import graphlab as gl + +# Reading from HDFS into an SFrame is easy, as long as you know how to +# construct your HDFS URL and your system has java installed in a relatively +# standard way. This how-to is meant to help if one of those two things are +# not true for you. + +#### Installation-specific variables #### +# Change these variables for your HDFS setup + +hdfs_url_base = None +# An example of what should be in this variable +#hdfs_url_base = 'hdfs://my.server.com:8020' + +username = 'evan' + +filepath = 'test.txt' + +#### Construct your HDFS URL #### +# If you don't know how to get the server and port to reach your HDFS +# installation, here's a way to do it that works on CDH 5. +if hdfs_url_base is None: + hdfs_url_base = subprocess.check_output( + ['hdfs', 'getconf', '-confKey', 'fs.defaultFS']).rstrip() + +#### Specify a Java installation (OPTIONAL) #### +# To set a specific java implementation to execute the HDFS commands, set this +# environment variable BEFORE running any GraphLab Create commands. +os.environ['GRAPHLAB_JAVA_HOME'] = '/foo/java' + +sf = gl.SFrame.read_csv(hdfs_url_base + '/user/' + username + '/' + filepath); + +print sf