22
33> Utility belt to handle data on AWS.
44
5- [ ![ Release] ( https://img.shields.io/badge/release-0.1.0 -brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
5+ [ ![ Release] ( https://img.shields.io/badge/release-0.1.1 -brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
66[ ![ Downloads] ( https://img.shields.io/pypi/dm/awswrangler.svg )] ( https://pypi.org/project/awswrangler/ )
77[ ![ Python Version] ( https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg )] ( https://pypi.org/project/awswrangler/ )
88[ ![ Documentation Status] ( https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest )] ( https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest )
2828* Pandas -> CSV (S3) (Parallel)
2929* Pandas -> Glue Catalog Table
3030* Pandas -> Athena (Parallel)
31- * Pandas -> Redshift (Parallel)
31+ * Pandas -> Redshift (Append/Overwrite/Upsert) ( Parallel)
3232* Parquet (S3) -> Pandas (Parallel) (NEW :star : )
3333* CSV (S3) -> Pandas (One shot or Batching)
3434* Glue Catalog Table -> Pandas (Parallel) (NEW :star : )
6161* Get EMR step state
6262* Get EMR step state
6363* Athena query to receive the result as python primitives (* Iterable[ Dict[ str, Any] * )
64+ * Load and Unzip SageMaker jobs outputs
6465
6566## Installation
6667
@@ -84,7 +85,7 @@ Runs anywhere (AWS Lambda, AWS Glue Python Shell, EMR, EC2, on-premises, local,
8485import awswrangler as wr
8586
8687wr.pandas.to_parquet(
87- dataframe = dataframe ,
88+ dataframe = df ,
8889 database = " database" ,
8990 path = " s3://..." ,
9091 partition_cols = [" col_name" ],
@@ -113,7 +114,7 @@ sess.pandas.to_parquet(
113114``` py3
114115import awswrangler as wr
115116
116- dataframe = wr.pandas.read_sql_athena(
117+ df = wr.pandas.read_sql_athena(
117118 sql = " select * from table" ,
118119 database = " database"
119120)
@@ -140,7 +141,7 @@ for df in df_iter:
140141import awswrangler as wr
141142
142143sess = wr.Session(athena_ctas_approach = True )
143- dataframe = sess.pandas.read_sql_athena(
144+ df = sess.pandas.read_sql_athena(
144145 sql = " select * from table" ,
145146 database = " database"
146147)
@@ -151,7 +152,7 @@ dataframe = sess.pandas.read_sql_athena(
151152``` py3
152153import awswrangler as wr
153154
154- dataframe = wr.pandas.read_csv(path = " s3://..." )
155+ df = wr.pandas.read_csv(path = " s3://..." )
155156```
156157
157158#### Reading from S3 (CSV) to Pandas in chunks (For memory restrictions)
@@ -173,7 +174,7 @@ for df in df_iter:
173174``` py3
174175import awswrangler as wr
175176
176- dataframe = wr.pandas.read_log_query(
177+ df = wr.pandas.read_log_query(
177178 log_group_names = [LOG_GROUP_NAME ],
178179 query = " fields @timestamp, @message | sort @timestamp desc | limit 5" ,
179180)
@@ -190,7 +191,7 @@ df = pandas.read_... # Read from anywhere
190191# Typical Pandas, Numpy or Pyarrow transformation HERE!
191192
192193wr.pandas.to_parquet( # Storing the data and metadata to Data Lake
193- dataframe = dataframe ,
194+ dataframe = df ,
194195 database = " database" ,
195196 path = " s3://..." ,
196197 partition_cols = [" col_name" ],
@@ -203,7 +204,7 @@ wr.pandas.to_parquet( # Storing the data and metadata to Data Lake
203204import awswrangler as wr
204205
205206wr.pandas.to_redshift(
206- dataframe = dataframe ,
207+ dataframe = df ,
207208 path = " s3://temp_path" ,
208209 schema = " ..." ,
209210 table = " ..." ,
@@ -219,7 +220,7 @@ wr.pandas.to_redshift(
219220``` py3
220221import awswrangler as wr
221222
222- dataframe = wr.pandas.read_sql_redshift(
223+ df = wr.pandas.read_sql_redshift(
223224 sql = " SELECT ..." ,
224225 iam_role = " YOUR_ROLE_ARN" ,
225226 connection = con,
@@ -268,6 +269,7 @@ sess.spark.create_glue_table(
268269
269270``` py3
270271import awswrangler as wr
272+
271273sess = awswrangler.Session(spark_session = spark)
272274dfs = sess.spark.flatten(dataframe = df_nested)
273275for name, df_flat in dfs.items():
@@ -367,6 +369,14 @@ for row in wr.athena.query(query="...", database="..."):
367369 print (row)
368370```
369371
372+ #### Load and unzip SageMaker job output
373+
374+ ``` py3
375+ import awswrangler as wr
376+
377+ outputs = wr.sagemaker.get_job_outputs(" s3://..." )
378+ ```
379+
370380## Diving Deep
371381
372382### Parallelism, Non-picklable objects and GeoPandas
@@ -397,14 +407,14 @@ To work with null object columns you can explicitly set the expected Athena data
397407import awswrangler as wr
398408import pandas as pd
399409
400- dataframe = pd.DataFrame({
410+ df = pd.DataFrame({
401411 " col" : [1 , 2 ],
402412 " col_string_null" : [None , None ],
403413 " col_date_null" : [None , None ],
404414})
405415
406416wr.pandas.to_parquet(
407- dataframe = dataframe ,
417+ dataframe = df ,
408418 database = " DATABASE" ,
409419 path = f " s3://... " ,
410420 cast_columns = {
0 commit comments