Fixing bug for boolean type on spark.to_redshift()

igorborgest · igorborgest · commit d5871182b812 · 2019-11-19T20:45:53.000-03:00
diff --git a/awswrangler/data_types.py b/awswrangler/data_types.py
@@ -12,15 +12,15 @@
 
 def athena2pandas(dtype: str) -> str:
     dtype = dtype.lower()
-    if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
+    if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
         return "Int64"
-    elif dtype in ["float", "double", "real"]:
+    elif dtype in ("float", "double", "real"):
         return "float64"
     elif dtype == "boolean":
         return "bool"
-    elif dtype in ["string", "char", "varchar"]:
+    elif dtype in ("string", "char", "varchar"):
         return "str"
-    elif dtype in ["timestamp", "timestamp with time zone"]:
+    elif dtype in ("timestamp", "timestamp with time zone"):
         return "datetime64"
     elif dtype == "date":
         return "date"
@@ -36,17 +36,17 @@ def athena2pyarrow(dtype: str) -> str:
         return "int8"
     if dtype == "smallint":
         return "int16"
-    elif dtype in ["int", "integer"]:
+    elif dtype in ("int", "integer"):
         return "int32"
     elif dtype == "bigint":
         return "int64"
     elif dtype == "float":
         return "float32"
     elif dtype == "double":
         return "float64"
-    elif dtype in ["boolean", "bool"]:
+    elif dtype in ("boolean", "bool"):
         return "bool"
-    elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
+    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
         return "string"
     elif dtype == "timestamp":
         return "timestamp[ns]"
@@ -58,13 +58,13 @@ def athena2pyarrow(dtype: str) -> str:
 
 def athena2python(dtype: str) -> Optional[type]:
     dtype = dtype.lower()
-    if dtype in ["int", "integer", "bigint", "smallint", "tinyint"]:
+    if dtype in ("int", "integer", "bigint", "smallint", "tinyint"):
         return int
-    elif dtype in ["float", "double", "real"]:
+    elif dtype in ("float", "double", "real"):
         return float
     elif dtype == "boolean":
         return bool
-    elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
+    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
         return str
     elif dtype == "timestamp":
         return datetime
@@ -80,17 +80,17 @@ def athena2redshift(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "smallint":
         return "SMALLINT"
-    elif dtype in ["int", "integer"]:
+    elif dtype in ("int", "integer"):
         return "INTEGER"
     elif dtype == "bigint":
         return "BIGINT"
     elif dtype == "float":
         return "FLOAT4"
     elif dtype == "double":
         return "FLOAT8"
-    elif dtype in ["boolean", "bool"]:
+    elif dtype in ("boolean", "bool"):
         return "BOOL"
-    elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
+    elif dtype in ("string", "char", "varchar", "array", "row", "map"):
         return "VARCHAR(256)"
     elif dtype == "timestamp":
         return "TIMESTAMP"
@@ -104,7 +104,7 @@ def pandas2athena(dtype: str) -> str:
     dtype = dtype.lower()
     if dtype == "int32":
         return "int"
-    elif dtype in ["int64", "Int64"]:
+    elif dtype in ("int64", "Int64"):
         return "bigint"
     elif dtype == "float32":
         return "float"
@@ -214,19 +214,19 @@ def python2athena(python_type: type) -> str:
 
 def redshift2athena(dtype: str) -> str:
     dtype_str = str(dtype)
-    if dtype_str in ["SMALLINT", "INT2"]:
+    if dtype_str in ("SMALLINT", "INT2"):
         return "smallint"
-    elif dtype_str in ["INTEGER", "INT", "INT4"]:
+    elif dtype_str in ("INTEGER", "INT", "INT4"):
         return "int"
-    elif dtype_str in ["BIGINT", "INT8"]:
+    elif dtype_str in ("BIGINT", "INT8"):
         return "bigint"
-    elif dtype_str in ["REAL", "FLOAT4"]:
+    elif dtype_str in ("REAL", "FLOAT4"):
         return "float"
-    elif dtype_str in ["DOUBLE PRECISION", "FLOAT8", "FLOAT"]:
+    elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"):
         return "double"
-    elif dtype_str in ["BOOLEAN", "BOOL"]:
+    elif dtype_str in ("BOOLEAN", "BOOL"):
         return "boolean"
-    elif dtype_str in ["VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"]:
+    elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"):
         return "string"
     elif dtype_str == "DATE":
         return "date"
@@ -238,19 +238,19 @@ def redshift2athena(dtype: str) -> str:
 
 def redshift2pyarrow(dtype: str) -> str:
     dtype_str: str = str(dtype)
-    if dtype_str in ["SMALLINT", "INT2"]:
+    if dtype_str in ("SMALLINT", "INT2"):
         return "int16"
-    elif dtype_str in ["INTEGER", "INT", "INT4"]:
+    elif dtype_str in ("INTEGER", "INT", "INT4"):
         return "int32"
-    elif dtype_str in ["BIGINT", "INT8"]:
+    elif dtype_str in ("BIGINT", "INT8"):
         return "int64"
-    elif dtype_str in ["REAL", "FLOAT4"]:
+    elif dtype_str in ("REAL", "FLOAT4"):
         return "float32"
-    elif dtype_str in ["DOUBLE PRECISION", "FLOAT8", "FLOAT"]:
+    elif dtype_str in ("DOUBLE PRECISION", "FLOAT8", "FLOAT"):
         return "float64"
-    elif dtype_str in ["BOOLEAN", "BOOL"]:
+    elif dtype_str in ("BOOLEAN", "BOOL"):
         return "bool"
-    elif dtype_str in ["VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"]:
+    elif dtype_str in ("VARCHAR", "CHARACTER VARYING", "NVARCHAR", "TEXT"):
         return "string"
     elif dtype_str == "DATE":
         return "date32"
@@ -272,7 +272,7 @@ def spark2redshift(dtype: str) -> str:
         return "FLOAT4"
     elif dtype == "double":
         return "FLOAT8"
-    elif dtype == "bool":
+    elif dtype in ("bool", "boolean"):
         return "BOOLEAN"
     elif dtype == "timestamp":
         return "TIMESTAMP"
@@ -308,7 +308,7 @@ def extract_pyarrow_schema_from_pandas(dataframe: pd.DataFrame, preserve_index:
     """
     cols = []
     cols_dtypes = {}
-    if indexes_position not in ["right", "left"]:
+    if indexes_position not in ("right", "left"):
         raise ValueError(f"indexes_position must be \"right\" or \"left\"")
 
     # Handle exception data types (e.g. Int64)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -2,8 +2,8 @@ yapf~=0.28.0
 mypy~=0.740
 flake8~=3.7.9
 pytest-cov~=2.8.1
-cfn-lint~=0.25.0
-twine~=2.0.0
+cfn-lint~=0.25.2
+twine~=3.0.0
 wheel~=0.33.6
 sphinx~=2.2.1
 pyspark~=2.4.4
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 numpy~=1.17.4
 pandas~=0.25.3
 pyarrow~=0.15.1
-botocore~=1.13.18
-boto3~=1.10.18
+botocore~=1.13.21
+boto3~=1.10.21
 s3fs~=0.4.0
 tenacity~=6.0.0
 pg8000~=1.13.2
diff --git a/setup.py b/setup.py
@@ -24,8 +24,8 @@
         "numpy~=1.17.4",
         "pandas~=0.25.3",
         "pyarrow~=0.15.1",
-        "botocore~=1.13.18",
-        "boto3~=1.10.18",
+        "botocore~=1.13.21",
+        "boto3~=1.10.21",
         "s3fs~=0.4.0",
         "tenacity~=6.0.0",
         "pg8000~=1.13.2",
diff --git a/testing/test_awswrangler/test_redshift.py b/testing/test_awswrangler/test_redshift.py
@@ -280,6 +280,43 @@ def test_to_redshift_spark_big(session, bucket, redshift_parameters):
     assert len(list(dataframe.columns)) == len(list(rows[0]))
 
 
+def test_to_redshift_spark_bool(session, bucket, redshift_parameters):
+    dataframe = session.spark_session.createDataFrame(
+        pd.DataFrame({
+            "A": [1, 2, 3],
+            "B": [True, False, True]
+        })
+    )
+    print(dataframe)
+    print(dataframe.dtypes)
+    con = Redshift.generate_connection(
+        database="test",
+        host=redshift_parameters.get("RedshiftAddress"),
+        port=redshift_parameters.get("RedshiftPort"),
+        user="test",
+        password=redshift_parameters.get("RedshiftPassword"),
+    )
+    session.spark.to_redshift(
+        dataframe=dataframe,
+        path=f"s3://{bucket}/redshift-load-bool/",
+        connection=con,
+        schema="public",
+        table="test",
+        iam_role=redshift_parameters.get("RedshiftRole"),
+        mode="overwrite",
+        min_num_partitions=1,
+    )
+    cursor = con.cursor()
+    cursor.execute("SELECT * from public.test")
+    rows = cursor.fetchall()
+    cursor.close()
+    con.close()
+    assert dataframe.count() == len(rows)
+    assert len(list(dataframe.columns)) == len(list(rows[0]))
+    assert type(rows[0][0]) == int
+    assert type(rows[0][1]) == bool
+
+
 def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
     dataframe = session.spark_session.createDataFrame(
         pd.DataFrame({
@@ -288,7 +325,6 @@ def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
             "C": list(range(1_000_000))
         }))
     dataframe.cache()
-
     for i in range(10):
         print(f"Run number: {i}")
         con = Redshift.generate_connection(