minor edits

saidbleik · saidbleik · commit d2be374c9c4d · 2019-08-19T15:05:13.000Z
diff --git a/utils_nlp/dataset/__init__.py b/utils_nlp/dataset/__init__.py
@@ -9,6 +9,6 @@
 
 
 class Split(str, Enum):
-    TRAIN : str = "train"
-    DEV : str = "dev"
-    TEST : str = "test"
+    TRAIN: str = "train"
+    DEV: str = "dev"
+    TEST: str = "test"
diff --git a/utils_nlp/dataset/snli.py b/utils_nlp/dataset/snli.py
@@ -20,17 +20,15 @@
 LABEL_COL = "score"
 
 
-def load_pandas_df(
-    local_cache_path=None, file_split=Split.TRAIN, file_type="txt", nrows=None
-):
+def load_pandas_df(local_cache_path=None, file_split=Split.TRAIN, file_type="txt", nrows=None):
     """
     Loads the SNLI dataset as pd.DataFrame
     Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip, and load
 
     Args:
         local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
-                               If None, all the intermediate files will be stored in a temporary directory and removed
-                               after use.
+            If None, all the intermediate files will be stored in a temporary directory and removed
+            after use.
         file_split (str): File split to load, defaults to "train"
         file_type (str): File type to load, defaults to "txt"
         nrows (int): Number of rows to load, defaults to None (in which all rows will be returned)
@@ -78,12 +76,8 @@ def _maybe_download_and_extract(zip_path, file_split, file_type):
     extract_path = os.path.join(dir_path, file_name)
 
     if not os.path.exists(extract_path):
-        dpath = download_snli(zip_path)
-        extract_snli(
-            zip_path,
-            source_path=SNLI_DIRNAME + "/" + file_name,
-            dest_path=extract_path,
-        )
+        _ = download_snli(zip_path)
+        extract_snli(zip_path, source_path=SNLI_DIRNAME + "/" + file_name, dest_path=extract_path)
 
     return extract_path
 
@@ -143,24 +137,20 @@ def clean_cols(df):
     )
 
     snli_df = snli_df.rename(
-        columns={
-            "sentence1": S1_COL,
-            "sentence2": S2_COL,
-            "gold_label": LABEL_COL,
-        }
+        columns={"sentence1": S1_COL, "sentence2": S2_COL, "gold_label": LABEL_COL}
     )
 
     return snli_df
 
 
 def clean_rows(df, label_col=LABEL_COL):
     """Drop badly formatted rows from the input dataframe
-    
+
     Args:
         df (pd.DataFrame): Input dataframe
         label_col (str): Name of label column. 
-                         Defaults to the standardized column name that is set after running the clean_col method.  
-    
+            Defaults to the standardized column name that is set after running the clean_col method.
+
     Returns:
         pd.DataFrame
     """
@@ -169,23 +159,23 @@ def clean_rows(df, label_col=LABEL_COL):
 
     return snli_df
 
+
 def clean_df(df, label_col=LABEL_COL):
     df = clean_cols(df)
     df = clean_rows(df, label_col)
 
     return df
 
-def load_azureml_df(
-    local_cache_path=None, file_split=Split.TRAIN, file_type="txt"
-):
+
+def load_azureml_df(local_cache_path=None, file_split=Split.TRAIN, file_type="txt"):
     """
     Loads the SNLI dataset as AzureML dataflow object
     Download the dataset from "https://nlp.stanford.edu/projects/snli/snli_1.0.zip", unzip, and load.
 
     Args:
         local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
-                               If None, all the intermediate files will be stored in a temporary directory and removed
-                               after use.
+            If None, all the intermediate files will be stored in a temporary directory and removed
+            after use.
         file_split (str): File split to load. One of (dev, test, train)
         file_type (str): File type to load. One of (txt, jsonl)
 
diff --git a/utils_nlp/dataset/wikigold.py b/utils_nlp/dataset/wikigold.py
@@ -14,9 +14,7 @@
 )
 
 
-def load_train_test_dfs(
-    local_cache_path="./", test_percentage=0.5, random_seed=None
-):
+def load_train_test_dfs(local_cache_path="./", test_percentage=0.5, random_seed=None):
     """
     Get the training and testing data frames based on test_percentage.
 
@@ -58,13 +56,9 @@ def load_train_test_dfs(
     train_sentence_list = sentence_list[test_sentence_count:]
     train_labels_list = labels_list[test_sentence_count:]
 
-    train_df = pd.DataFrame(
-        {"sentence": train_sentence_list, "labels": train_labels_list}
-    )
+    train_df = pd.DataFrame({"sentence": train_sentence_list, "labels": train_labels_list})
 
-    test_df = pd.DataFrame(
-        {"sentence": test_sentence_list, "labels": test_labels_list}
-    )
+    test_df = pd.DataFrame({"sentence": test_sentence_list, "labels": test_labels_list})
 
     return (train_df, test_df)
 
diff --git a/utils_nlp/dataset/xnli_torch_dataset.py b/utils_nlp/dataset/xnli_torch_dataset.py
@@ -60,15 +60,16 @@ def __init__(
             Load the dataset here
         Args:
             file_split (str, optional):The subset to load.
-                                       One of: {"train", "dev", "test"}
-                                       Defaults to "train".
+                One of: {"train", "dev", "test"}
+                Defaults to "train".
             cache_dir (str, optional):Path to store the data.
-                                      Defaults to "./".
+                Defaults to "./".
             language(str):Language required to load which xnli file (eg - "en", "zh")
             to_lowercase(bool):flag to convert samples in dataset to lowercase
             tok_language(Language, optional): language (Language, optional): The pretrained model's language.
-                                              Defaults to Language.ENGLISH.
-            data_percent_used(float, optional): Data used to create Torch Dataset.Defaults to "1.0" which is 100% data
+                Defaults to Language.ENGLISH.
+            data_percent_used(float, optional): Data used to create Torch Dataset.
+                Defaults to "1.0" which is 100% data
         """
         if file_split not in VALID_FILE_SPLIT:
             raise ValueError("The file split is not part of ", VALID_FILE_SPLIT)