Merge pull request #50 from factly/feat/check_index_in_columns

paul-tharun · web-flow · commit 541cfe03a4e5 · 2024-11-13T18:02:45.000+05:30
Added check to check index in column names
diff --git a/app/core/config.py b/app/core/config.py
@@ -319,6 +319,12 @@ class NoteSettings(BaseSettings):
 
 class CustomExpectationsSettings(BaseSettings):
 
+    INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME: str = (
+        "Index not in Column Names"
+    )
+    INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
+        "Column names should not have 'index' as a column so please rename - {column}"
+    )
     NULL_DATETIME_VALUE_NAME: str = "Null date values Flag - {column}"
     NULL_DATETIME_VALUE_MSG: str = (
         "Null values should not be permitted for datetime values"
@@ -330,7 +336,7 @@ class CustomExpectationsSettings(BaseSettings):
         "Numeric values in specific pattern - {column}"
     )
     NUMERIC_EXPECTATION_ERR_MSG: str = (
-        "Numeric values should be in proper format both integer and float(roundoff to two decimal places)"
+        "Numeric values should be in proper format both integer and float(round-off to two decimal places)"
     )
 
     NEGATIVE_NUMERIC_VALUES_PATTERN = re.compile(r"^-\d+(\.\d{1,})?$")
@@ -344,7 +350,7 @@ class CustomExpectationsSettings(BaseSettings):
     COLUMN_NAMES_PATTERN = re.compile(r"^[a-z]+(?:_[a-z]+)*$")
     COLUMN_NAMES_EXPECTATION_NAME: str = "Column names in specific pattern"
     COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
-        "Column names should be in lower case and separated by underscore - {column}"
+        "Column names should be in lower case and separated by underscore - Example 'Sub Category' column should be written as 'sub_category' The improper columns list is: {column}"
     )
 
     TRAIL_OR_LEAD_WHITESPACE_PATTERN = re.compile(r"^\s+.*|.*\s+$")
diff --git a/app/expectations/custom_expectations.py b/app/expectations/custom_expectations.py
@@ -123,11 +123,34 @@ def expect_column_names_to_be_in_specific_pattern(
         include_meta=True,
         find_columns=False,
     ):
-        boolean_list = (
+        boolean_value = (
             pd.Series(column_list.columns)
             .apply(lambda x: True if pattern.match(str(x)) else False)
             .all()
         )
-        boolean_list = pd.Series([boolean_list] * len(column_list))
+        if not boolean_value:
+            boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
+        else:
+            boolean_list = pd.Series([boolean_value] * len(column_list))
+        return boolean_list
 
+    @MetaPandasDataset.multicolumn_map_expectation
+    def expect_index_not_in_column_values(
+        self,
+        column_list,
+        meta={
+            "expectation_name": "Column names should not have index as a column",
+        },
+        include_meta=True,
+        find_columns=False,
+    ):
+        boolean_value = (
+            pd.Series(column_list.columns)
+            .apply(lambda x: False if x == "index" else True)
+            .all()
+        )
+        if not boolean_value:
+            boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
+        else:
+            boolean_list = pd.Series([boolean_value] * len(column_list))
         return boolean_list
diff --git a/app/utils/general.py b/app/utils/general.py
@@ -337,7 +337,15 @@ async def column_names_expectation_suite(dataset, result_format):
         "cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
         "expectation_name": custom_settings.COLUMN_NAMES_EXPECTATION_NAME,
         "expectation_error_message": custom_settings.COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
-            column=dataset.columns.tolist()
+            column=list(
+                set(dataset.columns.tolist())
+                - set(
+                    [
+                        i.lower().replace(" ", "_")
+                        for i in dataset.columns.tolist()
+                    ]
+                )
+            )
         ),
     }
     response = {
@@ -354,6 +362,36 @@ async def column_names_expectation_suite(dataset, result_format):
     return response
 
 
+async def index_not_in_columns_expectation_suite(dataset, result_format):
+    ge_pandas_dataset = ge.from_pandas(
+        dataset, dataset_class=GenericCustomExpectations
+    )
+    expectation = ge_pandas_dataset.expect_index_not_in_column_values(
+        column_list=dataset.columns.tolist(),
+        result_format=result_format,
+    )
+    expectation_dict = expectation.to_json_dict()
+    expectation_dict["expectation_config"]["meta"] = {
+        "cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
+        "expectation_name": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME,
+        "expectation_error_message": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
+            column=[i for i in dataset.columns.tolist() if i == "index"]
+        ),
+    }
+    response = {
+        expectation_dict["expectation_config"]["meta"][
+            "expectation_name"
+        ]: expectation_dict
+    }
+    response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
+        "result"
+    ]["partial_unexpected_index_list"] = []
+    response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
+        "result"
+    ]["partial_unexpected_list"] = []
+    return response
+
+
 async def general_table_expectation_suite(dataset, result_format):
     """Chaining all general expectaion suites for Datasets
 
@@ -407,6 +445,7 @@ async def general_table_expectation_suite(dataset, result_format):
         ],
         column_names_expectation_suite(dataset, result_format),
         observation_more_than_thresh_expectation_suite(dataset, result_format),
+        index_not_in_columns_expectation_suite(dataset, result_format),
     )
     expectations = ChainMap(*expectations)
     return expectations