Skip to content

Commit 541cfe0

Browse files
authored
Merge pull request #50 from factly/feat/check_index_in_columns
Added check to check index in column names
2 parents eb92bd5 + f8e9411 commit 541cfe0

File tree

3 files changed

+73
-5
lines changed

3 files changed

+73
-5
lines changed

app/core/config.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,12 @@ class NoteSettings(BaseSettings):
319319

320320
class CustomExpectationsSettings(BaseSettings):
321321

322+
INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME: str = (
323+
"Index not in Column Names"
324+
)
325+
INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
326+
"Column names should not have 'index' as a column so please rename - {column}"
327+
)
322328
NULL_DATETIME_VALUE_NAME: str = "Null date values Flag - {column}"
323329
NULL_DATETIME_VALUE_MSG: str = (
324330
"Null values should not be permitted for datetime values"
@@ -330,7 +336,7 @@ class CustomExpectationsSettings(BaseSettings):
330336
"Numeric values in specific pattern - {column}"
331337
)
332338
NUMERIC_EXPECTATION_ERR_MSG: str = (
333-
"Numeric values should be in proper format both integer and float(roundoff to two decimal places)"
339+
"Numeric values should be in proper format both integer and float(round-off to two decimal places)"
334340
)
335341

336342
NEGATIVE_NUMERIC_VALUES_PATTERN = re.compile(r"^-\d+(\.\d{1,})?$")
@@ -344,7 +350,7 @@ class CustomExpectationsSettings(BaseSettings):
344350
COLUMN_NAMES_PATTERN = re.compile(r"^[a-z]+(?:_[a-z]+)*$")
345351
COLUMN_NAMES_EXPECTATION_NAME: str = "Column names in specific pattern"
346352
COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
347-
"Column names should be in lower case and separated by underscore - {column}"
353+
"Column names should be in lower case and separated by underscore - Example 'Sub Category' column should be written as 'sub_category' The improper columns list is: {column}"
348354
)
349355

350356
TRAIL_OR_LEAD_WHITESPACE_PATTERN = re.compile(r"^\s+.*|.*\s+$")

app/expectations/custom_expectations.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,34 @@ def expect_column_names_to_be_in_specific_pattern(
123123
include_meta=True,
124124
find_columns=False,
125125
):
126-
boolean_list = (
126+
boolean_value = (
127127
pd.Series(column_list.columns)
128128
.apply(lambda x: True if pattern.match(str(x)) else False)
129129
.all()
130130
)
131-
boolean_list = pd.Series([boolean_list] * len(column_list))
131+
if not boolean_value:
132+
boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
133+
else:
134+
boolean_list = pd.Series([boolean_value] * len(column_list))
135+
return boolean_list
132136

137+
@MetaPandasDataset.multicolumn_map_expectation
138+
def expect_index_not_in_column_values(
139+
self,
140+
column_list,
141+
meta={
142+
"expectation_name": "Column names should not have index as a column",
143+
},
144+
include_meta=True,
145+
find_columns=False,
146+
):
147+
boolean_value = (
148+
pd.Series(column_list.columns)
149+
.apply(lambda x: False if x == "index" else True)
150+
.all()
151+
)
152+
if not boolean_value:
153+
boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
154+
else:
155+
boolean_list = pd.Series([boolean_value] * len(column_list))
133156
return boolean_list

app/utils/general.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,15 @@ async def column_names_expectation_suite(dataset, result_format):
337337
"cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
338338
"expectation_name": custom_settings.COLUMN_NAMES_EXPECTATION_NAME,
339339
"expectation_error_message": custom_settings.COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
340-
column=dataset.columns.tolist()
340+
column=list(
341+
set(dataset.columns.tolist())
342+
- set(
343+
[
344+
i.lower().replace(" ", "_")
345+
for i in dataset.columns.tolist()
346+
]
347+
)
348+
)
341349
),
342350
}
343351
response = {
@@ -354,6 +362,36 @@ async def column_names_expectation_suite(dataset, result_format):
354362
return response
355363

356364

365+
async def index_not_in_columns_expectation_suite(dataset, result_format):
366+
ge_pandas_dataset = ge.from_pandas(
367+
dataset, dataset_class=GenericCustomExpectations
368+
)
369+
expectation = ge_pandas_dataset.expect_index_not_in_column_values(
370+
column_list=dataset.columns.tolist(),
371+
result_format=result_format,
372+
)
373+
expectation_dict = expectation.to_json_dict()
374+
expectation_dict["expectation_config"]["meta"] = {
375+
"cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
376+
"expectation_name": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME,
377+
"expectation_error_message": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
378+
column=[i for i in dataset.columns.tolist() if i == "index"]
379+
),
380+
}
381+
response = {
382+
expectation_dict["expectation_config"]["meta"][
383+
"expectation_name"
384+
]: expectation_dict
385+
}
386+
response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
387+
"result"
388+
]["partial_unexpected_index_list"] = []
389+
response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
390+
"result"
391+
]["partial_unexpected_list"] = []
392+
return response
393+
394+
357395
async def general_table_expectation_suite(dataset, result_format):
358396
"""Chaining all general expectaion suites for Datasets
359397
@@ -407,6 +445,7 @@ async def general_table_expectation_suite(dataset, result_format):
407445
],
408446
column_names_expectation_suite(dataset, result_format),
409447
observation_more_than_thresh_expectation_suite(dataset, result_format),
448+
index_not_in_columns_expectation_suite(dataset, result_format),
410449
)
411450
expectations = ChainMap(*expectations)
412451
return expectations

0 commit comments

Comments
 (0)