From b9a6b0204f0586ac92b8d8829cdb3e13482f309f Mon Sep 17 00:00:00 2001 From: Osman Sabahat Date: Thu, 5 Dec 2024 19:51:41 -0500 Subject: [PATCH] sanitize numbers --- src/server/models.py | 25 +++++++------------------ src/server/utils.py | 28 ++++++++++++++++++++++++++++ src/server/volgistics_importer.py | 12 ++++++++---- 3 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 src/server/utils.py diff --git a/src/server/models.py b/src/server/models.py index 38c38101..6215d88c 100644 --- a/src/server/models.py +++ b/src/server/models.py @@ -20,6 +20,7 @@ from sqlalchemy.dialects.postgresql import JSONB, insert from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.sql.functions import coalesce +from utils import standardize_phone_number Base = declarative_base() @@ -92,21 +93,6 @@ def dedup_consecutive(table, unique_id, id, order_by, dedup_on): return delete(table).where(unique_id == to_delete.c[0]) -def normalize_phone_number(number): - result = None - - if number and str(number) != "nan": - number = re.sub("[() -.+]", "", str(number)) - - if number and number[0] == "1": - number = number[1:] - - if number.isdigit() and len(number) == 10: - result = number - - return result - - class PdpContacts(Base): __tablename__ = "pdp_contacts" __table_args__ = ( @@ -173,8 +159,10 @@ def insert_from_file_df(cls, df, conn): df = df[column_translation.keys()] df = df.rename(columns=column_translation) - df["phone"] = df["phone"].apply(normalize_phone_number) - df["mobile"] = df["mobile"].apply(normalize_phone_number) + phone_numbers = [standardize_phone_number(phone) for phone in df["phone"]] + mobile_numbers = [standardize_phone_number(phone) for phone in df["mobile"]] + df["phone"] = phone_numbers + df["mobile"] = mobile_numbers dedup_on = [col for col in cls.__table__.columns if col.name in df.columns] df["created_date"] = datetime.datetime.utcnow() @@ -237,7 +225,8 @@ def insert_from_df(cls, df, conn): df = df[column_translation.keys()] df = df.rename(columns=column_translation) - df["phone"] = df["phone"].apply(normalize_phone_number) + phone_numbers = [standardize_phone_number(phone) for phone in df["phone"]] + df["phone"] = phone_numbers dedup_on = [col for col in cls.__table__.columns if col.name in df.columns] df["created_date"] = datetime.datetime.utcnow() diff --git a/src/server/utils.py b/src/server/utils.py new file mode 100644 index 00000000..0ab4de23 --- /dev/null +++ b/src/server/utils.py @@ -0,0 +1,28 @@ +import re + +def standardize_phone_number(phone): + """Standardize phone number format. + + Args: + phone (str): The phone number to standardize. + + Returns: + str: The standardized phone number. + """ + # Remove all non-numeric characters + phone = re.sub(r'\D', '', phone) + + # if the phone number is less than 10 digits, it's invalid + if len(phone) < 10: + return None + + # If the phone number is exactly 10 digits, return as is + if len(phone) == 10: + return phone + + # if the phone number is greater than 10 digits, take the last 10 digits + if len(phone) > 10: + return f'{phone[-10:]}' + + # anything else we ignore + return None diff --git a/src/server/volgistics_importer.py b/src/server/volgistics_importer.py index b5b3495c..d1b23b35 100644 --- a/src/server/volgistics_importer.py +++ b/src/server/volgistics_importer.py @@ -1,10 +1,10 @@ -import re from flask.globals import current_app from datetime import datetime, timedelta from openpyxl import load_workbook from jellyfish import jaro_similarity from config import engine +from utils import standardize_phone_number import structlog @@ -178,6 +178,10 @@ def volgistics_people_import(workbook): col_email = col['Email'] time_stamp = datetime.utcnow() + home_phone = standardize_phone_number(r[col_home]) + work_phone = standardize_phone_number(r[col_work]) + cell_phone = standardize_phone_number(r[col_cell]) + try: for r in ws.iter_rows(min_row=2, max_col=42,values_only=True): insert_list.append( @@ -194,9 +198,9 @@ def volgistics_people_import(workbook): "state": r[col_state], "zip": r[col_zip], "all_phone_numbers": r[col_all_phones], - "home": r[col_home], - "work": r[col_work], - "cell": r[col_cell], + "home": home_phone, + "work": work_phone, + "cell": cell_phone, "email": r[col_email], "created_date" : time_stamp }