11from abc import abstractmethod
22from typing import List
33
4+ import re
45import numpy as np
56import pandas as pd
67from sklearn .feature_extraction .text import TfidfVectorizer
78
8- from fedot .core .constants import FRACTION_OF_UNIQUE_VALUES , MIN_VOCABULARY_SIZE
9+ from fedot .core .constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT , MIN_VOCABULARY_SIZE
10+ from fedot .core .log import default_log
911from fedot .core .repository .default_params_repository import DefaultOperationParamsRepository
1012
1113ALLOWED_NAN_PERCENT = 0.9
@@ -28,26 +30,25 @@ class TextDataDetector(DataDetector):
2830 """
2931 Class for detecting text data during its import.
3032 """
31- def define_text_columns (self , data_frame : pd .DataFrame ) -> List [str ]:
33+ def __init__ (self ):
34+ self .logger = default_log (prefix = 'FEDOT logger' )
35+ super ().__init__ ()
36+
37+ def find_text_columns (self , data_frame : pd .DataFrame ) -> List [str ]:
3238 """
3339 :param data_frame: pandas dataframe with data
3440 :return: list of text columns' names
3541 """
36- text_columns = []
37- for column_name in data_frame .columns :
38- if self ._column_contains_text (data_frame [column_name ]):
39- text_columns .append (column_name )
42+ text_columns = [column_name for column_name in data_frame .columns
43+ if self ._column_contains_text (data_frame [column_name ])]
4044 return text_columns
4145
42- def define_link_columns (self , data_frame : pd .DataFrame ) -> List [str ]:
46+ def find_link_columns (self , data_frame : pd .DataFrame ) -> List [str ]:
4347 """
4448 :param data_frame: pandas dataframe with data
4549 :return: list of link columns' names
4650 """
47- link_columns = []
48- for column_name in data_frame .columns :
49- if self .is_link (data_frame [column_name ]):
50- link_columns .append (column_name )
51+ link_columns = [column_name for column_name in data_frame .columns if self .is_link (data_frame [column_name ])]
5152 return link_columns
5253
5354 @staticmethod
@@ -58,9 +59,9 @@ def is_full_of_nans(text_data: np.array) -> bool:
5859
5960 @staticmethod
6061 def is_link (text_data : np .array ) -> bool :
61- if str ( next ( el for el in text_data if el is not None )). startswith ( 'http' ):
62- return True
63- return False
62+ link_pattern = \
63+ '[(http(s)?): \\ / \\ /(www \\ .)?a-zA-Z0-9@:%._ \\ +~#=]{2,256} \\ .[a-z]{2,6} \\ b([-a-zA-Z0-9@:%_ \\ +.~#?&//=]*)'
64+ return re . search ( link_pattern , str ( next ( el for el in text_data if el is not None ))) is not None
6465
6566 @staticmethod
6667 def prepare_multimodal_data (dataframe : pd .DataFrame , columns : List [str ]) -> dict :
@@ -100,14 +101,14 @@ def _column_contains_text(self, column: pd.Series) -> bool:
100101 if self .is_link (column ):
101102 return False
102103 elif column .dtype == object and not self ._is_float_compatible (column ) and self ._has_unique_values (column ):
104+ params = DefaultOperationParamsRepository ().get_default_params_for_operation ('tfidf' )
105+ tfidf_vectorizer = TfidfVectorizer (** params )
103106 try :
104- params = DefaultOperationParamsRepository ().get_default_params_for_operation ('tfidf' )
105- tfidf_vectorizer = TfidfVectorizer (** params )
107+ # TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them
106108 tfidf_vectorizer .fit (np .where (pd .isna (column ), '' , column ))
107- if len (tfidf_vectorizer .vocabulary_ ) > MIN_VOCABULARY_SIZE :
108- return True
109+ return len (tfidf_vectorizer .vocabulary_ ) > MIN_VOCABULARY_SIZE
109110 except ValueError :
110- print ( f' Column { column .name } possibly contains text, but it is not possible to vectorize it' )
111+ self . logger . warning ( f" Column { column .name } possibly contains text, but it's impossible to vectorize it" )
111112 return False
112113
113114 @staticmethod
@@ -132,8 +133,13 @@ def _has_unique_values(column: pd.Series) -> bool:
132133 """
133134 unique_num = len (column .unique ())
134135 nan_num = pd .isna (column ).sum ()
135- return unique_num / len (column ) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \
136- else (unique_num - 1 ) / (len (column ) - nan_num ) > FRACTION_OF_UNIQUE_VALUES
136+ # fraction of unique values in column if there is no nans
137+ frac_unique_is_bigger_than_threshold = unique_num / (len (column ) - nan_num ) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
138+ # fraction of unique values in column if there are nans
139+ frac_unique_is_bigger_than_threshold_with_nans = \
140+ (unique_num - 1 ) / (len (column ) - nan_num ) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
141+ return frac_unique_is_bigger_than_threshold if nan_num == 0 \
142+ else frac_unique_is_bigger_than_threshold_with_nans
137143
138144
139145class TimeSeriesDataDetector (DataDetector ):
0 commit comments