Skip to content

Commit 178110d

Browse files
- added ngram_range=(1,4) to search space
- minor changes
1 parent 290187e commit 178110d

File tree

5 files changed

+35
-31
lines changed

5 files changed

+35
-31
lines changed

fedot/core/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
1616

17-
FRACTION_OF_UNIQUE_VALUES = 0.6
17+
FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6
1818
MIN_VOCABULARY_SIZE = 20
1919

2020
default_data_split_ratio_by_task = {

fedot/core/data/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ def num_classes(self) -> Optional[int]:
331331
return len(unique_values) if unique_values is not None else None
332332

333333
@property
334-
def class_labels(self) -> Optional[int]:
334+
def class_labels(self) -> Optional[List[Union[int, str, float]]]:
335335
if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
336336
return np.unique(self.target)
337337
else:

fedot/core/data/data_detection.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from abc import abstractmethod
22
from typing import List
33

4+
import re
45
import numpy as np
56
import pandas as pd
67
from sklearn.feature_extraction.text import TfidfVectorizer
78

8-
from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES, MIN_VOCABULARY_SIZE
9+
from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE
10+
from fedot.core.log import default_log
911
from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository
1012

1113
ALLOWED_NAN_PERCENT = 0.9
@@ -28,26 +30,25 @@ class TextDataDetector(DataDetector):
2830
"""
2931
Class for detecting text data during its import.
3032
"""
31-
def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
33+
def __init__(self):
34+
self.logger = default_log(prefix='FEDOT logger')
35+
super().__init__()
36+
37+
def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
3238
"""
3339
:param data_frame: pandas dataframe with data
3440
:return: list of text columns' names
3541
"""
36-
text_columns = []
37-
for column_name in data_frame.columns:
38-
if self._column_contains_text(data_frame[column_name]):
39-
text_columns.append(column_name)
42+
text_columns = [column_name for column_name in data_frame.columns
43+
if self._column_contains_text(data_frame[column_name])]
4044
return text_columns
4145

42-
def define_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
46+
def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
4347
"""
4448
:param data_frame: pandas dataframe with data
4549
:return: list of link columns' names
4650
"""
47-
link_columns = []
48-
for column_name in data_frame.columns:
49-
if self.is_link(data_frame[column_name]):
50-
link_columns.append(column_name)
51+
link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])]
5152
return link_columns
5253

5354
@staticmethod
@@ -58,9 +59,9 @@ def is_full_of_nans(text_data: np.array) -> bool:
5859

5960
@staticmethod
6061
def is_link(text_data: np.array) -> bool:
61-
if str(next(el for el in text_data if el is not None)).startswith('http'):
62-
return True
63-
return False
62+
link_pattern = \
63+
'[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)'
64+
return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None
6465

6566
@staticmethod
6667
def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict:
@@ -100,14 +101,14 @@ def _column_contains_text(self, column: pd.Series) -> bool:
100101
if self.is_link(column):
101102
return False
102103
elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
104+
params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
105+
tfidf_vectorizer = TfidfVectorizer(**params)
103106
try:
104-
params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
105-
tfidf_vectorizer = TfidfVectorizer(**params)
107+
# TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them
106108
tfidf_vectorizer.fit(np.where(pd.isna(column), '', column))
107-
if len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE:
108-
return True
109+
return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE
109110
except ValueError:
110-
print(f'Column {column.name} possibly contains text, but it is not possible to vectorize it')
111+
self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it")
111112
return False
112113

113114
@staticmethod
@@ -132,8 +133,13 @@ def _has_unique_values(column: pd.Series) -> bool:
132133
"""
133134
unique_num = len(column.unique())
134135
nan_num = pd.isna(column).sum()
135-
return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \
136-
else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES
136+
# fraction of unique values in column if there is no nans
137+
frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
138+
# fraction of unique values in column if there are nans
139+
frac_unique_is_bigger_than_threshold_with_nans = \
140+
(unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
141+
return frac_unique_is_bigger_than_threshold if nan_num == 0 \
142+
else frac_unique_is_bigger_than_threshold_with_nans
137143

138144

139145
class TimeSeriesDataDetector(DataDetector):

fedot/core/data/multi_modal.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,11 @@ def data_type(self):
5858

5959
@property
6060
def num_classes(self) -> Optional[int]:
61-
if self.task.task_type == TaskTypesEnum.classification:
62-
return len(np.unique(self.target))
63-
else:
64-
return None
61+
unique_values = self.class_labels
62+
return len(unique_values) if unique_values is not None else None
6563

6664
@property
67-
def class_labels(self) -> Optional[int]:
65+
def class_labels(self) -> Optional[List[Union[int, str, float]]]:
6866
if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
6967
return np.unique(self.target)
7068
else:
@@ -166,9 +164,9 @@ def from_csv(cls,
166164
text_columns = [text_columns] if isinstance(text_columns, str) else text_columns
167165

168166
if not text_columns:
169-
text_columns = text_data_detector.define_text_columns(data_frame)
167+
text_columns = text_data_detector.find_text_columns(data_frame)
170168

171-
link_columns = text_data_detector.define_link_columns(data_frame)
169+
link_columns = text_data_detector.find_link_columns(data_frame)
172170
columns_to_drop = text_columns + link_columns
173171
data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns)
174172
data_frame_table = data_frame.drop(columns=columns_to_drop)

fedot/core/pipelines/tuning/search_space.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def get_parameters_dict(self):
285285
'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']])
286286
},
287287
'tfidf': {
288-
'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]),
288+
'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]),
289289
'min_df': (hp.uniform, [0.0001, 0.01]),
290290
'max_df': (hp.uniform, [0.9, 0.99])
291291
},

0 commit comments

Comments
 (0)