Skip to content

Commit 87a289a

Browse files
- added autodetection of columns with links
- tests adapted for new functionality
1 parent 4671183 commit 87a289a

File tree

5 files changed

+35
-10
lines changed

5 files changed

+35
-10
lines changed

fedot/core/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
1616

17-
FRACTION_OF_UNIQUE_VALUES = 0.7
17+
FRACTION_OF_UNIQUE_VALUES = 0.6
1818
MIN_VOCABULARY_SIZE = 20
1919

2020
default_data_split_ratio_by_task = {

fedot/core/data/data_detection.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,29 @@ def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
3939
text_columns.append(column_name)
4040
return text_columns
4141

42+
def define_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
43+
"""
44+
:param data_frame: pandas dataframe with data
45+
:return: list of link columns' names
46+
"""
47+
link_columns = []
48+
for column_name in data_frame.columns:
49+
if self.is_link(data_frame[column_name]):
50+
link_columns.append(column_name)
51+
return link_columns
52+
4253
@staticmethod
4354
def is_full_of_nans(text_data: np.array) -> bool:
4455
if np.sum(pd.isna(text_data)) / len(text_data) > ALLOWED_NAN_PERCENT:
4556
return True
4657
return False
4758

59+
@staticmethod
60+
def is_link(text_data: np.array) -> bool:
61+
if str(next(el for el in text_data if el is not None)).startswith('http'):
62+
return True
63+
return False
64+
4865
@staticmethod
4966
def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict:
5067
""" Prepares MultiModal text data in a form of dictionary
@@ -71,12 +88,18 @@ def _column_contains_text(self, column: pd.Series) -> bool:
7188
Column contains text if:
7289
1. it's not float or float compatible
7390
(e.g. ['1.2', '2.3', '3.4', ...] is float too)
74-
2. fraction of unique values (except nans) is more than 0.95
91+
2. fraction of unique values (except nans) is more than 0.6
92+
3. size of tfidf vocabulary is more than 20
93+
94+
If size of tfidf vocabulary is less than 20, then it is probably
95+
text column too, but it cannot be vectorized and used in model
7596
7697
:param column: pandas series with data
77-
:return: True if column contains text
98+
:return: True if column contains text, False otherwise or if column contains links
7899
"""
79-
if column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
100+
if self.is_link(column):
101+
return False
102+
elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
80103
try:
81104
params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
82105
tfidf_vectorizer = TfidfVectorizer(**params)

fedot/core/data/multi_modal.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,16 @@ def from_csv(cls,
168168
if not text_columns:
169169
text_columns = text_data_detector.define_text_columns(data_frame)
170170

171+
link_columns = text_data_detector.define_link_columns(data_frame)
172+
columns_to_drop = text_columns + link_columns
171173
data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns)
172-
data_frame_table = data_frame.drop(columns=text_columns)
174+
data_frame_table = data_frame.drop(columns=columns_to_drop)
173175
table_features, target = process_target_and_features(data_frame_table, target_columns)
174176

175177
data_part_transformation_func = partial(array_to_input_data,
176178
idx=idx, target_array=target, task=task)
177179

178-
# create labels for text data sources and remove source if there are many nans
180+
# create labels for text data sources and remove source if there are many nans or text is link
179181
sources = dict((text_data_detector.new_key_name(data_part_key),
180182
data_part_transformation_func(features_array=data_part, data_type=DataTypesEnum.text))
181183
for (data_part_key, data_part) in data_text.items()

test/data/multimodal_data_with_complicated_types.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
3,3,,4,4,3,"make a type specimen book. It has survived not only five centuries, but also",4, a ,True,,,0,2,yes
66
4,4,,5,5,0,"the leap into electronic typesetting, remaining essentially unchanged. It was",5, b ,,,,0,3,no
77
5,5,,6,6,0,popularised in the 1960s with the release of Letraset sheets containing Lorem ,6, c ,False,,,0,4,no
8-
6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,sample text,sample text,1,5,no
8+
6,6,inf,7,7,0,"Ipsum passages, and more recently with desktop publishing software like Aldus ",7, a ,True,"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.","Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.",1,5,no
99
7,7,inf,8,8,1,PageMaker including versions of Lorem Ipsum.,1, b ,,,4,0,6,no
1010
8,inf,inf,9,9,2,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots ",2,,True,,,1,7,no
1111
9,9,inf,10,10,2,"in a piece of classical Latin literature from 45 BC, making it over 2000 years ",3, c ,False,,,0,8,yes
@@ -16,4 +16,4 @@
1616
14,14,,3,3,2,"1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good ",1,a,False,,,,error,no
1717
15,15,,4,4,1,"and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of",2,a ,False,,,,13,no
1818
16,16,2,5,12,0,"ethics, very popular during the Renaissance. The first line of Lorem Ipsum,",3, d ,True,,,1,16,yes
19-
17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,another sample text,0,17,no
19+
17,17,3,6,13,0,"""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",4, d ,False,,"Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of ""de Finibus Bonorum et Malorum"" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, ""Lorem ipsum dolor sit amet.."", comes from a line in section 1.10.32.",0,17,no

test/unit/data/test_multimodal_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def test_multi_modal_data():
4343
multi_modal.target = new_target
4444
assert np.array_equal(multi_modal.target, new_target)
4545

46-
46+
# TODO make test of text columns autodetection
4747
def test_multimodal_data_from_csv():
4848
"""
4949
Checking correctness of MultiModalData import from csv file.
@@ -54,7 +54,7 @@ def test_multimodal_data_from_csv():
5454
text_data = np.array(df['description'])
5555
table_data = np.array(df.drop(columns=['id', 'description', 'variety']))
5656
target = np.array(df['variety']).reshape(-1, 1)
57-
actual_data = MultiModalData.from_csv(path)
57+
actual_data = MultiModalData.from_csv(path, text_columns=['description'])
5858
actual_text_features = actual_data['data_source_text/description'].features
5959
actual_table_features = actual_data['data_source_table'].features
6060
actual_target = actual_data.target

0 commit comments

Comments
 (0)