- added ngram_range=(1,4) to search space

andreygetmanov · andreygetmanov · commit 178110d8829d · 2022-10-06T14:55:34.000+02:00
- minor changes
diff --git a/fedot/core/constants.py b/fedot/core/constants.py
@@ -14,7 +14,7 @@
 
 MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
 
-FRACTION_OF_UNIQUE_VALUES = 0.6
+FRACTION_OF_UNIQUE_VALUES_IN_TEXT = 0.6
 MIN_VOCABULARY_SIZE = 20
 
 default_data_split_ratio_by_task = {
diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
@@ -331,7 +331,7 @@ def num_classes(self) -> Optional[int]:
         return len(unique_values) if unique_values is not None else None
 
     @property
-    def class_labels(self) -> Optional[int]:
+    def class_labels(self) -> Optional[List[Union[int, str, float]]]:
         if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
             return np.unique(self.target)
         else:
diff --git a/fedot/core/data/data_detection.py b/fedot/core/data/data_detection.py
@@ -1,11 +1,13 @@
 from abc import abstractmethod
 from typing import List
 
+import re
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 
-from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES, MIN_VOCABULARY_SIZE
+from fedot.core.constants import FRACTION_OF_UNIQUE_VALUES_IN_TEXT, MIN_VOCABULARY_SIZE
+from fedot.core.log import default_log
 from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository
 
 ALLOWED_NAN_PERCENT = 0.9
@@ -28,26 +30,25 @@ class TextDataDetector(DataDetector):
     """
     Class for detecting text data during its import.
     """
-    def define_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
+    def __init__(self):
+        self.logger = default_log(prefix='FEDOT logger')
+        super().__init__()
+
+    def find_text_columns(self, data_frame: pd.DataFrame) -> List[str]:
         """
         :param data_frame: pandas dataframe with data
         :return: list of text columns' names
         """
-        text_columns = []
-        for column_name in data_frame.columns:
-            if self._column_contains_text(data_frame[column_name]):
-                text_columns.append(column_name)
+        text_columns = [column_name for column_name in data_frame.columns
+                        if self._column_contains_text(data_frame[column_name])]
         return text_columns
 
-    def define_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
+    def find_link_columns(self, data_frame: pd.DataFrame) -> List[str]:
         """
         :param data_frame: pandas dataframe with data
         :return: list of link columns' names
         """
-        link_columns = []
-        for column_name in data_frame.columns:
-            if self.is_link(data_frame[column_name]):
-                link_columns.append(column_name)
+        link_columns = [column_name for column_name in data_frame.columns if self.is_link(data_frame[column_name])]
         return link_columns
 
     @staticmethod
@@ -58,9 +59,9 @@ def is_full_of_nans(text_data: np.array) -> bool:
 
     @staticmethod
     def is_link(text_data: np.array) -> bool:
-        if str(next(el for el in text_data if el is not None)).startswith('http'):
-            return True
-        return False
+        link_pattern = \
+            '[(http(s)?):\\/\\/(www\\.)?a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*)'
+        return re.search(link_pattern, str(next(el for el in text_data if el is not None))) is not None
 
     @staticmethod
     def prepare_multimodal_data(dataframe: pd.DataFrame, columns: List[str]) -> dict:
@@ -100,14 +101,14 @@ def _column_contains_text(self, column: pd.Series) -> bool:
         if self.is_link(column):
             return False
         elif column.dtype == object and not self._is_float_compatible(column) and self._has_unique_values(column):
+            params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
+            tfidf_vectorizer = TfidfVectorizer(**params)
             try:
-                params = DefaultOperationParamsRepository().get_default_params_for_operation('tfidf')
-                tfidf_vectorizer = TfidfVectorizer(**params)
+                # TODO now grey zone columns (not text, not numerical) are not processed. Need to drop them
                 tfidf_vectorizer.fit(np.where(pd.isna(column), '', column))
-                if len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE:
-                    return True
+                return len(tfidf_vectorizer.vocabulary_) > MIN_VOCABULARY_SIZE
             except ValueError:
-                print(f'Column {column.name} possibly contains text, but it is not possible to vectorize it')
+                self.logger.warning(f"Column {column.name} possibly contains text, but it's impossible to vectorize it")
         return False
 
     @staticmethod
@@ -132,8 +133,13 @@ def _has_unique_values(column: pd.Series) -> bool:
         """
         unique_num = len(column.unique())
         nan_num = pd.isna(column).sum()
-        return unique_num / len(column) > FRACTION_OF_UNIQUE_VALUES if nan_num == 0 \
-            else (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES
+        # fraction of unique values in column if there is no nans
+        frac_unique_is_bigger_than_threshold = unique_num / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
+        # fraction of unique values in column if there are nans
+        frac_unique_is_bigger_than_threshold_with_nans = \
+            (unique_num - 1) / (len(column) - nan_num) > FRACTION_OF_UNIQUE_VALUES_IN_TEXT
+        return frac_unique_is_bigger_than_threshold if nan_num == 0 \
+            else frac_unique_is_bigger_than_threshold_with_nans
 
 
 class TimeSeriesDataDetector(DataDetector):
diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py
@@ -58,13 +58,11 @@ def data_type(self):
 
     @property
     def num_classes(self) -> Optional[int]:
-        if self.task.task_type == TaskTypesEnum.classification:
-            return len(np.unique(self.target))
-        else:
-            return None
+        unique_values = self.class_labels
+        return len(unique_values) if unique_values is not None else None
 
     @property
-    def class_labels(self) -> Optional[int]:
+    def class_labels(self) -> Optional[List[Union[int, str, float]]]:
         if self.task.task_type == TaskTypesEnum.classification and self.target is not None:
             return np.unique(self.target)
         else:
@@ -166,9 +164,9 @@ def from_csv(cls,
         text_columns = [text_columns] if isinstance(text_columns, str) else text_columns
 
         if not text_columns:
-            text_columns = text_data_detector.define_text_columns(data_frame)
+            text_columns = text_data_detector.find_text_columns(data_frame)
 
-        link_columns = text_data_detector.define_link_columns(data_frame)
+        link_columns = text_data_detector.find_link_columns(data_frame)
         columns_to_drop = text_columns + link_columns
         data_text = text_data_detector.prepare_multimodal_data(data_frame, text_columns)
         data_frame_table = data_frame.drop(columns=columns_to_drop)
diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py
@@ -285,7 +285,7 @@ def get_parameters_dict(self):
                                             'glove-wiki-gigaword-100', 'word2vec-ruscorpora-300']])
             },
             'tfidf': {
-                'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3)]]),
+                'ngram_range': (hp.choice, [[(1, 1), (1, 2), (1, 3), (1, 4)]]),
                 'min_df': (hp.uniform, [0.0001, 0.01]),
                 'max_df': (hp.uniform, [0.9, 0.99])
             },