-
Notifications
You must be signed in to change notification settings - Fork 184
MAINT: Update TSNE for sklearn1.8 #2793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
db38d4c
9c01a8c
b3f852f
55a8529
d960a14
8ebab02
42d8d50
13ce364
2cbeeb9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -122,36 +122,89 @@ def _daal_tsne(self, P, n_samples, X_embedded): | |
|
|
||
| return X_embedded | ||
|
|
||
| # Comment 2025-11-24: This appears to be a copy-paste from an earlier version of the original | ||
| # scikit-learn with some modifications to make calls to oneDAL under a narrow subset of | ||
| # allowed input parameters, copy-pastying the rest of the sklearn code when oneDAL is not | ||
| # called. Note that the conditions checked here are out of synch with the latest sklearn by now. | ||
| # An early 'is supported' check that offloads to stock sklearn was added later on, which results | ||
| # in having a lot of dead code paths in this function that can be safely removed. | ||
| # Note: this method is called from inside 'fit' from the base class in stock scikit-learn. | ||
| # Hence, the offloading logic is different than in other classes, as falling back to 'fit' | ||
| # from the base class would lead to a circular loop. | ||
| def _fit(self, X, skip_num_points=0): | ||
| """Private function to fit the model using X as training data.""" | ||
| if isinstance(self.init, str) and self.init == "warn": | ||
| warnings.warn( | ||
| "The default initialization in TSNE will change " | ||
| "from 'random' to 'pca' in 1.2.", | ||
| FutureWarning, | ||
| ) | ||
| self._init = "random" | ||
|
|
||
| _patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne") | ||
| _patching_status.and_conditions( | ||
| [ | ||
| ( | ||
| self.method == "barnes_hut", | ||
| 'Used t-SNE method is not "barnes_hut" which is the only supported.', | ||
| ), | ||
| (self.n_components == 2, "Number of components != 2."), | ||
| (self.verbose == 0, "Verbose mode is set."), | ||
| ( | ||
| daal_check_version((2021, "P", 600)), | ||
| "oneDAL version is lower than 2021.6.", | ||
| ), | ||
| ( | ||
| not ( | ||
| isinstance(self.init, str) and self.init == "pca" and issparse(X) | ||
| ), | ||
| "PCA initialization is not supported with sparse input matrices.", | ||
| ), | ||
| # Note: these conditions below should result in errors, but stock scikit-learn | ||
| # does not check for errors at this exact point. Hence, this offloads the erroring | ||
| # out to the base class, wherever in the process they might be encountered. | ||
| ( | ||
| np.isscalar(self.angle) and self.angle > 0.0 and self.angle < 1.0, | ||
| "'angle' must be between 0.0 - 1.0", | ||
| ), | ||
| (self.early_exaggeration >= 1.0, "early_exaggeration must be at least 1"), | ||
| ( | ||
| ( | ||
| isinstance(self.init, str) | ||
| and self.init | ||
| in ["random", "pca"] | ||
| + ( | ||
| ["warn"] | ||
| if sklearn_check_version("1.0") | ||
| and not sklearn_check_version("1.2") | ||
| else [] | ||
| ) | ||
| ) | ||
| or isinstance(self.init, np.ndarray), | ||
| "'init' must be 'exact', 'pca', or a numpy array.", | ||
| ), | ||
| ] | ||
| ) | ||
| _dal_ready = _patching_status.get_status(logs=True) | ||
| if not _dal_ready: | ||
| return super()._fit(X, skip_num_points) | ||
|
|
||
| if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): | ||
| if isinstance(self.init, str) and self.init == "warn": | ||
| warnings.warn( | ||
| "The default initialization in TSNE will change " | ||
| "from 'random' to 'pca' in 1.2.", | ||
| FutureWarning, | ||
| ) | ||
| self._init = "random" | ||
| else: | ||
| self._init = self.init | ||
| else: | ||
| self._init = self.init | ||
|
|
||
| if isinstance(self._init, str) and self._init == "pca" and issparse(X): | ||
| raise TypeError( | ||
| "PCA initialization is currently not supported " | ||
| "with the sparse input matrix. Use " | ||
| 'init="random" instead.' | ||
| ) | ||
|
|
||
| if self.method not in ["barnes_hut", "exact"]: | ||
| raise ValueError("'method' must be 'barnes_hut' or 'exact'") | ||
| if self.angle < 0.0 or self.angle > 1.0: | ||
| raise ValueError("'angle' must be between 0.0 - 1.0") | ||
| if self.learning_rate == "warn": | ||
| warnings.warn( | ||
| "The default learning rate in TSNE will change " | ||
| "from 200.0 to 'auto' in 1.2.", | ||
| FutureWarning, | ||
| ) | ||
| self._learning_rate = 200.0 | ||
| if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): | ||
| if self.learning_rate == "warn": | ||
| warnings.warn( | ||
| "The default learning rate in TSNE will change " | ||
| "from 200.0 to 'auto' in 1.2.", | ||
| FutureWarning, | ||
| ) | ||
| self._learning_rate = 200.0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like more versioning is needed here, as it fails on older versions of scikit:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
| else: | ||
| self._learning_rate = self.learning_rate | ||
| else: | ||
| self._learning_rate = self.learning_rate | ||
| if self._learning_rate == "auto": | ||
|
|
@@ -227,28 +280,15 @@ def _fit(self, X, skip_num_points=0): | |
| "or provide the dense distance matrix." | ||
| ) | ||
|
|
||
| if self.method == "barnes_hut" and self.n_components > 3: | ||
| raise ValueError( | ||
| "'n_components' should be inferior to 4 for the " | ||
| "barnes_hut algorithm as it relies on " | ||
| "quad-tree or oct-tree." | ||
| ) | ||
| random_state = check_random_state(self.random_state) | ||
|
|
||
| if self.early_exaggeration < 1.0: | ||
| raise ValueError( | ||
| "early_exaggeration must be at least 1, but is {}".format( | ||
| self.early_exaggeration | ||
| ) | ||
| ) | ||
|
|
||
| if not sklearn_check_version("1.2"): | ||
| if self.n_iter < 250: | ||
| raise ValueError("n_iter should be at least 250") | ||
|
|
||
| n_samples = X.shape[0] | ||
|
|
||
| neighbors_nn = None | ||
| # neighbors_nn = None # <- unused variable in stock sklearn, commented out due to coverity | ||
| if self.method == "exact": | ||
| # Retrieve the distance matrix, either using the precomputed one or | ||
| # computing it. | ||
|
|
@@ -278,9 +318,8 @@ def _fit(self, X, skip_num_points=0): | |
| "All distances should be positive, the " "metric given is not correct" | ||
| ) | ||
|
|
||
| if ( | ||
| self.metric != "euclidean" | ||
| and getattr(self, "square_distances", True) is True | ||
| if self.metric != "euclidean" and ( | ||
| sklearn_check_version("1.2") or self.square_distances is True | ||
| ): | ||
| distances **= 2 | ||
|
|
||
|
|
@@ -339,15 +378,14 @@ def _fit(self, X, skip_num_points=0): | |
| # Free the memory used by the ball_tree | ||
| del knn | ||
|
|
||
| if ( | ||
| getattr(self, "square_distances", True) is True | ||
| or self.metric == "euclidean" | ||
| # knn return the euclidean distance but we need it squared | ||
| # to be consistent with the 'exact' method. Note that the | ||
| # the method was derived using the euclidean method as in the | ||
| # input space. Not sure of the implication of using a different | ||
| # metric. | ||
| if sklearn_check_version("1.2") or ( | ||
| self.square_distances is True or self.metric == "euclidean" | ||
| ): | ||
| # knn return the euclidean distance but we need it squared | ||
| # to be consistent with the 'exact' method. Note that the | ||
| # the method was derived using the euclidean method as in the | ||
| # input space. Not sure of the implication of using a different | ||
| # metric. | ||
| distances_nn.data **= 2 | ||
|
|
||
| # compute the joint probability distribution for the input space | ||
|
|
@@ -358,16 +396,23 @@ def _fit(self, X, skip_num_points=0): | |
| elif self._init == "pca": | ||
| pca = PCA( | ||
| n_components=self.n_components, | ||
| svd_solver="randomized", | ||
| random_state=random_state, | ||
| ) | ||
| if sklearn_check_version("1.2"): | ||
| # Always output a numpy array, no matter what is configured globally | ||
| pca.set_output(transform="default") | ||
| X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) | ||
| warnings.warn( | ||
| "The PCA initialization in TSNE will change to " | ||
| "have the standard deviation of PC1 equal to 1e-4 " | ||
| "in 1.2. This will ensure better convergence.", | ||
| FutureWarning, | ||
| ) | ||
| if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): | ||
| warnings.warn( | ||
| "The PCA initialization in TSNE will change to " | ||
| "have the standard deviation of PC1 equal to 1e-4 " | ||
| "in 1.2. This will ensure better convergence.", | ||
| FutureWarning, | ||
| ) | ||
| if sklearn_check_version("1.2"): | ||
| # PCA is rescaled so that PC1 has standard deviation 1e-4 which is | ||
| # the default value for random initialization. See issue #18018. | ||
| X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4 | ||
| elif self._init == "random": | ||
| # The embedding is initialized with iid samples from Gaussians with | ||
| # standard deviation 1e-4. | ||
|
|
@@ -377,40 +422,11 @@ def _fit(self, X, skip_num_points=0): | |
| else: | ||
| raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") | ||
|
|
||
| # Degrees of freedom of the Student's t-distribution. The suggestion | ||
| # degrees_of_freedom = n_components - 1 comes from | ||
| # "Learning a Parametric Embedding by Preserving Local Structure" | ||
| # Laurens van der Maaten, 2009. | ||
| degrees_of_freedom = max(self.n_components - 1, 1) | ||
|
|
||
| _patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne") | ||
| _patching_status.and_conditions( | ||
| [ | ||
| ( | ||
| self.method == "barnes_hut", | ||
| 'Used t-SNE method is not "barnes_hut" which is the only supported.', | ||
| ), | ||
| (self.n_components == 2, "Number of components != 2."), | ||
| (self.verbose == 0, "Verbose mode is set."), | ||
| ( | ||
| daal_check_version((2021, "P", 600)), | ||
| "oneDAL version is lower than 2021.6.", | ||
| ), | ||
| ] | ||
| ) | ||
| _dal_ready = _patching_status.get_status(logs=True) | ||
| # Note: by this point, stock sklearn would calculate degrees of freedom, but oneDAL | ||
| # doesn't use them. | ||
|
|
||
| if _dal_ready: | ||
| X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64]) | ||
| return self._daal_tsne(P, n_samples, X_embedded=X_embedded) | ||
| return self._tsne( | ||
| P, | ||
| degrees_of_freedom, | ||
| n_samples, | ||
| X_embedded=X_embedded, | ||
| neighbors=neighbors_nn, | ||
| skip_num_points=skip_num_points, | ||
| ) | ||
| X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64]) | ||
| return self._daal_tsne(P, n_samples, X_embedded=X_embedded) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it correct that in case method == exact we would still call this function?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code would not be reached with method 'exact'. |
||
|
|
||
| fit.__doc__ = BaseTSNE.fit.__doc__ | ||
| fit_transform.__doc__ = BaseTSNE.fit_transform.__doc__ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,9 +34,9 @@ The overall acceleration of TSNE depends on the acceleration of each of these al | |
| - ``metric`` != `'euclidean'` or `'minkowski'` with ``p`` != `2` | ||
| - The Gradient Descent part of the algorithm supports all parameters except: | ||
|
|
||
| - ``n_components`` = `3` | ||
| - ``method`` = `'exact'` | ||
| - ``verbose`` != `0` | ||
| - ``n_components`` > ``2`` | ||
| - ``method`` = ``'exact'`` | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But in case of exact method we don't fallback to sklearn, should we add it as supported?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It falls back on the first condition in the patching chain. |
||
| - ``verbose`` != ``0`` | ||
|
|
||
| To get better performance, use parameters supported by both components. | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ | |
| from numpy.testing import assert_allclose | ||
| from sklearn.metrics.pairwise import pairwise_distances | ||
|
|
||
| from daal4py.sklearn._utils import sklearn_check_version | ||
|
|
||
| # Note: n_components must be 2 for now | ||
| from onedal.tests.utils._dataframes_support import ( | ||
| _as_numpy, | ||
|
|
@@ -161,8 +163,12 @@ def test_tsne_functionality_and_edge_cases( | |
| assert np.any(embedding != 0) | ||
|
|
||
|
|
||
| # Note: since sklearn1.2, the PCA initialization divides by standard deviations of components. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to add another test case in the future instead of removed one?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, what other case would you add? |
||
| # Since those will be zeros for constant data, it will end up producing NaNs, hence it's not tested. | ||
| @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) | ||
| @pytest.mark.parametrize("init", ["pca", "random"]) | ||
| @pytest.mark.parametrize( | ||
| "init", ["random"] + (["pca"] if not sklearn_check_version("1.2") else []) | ||
| ) | ||
| @pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
| def test_tsne_constant_data(init, dataframe, queue, dtype): | ||
| from sklearnex.manifold import TSNE | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think there's a mistake in error message
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'warn' is no longer allowed in newer sklearn versions, so it's not referenced here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but init can't be exact? Also it can't be numpy array according to this condition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, there's no init 'exact'; and there's an 'or' condition where it allows numpy arrays.