Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 106 additions & 90 deletions daal4py/sklearn/manifold/_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,36 +122,89 @@ def _daal_tsne(self, P, n_samples, X_embedded):

return X_embedded

# Comment 2025-11-24: This appears to be a copy-paste from an earlier version of the original
# scikit-learn with some modifications to make calls to oneDAL under a narrow subset of
# allowed input parameters, copy-pastying the rest of the sklearn code when oneDAL is not
# called. Note that the conditions checked here are out of synch with the latest sklearn by now.
# An early 'is supported' check that offloads to stock sklearn was added later on, which results
# in having a lot of dead code paths in this function that can be safely removed.
# Note: this method is called from inside 'fit' from the base class in stock scikit-learn.
# Hence, the offloading logic is different than in other classes, as falling back to 'fit'
# from the base class would lead to a circular loop.
def _fit(self, X, skip_num_points=0):
"""Private function to fit the model using X as training data."""
if isinstance(self.init, str) and self.init == "warn":
warnings.warn(
"The default initialization in TSNE will change "
"from 'random' to 'pca' in 1.2.",
FutureWarning,
)
self._init = "random"

_patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne")
_patching_status.and_conditions(
[
(
self.method == "barnes_hut",
'Used t-SNE method is not "barnes_hut" which is the only supported.',
),
(self.n_components == 2, "Number of components != 2."),
(self.verbose == 0, "Verbose mode is set."),
(
daal_check_version((2021, "P", 600)),
"oneDAL version is lower than 2021.6.",
),
(
not (
isinstance(self.init, str) and self.init == "pca" and issparse(X)
),
"PCA initialization is not supported with sparse input matrices.",
),
# Note: these conditions below should result in errors, but stock scikit-learn
# does not check for errors at this exact point. Hence, this offloads the erroring
# out to the base class, wherever in the process they might be encountered.
(
np.isscalar(self.angle) and self.angle > 0.0 and self.angle < 1.0,
"'angle' must be between 0.0 - 1.0",
),
(self.early_exaggeration >= 1.0, "early_exaggeration must be at least 1"),
(
(
isinstance(self.init, str)
and self.init
in ["random", "pca"]
+ (
["warn"]
if sklearn_check_version("1.0")
and not sklearn_check_version("1.2")
else []
)
)
or isinstance(self.init, np.ndarray),
"'init' must be 'exact', 'pca', or a numpy array.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's a mistake in error message

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'warn' is no longer allowed in newer sklearn versions, so it's not referenced here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but init can't be exact? Also it can't be numpy array according to this condition

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there's no init 'exact'; and there's an 'or' condition where it allows numpy arrays.

),
]
)
_dal_ready = _patching_status.get_status(logs=True)
if not _dal_ready:
return super()._fit(X, skip_num_points)

if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
if isinstance(self.init, str) and self.init == "warn":
warnings.warn(
"The default initialization in TSNE will change "
"from 'random' to 'pca' in 1.2.",
FutureWarning,
)
self._init = "random"
else:
self._init = self.init
else:
self._init = self.init

if isinstance(self._init, str) and self._init == "pca" and issparse(X):
raise TypeError(
"PCA initialization is currently not supported "
"with the sparse input matrix. Use "
'init="random" instead.'
)

if self.method not in ["barnes_hut", "exact"]:
raise ValueError("'method' must be 'barnes_hut' or 'exact'")
if self.angle < 0.0 or self.angle > 1.0:
raise ValueError("'angle' must be between 0.0 - 1.0")
if self.learning_rate == "warn":
warnings.warn(
"The default learning rate in TSNE will change "
"from 200.0 to 'auto' in 1.2.",
FutureWarning,
)
self._learning_rate = 200.0
if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
if self.learning_rate == "warn":
warnings.warn(
"The default learning rate in TSNE will change "
"from 200.0 to 'auto' in 1.2.",
FutureWarning,
)
self._learning_rate = 200.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like more versioning is needed here, as it fails on older versions of scikit:
AttributeError: 'TSNE' object has no attribute '_learning_rate'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

else:
self._learning_rate = self.learning_rate
else:
self._learning_rate = self.learning_rate
if self._learning_rate == "auto":
Expand Down Expand Up @@ -227,28 +280,15 @@ def _fit(self, X, skip_num_points=0):
"or provide the dense distance matrix."
)

if self.method == "barnes_hut" and self.n_components > 3:
raise ValueError(
"'n_components' should be inferior to 4 for the "
"barnes_hut algorithm as it relies on "
"quad-tree or oct-tree."
)
random_state = check_random_state(self.random_state)

if self.early_exaggeration < 1.0:
raise ValueError(
"early_exaggeration must be at least 1, but is {}".format(
self.early_exaggeration
)
)

if not sklearn_check_version("1.2"):
if self.n_iter < 250:
raise ValueError("n_iter should be at least 250")

n_samples = X.shape[0]

neighbors_nn = None
# neighbors_nn = None # <- unused variable in stock sklearn, commented out due to coverity
if self.method == "exact":
# Retrieve the distance matrix, either using the precomputed one or
# computing it.
Expand Down Expand Up @@ -278,9 +318,8 @@ def _fit(self, X, skip_num_points=0):
"All distances should be positive, the " "metric given is not correct"
)

if (
self.metric != "euclidean"
and getattr(self, "square_distances", True) is True
if self.metric != "euclidean" and (
sklearn_check_version("1.2") or self.square_distances is True
):
distances **= 2

Expand Down Expand Up @@ -339,15 +378,14 @@ def _fit(self, X, skip_num_points=0):
# Free the memory used by the ball_tree
del knn

if (
getattr(self, "square_distances", True) is True
or self.metric == "euclidean"
# knn return the euclidean distance but we need it squared
# to be consistent with the 'exact' method. Note that the
# the method was derived using the euclidean method as in the
# input space. Not sure of the implication of using a different
# metric.
if sklearn_check_version("1.2") or (
self.square_distances is True or self.metric == "euclidean"
):
# knn return the euclidean distance but we need it squared
# to be consistent with the 'exact' method. Note that the
# the method was derived using the euclidean method as in the
# input space. Not sure of the implication of using a different
# metric.
distances_nn.data **= 2

# compute the joint probability distribution for the input space
Expand All @@ -358,16 +396,23 @@ def _fit(self, X, skip_num_points=0):
elif self._init == "pca":
pca = PCA(
n_components=self.n_components,
svd_solver="randomized",
random_state=random_state,
)
if sklearn_check_version("1.2"):
# Always output a numpy array, no matter what is configured globally
pca.set_output(transform="default")
X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
warnings.warn(
"The PCA initialization in TSNE will change to "
"have the standard deviation of PC1 equal to 1e-4 "
"in 1.2. This will ensure better convergence.",
FutureWarning,
)
if sklearn_check_version("1.0") and not sklearn_check_version("1.2"):
warnings.warn(
"The PCA initialization in TSNE will change to "
"have the standard deviation of PC1 equal to 1e-4 "
"in 1.2. This will ensure better convergence.",
FutureWarning,
)
if sklearn_check_version("1.2"):
# PCA is rescaled so that PC1 has standard deviation 1e-4 which is
# the default value for random initialization. See issue #18018.
X_embedded = X_embedded / np.std(X_embedded[:, 0]) * 1e-4
elif self._init == "random":
# The embedding is initialized with iid samples from Gaussians with
# standard deviation 1e-4.
Expand All @@ -377,40 +422,11 @@ def _fit(self, X, skip_num_points=0):
else:
raise ValueError("'init' must be 'pca', 'random', or " "a numpy array")

# Degrees of freedom of the Student's t-distribution. The suggestion
# degrees_of_freedom = n_components - 1 comes from
# "Learning a Parametric Embedding by Preserving Local Structure"
# Laurens van der Maaten, 2009.
degrees_of_freedom = max(self.n_components - 1, 1)

_patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne")
_patching_status.and_conditions(
[
(
self.method == "barnes_hut",
'Used t-SNE method is not "barnes_hut" which is the only supported.',
),
(self.n_components == 2, "Number of components != 2."),
(self.verbose == 0, "Verbose mode is set."),
(
daal_check_version((2021, "P", 600)),
"oneDAL version is lower than 2021.6.",
),
]
)
_dal_ready = _patching_status.get_status(logs=True)
# Note: by this point, stock sklearn would calculate degrees of freedom, but oneDAL
# doesn't use them.

if _dal_ready:
X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64])
return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
return self._tsne(
P,
degrees_of_freedom,
n_samples,
X_embedded=X_embedded,
neighbors=neighbors_nn,
skip_num_points=skip_num_points,
)
X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64])
return self._daal_tsne(P, n_samples, X_embedded=X_embedded)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct that in case method == exact we would still call this function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code would not be reached with method 'exact'.


fit.__doc__ = BaseTSNE.fit.__doc__
fit_transform.__doc__ = BaseTSNE.fit_transform.__doc__
6 changes: 3 additions & 3 deletions doc/sources/algorithms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,11 @@ Dimensionality Reduction
- All parameters are supported except:

- ``metric`` != 'euclidean' or `'minkowski'` with ``p`` != `2`

- ``n_components`` can only be `2`

- ``method`` != ``"barnes_hut"``

Refer to :ref:`TSNE acceleration details <acceleration_tsne>` to learn more.
- Sparse data is not supported
- Sparse data with ``init`` = ``"pca"`` is not supported

Nearest Neighbors
*****************
Expand Down
6 changes: 3 additions & 3 deletions doc/sources/guide/acceleration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ The overall acceleration of TSNE depends on the acceleration of each of these al
- ``metric`` != `'euclidean'` or `'minkowski'` with ``p`` != `2`
- The Gradient Descent part of the algorithm supports all parameters except:

- ``n_components`` = `3`
- ``method`` = `'exact'`
- ``verbose`` != `0`
- ``n_components`` > ``2``
- ``method`` = ``'exact'``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But in case of exact method we don't fallback to sklearn, should we add it as supported?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It falls back on the first condition in the patching chain.

- ``verbose`` != ``0``

To get better performance, use parameters supported by both components.

Expand Down
8 changes: 7 additions & 1 deletion sklearnex/manifold/tests/test_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from numpy.testing import assert_allclose
from sklearn.metrics.pairwise import pairwise_distances

from daal4py.sklearn._utils import sklearn_check_version

# Note: n_components must be 2 for now
from onedal.tests.utils._dataframes_support import (
_as_numpy,
Expand Down Expand Up @@ -161,8 +163,12 @@ def test_tsne_functionality_and_edge_cases(
assert np.any(embedding != 0)


# Note: since sklearn1.2, the PCA initialization divides by standard deviations of components.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to add another test case in the future instead of removed one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, what other case would you add?

# Since those will be zeros for constant data, it will end up producing NaNs, hence it's not tested.
@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
@pytest.mark.parametrize("init", ["pca", "random"])
@pytest.mark.parametrize(
"init", ["random"] + (["pca"] if not sklearn_check_version("1.2") else [])
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_tsne_constant_data(init, dataframe, queue, dtype):
from sklearnex.manifold import TSNE
Expand Down
Loading