diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml
index 5f6fc82502..add6f6fd2d 100644
--- a/.github/workflows/dev_test_benckmark.yml
+++ b/.github/workflows/dev_test_benckmark.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         pip install wheel
-        pip install .[tests,onnxruntime,benchmark]
+        pip install .[tests,onnxruntime,benchmark] datasets
         pip install -U git+https://github.com/huggingface/evaluate
         pip install -U git+https://github.com/huggingface/diffusers
         pip install -U git+https://github.com/huggingface/transformers
diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml
index e859e845d6..fe7df1a20c 100644
--- a/.github/workflows/test_benckmark.yml
+++ b/.github/workflows/test_benckmark.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install wheel
-          pip install .[tests,onnxruntime,benchmark]
+          pip install .[tests,onnxruntime,benchmark] datasets
       - name: Test with unittest
         run: |
           python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
index b5f2e27fc6..4e849ca317 100644
--- a/.github/workflows/test_utils.yml
+++ b/.github/workflows/test_utils.yml
@@ -37,4 +37,13 @@ jobs:
       - name: Test with pytest
         working-directory: tests
         run: |
-          python -m pytest -s -vvvv utils
+          pytest utils -s -n auto -m "not datasets_test" --durations=0
+
+      - name: Install datasets
+        run: |
+          pip install datasets
+
+      - name: Tests needing datasets
+        working-directory: tests
+        run: |
+          pytest utils -s -n auto -m "datasets_test" --durations=0
\ No newline at end of file
diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py
index b8734da478..7e5fc0b43d 100644
--- a/optimum/gptq/data.py
+++ b/optimum/gptq/data.py
@@ -18,7 +18,12 @@
 
 import numpy as np
 import torch
-from datasets import load_dataset
+
+from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
+
+
+if is_datasets_available():
+    from datasets import load_dataset
 
 
 """
@@ -113,6 +118,9 @@ def pad_block(block, pads):
 
 
 def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2"))
+
     if split == "train":
         data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     elif split == "validation":
@@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai
 
 
 def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
@@ -157,6 +168,9 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
 
 
 def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 949d4d260d..849d8821eb 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -88,7 +88,7 @@ def __init__(
             dataset (`Union[List[str], str, Any]`, defaults to `None`):
                 The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
                 (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
-                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
+                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new'].
             group_size (int, defaults to 128):
                 The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
             damp_percent (`float`, defaults to `0.1`):
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
index 2e3d9f32d6..adc1984795 100644
--- a/optimum/onnxruntime/configuration.py
+++ b/optimum/onnxruntime/configuration.py
@@ -18,9 +18,8 @@
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from datasets import Dataset
 from packaging.version import Version, parse
 
 from onnxruntime import __version__ as ort_version
@@ -33,6 +32,10 @@
 from ..utils import logging
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.get_logger(__name__)
 
 # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel"
@@ -117,7 +120,9 @@ def create_calibrator(
 
 class AutoCalibrationConfig:
     @staticmethod
-    def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig:
+    def minmax(
+        dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01
+    ) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
@@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f
 
     @staticmethod
     def entropy(
-        dataset: Dataset,
+        dataset: "Dataset",
         num_bins: int = 128,
         num_quantized_bins: int = 128,
     ) -> CalibrationConfig:
@@ -188,7 +193,7 @@ def entropy(
         )
 
     @staticmethod
-    def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
+    def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index caa662f382..4182abc925 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -14,10 +14,9 @@
 
 import logging
 import os
-from typing import Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
-from datasets import Dataset
 from transformers import EvalPrediction
 from transformers.trainer_pt_utils import nested_concat
 from transformers.trainer_utils import EvalLoopOutput
@@ -25,6 +24,10 @@
 from onnxruntime import InferenceSession
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -59,7 +62,7 @@ def __init__(
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
         self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
 
-    def evaluation_loop(self, dataset: Dataset):
+    def evaluation_loop(self, dataset: "Dataset"):
         """
         Run evaluation and returns metrics and predictions.
 
diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
index 056123f8d8..dca3584928 100644
--- a/optimum/onnxruntime/quantization.py
+++ b/optimum/onnxruntime/quantization.py
@@ -21,7 +21,6 @@
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import onnx
-from datasets import Dataset, load_dataset
 from packaging.version import Version, parse
 from transformers import AutoConfig
 
@@ -29,6 +28,7 @@
 from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType
 from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
 from onnxruntime.quantization.qdq_quantizer import QDQQuantizer
+from optimum.utils.import_utils import requires_backends
 
 from ..quantization_base import OptimumQuantizer
 from ..utils.save_utils import maybe_save_preprocessors
@@ -40,6 +40,7 @@
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset
     from transformers import PretrainedConfig
 
 LOGGER = logging.getLogger(__name__)
@@ -48,7 +49,7 @@
 class ORTCalibrationDataReader(CalibrationDataReader):
     __slots__ = ["batch_size", "dataset", "_dataset_iter"]
 
-    def __init__(self, dataset: Dataset, batch_size: int = 1):
+    def __init__(self, dataset: "Dataset", batch_size: int = 1):
         if dataset is None:
             raise ValueError("Provided dataset is None.")
 
@@ -157,7 +158,7 @@ def from_pretrained(
 
     def fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -211,7 +212,7 @@ def fit(
 
     def partial_fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -427,7 +428,7 @@ def get_calibration_dataset(
         seed: int = 2016,
         use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
-    ) -> Dataset:
+    ) -> "Dataset":
         """
         Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step.
 
@@ -473,6 +474,10 @@ def get_calibration_dataset(
                 "provided."
             )
 
+        requires_backends(self, ["datasets"])
+
+        from datasets import load_dataset
+
         calib_dataset = load_dataset(
             dataset_name,
             name=dataset_config_name,
@@ -491,7 +496,7 @@ def get_calibration_dataset(
 
         return self.clean_calibration_dataset(processed_calib_dataset)
 
-    def clean_calibration_dataset(self, dataset: Dataset) -> Dataset:
+    def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset":
         model = onnx.load(self.onnx_model_path)
         model_inputs = {input.name for input in model.graph.input}
         ignored_columns = list(set(dataset.column_names) - model_inputs)
diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py
index c493a94374..bfdcd64d92 100644
--- a/optimum/onnxruntime/runs/calibrator.py
+++ b/optimum/onnxruntime/runs/calibrator.py
@@ -1,6 +1,4 @@
-from typing import Dict, List
-
-from datasets import Dataset
+from typing import TYPE_CHECKING, Dict, List
 
 from ...runs_base import Calibrator
 from .. import ORTQuantizer
@@ -9,10 +7,14 @@
 from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 class OnnxRuntimeCalibrator(Calibrator):
     def __init__(
         self,
-        calibration_dataset: Dataset,
+        calibration_dataset: "Dataset",
         quantizer: ORTQuantizer,
         model_path: str,
         qconfig: QuantizationConfig,
diff --git a/optimum/runs_base.py b/optimum/runs_base.py
index 3a1d164c60..dadd445818 100644
--- a/optimum/runs_base.py
+++ b/optimum/runs_base.py
@@ -2,13 +2,12 @@
 import subprocess
 from contextlib import contextmanager
 from time import perf_counter_ns
-from typing import Set
+from typing import TYPE_CHECKING, Set
 
 import numpy as np
 import optuna
 import torch
 import transformers
-from datasets import Dataset
 from tqdm import trange
 
 from . import version as optimum_version
@@ -21,6 +20,9 @@
 from .utils.runs import RunConfig, cpu_info_command
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -34,7 +36,7 @@ def get_autoclass_name(task):
 
 class Calibrator:
     def __init__(
-        self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion
+        self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion
     ):
         self.calibration_dataset = calibration_dataset
         self.quantizer = quantizer
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 35a6294ab5..405e3815b3 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -69,6 +69,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _auto_gptq_available = _is_package_available("auto_gptq")
 _timm_available = _is_package_available("timm")
 _sentence_transformers_available = _is_package_available("sentence_transformers")
+_datasets_available = _is_package_available("datasets")
 
 torch_version = None
 if is_torch_available():
@@ -131,6 +132,10 @@ def is_sentence_transformers_available():
     return _sentence_transformers_available
 
 
+def is_datasets_available():
+    return _datasets_available
+
+
 def is_auto_gptq_available():
     if _auto_gptq_available:
         version_autogptq = version.parse(importlib_metadata.version("auto_gptq"))
@@ -230,6 +235,12 @@ def require_numpy_strictly_lower(package_version: str, message: str):
 -U transformers`. Please note that you may need to restart your runtime after installation.
 """
 
+DATASETS_IMPORT_ERROR = """
+{0} requires the datasets library but it was not found in your environment. You can install it with pip:
+`pip install datasets`. Please note that you may need to restart your runtime after installation.
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)),
@@ -245,6 +256,7 @@ def require_numpy_strictly_lower(package_version: str, message: str):
             "transformers_434",
             (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")),
         ),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
     ]
 )
 
diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py
index dc995ccc50..7cfda13ba7 100644
--- a/optimum/utils/preprocessing/base.py
+++ b/optimum/utils/preprocessing/base.py
@@ -20,15 +20,16 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
-from datasets import Dataset, DatasetDict
-from datasets import load_dataset as datasets_load_dataset
 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import BaseImageProcessor
 
+from optimum.utils.import_utils import requires_backends
+
 from .. import logging
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset, DatasetDict
     from transformers import PretrainedConfig
 
 
@@ -102,11 +103,14 @@ def create_dataset_processing_func(
 
     def prepare_dataset(
         self,
-        dataset: Union[DatasetDict, Dataset],
+        dataset: Union["DatasetDict", "Dataset"],
         data_keys: Dict[str, str],
         ref_keys: Optional[List[str]] = None,
         split: Optional[str] = None,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+        from datasets import Dataset
+
         if isinstance(dataset, Dataset) and split is not None:
             raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.")
         elif split is not None:
@@ -131,7 +135,12 @@ def load_dataset(
         num_samples: Optional[int] = None,
         shuffle: bool = False,
         **load_dataset_kwargs,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+
+        from datasets import Dataset, DatasetDict
+        from datasets import load_dataset as datasets_load_dataset
+
         dataset = datasets_load_dataset(path, **load_dataset_kwargs)
 
         if isinstance(dataset, DatasetDict) and load_smallest_split:
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 76fe9a05b1..88b1acdb78 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -28,6 +28,7 @@
 from . import (
     is_accelerate_available,
     is_auto_gptq_available,
+    is_datasets_available,
     is_diffusers_available,
     is_sentence_transformers_available,
     is_timm_available,
@@ -146,6 +147,10 @@ def require_sentence_transformers(test_case):
     return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case)
 
 
+def require_datasets(test_case):
+    return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case)
+
+
 def grid_parameters(
     parameters: Dict[str, Iterable[Any]],
     yield_dict: bool = False,
diff --git a/pyproject.toml b/pyproject.toml
index 99a0f1c85f..17bcd90e06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "rocm_ep_test",
     "tensorflow_test",
     "timm_test",
+    "datasets_test",
     "run_in_series",
     "run_slow",
     "accelerate_test",
diff --git a/setup.py b/setup.py
index 82892bfcc8..accf6adf14 100644
--- a/setup.py
+++ b/setup.py
@@ -13,14 +13,11 @@
 
 
 REQUIRED_PKGS = [
-    "coloredlogs",
-    "sympy",
     "transformers>=4.29",
     "torch>=1.11",
     "packaging",
     "numpy",
     "huggingface_hub>=0.8.0",
-    "datasets",
 ]
 
 # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released
diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py
index 1656704807..5ffeab07b2 100644
--- a/tests/utils/test_task_processors.py
+++ b/tests/utils/test_task_processors.py
@@ -19,10 +19,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Tuple, Union
 from unittest import TestCase
 
+import pytest
 from datasets import DatasetDict
 from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
 
 from optimum.utils.preprocessing import TaskProcessorsManager
+from optimum.utils.testing_utils import require_datasets
 
 
 if TYPE_CHECKING:
@@ -122,6 +124,8 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre
         )
         self.assertDictEqual(preprocessor_kwargs, clone)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_unallowed_data_keys(self):
         task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)(
             self.CONFIG, self.PREPROCESSOR
@@ -188,15 +192,23 @@ def _test_load_dataset(
 
         return dataset
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset(self):
         return self._test_load_dataset(False, False, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_by_guessing_data_keys(self):
         return self._test_load_dataset(False, True, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_and_only_keep_necessary_columns(self):
         return self._test_load_dataset(False, False, True)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         return self._test_load_dataset(True, False, False)
 
@@ -207,6 +219,8 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -223,6 +237,8 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -232,6 +248,8 @@ def test_load_dataset_with_max_length(self):
         input_ids = dataset[0]["input_ids"]
         self.assertEqual(len(input_ids), max_length)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         self.skipTest(
             "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)"
@@ -244,6 +262,8 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = 384
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)