huggingface
diff --git a/‎examples/legacy/seq2seq/finetune_trainer.py
Lines changed: 3 additions & 3 deletions b/‎examples/legacy/seq2seq/finetune_trainer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/legacy/seq2seq/utils.py
Lines changed: 3 additions & 3 deletions b/‎examples/legacy/seq2seq/utils.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/pytorch/summarization/run_summarization.py
Lines changed: 3 additions & 3 deletions b/‎examples/pytorch/summarization/run_summarization.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/pytorch/text-classification/run_classification.py
Lines changed: 6 additions & 6 deletions b/‎examples/pytorch/text-classification/run_classification.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/pytorch/text-classification/run_glue.py
Lines changed: 6 additions & 6 deletions b/‎examples/pytorch/text-classification/run_glue.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/pytorch/translation/run_translation_no_trainer.py
Lines changed: 3 additions & 3 deletions b/‎examples/pytorch/translation/run_translation_no_trainer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/tensorflow/translation/run_translation.py
Lines changed: 3 additions & 3 deletions b/‎examples/tensorflow/translation/run_translation.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/cache_utils.py
Lines changed: 10 additions & 10 deletions b/‎src/transformers/cache_utils.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/transformers/integrations/tpu.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/integrations/tpu.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/modeling_utils.py
Lines changed: 12 additions & 12 deletions b/‎src/transformers/modeling_utils.py
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
Lines changed: 9 additions & 9 deletions b/‎src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/deprecated/realm/modeling_realm.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/deprecated/realm/modeling_realm.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
Lines changed: 6 additions & 6 deletions b/‎src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
Lines changed: 6 additions & 6 deletions
@@ -231,9 +231,9 @@ def main():
 
     # set decoder_start_token_id for MBart
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
-            "mBart requires --tgt_lang and --src_lang"
-        )
+        assert (
+            data_args.tgt_lang is not None and data_args.src_lang is not None
+        ), "mBart requires --tgt_lang and --src_lang"
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
         else:
 
@@ -283,9 +283,9 @@ def __init__(self, tokenizer, data_args, decoder_start_token_id, tpu_num_cores=N
         self.tokenizer = tokenizer
         self.pad_token_id = tokenizer.pad_token_id
         self.decoder_start_token_id = decoder_start_token_id
-        assert self.pad_token_id is not None, (
-            f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
-        )
+        assert (
+            self.pad_token_id is not None
+        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
         self.data_args = data_args
         self.tpu_num_cores = tpu_num_cores
         self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
 
@@ -504,9 +504,9 @@ def main():
         return
 
     if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert data_args.lang is not None, (
-            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
-        )
+        assert (
+            data_args.lang is not None
+        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
 
         tokenizer.src_lang = data_args.lang
         tokenizer.tgt_lang = data_args.lang
 
@@ -198,9 +198,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert validation_extension == train_extension, (
-                "`validation_file` should have the same extension (csv or json) as `train_file`."
-            )
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
 
 
 @dataclass
@@ -356,9 +356,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert test_extension == train_extension, (
-                    "`test_file` should have the same extension (csv or json) as `train_file`."
-                )
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a dataset name or a test file for `do_predict`.")
 
@@ -155,9 +155,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert validation_extension == train_extension, (
-                "`validation_file` should have the same extension (csv or json) as `train_file`."
-            )
+            assert (
+                validation_extension == train_extension
+            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
 
 
 @dataclass
@@ -312,9 +312,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert test_extension == train_extension, (
-                    "`test_file` should have the same extension (csv or json) as `train_file`."
-                )
+                assert (
+                    test_extension == train_extension
+                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
 
@@ -435,9 +435,9 @@ def main():
 
     # Set decoder_start_token_id
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert args.target_lang is not None and args.source_lang is not None, (
-            "mBart requires --target_lang and --source_lang"
-        )
+        assert (
+            args.target_lang is not None and args.source_lang is not None
+        ), "mBart requires --target_lang and --source_lang"
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
         else:
 
@@ -500,9 +500,9 @@ def preprocess_function(examples):
 
         # region Set decoder_start_token_id
         if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-            assert data_args.target_lang is not None and data_args.source_lang is not None, (
-                "mBart requires --target_lang and --source_lang"
-            )
+            assert (
+                data_args.target_lang is not None and data_args.source_lang is not None
+            ), "mBart requires --target_lang and --source_lang"
             if isinstance(tokenizer, MBartTokenizer):
                 model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
             else:
 
@@ -1673,7 +1673,7 @@ def __init__(
                 "config and it's not set to None."
             )
         self.config = config
-        self.device = device 
+        self.device = device
         self.layer_device_map = layer_device_map
         self.max_cache_len = max_cache_len
         self.max_batch_size = max_batch_size
@@ -1816,13 +1816,13 @@ def reset(self):
 def _get_flat_dict_for_hybrid_cache(hybrid_cache: HybridCache):
     return {
         "config": getattr(hybrid_cache, "config"),
-        "device": str(getattr(hybrid_cache, "device")) if getattr(hybrid_cache, "device", None) != None else None,
+        "device": str(getattr(hybrid_cache, "device")) if getattr(hybrid_cache, "device", None) is not None else None,
         "layer_device_map": getattr(hybrid_cache, "layer_device_map"),
         "key_cache": getattr(hybrid_cache, "key_cache"),
         "value_cache": getattr(hybrid_cache, "value_cache"),
         "max_batch_size": getattr(hybrid_cache, "max_batch_size"),
         "max_cache_len": getattr(hybrid_cache, "max_cache_len"),
-        "_dtype": str(getattr(hybrid_cache, "_dtype")) if getattr(hybrid_cache, "_dtype", None) != None else None,
+        "_dtype": str(getattr(hybrid_cache, "_dtype")) if getattr(hybrid_cache, "_dtype", None) is not None else None,
     }
 
 
@@ -1833,9 +1833,9 @@ def _flatten_hybrid_cache(
     if not isinstance(hybrid_cache, HybridCache):
         raise RuntimeError("This pytree flattening function should only be applied to HybridCache")
 
-    if not is_torch_greater_or_equal_than_2_6:
+    if not is_torch_greater_or_equal_than_2_7:
         logger.warning_once(
-            "HybridCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions."
+            "HybridCache + torch.export is tested on torch 2.7.0+ and may not work on earlier versions."
         )
 
     return torch.utils._pytree._dict_flatten(_get_flat_dict_for_hybrid_cache(hybrid_cache))
@@ -1851,11 +1851,11 @@ def _unflatten_hybrid_cache(
 ):
     dictionary = torch.utils._pytree._dict_unflatten(values, context)
     hybrid_cache = HybridCache(
-        dictionary["config"], 
-        dictionary["max_batch_size"], 
-        dictionary["max_cache_len"], 
-        torch.device(dictionary["device"]) if dictionary["device"] != None else None,
-        getattr(torch, dictionary["_dtype"][len("torch."):]) if dictionary["_dtype"] != None else None,
+        dictionary["config"],
+        dictionary["max_batch_size"],
+        dictionary["max_cache_len"],
+        torch.device(dictionary["device"]) if dictionary["device"] is not None else None,
+        getattr(torch, dictionary["_dtype"][len("torch.") :]) if dictionary["_dtype"] is not None else None,
         dictionary["layer_device_map"],
     )
 
 
@@ -21,9 +21,9 @@ def tpu_spmd_dataloader(dataloader: DataLoader):
     if is_torch_xla_available():
         import torch_xla.distributed.parallel_loader as pl
 
-        assert isinstance(dataloader, pl.MpDeviceLoader), (
-            "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
-        )
+        assert isinstance(
+            dataloader, pl.MpDeviceLoader
+        ), "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
 
         # This is to support PyTorch/XLA FSDP via SPMD.
         # Here we shard the input data's 0th dim across the fsdp axis.
 
@@ -2542,9 +2542,9 @@ def tie_encoder_to_decoder_recursively(
             total_decoder_name="",
             total_encoder_name="",
         ):
-            assert isinstance(decoder_pointer, nn.Module) and isinstance(encoder_pointer, nn.Module), (
-                f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
-            )
+            assert isinstance(decoder_pointer, nn.Module) and isinstance(
+                encoder_pointer, nn.Module
+            ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
             if hasattr(decoder_pointer, "weight"):
                 assert hasattr(encoder_pointer, "weight")
                 encoder_pointer.weight = decoder_pointer.weight
@@ -2558,9 +2558,9 @@ def tie_encoder_to_decoder_recursively(
             encoder_modules = encoder_pointer._modules
             decoder_modules = decoder_pointer._modules
             if len(decoder_modules) > 0:
-                assert len(encoder_modules) > 0, (
-                    f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
-                )
+                assert (
+                    len(encoder_modules) > 0
+                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
 
                 all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
                 encoder_layer_pos = 0
@@ -5464,9 +5464,9 @@ def forward(
         Returns:
             `torch.FloatTensor`: The end logits for SQuAD.
         """
-        assert start_states is not None or start_positions is not None, (
-            "One of start_states, start_positions should be not None"
-        )
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
             slen, hsz = hidden_states.shape[-2:]
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
@@ -5536,9 +5536,9 @@ def forward(
         """
         # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
         hsz = hidden_states.shape[-1]
-        assert start_states is not None or start_positions is not None, (
-            "One of start_states, start_positions should be not None"
-        )
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
             start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
 
@@ -127,27 +127,27 @@ def convert_data2vec_checkpoint_to_pytorch(
 
         # self-attention output
         self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, (
-            f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
-        )
+        assert (
+            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
+        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
         self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
         self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
         self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
         self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias
 
         # intermediate
         intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, (
-            f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
-        )
+        assert (
+            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
+        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
         intermediate.dense.weight = data2vec_layer.fc1.weight
         intermediate.dense.bias = data2vec_layer.fc1.bias
 
         # output
         bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, (
-            f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
-        )
+        assert (
+            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
+        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
         bert_output.dense.weight = data2vec_layer.fc2.weight
         bert_output.dense.bias = data2vec_layer.fc2.bias
         bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
 
@@ -180,9 +180,9 @@ def check_and_map_params(hf_param, gluon_param):
         gluon_param = to_torch(params[gluon_param])
         shape_gluon = gluon_param.shape
 
-        assert shape_hf == shape_gluon, (
-            f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
-        )
+        assert (
+            shape_hf == shape_gluon
+        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
 
         return gluon_param
 
 
@@ -139,9 +139,9 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         elif m_name == "kernel":
             array = np.transpose(array)
         try:
-            assert pointer.shape == array.shape, (
-                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-            )
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
 
@@ -1095,9 +1095,9 @@ def call(
                 batch_size, sequence_length = shape_list(input_ids)[:2]
             else:
                 batch_size, sequence_length = shape_list(inputs_embeds)[:2]
-            assert self.config.pad_token_id is not None or batch_size == 1, (
-                "Cannot handle batch sizes > 1 if no padding token is defined."
-            )
+            assert (
+                self.config.pad_token_id is not None or batch_size == 1
+            ), "Cannot handle batch sizes > 1 if no padding token is defined."
 
             if not tf.is_tensor(sequence_lengths):
                 in_logits = logits[0:batch_size, sequence_lengths]
 
@@ -155,9 +155,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
                 p_i.data = torch.from_numpy(arr_i)
         else:
             try:
-                assert pointer.shape == array.shape, (
-                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-                )
+                assert (
+                    pointer.shape == array.shape
+                ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
             except AssertionError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
@@ -1238,9 +1238,9 @@ def forward(
         else:
             batch_size, sequence_length = inputs_embeds.shape[:2]
 
-        assert self.config.pad_token_id is not None or batch_size == 1, (
-            "Cannot handle batch sizes > 1 if no padding token is defined."
-        )
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
         if self.config.pad_token_id is None:
             sequence_lengths = -1
         else: