[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit f9d60415dabd · 2025-04-25T09:47:13.000Z
for more information, see https://pre-commit.ci
diff --git a/litgpt/api.py b/litgpt/api.py
@@ -384,7 +384,9 @@ def distribute(
                 else:
                     kv_cache_size = fixed_kv_cache_size
                 model.set_kv_cache(
-                    batch_size=1, max_seq_length=kv_cache_size, device=fabric.device,
+                    batch_size=1,
+                    max_seq_length=kv_cache_size,
+                    device=fabric.device,
                 )
                 self.kv_cache_initialized = True
                 self.fixed_kv_cache_size = fixed_kv_cache_size
@@ -513,7 +515,9 @@ def generate(
             else:
                 device = self.preprocessor.device
             self.model.set_kv_cache(
-                batch_size=1, max_seq_length=max_returned_tokens, device=device,
+                batch_size=1,
+                max_seq_length=max_returned_tokens,
+                device=device,
             )
             self.kv_cache_initialized = True
 
@@ -522,7 +526,9 @@ def generate(
             tmp_device = self.model.mha.mask_cache.device
             self.model.clear_kv_cache()
             self.model.set_kv_cache(
-                batch_size=1, max_seq_length=max_returned_tokens, device=tmp_device,
+                batch_size=1,
+                max_seq_length=max_returned_tokens,
+                device=tmp_device,
             )
         else:
             for block in self.model.transformer.h:
diff --git a/litgpt/attention.py b/litgpt/attention.py
@@ -136,9 +136,9 @@ def __call__(
         if use_mask:
             # Special case requires building a mask. `mask_cache` is only needed
             # then.
-            assert (
-                self.mask_cache is not None
-            ), "mask_cache must be given if sliding window attention is used, or if input_pos given and T > 1"
+            assert self.mask_cache is not None, (
+                "mask_cache must be given if sliding window attention is used, or if input_pos given and T > 1"
+            )
             if is_causal:
                 mask = self.mask_cache[:T, :T].view(1, 1, T, T)
                 is_causal = False
@@ -156,9 +156,7 @@ def __call__(
                 nh_k = self.config.n_query_groups
                 q_per_kv = nh_q // nh_k
                 if q_per_kv > 1:
-                    mask = mask.unsqueeze(2).expand(
-                        -1, -1, q_per_kv, -1, -1
-                    ).reshape(B, nh_q, T, -1)
+                    mask = mask.unsqueeze(2).expand(-1, -1, q_per_kv, -1, -1).reshape(B, nh_q, T, -1)
 
         # Efficient attention using Flash Attention CUDA kernels.
         # NOTE: efficient implementation is disabled if `mask` is not None or softcapping is enabled.
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -3,15 +3,14 @@
 from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Type, Union, List
+from typing import Any, Callable, List, Literal, Optional, Type, Union
 
 import torch
 import yaml
 from typing_extensions import Self
 
 from litgpt.utils import find_multiple
 
-
 # See `Config.start_of_layer_hook`. A start of layer hook is called just before
 # a layer is computed. The call is `hook(x, block_idx, input_pos)`, where
 # `x` is the layer input, `block_idx` the number of the layer, and `input_pos`
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
@@ -171,7 +171,7 @@ def generate_fn(
 
     prompt_size = prompt.size(0)
     if prompt_size == 0:
-        raise ValueError(f"prompt must not be empty")
+        raise ValueError("prompt must not be empty")
     sample_kwargs = dict(
         temperature=temperature,
         top_k=top_k,
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -5,9 +5,10 @@
 Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
 https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
 """
+
+from dataclasses import replace
 from functools import partial
 from typing import Any, List, Optional, Tuple, Union
-from dataclasses import replace
 
 import torch
 import torch.nn as nn
@@ -189,7 +190,8 @@ def reset_parameters(self) -> None:
         self.mha.set_seq_length(self.max_seq_length, device=self.cos.device)
 
     def set_start_of_layer_hook(
-        self, hook: Optional[StartOfLayerHook],
+        self,
+        hook: Optional[StartOfLayerHook],
     ):
         """
         Sets a function `hook(x, block_idx, input_pos)`, which is called
@@ -452,7 +454,9 @@ def get_kv_cache_params(self) -> Optional[KVCacheParams]:
         batch_size = min(c.batch_size for c in caches)
         cache_length = min(c.cache_length for c in caches)
         params = replace(
-            params, batch_size=batch_size, cache_length=cache_length,
+            params,
+            batch_size=batch_size,
+            cache_length=cache_length,
         )
         return params
 
diff --git a/tests/generate/test_main.py b/tests/generate/test_main.py
@@ -174,7 +174,8 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
     )
     assert (
         generate_mock.mock_calls
-        == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)] * num_samples
+        == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)]
+        * num_samples
     )
     expected_output = "foo bar baz\n" * num_samples
     # Allow for the config to be printed before the expected repeated strings.
@@ -209,9 +210,7 @@ def test_sample(temperature):
     )
     # Note: Both `sample` and `batched_sample` create only 1 sample, not 3.
     # It is like passing `logits[:, 1-:, :]`
-    token = batched_sample(
-        logits, kwargs=dict(temperature=temperature, top_p=0.8)
-    )
+    token = batched_sample(logits, kwargs=dict(temperature=temperature, top_p=0.8))
 
     assert token.shape == (2, 1)
     # sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported
diff --git a/tests/test_batch.py b/tests/test_batch.py
@@ -32,7 +32,9 @@ def create_llm(tmp_path, batch_size, max_seq_length, device) -> tuple[LLM, GPT]:
     )
     model: GPT = llm.model
     model.set_kv_cache(
-        batch_size=batch_size, max_seq_length=max_seq_length, device=device,
+        batch_size=batch_size,
+        max_seq_length=max_seq_length,
+        device=device,
     )
 
     return llm, model
@@ -89,7 +91,9 @@ def test_batched_equivalence(tmp_path):
     # Switch to batched generation
     model.clear_kv_cache()
     model.set_kv_cache(
-        batch_size=batch_size, max_seq_length=max_seq_length, device=device,
+        batch_size=batch_size,
+        max_seq_length=max_seq_length,
+        device=device,
     )
 
     toks_1: torch.Tensor = batched_next_token(
diff --git a/tests/test_chat.py b/tests/test_chat.py
@@ -47,12 +47,8 @@ def test_generate(monkeypatch, generated, stop_tokens, expected):
     model.config.block_size = 100
     model.max_seq_length = 100
     # Mock methods called during generation
-    monkeypatch.setattr(
-        model, "kv_cache_max_prefill_length", lambda: 80
-    )
-    monkeypatch.setattr(
-        model, "kv_cache_max_tokens_forward", lambda: 20
-    )
+    monkeypatch.setattr(model, "kv_cache_max_prefill_length", lambda: 80)
+    monkeypatch.setattr(model, "kv_cache_max_tokens_forward", lambda: 20)
     it = iter(generated)
 
     def multinomial(*_, **__):
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -4,7 +4,6 @@
 import random
 from copy import deepcopy
 from functools import partial
-from unittest import mock
 
 import pytest
 import torch

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,8 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):`
`174`	`174`	`)`
`175`	`175`	`assert (`
`176`	`176`	`generate_mock.mock_calls`
`177`		`- == [call(ANY, tensor_like, len_return_value, *sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)] num_samples`
	`177`	`+ == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)]`
	`178`	`+ * num_samples`
`178`	`179`	`)`
`179`	`180`	`expected_output = "foo bar baz\n" * num_samples`
`180`	`181`	`# Allow for the config to be printed before the expected repeated strings.`
`@@ -209,9 +210,7 @@ def test_sample(temperature):`
`209`	`210`	`)`
`210`	`211`	# Note: Both `sample` and `batched_sample` create only 1 sample, not 3.
`211`	`212`	# It is like passing `logits[:, 1-:, :]`
`212`		`- token = batched_sample(`
`213`		`- logits, kwargs=dict(temperature=temperature, top_p=0.8)`
`214`		`- )`
	`213`	`+ token = batched_sample(logits, kwargs=dict(temperature=temperature, top_p=0.8))`
`215`	`214`
`216`	`215`	`assert token.shape == (2, 1)`
`217`	`216`	`# sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,9 @@ def create_llm(tmp_path, batch_size, max_seq_length, device) -> tuple[LLM, GPT]:`
`32`	`32`	`)`
`33`	`33`	`model: GPT = llm.model`
`34`	`34`	`model.set_kv_cache(`
`35`		`- batch_size=batch_size, max_seq_length=max_seq_length, device=device,`
	`35`	`+ batch_size=batch_size,`
	`36`	`+ max_seq_length=max_seq_length,`
	`37`	`+ device=device,`
`36`	`38`	`)`
`37`	`39`
`38`	`40`	`return llm, model`
`@@ -89,7 +91,9 @@ def test_batched_equivalence(tmp_path):`
`89`	`91`	`# Switch to batched generation`
`90`	`92`	`model.clear_kv_cache()`
`91`	`93`	`model.set_kv_cache(`
`92`		`- batch_size=batch_size, max_seq_length=max_seq_length, device=device,`
	`94`	`+ batch_size=batch_size,`
	`95`	`+ max_seq_length=max_seq_length,`
	`96`	`+ device=device,`
`93`	`97`	`)`
`94`	`98`
`95`	`99`	`toks_1: torch.Tensor = batched_next_token(`