diff --git a/pyproject.toml b/pyproject.toml index e978edd38..bc65e0c5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ plotting = ["matplotlib>=3.10.1", "seaborn>=0.13.2"] backend = [ - "peft>=0.14.0", + "peft>=0.18.0", "hf-xet>=1.1.0", "bitsandbytes>=0.45.2", "unsloth==2026.2.1", @@ -30,7 +30,7 @@ backend = [ "awscli>=1.38.1", "setuptools>=78.1.0", "wandb==0.25.0", - "transformers>=4.55.2,<=4.57.3", + "transformers==5.2.0", "duckdb>=1.0.0", "pyarrow>=15.0.0", "trl==0.20.0", @@ -65,7 +65,7 @@ tinker = [ "pydantic>=2.12.5", "tinker>=0.8.1", "torch>=2.8.0", - "transformers>=4.55.2,<=4.57.3", + "transformers==5.2.0", "uvicorn>=0.35.0", "datrie>=0.8.3", ] @@ -122,7 +122,13 @@ required-version = ">=0.6.15" # Override numpy to <2.0 for compatibility with megatron-core in the training # environment. vLLM 0.15.1 pulls opencv-python-headless>=4.13 which wants # numpy>=2 on Python 3.9+, but megatron-core requires numpy<2. -override-dependencies = ["transformer-engine>=2.11.0", "numpy<2"] +override-dependencies = [ + "transformer-engine>=2.11.0", + "numpy<2", + # Override unsloth's overly strict constraint on transformers — v5.x + # is confirmed working per unsloth February-2026 release notes + "transformers==5.2.0", +] exclude-dependencies = ["pynvml"] no-build-isolation-package = ["apex", "transformer-engine", "transformer-engine-cu12", "transformer-engine-torch", "megatron-core", "megatron-bridge", "nv-grouped-gemm", "mamba-ssm", "causal-conv1d"] diff --git a/src/art/__init__.py b/src/art/__init__.py index 3272944da..bc82c4dcb 100644 --- a/src/art/__init__.py +++ b/src/art/__init__.py @@ -40,9 +40,13 @@ def __init__(self, **kwargs): import transformers try: - from .transformers.patches import patch_preprocess_mask_arguments + from .transformers.patches import ( + patch_apply_chat_template, + patch_preprocess_mask_arguments, + ) patch_preprocess_mask_arguments() + patch_apply_chat_template() except Exception: pass except ImportError: diff --git a/src/art/dev/model.py b/src/art/dev/model.py index 84a13f1d0..8c52e92ee 100644 --- a/src/art/dev/model.py +++ b/src/art/dev/model.py @@ -197,7 +197,6 @@ class PeftArgs(TypedDict, total=False): class TrainerArgs(TypedDict, total=False): output_dir: str | None - overwrite_output_dir: bool do_train: bool do_eval: bool do_predict: bool @@ -226,7 +225,6 @@ class TrainerArgs(TypedDict, total=False): log_level: str log_level_replica: str log_on_each_node: bool - logging_dir: str | None logging_strategy: "IntervalStrategy | str" logging_first_step: bool logging_steps: float @@ -243,25 +241,21 @@ class TrainerArgs(TypedDict, total=False): use_mps_device: bool seed: int data_seed: int | None - jit_mode_eval: bool use_ipex: bool bf16: bool fp16: bool fp16_opt_level: str - half_precision_backend: str bf16_full_eval: bool fp16_full_eval: bool tf32: bool | None local_rank: int ddp_backend: str | None - tpu_num_cores: int | None tpu_metrics_debug: bool debug: str | list[DebugOption] dataloader_drop_last: bool eval_steps: float | None dataloader_num_workers: int dataloader_prefetch_factor: int | None - past_index: int run_name: str | None disable_tqdm: bool | None remove_unused_columns: bool | None @@ -302,15 +296,8 @@ class TrainerArgs(TypedDict, total=False): include_inputs_for_metrics: bool include_for_metrics: list[str] eval_do_concat_batches: bool - fp16_backend: str - push_to_hub_model_id: str | None - push_to_hub_organization: str | None - push_to_hub_token: str | None - mp_parameters: str auto_find_batch_size: bool full_determinism: bool - torchdynamo: str | None - ray_scope: str | None ddp_timeout: int torch_compile: bool torch_compile_backend: str | None diff --git a/src/art/preprocessing/tokenize.py b/src/art/preprocessing/tokenize.py index a8f553c60..f4d5694e9 100644 --- a/src/art/preprocessing/tokenize.py +++ b/src/art/preprocessing/tokenize.py @@ -197,9 +197,7 @@ def tokenize_trajectory( continue_final_message=True, ), ) - sentinal_token_id = max( - set(range(cast(int, tokenizer.vocab_size))) - set(original_token_ids) - ) + sentinal_token_id = max(set(range(tokenizer.vocab_size)) - set(original_token_ids)) sentinal_token = tokenizer.decode(sentinal_token_id) token_template_messages: list[dict[str, Any]] = [] for original, message in zip(messages_and_choices, messages): @@ -287,11 +285,14 @@ def tokenize_trajectory( except (IndexError, ValueError): token_ids[start:end] = [ token_id if token_id is not None else tokenizer.eos_token_id - for token_id in tokenizer.convert_tokens_to_ids( - [ - token_logprob.token or tokenizer.eos_token - for token_logprob in token_logprobs - ] + for token_id in cast( + list[int], + tokenizer.convert_tokens_to_ids( + [ + token_logprob.token or tokenizer.eos_token + for token_logprob in token_logprobs + ] + ), ) ] logprobs[start:end] = ( @@ -346,7 +347,7 @@ def tokenize_trajectory( return TokenizedResult( advantage=advantage, chat=chat, - tokens=[tokenizer.decode(token_id) for token_id in token_ids], + tokens=[cast(str, tokenizer.decode(token_id)) for token_id in token_ids], token_ids=token_ids, input_pos=list(range(len(token_ids))), assistant_mask=assistant_mask, diff --git a/src/art/transformers/patches.py b/src/art/transformers/patches.py index 97e09f6c8..9b2d9088b 100644 --- a/src/art/transformers/patches.py +++ b/src/art/transformers/patches.py @@ -1,9 +1,11 @@ +import functools from typing import TYPE_CHECKING, Optional, Union import torch from transformers import masking_utils from transformers.cache_utils import Cache from transformers.configuration_utils import PretrainedConfig +from transformers.tokenization_utils_base import PreTrainedTokenizerBase if TYPE_CHECKING: from torch.nn.attention.flex_attention import BlockMask @@ -35,3 +37,19 @@ def _patched_preprocess_mask_arguments( def patch_preprocess_mask_arguments() -> None: masking_utils._preprocess_mask_arguments = _patched_preprocess_mask_arguments # ty:ignore[invalid-assignment] + + +def patch_apply_chat_template() -> None: + """Default return_dict=False in apply_chat_template for transformers v5. + + Transformers v5 changed the default from list[int] to BatchEncoding. + This restores the v4 behavior so all call sites get list[int] back. + """ + original = PreTrainedTokenizerBase.apply_chat_template + + @functools.wraps(original) + def _patched(self, *args, **kwargs): # type: ignore + kwargs.setdefault("return_dict", False) + return original(self, *args, **kwargs) + + PreTrainedTokenizerBase.apply_chat_template = _patched # type: ignore diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index cfb95b3ce..94d01b78d 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -13,8 +13,8 @@ from datasets import Dataset import peft import torch +from transformers import GenerationMixin, PreTrainedModel from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from transformers.utils.dummy_pt_objects import GenerationMixin, PreTrainedModel from trl import GRPOConfig, GRPOTrainer from vllm import AsyncEngineArgs from vllm.lora.request import LoRARequest @@ -30,6 +30,7 @@ packed_tensors_from_dir, ) from ..preprocessing.tokenize import SFTBatch +from ..utils.convert_moe_lora import convert_checkpoint_if_needed from ..utils.get_model_step import get_step_from_dir from ..utils.output_dirs import get_step_checkpoint_dir from ..vllm import get_llm, get_worker, openai_server_task, run_on_workers @@ -156,6 +157,7 @@ def save_checkpoint( checkpoint_dir = get_step_checkpoint_dir(output_dir, next_step) os.makedirs(checkpoint_dir, exist_ok=True) trainer.save_model(checkpoint_dir) + convert_checkpoint_if_needed(checkpoint_dir) return checkpoint_dir @@ -436,6 +438,7 @@ async def start_openai_server( lora_path = get_step_checkpoint_dir(self.output_dir, 0) os.makedirs(os.path.dirname(lora_path), exist_ok=True) self._state.trainer.save_model(lora_path) + convert_checkpoint_if_needed(lora_path) self._latest_step = 0 else: self._latest_step = get_step_from_dir(self.output_dir) @@ -921,6 +924,11 @@ def _state(self) -> UnslothState: ), ) + # Unsloth's model patching can leave the PEFT model without + # `warnings_issued`, which GRPOTrainer expects during init. + if not hasattr(peft_model, "warnings_issued"): + peft_model.warnings_issued = {} # type: ignore[attr-defined] + # Initialize trainer with dummy dataset data = {"prompt": ""} trainer = GRPOTrainer( diff --git a/src/art/utils/convert_moe_lora.py b/src/art/utils/convert_moe_lora.py new file mode 100644 index 000000000..0ea80f63a --- /dev/null +++ b/src/art/utils/convert_moe_lora.py @@ -0,0 +1,181 @@ +"""Convert fused MoE LoRA adapters to per-expert format for vLLM compatibility. + +Unsloth with transformers v5 saves MoE expert LoRA as fused 2D tensors: + mlp.experts.base_layer.lora_A [num_experts*rank, intermediate*2] (gate_up_proj) + mlp.experts.base_layer.lora_B [hidden, num_experts*rank] (gate_up_proj) + mlp.experts.lora_A [num_experts*rank, hidden] (down_proj) + mlp.experts.lora_B [intermediate, num_experts*rank] (down_proj) + +vLLM expects per-expert keys: + mlp.experts.0.gate_proj.lora_A [rank, hidden] + mlp.experts.0.gate_proj.lora_B [intermediate, rank] + ... +""" + +import json +import os +import re + +import safetensors.torch +import torch + + +def _has_fused_moe_lora(tensors: dict[str, torch.Tensor]) -> bool: + """Check if the adapter contains fused MoE LoRA tensors.""" + return any( + re.search(r"mlp\.experts\.(base_layer\.)?lora_[AB]\.weight$", key) + for key in tensors + ) + + +def _infer_moe_params( + tensors: dict[str, torch.Tensor], + adapter_config: dict, +) -> tuple[int, int, int, int]: + """Infer num_experts, rank, intermediate_size, hidden_size from tensor shapes.""" + rank = adapter_config.get("r", adapter_config.get("lora_rank", 8)) + + for key, tensor in tensors.items(): + # gate_up_proj lora_A: [num_experts*rank, intermediate*2] + if re.search(r"mlp\.experts\.base_layer\.lora_A\.weight$", key): + num_experts_times_rank = tensor.shape[0] + intermediate_times_2 = tensor.shape[1] + num_experts = num_experts_times_rank // rank + intermediate_size = intermediate_times_2 // 2 + break + # down_proj lora_B: [intermediate, num_experts*rank] + if re.search(r"mlp\.experts\.lora_B\.weight$", key): + intermediate_size = tensor.shape[0] + num_experts = tensor.shape[1] // rank + break + else: + raise ValueError("Could not find fused MoE tensors to infer parameters") + + # Get hidden_size from gate_up_proj lora_B: [hidden, num_experts*rank] + # or from down_proj lora_A: [num_experts*rank, hidden] + for key, tensor in tensors.items(): + if re.search(r"mlp\.experts\.base_layer\.lora_B\.weight$", key): + hidden_size = tensor.shape[0] + break + if re.search(r"mlp\.experts\.lora_A\.weight$", key): + hidden_size = tensor.shape[1] + break + else: + raise ValueError("Could not infer hidden_size from fused MoE tensors") + + return num_experts, rank, intermediate_size, hidden_size + + +def convert_fused_moe_lora( + tensors: dict[str, torch.Tensor], + num_experts: int, + rank: int, + intermediate_size: int, + hidden_size: int, +) -> dict[str, torch.Tensor]: + """Convert fused MoE LoRA tensors to per-expert format. + + Non-expert tensors (e.g. self_attn) are passed through unchanged. + """ + new_tensors: dict[str, torch.Tensor] = {} + + for key, tensor in tensors.items(): + # Non-expert tensors: keep as-is + m = re.match( + r"(.*\.mlp\.experts)\.(base_layer\.lora_(A|B)|lora_(A|B))\.weight$", + key, + ) + if not m: + new_tensors[key] = tensor + continue + + prefix = m.group(1) + is_base_layer = "base_layer" in key + is_A = "lora_A" in key + + if is_base_layer: + # gate_up_proj (fused gate + up) + if is_A: + # [num_experts*rank, intermediate*2] → per expert + per_expert = tensor.reshape(num_experts, rank, intermediate_size * 2) + for e in range(num_experts): + expert_a = per_expert[e] # [rank, intermediate*2] + gate_a = expert_a[:, :intermediate_size] + up_a = expert_a[:, intermediate_size:] + new_tensors[f"{prefix}.{e}.gate_proj.lora_B.weight"] = ( + gate_a.T.contiguous() + ) + new_tensors[f"{prefix}.{e}.up_proj.lora_B.weight"] = ( + up_a.T.contiguous() + ) + else: + # [hidden, num_experts*rank] → per expert + per_expert = tensor.reshape(hidden_size, num_experts, rank) + for e in range(num_experts): + expert_b = per_expert[:, e, :] # [hidden, rank] + new_tensors[f"{prefix}.{e}.gate_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + new_tensors[f"{prefix}.{e}.up_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + else: + # down_proj + if is_A: + # [num_experts*rank, hidden] → per expert + per_expert = tensor.reshape(num_experts, rank, hidden_size) + for e in range(num_experts): + expert_a = per_expert[e] # [rank, hidden] + new_tensors[f"{prefix}.{e}.down_proj.lora_B.weight"] = ( + expert_a.T.contiguous() + ) + else: + # [intermediate, num_experts*rank] → per expert + per_expert = tensor.reshape(intermediate_size, num_experts, rank) + for e in range(num_experts): + expert_b = per_expert[:, e, :] # [intermediate, rank] + new_tensors[f"{prefix}.{e}.down_proj.lora_A.weight"] = ( + expert_b.T.contiguous() + ) + + return new_tensors + + +def convert_checkpoint_if_needed(checkpoint_dir: str) -> None: + """Convert a checkpoint's MoE LoRA adapter to per-expert format if needed. + + This is a no-op for non-MoE adapters. + """ + adapter_path = os.path.join(checkpoint_dir, "adapter_model.safetensors") + config_path = os.path.join(checkpoint_dir, "adapter_config.json") + + if not os.path.exists(adapter_path) or not os.path.exists(config_path): + return + + tensors = safetensors.torch.load_file(adapter_path) + if not _has_fused_moe_lora(tensors): + return + + with open(config_path) as f: + adapter_config = json.load(f) + + num_experts, rank, intermediate_size, hidden_size = _infer_moe_params( + tensors, adapter_config + ) + + new_tensors = convert_fused_moe_lora( + tensors, num_experts, rank, intermediate_size, hidden_size + ) + + # Overwrite the adapter with the converted tensors + safetensors.torch.save_file(new_tensors, adapter_path) + + # Update adapter_config.json target_modules + adapter_config["target_modules"] = [ + m for m in adapter_config.get("target_modules", []) if "experts" not in m + ] + ["gate_proj", "up_proj", "down_proj"] + # Remove target_parameters if present (not needed for per-expert format) + adapter_config.pop("target_parameters", None) + + with open(config_path, "w") as f: + json.dump(adapter_config, f, indent=2) diff --git a/uv.lock b/uv.lock index f002e314b..7d1dbc6d5 100644 --- a/uv.lock +++ b/uv.lock @@ -24,6 +24,7 @@ resolution-markers = [ overrides = [ { name = "numpy", specifier = "<2" }, { name = "transformer-engine", specifier = ">=2.11.0" }, + { name = "transformers", specifier = "==5.2.0" }, ] excludes = ["pynvml"] @@ -3031,21 +3032,23 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.36.2" +version = "1.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "httpx" }, { name = "packaging" }, { name = "pyyaml" }, - { name = "requests" }, + { name = "shellingham" }, { name = "tqdm" }, + { name = "typer-slim" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c4/fc/eb9bc06130e8bbda6a616e1b80a7aa127681c448d6b49806f61db2670b61/huggingface_hub-1.4.1.tar.gz", hash = "sha256:b41131ec35e631e7383ab26d6146b8d8972abc8b6309b963b306fbcca87f5ed5", size = 642156, upload-time = "2026-02-06T09:20:03.013Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ae/2f6d96b4e6c5478d87d606a1934b5d436c4a2bce6bb7c6fdece891c128e3/huggingface_hub-1.4.1-py3-none-any.whl", hash = "sha256:9931d075fb7a79af5abc487106414ec5fba2c0ae86104c0c62fd6cae38873d18", size = 553326, upload-time = "2026-02-06T09:20:00.728Z" }, ] [[package]] @@ -5300,7 +5303,7 @@ requires-dist = [ { name = "numpy", marker = "extra == 'tinker'" }, { name = "nvidia-ml-py", marker = "extra == 'megatron'", specifier = "==13.580.82" }, { name = "openai", specifier = ">=2.14.0" }, - { name = "peft", marker = "extra == 'backend'", specifier = ">=0.14.0" }, + { name = "peft", marker = "extra == 'backend'", specifier = ">=0.18.0" }, { name = "pillow", marker = "extra == 'tinker'" }, { name = "polars", specifier = ">=1.26.0" }, { name = "pyarrow", marker = "extra == 'backend'", specifier = ">=15.0.0" }, @@ -5318,8 +5321,8 @@ requires-dist = [ { name = "transformer-engine", marker = "extra == 'megatron'", specifier = "==2.11.0" }, { name = "transformer-engine-cu12", marker = "extra == 'megatron'", specifier = "==2.11.0" }, { name = "transformer-engine-torch", marker = "extra == 'megatron'", specifier = "==2.11.0" }, - { name = "transformers", marker = "extra == 'backend'", specifier = ">=4.55.2,<=4.57.3" }, - { name = "transformers", marker = "extra == 'tinker'", specifier = ">=4.55.2,<=4.57.3" }, + { name = "transformers", marker = "extra == 'backend'", specifier = "==5.2.0" }, + { name = "transformers", marker = "extra == 'tinker'", specifier = "==5.2.0" }, { name = "trl", marker = "extra == 'backend'", specifier = "==0.20.0" }, { name = "typer", specifier = ">=0.15.2" }, { name = "unsloth", marker = "extra == 'backend'", specifier = "==2026.2.1" }, @@ -8674,23 +8677,22 @@ sdist = { url = "https://files.pythonhosted.org/packages/09/42/068a40f5b213a3a88 [[package]] name = "transformers" -version = "4.57.3" +version = "5.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, - { name = "requests" }, { name = "safetensors" }, { name = "tokenizers" }, { name = "tqdm" }, + { name = "typer-slim" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/7e/8a0c57d562015e5b16c97c1f0b8e0e92ead2c7c20513225dc12c2043ba9f/transformers-5.2.0.tar.gz", hash = "sha256:0088b8b46ccc9eff1a1dca72b5d618a5ee3b1befc3e418c9512b35dea9f9a650", size = 8618176, upload-time = "2026-02-16T18:54:02.867Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" }, + { url = "https://files.pythonhosted.org/packages/4e/93/79754b0ca486e556c2b95d4f5afc66aaf4b260694f3d6e1b51da2d036691/transformers-5.2.0-py3-none-any.whl", hash = "sha256:9ecaf243dc45bee11a7d93f8caf03746accc0cb069181bbf4ad8566c53e854b4", size = 10403304, upload-time = "2026-02-16T18:53:59.699Z" }, ] [[package]] @@ -8797,6 +8799,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d5/91/9b286ab899c008c2cb05e8be99814807e7fbbd33f0c0c960470826e5ac82/typer-0.23.1-py3-none-any.whl", hash = "sha256:3291ad0d3c701cbf522012faccfbb29352ff16ad262db2139e6b01f15781f14e", size = 56813, upload-time = "2026-02-13T10:04:32.008Z" }, ] +[[package]] +name = "typer-slim" +version = "0.23.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/22/b9c47b8655937b6877d40791b937931702ba9c5f9d28753199266aa96f50/typer_slim-0.23.1.tar.gz", hash = "sha256:dfe92a6317030ee2380f65bf92e540d7c77fefcc689e10d585b4925b45b5e06a", size = 4762, upload-time = "2026-02-13T10:04:26.416Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/8a/5764b851659345f34787f1b6eb30b9d308bbd6c294825cbe38b6b869c97a/typer_slim-0.23.1-py3-none-any.whl", hash = "sha256:8146d5df1eb89f628191c4c604c8464fa841885d0733c58e6e700ff0228adac5", size = 3397, upload-time = "2026-02-13T10:04:27.132Z" }, +] + [[package]] name = "types-paramiko" version = "4.0.0.20250822"