diff --git a/examples/models/lfm2/short_conv.py b/examples/models/lfm2/short_conv.py index ae04580d6c6..08c00a1f414 100644 --- a/examples/models/lfm2/short_conv.py +++ b/examples/models/lfm2/short_conv.py @@ -74,7 +74,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.no_grad(): self.conv_state.copy_(new_conv_state) - conv_out = self.conv(Bx)[..., : x.size(-1)] # (batch_size, dim, seq_len) + # Manual depthwise conv: Triton has no template for nn.Conv1d with + # groups=dim and dynamic seq_len. kernel_size is always 3. + w = self.conv.weight[:, 0, :] # (dim, 3) + conv_out = ( + Bx[..., :-2] * w[:, 0:1] + + Bx[..., 1:-1] * w[:, 1:2] + + Bx[..., 2:] * w[:, 2:3] + ) # (batch_size, dim, seq_len) y = C * conv_out # (batch_size, dim, seq_len) y = y.transpose(-1, -2) # (batch_size, seq_len, dim) diff --git a/examples/models/lfm2_5_vl/__init__.py b/examples/models/lfm2_5_vl/__init__.py new file mode 100644 index 00000000000..f1fe2afba26 --- /dev/null +++ b/examples/models/lfm2_5_vl/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.examples.models.lfm2_5_vl.convert_weights import convert_weights +from executorch.examples.models.lfm2_5_vl.model import Lfm2p5VlModel + +__all__ = [ + "convert_weights", + "Lfm2p5VlModel", +] diff --git a/examples/models/lfm2_5_vl/config/lfm2_5_vl_1_6b_config.json b/examples/models/lfm2_5_vl/config/lfm2_5_vl_1_6b_config.json new file mode 100644 index 00000000000..396f7bb7a8a --- /dev/null +++ b/examples/models/lfm2_5_vl/config/lfm2_5_vl_1_6b_config.json @@ -0,0 +1,33 @@ +{ + "dim": 2048, + "ffn_dim_multiplier": 1, + "hidden_dim": 8192, + "n_heads": 32, + "n_kv_heads": 8, + "n_layers": 16, + "norm_eps": 1e-5, + "rope_theta": 1000000.0, + "use_scaled_rope": false, + "vocab_size": 65536, + "use_hf_rope": true, + "use_qk_norm": true, + "qk_norm_before_rope": true, + "layer_types": [ + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv" + ] +} diff --git a/examples/models/lfm2_5_vl/config/lfm2_5_vl_450m_config.json b/examples/models/lfm2_5_vl/config/lfm2_5_vl_450m_config.json new file mode 100644 index 00000000000..975ccbccca7 --- /dev/null +++ b/examples/models/lfm2_5_vl/config/lfm2_5_vl_450m_config.json @@ -0,0 +1,33 @@ +{ + "dim": 1024, + "ffn_dim_multiplier": 1, + "hidden_dim": 4608, + "n_heads": 16, + "n_kv_heads": 8, + "n_layers": 16, + "norm_eps": 1e-5, + "rope_theta": 1000000.0, + "use_scaled_rope": false, + "vocab_size": 65536, + "use_hf_rope": true, + "use_qk_norm": true, + "qk_norm_before_rope": true, + "layer_types": [ + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv" + ] +} diff --git a/examples/models/lfm2_5_vl/convert_weights.py b/examples/models/lfm2_5_vl/convert_weights.py new file mode 100644 index 00000000000..82ccba110ee --- /dev/null +++ b/examples/models/lfm2_5_vl/convert_weights.py @@ -0,0 +1,81 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Convert LFM2.5-VL text decoder weights from HuggingFace to ET format.""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import torch +from executorch.examples.models.checkpoint import get_mapped_key +from safetensors.torch import load_file + +_LFM2_5_VL_TO_META: dict[str, str] = { + "model.language_model.embed_tokens.weight": "tok_embeddings.weight", + "model.language_model.embedding_norm.weight": "norm.weight", + "model.language_model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight", + "model.language_model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight", + "model.language_model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight", + "model.language_model.layers.{}.self_attn.out_proj.weight": "layers.{}.attention.wo.weight", + "model.language_model.layers.{}.self_attn.q_layernorm.weight": "layers.{}.attention.q_norm_fn.weight", + "model.language_model.layers.{}.self_attn.k_layernorm.weight": "layers.{}.attention.k_norm_fn.weight", + "model.language_model.layers.{}.operator_norm.weight": "layers.{}.attention_norm.weight", + "model.language_model.layers.{}.ffn_norm.weight": "layers.{}.ffn_norm.weight", + "model.language_model.layers.{}.feed_forward.w1.weight": "layers.{}.feed_forward.w1.weight", + "model.language_model.layers.{}.feed_forward.w2.weight": "layers.{}.feed_forward.w2.weight", + "model.language_model.layers.{}.feed_forward.w3.weight": "layers.{}.feed_forward.w3.weight", + "model.language_model.layers.{}.conv.conv.weight": "layers.{}.conv.conv.weight", + "model.language_model.layers.{}.conv.out_proj.weight": "layers.{}.conv.out_proj.weight", + "model.language_model.lm_head.weight": "output.weight", +} + +_IN_PROJ_SPLITS = ("B_proj", "C_proj", "x_proj") + + +def lfm2_5_vl_to_meta(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Extract and remap language model weights from a full VL state dict.""" + converted: dict[str, torch.Tensor] = {} + + for key, value in state_dict.items(): + if not key.startswith("model.language_model."): + continue + + try: + new_key = get_mapped_key(key, _LFM2_5_VL_TO_META) + except Exception: + new_key = key.removeprefix("model.language_model.") + + if new_key.endswith(".conv.in_proj.weight"): + for name, chunk in zip(_IN_PROJ_SPLITS, torch.chunk(value, 3, dim=0)): + converted[new_key.replace("in_proj", name)] = chunk + else: + converted[new_key] = value + + if "output.weight" not in converted: + converted["output.weight"] = converted["tok_embeddings.weight"] + + return converted + + +def convert_weights(input_dir: str, output_file: str) -> None: + sd = load_file(str(Path(input_dir) / "model.safetensors")) + sd = lfm2_5_vl_to_meta(sd) + torch.save(sd, output_file) + print(f"Saved {len(sd)} tensors to {output_file}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert LFM2.5-VL weights to ET format.") + parser.add_argument("input_dir", help="Directory containing model.safetensors.") + parser.add_argument("output", help="Output .pt checkpoint path.") + args = parser.parse_args() + convert_weights(args.input_dir, args.output) + + +if __name__ == "__main__": + main() diff --git a/examples/models/lfm2_5_vl/export_lfm2_5_vl.py b/examples/models/lfm2_5_vl/export_lfm2_5_vl.py new file mode 100644 index 00000000000..b00bbf732ed --- /dev/null +++ b/examples/models/lfm2_5_vl/export_lfm2_5_vl.py @@ -0,0 +1,243 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Export LFM2.5-VL as a multi-method PTE for ExecuTorch with CUDA/AOTI backend. + +All three methods are delegated to the CUDA backend. Conv layer state is +threaded through attn_options as explicit IO; KV cache uses mark_static_address +so AOTI can trace through in-place mutations. + +Methods (D = text hidden dim): + vision_encoder : [1, 3, 512, 512] f32 -> [1, 256, D] f32 + token_embedding : [1, seq_len] i64 -> [1, seq_len, D] f32 + text_decoder : ([1, seq_len, D], [seq_len] i64) -> [1, vocab] f32 + +Usage: + python examples/models/lfm2_5_vl/export_lfm2_5_vl.py \\ + --model_dir LiquidAI/LFM2.5-VL-450M --dtype bf16 +""" + +from __future__ import annotations + +import logging +from argparse import ArgumentParser +from pathlib import Path +from typing import Optional + +import torch +from torch.export import Dim, ExportedProgram +from torch.nn.attention import SDPBackend + +from executorch.backends.cuda.cuda_backend import CudaBackend +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) +from executorch.exir.passes import MemoryPlanningPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass + +from executorch.examples.models.lfm2_5_vl.model import ( + IMAGE_SIZE, + MAX_SEQ_LEN, + Lfm2p5VlModel, +) + +logging.basicConfig( + level=logging.INFO, + format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s", +) + +# --------------------------------------------------------------------------- +# Blackwell (sm_103) workaround: torch._inductor maps arch 103 -> "100f" but +# Triton generates PTX targeting sm_103a. Patch to match. +# TODO: Remove once PyTorch bump includes the upstream fix in +# torch/_inductor/codegen/cuda/compile_utils.py +# --------------------------------------------------------------------------- +try: + from torch._inductor.codecache import cuda_compile_utils + + _orig_nvcc_arch = cuda_compile_utils._nvcc_arch_as_compile_option + + def _patched_nvcc_arch() -> str: + arch = cuda_compile_utils.cuda_env.get_cuda_arch() + return "103a" if arch == "103" else _orig_nvcc_arch() + + cuda_compile_utils._nvcc_arch_as_compile_option = _patched_nvcc_arch +except (ImportError, AttributeError): + pass + +_CONFIG_DIR = Path(__file__).parent / "config" + +_DTYPE_MAP: dict[str, torch.dtype] = { + "fp32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, +} + + +def _resolve_params_path(model_dir: str, params: str | None) -> str | None: + if params is not None: + return params + name = model_dir.lower() + if "450m" in name: + return str(_CONFIG_DIR / "lfm2_5_vl_450m_config.json") + if "1.6b" in name or "1_6b" in name: + return str(_CONFIG_DIR / "lfm2_5_vl_1_6b_config.json") + return None + + +# --------------------------------------------------------------------------- +# Per-method export +# --------------------------------------------------------------------------- + + +def _export_image_encoder(lfm2: torch.nn.Module, *, device: str) -> ExportedProgram: + class _Encoder(torch.nn.Module): + def __init__(self, lfm2: torch.nn.Module) -> None: + super().__init__() + self.lfm2 = lfm2 + + def forward(self, images: torch.Tensor) -> torch.Tensor: + return self.lfm2.image_embedding(images) + + example = torch.randint(0, 256, (1, 3, IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32, device=device) + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + return torch.export.export(_Encoder(lfm2), (example,), strict=False) + + +def _export_text_decoder(lfm2: torch.nn.Module, *, dtype: torch.dtype, device: str) -> ExportedProgram: + dim = lfm2.text_model_args.dim + + class _Decoder(torch.nn.Module): + def __init__(self, text_model: torch.nn.Module) -> None: + super().__init__() + self.text_model = text_model + + def forward(self, embeddings: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor: + out = self.text_model(None, {"input_pos": input_pos}, embeddings) + if isinstance(out, tuple): + out = out[0] + return out.contiguous() + + seq = 8 + token_dim = Dim("token_dim", min=1, max=MAX_SEQ_LEN - 1) + example_emb = torch.randn(1, seq, dim, dtype=dtype, device=device) + example_pos = torch.arange(seq, dtype=torch.int64, device=device) + + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + return torch.export._trace._export( + _Decoder(lfm2.text_model), + (example_emb, example_pos), + dynamic_shapes=({1: token_dim}, {0: token_dim}), + strict=False, + prefer_deferred_runtime_asserts_over_guards=True, + ) + + +def _export_token_embedding(lfm2: torch.nn.Module, *, device: str) -> ExportedProgram: + embed = lfm2.model_.model.language_model.get_input_embeddings() + token_dim = Dim("token_dim_1", min=1, max=MAX_SEQ_LEN) + example = torch.zeros(1, MAX_SEQ_LEN, dtype=torch.int64, device=device) + with torch.no_grad(): + return torch.export.export(embed, (example,), dynamic_shapes=[{1: token_dim}], strict=False) + + +# --------------------------------------------------------------------------- +# Pipeline +# --------------------------------------------------------------------------- + + +def export_all( + model_dir: str, + output: str, + *, + dtype: torch.dtype = torch.bfloat16, + max_seq_len: int = MAX_SEQ_LEN, + params_path: str | None = None, +) -> None: + logging.info("Loading %s...", model_dir) + lfm2_model = Lfm2p5VlModel( + model_dir=model_dir, + max_seq_len=max_seq_len, + max_context_len=max_seq_len, + params_path=params_path, + use_sdpa_with_kv_cache_op=False, + ) + lfm2 = lfm2_model.get_eager_model().to(dtype=dtype, device="cuda") + + # Mark KV cache and conv state buffers as static addresses so AOTI can + # trace through in-place mutations. Must be after .to("cuda") because + # marking a CPU buffer that later gets replaced is a no-op. + for module in lfm2.text_model.modules(): + for name, buf in module.named_buffers(recurse=False): + if name in ("k_cache", "v_cache", "conv_state"): + torch._dynamo.mark_static_address(buf) + + logging.info("[1/3] Vision encoder") + vision_ep = _export_image_encoder(lfm2, device="cuda") + logging.info("[2/3] Text decoder") + decoder_ep = _export_text_decoder(lfm2, dtype=dtype, device="cuda") + logging.info("[3/3] Token embedding") + token_ep = _export_token_embedding(lfm2, device="cuda") + + programs = {"vision_encoder": vision_ep, "token_embedding": token_ep, "text_decoder": decoder_ep} + partitioners = { + k: [CudaPartitioner([CudaBackend.generate_method_name_compile_spec(k)])] + for k in programs + } + metadata = { + "get_max_seq_len": lfm2.text_model_args.max_seq_len, + "get_vocab_size": lfm2.text_model_args.vocab_size, + "use_kv_cache": lfm2.text_model_args.use_kv_cache, + "get_eos_ids": [7], + } + + logging.info("Lowering to Edge IR + CUDA") + et_prog = to_edge_transform_and_lower( + programs, + partitioner=partitioners, + compile_config=EdgeCompileConfig(_check_ir_validity=False, _skip_dim_order=True), + constant_methods=metadata, + ) + + logging.info("Finalizing ExecuTorch program") + et_program = et_prog.to_executorch( + ExecutorchBackendConfig( + memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), + sym_shape_eval_pass={k: ConstraintBasedSymShapeEvalPass() for k in programs}, + ) + ) + + output_path = Path(output) + output_dir = output_path.parent or Path(".") + logging.info("Saving %s", output_path) + with open(output_path, "wb") as f: + et_program.write_to_file(f) + et_program.write_tensor_data_to_file(str(output_dir)) + logging.info("Done — methods: %s", et_program.methods) + + +def main() -> None: + parser = ArgumentParser(description="Export LFM2.5-VL to ExecuTorch (CUDA)") + parser.add_argument("--model_dir", default="LiquidAI/LFM2.5-VL-450M") + parser.add_argument("--dtype", default="bf16", choices=list(_DTYPE_MAP)) + parser.add_argument("--max_seq_len", type=int, default=MAX_SEQ_LEN) + parser.add_argument("--params", default=None) + parser.add_argument("--output", default=None) + args = parser.parse_args() + + dtype = _DTYPE_MAP[args.dtype] + params_path = _resolve_params_path(args.model_dir, args.params) + output = args.output or f"lfm2_5_vl_{args.dtype}_cuda.pte" + + export_all(args.model_dir, output, dtype=dtype, max_seq_len=args.max_seq_len, params_path=params_path) + + +if __name__ == "__main__": + main() diff --git a/examples/models/lfm2_5_vl/model.py b/examples/models/lfm2_5_vl/model.py new file mode 100644 index 00000000000..a952f3f7062 --- /dev/null +++ b/examples/models/lfm2_5_vl/model.py @@ -0,0 +1,141 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""ExecuTorch-friendly LFM2.5-VL model. Mirrors examples/models/llava/model.py.""" + +from __future__ import annotations + +import json +import math +from pathlib import Path + +import torch +import torch.nn.functional as F +from executorch.examples.models.lfm2_5_vl.convert_weights import lfm2_5_vl_to_meta +from executorch.examples.models.llama.llama_transformer import construct_transformer +from executorch.examples.models.llama.model_args import ModelArgs +from executorch.examples.models.llama.source_transformation.custom_kv_cache import ( + replace_kv_cache_with_custom_kv_cache, +) +from executorch.examples.models.llama.source_transformation.sdpa import ( + replace_sdpa_with_custom_op, +) +from executorch.examples.models.model_base import EagerModelBase +from torch.export import Dim +from transformers import AutoModelForImageTextToText, AutoProcessor + +MAX_SEQ_LEN = 2048 +IMAGE_SIZE = 512 +PATCH_SIZE = 16 +FIXED_H, FIXED_W = 32, 32 + +_DEFAULT_PARAMS = Path(__file__).parent / "config" / "lfm2_5_vl_1_6b_config.json" + + +class Lfm2p5Vl(torch.nn.Module): + def __init__(self, hf_model: AutoModelForImageTextToText, params: ModelArgs) -> None: + super().__init__() + self.model_ = hf_model + self.text_model_args = params + self.text_model = construct_transformer(params) + + if params.use_sdpa_with_kv_cache_op: + self.text_model = replace_kv_cache_with_custom_kv_cache(self.text_model) + self.text_model = replace_sdpa_with_custom_op(self.text_model) + + self.text_model.load_state_dict( + state_dict=self._translate_weights(), strict=False, assign=True + ) + self._patch_positional_embeddings() + + def _patch_positional_embeddings(self) -> None: + embeddings = self.model_.model.vision_tower.vision_model.embeddings + orig = embeddings.position_embedding.weight.data + sqrt_n = int(math.sqrt(orig.shape[0])) + + grid = orig.reshape(sqrt_n, sqrt_n, -1).permute(2, 0, 1).unsqueeze(0) + resized = F.interpolate( + grid, size=(FIXED_H, FIXED_W), mode="bilinear", align_corners=False, antialias=True + ) + pe = resized.squeeze(0).permute(1, 2, 0).reshape(FIXED_H * FIXED_W, -1).contiguous() + embeddings.register_buffer("_precomputed_pe", pe, persistent=False) + embeddings.resize_positional_embeddings = lambda *_args, **_kw: embeddings._precomputed_pe + + def _translate_weights(self) -> dict[str, torch.Tensor]: + raw: dict[str, torch.Tensor] = {} + for k, v in self.model_.model.language_model.state_dict().items(): + raw[f"model.language_model.{k}"] = v + for k, v in self.model_.lm_head.state_dict().items(): + raw[f"model.language_model.lm_head.{k}"] = v + return lfm2_5_vl_to_meta(raw) + + def embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor: + return self.model_.model.language_model.get_input_embeddings()(tokens) + + def image_embedding(self, nchw_pixels: torch.Tensor) -> torch.Tensor: + """[B, 3, 512, 512] float32 pixels in [0, 255] -> [B, 256, D].""" + x = (nchw_pixels / 255.0 - 0.5) / 0.5 + + x = x.unfold(2, PATCH_SIZE, PATCH_SIZE).unfold(3, PATCH_SIZE, PATCH_SIZE) + x = x.permute(0, 2, 3, 4, 5, 1).reshape(1, FIXED_H * FIXED_W, PATCH_SIZE * PATCH_SIZE * 3) + + out = self.model_.model.vision_tower( + pixel_values=x, + pixel_attention_mask=None, + spatial_shapes=torch.tensor([[FIXED_H, FIXED_W]], dtype=torch.int64, device=x.device), + return_dict=True, + ) + feats = out.last_hidden_state.reshape(-1, FIXED_H, FIXED_W, out.last_hidden_state.shape[-1]) + projected = self.model_.model.multi_modal_projector(feats) + return projected.reshape(1, -1, projected.shape[-1]) + + def forward(self, images: torch.Tensor) -> torch.Tensor: + return self.image_embedding(images) + + +class Lfm2p5VlModel(EagerModelBase): + def __init__( + self, + *, + use_sdpa_with_kv_cache_op: bool = True, + use_kv_cache: bool = True, + max_seq_len: int = MAX_SEQ_LEN, + max_context_len: int = MAX_SEQ_LEN, + model_dir: str = "LiquidAI/LFM2.5-VL-1.6B", + params_path: str | None = None, + ) -> None: + self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op + self.max_context_len = max_context_len + self.max_seq_len = max_seq_len + self.model_dir = model_dir + + resolved = Path(params_path) if params_path else _DEFAULT_PARAMS + params = json.loads(resolved.read_text()) + + self.text_model_args = ModelArgs( + max_batch_size=1, + max_seq_len=max_seq_len, + max_context_len=max_context_len, + use_kv_cache=use_kv_cache, + use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op, + enable_dynamic_shape=False, + **params, + ) + + self.hf_model = AutoModelForImageTextToText.from_pretrained( + model_dir, device_map="cpu", torch_dtype=torch.float32 + ) + self.processor = AutoProcessor.from_pretrained(model_dir) + self.tokenizer = self.processor.tokenizer + + def get_eager_model(self) -> torch.nn.Module: + return Lfm2p5Vl(self.hf_model, self.text_model_args).to(dtype=torch.float32) + + def get_example_inputs(self) -> tuple[torch.Tensor, ...]: + return (torch.randint(0, 256, (1, 3, IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32),) + + def get_dynamic_shapes(self) -> None: + return None diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index a48d88fa224..440c6dd9b4c 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -456,10 +456,13 @@ def _tensor_spec_to_evalue( ctypes.c_char * typing.cast(torch.UntypedStorage, spec.storage).nbytes() ) + storage = typing.cast(torch.UntypedStorage, spec.storage) + if spec.allocated_memory != 0 and storage.device.type != "cpu": + storage = storage.cpu() buffer_data = ( bytes( ctypes.cast( - typing.cast(torch.UntypedStorage, spec.storage).data_ptr(), + storage.data_ptr(), ctypes.POINTER(spec_array_type), ).contents )