From 4f2e41a5145c40662d73dffffd650a842652a3b5 Mon Sep 17 00:00:00 2001
From: potassiummmm <zhou.hansong@outlook.com>
Date: Wed, 12 Mar 2025 18:16:45 +0800
Subject: [PATCH 01/11] add support for bitnet2b_2501 model

---
 3rdparty/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 957b59d22..a8ac7072a 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 957b59d2207370cd5061dd1bb12d079aa267fbab
+Subproject commit a8ac7072ae02ffd68b4b661db0ebd2689fb82b7f

From 09f91066d65c917a9b081eafce09a1a30f390537 Mon Sep 17 00:00:00 2001
From: potassiummmm <zhou.hansong@outlook.com>
Date: Wed, 12 Mar 2025 18:34:05 +0800
Subject: [PATCH 02/11] add conversion logic for new model

---
 utils/convert-ms-to-gguf-bitnet.py | 1852 ++++++++++++++++++++++++++++
 1 file changed, 1852 insertions(+)
 create mode 100644 utils/convert-ms-to-gguf-bitnet.py

diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py
new file mode 100644
index 000000000..23a1a2c89
--- /dev/null
+++ b/utils/convert-ms-to-gguf-bitnet.py
@@ -0,0 +1,1852 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import argparse
+import concurrent.futures
+import enum
+import faulthandler
+import functools
+import itertools
+import json
+import math
+import mmap
+import os
+import pickle
+import re
+import signal
+import struct
+import sys
+import textwrap
+import time
+import zipfile
+from abc import ABC, abstractmethod
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Tuple
+
+import configparser
+import numpy as np
+from sentencepiece import SentencePieceProcessor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+if TYPE_CHECKING:
+    from typing_extensions import Self, TypeAlias
+
+logger = logging.getLogger("convert")
+
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
+
+ARCH = gguf.MODEL_ARCH.BITNET_25
+
+DEFAULT_CONCURRENCY = 16
+
+ADDED_TOKENS_FILE = 'added_tokens.json'
+FAST_TOKENIZER_FILE = 'tokenizer.json'
+
+#
+# data types
+#
+
+
+@dataclass(frozen=True)
+class DataType:
+    name: str
+    dtype: np.dtype[Any]
+    valid_conversions: list[str]
+
+    def elements_to_bytes(self, n_elements: int) -> int:
+        return n_elements * self.dtype.itemsize
+
+
+@dataclass(frozen=True)
+class UnquantizedDataType(DataType):
+    pass
+
+
+DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0', 'I2'])
+DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])
+DT_I2   = UnquantizedDataType('I2',   dtype = np.dtype(np.uint8),   valid_conversions = ['F32', 'F16', 'Q8_0'])
+
+@dataclass(frozen=True)
+class QuantizedDataType(DataType):
+    block_size: int
+    quantized_dtype: np.dtype[Any]
+    ggml_type: gguf.GGMLQuantizationType
+
+    def quantize(self, arr: NDArray) -> NDArray:
+        raise NotImplementedError(f'Quantization for {self.name} not implemented')
+
+    def elements_to_bytes(self, n_elements: int) -> int:
+        assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
+        return self.quantized_dtype.itemsize * (n_elements // self.block_size)
+
+
+@dataclass(frozen=True)
+class Q8_0QuantizedDataType(QuantizedDataType):
+    # Mini Q8_0 quantization in Python!
+    def quantize(self, arr: NDArray) -> NDArray:
+        assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
+        assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
+        n_blocks = arr.size // self.block_size
+        blocks = arr.reshape((n_blocks, self.block_size))
+        # Much faster implementation of block quantization contributed by @Cebtenzzre
+
+        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
+            d = abs(blocks).max(axis = 1) / np.float32(127)
+            with np.errstate(divide = 'ignore'):
+                qs = (blocks / d[:, None]).round()
+            qs[d == 0] = 0
+            yield from zip(d, qs)
+        return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
+
+# @dataclass(frozen=True)
+# class TransformedDataType(DataType):
+#     transformed_dtype: np.dtype[Any]
+
+#     def transform(self, arr: NDArray) -> NDArray:
+#         raise NotImplementedError(f'Transformation for {self.name} not implemented')
+
+# @dataclass(frozen=True)
+# class I2TransformedDataType(TransformedDataType):
+#     # fp32 -> int2 (dtype is uint8)
+#     def transform(self, arr: NDArray) -> NDArray:
+#         assert(np.prod(arr.shape) % 4 == 0)
+#         # Much faster implementation of block quantization contributed by @Cebtenzzre
+
+#         def transform_to_i2(x : NDArray) -> Iterable[tuple[Any, Any]]:
+#             x_num = np.prod(x.shape)
+#             x = np.reshape(x, x_num)
+#             for i in range(x_num):
+#                 if x[i] != 0:
+#                     d = x[i]
+#                     break
+#             x = np.divide(x, d)
+#             x = x.astype(np.uint8)
+#             x = np.reshape(x, [x.shape[0] // 4, 4])
+#             keep_bit = {0:192, 1:48, 2:12, 3:3}
+#             ans = np.zeros([x_num // 4], dtype=np.uint8)
+#             for i in range(4):
+#                 x_bit_col = x[:, i]
+#                 x_bit_shift = np.left_shift(x_bit_col, 6 - i * 2)
+#                 x_bit_shift = np.bitwise_and(x_bit_shift, keep_bit[i])
+#                 ans = np.bitwise_or(ans, x_bit_shift)
+#             return ans
+#         return transform_to_i2(arr)
+
+#     def elements_to_bytes(self, n_elements: int) -> int:
+#         return n_elements // 4
+
+
+DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
+                                dtype = np.dtype(np.float32), valid_conversions = [],
+                                ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
+                                quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
+
+# DT_I2 = I2TransformedDataType('I2',
+#                               dtype = np.dtype(np.float32), valid_conversions = [],
+#                               transformed_dtype = np.uint8
+#                               )
+
+# Quantized types skipped here because they may also map to np.float32
+NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
+for dt in (DT_BF16, DT_F16, DT_F32, DT_I32, DT_I2):
+    if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
+        raise ValueError(f'Invalid duplicate data type {dt}')
+    NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
+
+SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
+    'BF16': DT_BF16,
+    'F16': DT_F16,
+    'F32': DT_F32,
+    'I32': DT_I32,
+}
+
+# TODO: match this with `llama_ftype`
+# TODO: rename to LLAMAFileType
+# TODO: move to `gguf.py`
+
+
+class GGMLFileType(enum.IntEnum):
+    AllF32     = 0
+    MostlyF16  = 1  # except 1d tensors
+    MostlyI2   = 2  # except 1d tensors
+    MostlyQ8_0 = 7  # except 1d tensors
+
+    def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
+        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
+        if dt is None:
+            raise ValueError(self)
+        # Convert all 1D tensors to F32.  Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
+        #  Also The 1d tensors aren't much of a performance/size issue.  So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
+        dt = dt if len(tensor.shape) > 1 else DT_F32
+        if name == "token_embd.weight" or name == "output.weight":
+            dt = DT_F32
+        return dt
+
+
+GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
+    GGMLFileType.AllF32    : DT_F32,
+    GGMLFileType.MostlyF16 : DT_F16,
+    GGMLFileType.MostlyI2  : DT_I2,
+    GGMLFileType.MostlyQ8_0: DT_Q8_0,
+}
+
+#
+# hparams loading
+#
+
+
+@dataclass
+class Params:
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None
+
+    rope_scaling_type: gguf.RopeScalingType | None = None
+    f_rope_freq_base: float | None = None
+    f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None
+
+    ftype: GGMLFileType | None = None
+
+    # path to the directory containing the model files
+    path_model: Path | None = None
+
+    @staticmethod
+    def guessed(model: LazyModel) -> Params:
+        # try transformer naming first
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+        # try transformer naming first
+        if "model.layers.0.self_attn.q_proj.weight" in model:
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+        else:
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+        if n_layer < 1:
+            msg = """\
+                failed to guess 'n_layer'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))
+
+        n_head = n_embd // 128 # guessed
+        n_mult = 256           # guessed
+
+        # TODO: verify this
+        n_ff = int(2 * (4 * n_embd) / 3)
+        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
+
+        return Params(
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_layer    = n_layer,
+            n_ctx      = -1,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head,
+            f_norm_eps = 1e-5,
+        )
+
+    @staticmethod
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
+        with open(config_path) as f:
+            config = json.load(f)
+
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+        rope_scaling = config.get("rope_scaling")
+
+        if rope_scaling is not None and (typ := rope_scaling.get("type")):
+            rope_factor = rope_scaling.get("factor")
+            f_rope_scale = rope_factor
+            if typ == "linear":
+                rope_scaling_type = gguf.RopeScalingType.LINEAR
+            elif typ == "yarn":
+                rope_scaling_type = gguf.RopeScalingType.YARN
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
+            else:
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
+
+        if "max_sequence_length" in config:
+            n_ctx = config["max_sequence_length"]
+        elif "max_position_embeddings" in config:
+            n_ctx = config["max_position_embeddings"]
+        else:
+            msg = """\
+                failed to guess 'n_ctx'. This model is unknown or unsupported.
+                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
+            raise KeyError(textwrap.dedent(msg))
+
+        n_experts      = None
+        n_experts_used = None
+
+        if "num_local_experts" in config:
+            n_experts = config["num_local_experts"]
+            n_experts_used = config["num_experts_per_tok"]
+
+        return Params(
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
+        )
+
+    # LLaMA v2 70B params.json
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
+    @staticmethod
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
+        with open(config_path) as f:
+            config = json.load(f)
+
+        n_experts      = None
+        n_experts_used = None
+        f_rope_freq_base = None
+
+        # hack to determine LLaMA v1 vs v2 vs CodeLlama
+        if config.get("moe"):
+            # Mixtral
+            n_ctx = 32768
+        elif config.get("rope_theta") == 1000000:
+            # CodeLlama
+            n_ctx = 16384
+        elif config["norm_eps"] == 1e-05:
+            # LLaMA v2
+            n_ctx = 4096
+        else:
+            # LLaMA v1
+            n_ctx = 2048
+
+        if "layers.0.feed_forward.w1.weight" in model:
+            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
+        if config.get("moe"):
+            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
+            n_experts      = config["moe"]["num_experts"]
+            n_experts_used = config["moe"]["num_experts_per_tok"]
+            f_rope_freq_base = 1e6
+
+        return Params(
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_embd           = config["dim"],
+            n_layer          = config["n_layers"],
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
+        )
+
+    @staticmethod
+    def load(model_plus: ModelPlus) -> Params:
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
+        orig_config_path = model_plus.paths[0].parent / "params.json"
+
+        if hf_config_path.exists():
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+        elif orig_config_path.exists():
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        elif model_plus.format != 'none':
+            params = Params.guessed(model_plus.model)
+        else:
+            raise ValueError('Cannot guess params when model format is none')
+
+        params.path_model = model_plus.paths[0].parent
+
+        return params
+
+
+#
+# vocab
+#
+
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]
+
+
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class BpeVocab(Vocab):
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "spm"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')
+
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict  = added_tokens
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.id_to_piece(i)
+            text         = piece.encode("utf-8")
+            score: float = tokenizer.get_score(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.is_unknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.is_control(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.is_unused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.is_byte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class LlamaHfVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "hfft"
+
+    def __init__(self, base_path: Path):
+        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use LlamaHfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
+        # Allow the tokenizer to default to slow or fast versions.
+        # Explicitly set tokenizer to use local paths.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            base_path,
+            cache_dir=base_path,
+            local_files_only=True,
+        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used
+
+        # Initialize lists and dictionaries for added tokens
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids  = set()
+
+        # Process added tokens
+        for tok, tokidx in sorted(
+            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
+        ):
+            # Only consider added tokens that are not in the base vocabulary
+            if tokidx >= self.tokenizer.vocab_size:
+                self.added_tokens_list.append(tok)
+                self.added_tokens_dict[tok] = tokidx
+                self.added_tokens_ids.add(tokidx)
+
+        # Store special tokens and their IDs
+        self.specials = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids = set(self.tokenizer.all_special_ids)
+
+        # Set vocabulary sizes
+        self.vocab_size_base = self.tokenizer.vocab_size
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+
+        self.fname_tokenizer = fname_tokenizer
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {
+            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
+        }
+
+        for token_id in range(self.vocab_size_base):
+            # Skip processing added tokens here
+            if token_id in self.added_tokens_ids:
+                continue
+
+            # Convert token text to bytes
+            token_text = reverse_vocab[token_id].encode("utf-8")
+
+            # Yield token text, score, and type
+            yield token_text, self.get_token_score(token_id), self.get_token_type(
+                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+            )
+
+    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+        # Special case for byte tokens
+        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+            return gguf.TokenType.BYTE
+
+        # Determine token type based on whether it's a special token
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+
+    def get_token_score(self, token_id: int) -> float:
+        # Placeholder for actual logic to determine the token's score
+        # This needs to be implemented based on specific requirements
+        return -1000.0  # Default score
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            if text in self.specials:
+                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                score = self.get_token_score(self.specials[text])
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self):
+        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+#
+# data loading
+# TODO: reuse (probably move to gguf.py?)
+#
+
+
+def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    if n_head_kv is not None and n_head != n_head_kv:
+        n_head = n_head_kv
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+class Tensor(ABC):
+    ndarray: NDArray
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> Self: ...
+    @abstractmethod
+    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
+    @abstractmethod
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
+    @abstractmethod
+    def part(self, n_part: int) -> Self: ...
+    @abstractmethod
+    def to_ggml(self) -> GGMLCompatibleTensor: ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+def preprocess_weights(
+    w: np.ndarray,
+    bits = 2,
+    g    = 4,
+) -> Tuple[np.ndarray, np.ndarray]:
+    M, K = w.shape
+
+    cf=configparser.ConfigParser()
+    cf.read("./build/kcfg.ini")
+    secs=cf.sections()
+    for sec in secs:
+        sec_splits = str(sec).split('_')
+        if sec_splits[-4] == "m" + str(M*2) and sec_splits[-3] == "k" + str(K):
+            bm = int(cf.get(sec, 'bm'))
+            kfactor = int(cf.get(sec, 'kfactor'))
+            simd_n_in = int(cf.get(sec, 'simd_n_in'))
+            simd_n_out = int(cf.get(sec, 'simd_n_out'))
+            break    
+
+    M = M * bits
+    ngroups_per_elem = 8 // g
+
+    # (M // bits, K, bits)
+    w = np.stack([(w >> ib) & 1 for ib in range(bits)], axis=-1)
+    # print(w)
+    # (M // bits, K, bits) -> (M // bits, bits, K) -> (M // bits, bits, K) -> (M // bits, bits, K // g, g)
+    w = w.transpose(0, 2, 1).reshape(M // bits, bits, K // g, g)
+    w = sum([(w[:, :, :, ig] << ig) for ig in range(g)])
+    # print(w)
+    # 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
+    # for bits=3
+    # bit0: [0, 8), bit1: [8, 16), bit2: [16, 24), bit0: [24, 32)
+    # (M // bits // simd_n_float16, bits, simd_n_float16, K // g)
+    w = w.reshape(M // bits // simd_n_out, simd_n_out, bits, K // g).transpose(0, 2, 1, 3)
+    mgroup = ngroups_per_elem * simd_n_in
+    w = w.reshape(M // mgroup, ngroups_per_elem, simd_n_in, K // g).transpose(0, 2, 1, 3)
+    #             0        1             2             3                 4                  5
+    w = w.reshape(M // bm, bm // mgroup, simd_n_in, ngroups_per_elem, K // g // kfactor, kfactor).transpose(0, 4, 1, 5, 2, 3)
+    w = sum([(w[:, :, :, :, :, ng] << (ng * g)) for ng in range(ngroups_per_elem)])
+    w = w.reshape(M // bm, K // g // kfactor, bm // mgroup, kfactor, simd_n_in)
+    # input size of current TVM API
+    w = w.reshape(M // bm, K // g, bm // ngroups_per_elem)
+
+    return w
+
+def transform_to_i2(x : NDArray):
+    x_num = np.prod(x.shape)
+    tile_x = np.reshape(x, x_num)
+    scale = 1
+    for i in range(x_num):
+        if tile_x[i] != 0:
+            scale = tile_x[i]
+            break
+    tile_x = np.divide(tile_x, scale)
+    tile_x = (tile_x.astype(np.int8) + 2).astype(np.uint8)
+    ans = np.reshape(tile_x, x.shape)
+    return ans, scale
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray, i2_scale: NDArray = None):
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+        self.i2_scale = i2_scale
+
+    def astype(self, data_type: DataType) -> UnquantizedTensor:
+        dtype = data_type.dtype
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        if dtype == np.uint8:
+            self.ndarray, self.i2_scale = transform_to_i2(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype), self.i2_scale)
+
+    def to_ggml(self) -> Self:
+        return self
+
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
+
+    def part(self, n_part: int) -> UnquantizedTensor:
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
+    def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
+
+
+def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
+
+    return tensor.ndarray
+
+
+GGMLCompatibleTensor = UnquantizedTensor
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: list[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        # Should be okay if it maps to the same numpy type?
+        assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
+            (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> LazyTensor:
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
+            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
+
+
+LazyModel: TypeAlias = 'dict[str, LazyTensor]'
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: list[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors', 'none']
+    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: list[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith('tok_embeddings.') or \
+           name.endswith('.attention.wo.weight') or \
+           name.endswith('.feed_forward.w2.weight'):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split individual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head, n_head_kv)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+
+
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+
+def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().part(n_part)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
+
+import torch
+
+@torch.compile
+def forward_t(x):
+    dtype = x.dtype
+    x = x.float()
+    s = 1.0 / x.abs().mean().clamp_(min=1e-5)
+    x = (x * s).round().clamp(-1, 1) / s
+    return x.to(dtype)
+
+def weight_quant(weight):
+    weight = torch.tensor(weight, dtype=torch.float32)
+    weight = forward_t(weight)
+    weight = weight.numpy().astype(np.float32)
+    return weight
+
+def part_lazy_q(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        return UnquantizedTensor(np.array(tensor[:2560]))
+    s = lazy_tensor.shape.copy()
+    s[0] = 2560
+    return LazyTensor(load, s, lazy_tensor.data_type, 'partq ' + lazy_tensor.description)
+
+def part_lazy_k(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        return UnquantizedTensor(np.array(tensor[2560:3200]))
+    s = lazy_tensor.shape.copy()
+    s[0] = 640
+    return LazyTensor(load, s, lazy_tensor.data_type, 'partk ' + lazy_tensor.description)
+
+def part_lazy_v(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        temp = np.array(tensor[3200:])
+        return UnquantizedTensor(temp)
+    s = lazy_tensor.shape.copy()
+    s[0] = 640
+    return LazyTensor(load, s, lazy_tensor.data_type, 'partv ' + lazy_tensor.description)
+
+
+def part_lazy_w1(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        st = tensor.shape[0] // 2
+        return UnquantizedTensor(np.array(tensor[:st]))
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 2
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part0 ' + lazy_tensor.description)
+
+def part_lazy_w3(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        st = tensor.shape[0] // 2
+        return UnquantizedTensor(np.array(tensor[st:]))
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 2
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part1 ' + lazy_tensor.description)
+
+def part_lazy_rope(lazy_tensor: LazyTensor) -> LazyTensor:
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        return UnquantizedTensor(np.array(tensor))
+    s = lazy_tensor.shape.copy()
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part_rope ' + lazy_tensor.description)
+
+def part_lazy_weight_quant(lazy_tensor: LazyTensor, name) -> LazyTensor:
+    print(name)
+    def load() -> Tensor:
+        tensor = lazy_tensor.load().ndarray
+        tensor = np.array(weight_quant(tensor))
+        return UnquantizedTensor(tensor)
+    s = lazy_tensor.shape.copy()
+    return LazyTensor(load, s, lazy_tensor.data_type, 'partlazy ' + lazy_tensor.description)
+
+def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
+    def load() -> Tensor:
+        tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
+        return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
+    s = lazy_tensors[0].shape.copy()
+    s.insert(0, len(lazy_tensors))
+    return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
+
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == 'storage'
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = f'{self.data_base_path}/{filename_stem}'
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = data_type.dtype
+            with self.zip_file.open(info) as fp:
+                fp.seek(offset * dtype.itemsize)
+                size = elm_count * dtype.itemsize
+                data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+        description = f'pickled storage_offset={storage_offset} in {storage.description}'
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES = {
+        # getattr used here as a workaround for mypy not being smart enough to determine
+        # the staticmethods have a __func__ attribute.
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith('torch'):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_paths[0][:-4],
+                              zip_file=zf)
+    model = unpickler.load()
+    if 'model' in model: model = model['model']
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    header_size, = struct.unpack('<Q', fp.read(8))
+    header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size:]
+
+    def convert(info: dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
+        numpy_dtype = data_type.dtype
+        shape: list[int] = info['shape']
+        begin, end = info['data_offsets']
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
+        return LazyTensor(load, shape, data_type, description)
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise EOFError("unexpectedly reached end of file")
+    return ret
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, 'rb')
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b'PK':
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar('In')
+Out = TypeVar('Out')
+
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
+    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread.'''
+    if concurrency < 2:
+        yield from map(func, iterable)
+        # Not reached.
+    iterable = iter(iterable)
+    executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
+    if use_processpool_executor:
+        executor_class = ProcessPoolExecutor
+    else:
+        executor_class = ThreadPoolExecutor
+    with executor_class(max_workers=max_workers) as executor:
+        futures: list[concurrent.futures.Future[Out]] = []
+        done = False
+        for _ in range(concurrency):
+            try:
+                futures.append(executor.submit(func, next(iterable)))
+            except StopIteration:
+                done = True
+                break
+
+        while futures:
+            result = futures.pop(0).result()
+            while not done and len(futures) < concurrency:
+                try:
+                    futures.append(executor.submit(func, next(iterable)))
+                except StopIteration:
+                    done = True
+                    break
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
+    # Handle special case where the model's vocab size is not set
+    if params.n_vocab == -1:
+        raise ValueError(
+            "The model's vocab size is set to -1 in params.json. Please update it manually."
+            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
+        )
+    if not isinstance(vocab, Vocab):
+        return  # model has no vocab
+
+    # Check for a vocab size mismatch
+    if params.n_vocab == vocab.vocab_size:
+        logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
+        return
+
+    if pad_vocab and params.n_vocab > vocab.vocab_size:
+        pad_count = params.n_vocab - vocab.vocab_size
+        logger.debug(
+            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
+        )
+        for i in range(1, pad_count + 1):
+            vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
+            vocab.added_tokens_list.append(f"<dummy{i:05}>")
+        vocab.vocab_size = params.n_vocab
+        return
+
+    msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
+    if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
+        msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+    if vocab.vocab_size < params.n_vocab:
+        msg += " Add the --pad-vocab option and try again."
+
+    raise ValueError(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+
+    def add_meta_arch(self, params: Params) -> None:
+        name = "bitnet"
+
+        # TODO: better logic to determine model name
+        if params.n_ctx == 4096:
+            name = "bitnet2b_2501"
+        elif params.path_model is not None:
+            name = str(params.path_model.parent).split('/')[-1]
+
+        self.gguf.add_name                (name)
+        self.gguf.add_vocab_size          (params.n_vocab)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
+        self.gguf.add_head_count          (params.n_head)
+        self.gguf.add_head_count_kv       (params.n_head_kv)
+        self.gguf.add_add_bos_token       (True)
+
+        if params.n_experts:
+            self.gguf.add_expert_count(params.n_experts)
+
+        if params.n_experts_used:
+            self.gguf.add_expert_used_count(params.n_experts_used)
+
+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')
+
+        if params.f_rope_freq_base is not None:
+            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
+
+        if params.n_orig_ctx is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+
+        if params.rope_finetuned is not None:
+            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
+
+        if params.ftype is not None:
+            self.gguf.add_file_type(params.ftype)
+
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
+        tokens = []
+        scores = []
+        toktypes = []
+
+        # NOTE: `all_tokens` returns the base vocabulary and added tokens
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size
+
+        return tokens, scores, toktypes
+
+    def add_meta_vocab(self, vocab: Vocab) -> None:
+        # Ensure that tokenizer_model is added to the GGUF model
+        self.gguf.add_tokenizer_model(vocab.tokenizer_model)
+        # Extract model vocabulary for model conversion
+        tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
+
+        # Add extracted token information for model conversion
+        self.gguf.add_token_list(tokens)
+        self.gguf.add_token_scores(scores)
+        self.gguf.add_token_types(toktypes)
+
+    def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
+        svocab.add_to_gguf(self.gguf)
+
+    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
+        n_elements = int(np.prod(tensor.shape))
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
+        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
+        if tensor.data_type.name == "I2":
+            # i2 * n + scale (fp32)
+            # print(tensor.shape)
+            # print(data_nbytes)
+            data_nbytes = data_nbytes // 4 + 32
+            # print(data_nbytes)
+            # scale_name = name.replace('.weight', '_scale.weight')
+            # scale_shape = [1]
+            # scale_data_type = np.float32
+            # scale_nbytes = 4
+            # self.gguf.add_tensor_info(scale_name, scale_shape, scale_data_type, scale_nbytes, raw_dtype=None)
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
+
+    def write_meta(self) -> None:
+        self.gguf.write_header_to_file()
+        self.gguf.write_kv_data_to_file()
+
+    def write_tensor_info(self) -> None:
+        self.gguf.write_ti_data_to_file()
+
+    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
+        if ftype == GGMLFileType.MostlyQ8_0:
+            ndarrays = bounded_parallel_map(
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
+                use_processpool_executor=True,
+            )
+        # elif ftype == GGMLFileType.MostlyI2:
+        #     # ndarrays = bounded_parallel_map(
+        #     #     OutputFile.maybe_do_transform, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, use_processpool_executor=True,)
+        #     ndarrays = map(OutputFile.maybe_do_transform, ndarrays_inner)
+        else:
+            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
+
+        start = time.time()
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            ndarray, i2_scale = ndarray
+            elapsed = time.time() - start
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            logger.info(
+                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
+            )
+            
+            if i2_scale is not None:
+                i2_scale = np.tile(i2_scale, 8)
+                ndarray = preprocess_weights(ndarray)
+                self.gguf.write_tensor_data(ndarray)
+                self.gguf.write_tensor_data(i2_scale)
+            else:
+                self.gguf.write_tensor_data(ndarray)
+
+    def close(self) -> None:
+        self.gguf.close()
+
+    @staticmethod
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+    ) -> None:
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+
+        of = OutputFile(fname_out, endianess=endianess)
+
+        # meta data
+        of.add_meta_arch(params)
+        of.add_meta_vocab(vocab)
+        of.add_meta_special_vocab(svocab)
+
+        of.write_meta()
+
+        of.close()
+
+    @staticmethod
+    def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
+        name, lazy_tensor = item
+        tensor = lazy_tensor.load().to_ggml()
+        return (lazy_tensor.data_type, tensor.ndarray, tensor.i2_scale)
+
+    @staticmethod
+    def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
+        dt, arr, i2_scale = item
+        if not isinstance(dt, QuantizedDataType):
+            return arr, i2_scale
+        return dt.quantize(arr)
+
+    @staticmethod
+    def write_all(
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
+    ) -> None:
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+
+        of = OutputFile(fname_out, endianess=endianess)
+
+        # meta data
+        of.add_meta_arch(params)
+        if isinstance(vocab, Vocab):
+            of.add_meta_vocab(vocab)
+            of.add_meta_special_vocab(svocab)
+        else:  # NoVocab
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
+
+        # tensor info
+        for name, lazy_tensor in model.items():
+            of.add_tensor_info(name, lazy_tensor)
+
+        of.write_meta()
+        of.write_tensor_info()
+
+        # tensor data
+        of.write_tensor_data(ftype, model, concurrency)
+
+        of.close()
+
+
+def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
+
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return GGMLFileType.MostlyF16
+    if output_type_str == "q8_0":
+        return GGMLFileType.MostlyQ8_0
+    if output_type_str == "i2":
+        return GGMLFileType.MostlyI2
+
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+
+    raise ValueError(f"Unexpected combination of types: {name_to_type}")
+
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    # for (name, tensor) in model.items():
+    #     print(name)
+    #     print(tensor)
+    #     print(output_type.type_for_tensor(name, tensor))
+    #     print(tensor.astype(output_type.type_for_tensor(name, tensor)))
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
+
+
+def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
+    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
+    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+
+    tmp = model
+
+    # merge experts into one tensor
+    # if params.n_experts and params.n_experts > 0:
+    #     for i_l in range(params.n_layer):
+    #         for w in range(1, 4):
+    #             experts = []
+    #             for e in range(params.n_experts):
+    #                 if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
+    #                     experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
+    #                     del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
+    #                 elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
+    #                     experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
+    #                     del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
+    #                 else:
+    #                     raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
+    #             tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
+    # tmp[f"rope.freqs"] = part_lazy_rope(1.0 / (torch.tensor(500000) ** (torch.arange(0, 128, 2).float().to("cpu") / 128)))
+    # 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+    rope_ndarray = (1.0 / (torch.tensor(500000.0) ** (torch.arange(0, 128, 2).float() / 128))).numpy().astype(np.float32)
+    # print(rope_ndarray)
+    
+
+    def load() -> UnquantizedTensor:
+        return UnquantizedTensor(rope_ndarray)
+    # model[f"rope_freqs.weight"] = LazyTensor(load, list(rope_ndarray.shape), UnquantizedDataType("F32", np.float32, ['F16', 'Q8_0', 'I2']), "check")
+    # print(tmp[f"rope.freqs"])
+
+    # for name, lazy_tensor in model.items():
+    #     # if "rope" in name:
+    #     print(name)
+    #     print(lazy_tensor)
+    # asfasf
+            # print(lazy_tensor.load().ndarray)
+    # asfasf
+
+    # HF models permut or pack some of the tensors, so we need to undo that
+
+    # if ARCH == gguf.MODEL_ARCH.LLAMA or ARCH == gguf.MODEL_ARCH.BITNET:
+    #     print(tmp.keys())
+    #     del tmp["output.weight"]
+    # asfasfasf
+
+    # for i in itertools.count():
+    #     if f"layers.{i}.attention.wqkv.weight" in model:
+    #         print(model[f"layers.{i}.attention.wqkv.weight"].load().ndarray.shape)
+    #         # saf
+    #         tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = part_lazy_q(model[f"layers.{i}.attention.wqkv.weight"], 0)
+    #         tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = part_lazy_k(model[f"layers.{i}.attention.wqkv.weight"], 1)
+    #         tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy_v(model[f"layers.{i}.attention.wqkv.weight"], 2)
+    #         del tmp[f"layers.{i}.attention.wqkv.weight"]
+    #     else:
+    #         break
+
+    # for i in itertools.count():
+    #     if f"layers.{i}.feed_forward.w13.weight" in model:
+    #         tmp[f"layers.{i}.feed_forward.w1.weight"] = part_lazy_w1(model[f"layers.{i}.feed_forward.w13.weight"], 0)
+    #         tmp[f"layers.{i}.feed_forward.w3.weight"] = part_lazy_w3(model[f"layers.{i}.feed_forward.w13.weight"], 1)
+    #         del tmp[f"layers.{i}.feed_forward.w13.weight"]
+    #     else:
+    #         break
+
+    # for name, lazy_tensor in model.items():
+    #     if name.endswith(("w1.weight", "w2.weight", "w3.weight",
+    #                       "wo.weight")):
+    #         tmp[name] = part_lazy_weight_quant(tmp[name], name)
+
+
+    # for i in itertools.count():
+    #     if f"layers.{i}.attention.wqkv.weight" in model:
+    #         print(model[f"layers.{i}.attention.wqkv.weight"].load().ndarray.shape)
+    #         # saf
+    #         tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = part_lazy_q(model[f"layers.{i}.attention.wqkv.weight"], 0)
+    #         tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = part_lazy_k(model[f"layers.{i}.attention.wqkv.weight"], 1)
+    #         tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy_v(model[f"layers.{i}.attention.wqkv.weight"], 2)
+    #         del tmp[f"layers.{i}.attention.wqkv.weight"]
+    #     else:
+    #         break
+
+    # for i in itertools.count():
+    #     if f"layers.{i}.feed_forward.w13.weight" in model:
+    #         tmp[f"layers.{i}.feed_forward.w1.weight"] = part_lazy_w1(model[f"layers.{i}.feed_forward.w13.weight"], 0)
+    #         tmp[f"layers.{i}.feed_forward.w3.weight"] = part_lazy_w3(model[f"layers.{i}.feed_forward.w13.weight"], 1)
+    #         del tmp[f"layers.{i}.feed_forward.w13.weight"]
+    #     else:
+    #         break
+
+    # for name, lazy_tensor in model.items():
+    #     if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight", 
+    #                       "w1.weight", "w2.weight", "w3.weight",
+    #                       "wo.weight")):
+    #         tmp[name] = part_lazy_weight_quant(tmp[name], name)
+
+    # for i in itertools.count():
+    #     if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+    #         logger.debug(f"Permuting layer {i}")
+    #         # tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
+    #         # tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
+    #         # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.v_proj.weight"], params.n_head_kv, params.n_head)
+    #         # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
+    #     elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+    #         logger.debug(f"Unpacking and permuting layer {i}")
+    #         tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
+    #         tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
+    #         tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+    #         del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
+    #     else:
+    #         break
+
+    out: LazyModel = {}
+    for name, lazy_tensor in model.items():
+        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
+        if name_new is None:
+            if skip_unknown:
+                logger.info(f"Unexpected tensor name: {name} - skipping")
+                continue
+            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
+
+        # if tensor_type in should_skip:
+        #     logger.info(f"skipping tensor {name_new}")
+        #     continue
+
+        # logger.info(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+        # asasdsd
+        out[name_new] = lazy_tensor
+
+    return out
+
+
+def nth_multifile_path(path: Path, n: int) -> Path | None:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    '''
+    # Support the following patterns:
+    patterns = [
+        # - x.00.pth, x.01.pth, etc.
+        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
+        # x.bin, x.bin.1, etc.
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> list[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    '''
+    ret: list[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    '''Load a model of any supported format.'''
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors", "model-int2.pth"]
+        files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            raise FileNotFoundError(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: list[ModelPlus] = []
+    for path in paths:
+        logger.info(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+class VocabFactory:
+    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
+
+    def __init__(self, path: Path):
+        self.path = path
+
+    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
+        load_merges = vocab.name == "bpe"
+        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
+        return gguf.SpecialVocab(
+            model_parent_path,
+            load_merges=load_merges,
+            special_token_types=None,  # Predetermined or passed as a parameter
+            n_vocab=n_vocab,
+        )
+
+    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
+        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
+        selected_vocabs: dict[str, type[Vocab]] = {}
+        for vtype in vocab_types:
+            try:
+                selected_vocabs[vtype] = vocab_classes[vtype]
+            except KeyError:
+                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
+
+        for vtype, cls in selected_vocabs.items():
+            try:
+                vocab = cls(self.path)
+                break
+            except FileNotFoundError:
+                pass  # ignore unavailable tokenizers
+        else:
+            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
+
+        logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
+        return vocab
+
+    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
+        vocab: BaseVocab
+        if vocab_types is None:
+            vocab = NoVocab()
+        else:
+            vocab = self._create_vocab_by_path(vocab_types)
+        # FIXME: Respect --vocab-dir?
+        special_vocab = self._create_special_vocab(
+            vocab,
+            model_parent_path,
+        )
+        return vocab, special_vocab
+
+
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ8_0:"q8_0",
+        GGMLFileType.MostlyI2:  "i2",
+    }[file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+    if ret in model_paths:
+        logger.error(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
+    print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
+    print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
+
+
+def main(args_in: list[str] | None = None) -> None:
+    output_choices = ["f32", "f16", "i2"]
+    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+        # We currently only support Q8_0 output on little endian systems.
+        output_choices.append("q8_0")
+    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
+    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
+    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
+    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
+    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
+    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+
+    args = parser.parse_args(args_in)
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    elif args.dump_single or args.dump:
+        # Avoid printing anything besides the dump output
+        logging.basicConfig(level=logging.WARNING)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    if args.no_vocab and args.vocab_only:
+        raise ValueError("--vocab-only does not make sense with --no-vocab")
+
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+        return
+
+    if not args.vocab_only:
+        model_plus = load_some_model(args.model)
+    else:
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+
+    if args.dump:
+        do_dump_model(model_plus)
+        return
+
+    endianess = gguf.GGUFEndian.LITTLE
+    if args.big_endian:
+        endianess = gguf.GGUFEndian.BIG
+
+    params = Params.load(model_plus)
+    if params.n_ctx == -1:
+        if args.ctx is None:
+            msg = """\
+                The model doesn't have a context size, and you didn't specify one with --ctx
+                Please specify one with --ctx:
+                 - LLaMA v1: --ctx 2048
+                 - LLaMA v2: --ctx 4096"""
+            parser.error(textwrap.dedent(msg))
+        params.n_ctx = args.ctx
+
+    if args.outtype:
+        params.ftype = {
+            "f32": GGMLFileType.AllF32,
+            "f16": GGMLFileType.MostlyF16,
+            "i2" : GGMLFileType.MostlyI2,
+            "q8_0": GGMLFileType.MostlyQ8_0,
+        }[args.outtype]
+
+    logger.info(f"params = {params}")
+
+    model_parent_path = model_plus.paths[0].parent
+    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
+    vocab_factory = VocabFactory(vocab_path)
+    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
+    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
+
+    if args.vocab_only:
+        assert isinstance(vocab, Vocab)
+        if not args.outfile:
+            raise ValueError("need --outfile if using --vocab-only")
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
+        logger.info(f"Wrote {outfile}")
+        return
+
+    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
+        vocab = model_plus.vocab
+
+    logger.info(f"Vocab info: {vocab}")
+    logger.info(f"Special vocab info: {special_vocab}")
+    model   = model_plus.model
+    model   = convert_model_names(model, params, args.skip_unknown)
+    ftype   = pick_output_type(model, args.outtype)
+    model   = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+
+    params.ftype = ftype
+    logger.info(f"Writing {outfile}, format {ftype}")
+
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+    logger.info(f"Wrote {outfile}")
+
+
+if __name__ == '__main__':
+    main()

From fd3f355a0babdb802a8f60a0d28e02579b8a9f26 Mon Sep 17 00:00:00 2001
From: Yan Xia <59006636+sd983527@users.noreply.github.com>
Date: Tue, 15 Apr 2025 14:53:56 +0800
Subject: [PATCH 03/11] update readme and setup script to support official
 BitNet b1.58 model  (#171)

* update readme and setup file for new model.

* update model file name

---------

Co-authored-by: Yan Xia <yanxia@microsoft.com>
---
 README.md                       |  48 ++++++++++++++++++++++++++------
 assets/header_model_release.png | Bin 0 -> 14848 bytes
 setup_env.py                    |   9 +++++-
 3 files changed, 48 insertions(+), 9 deletions(-)
 create mode 100644 assets/header_model_release.png

diff --git a/README.md b/README.md
index 013daa025..df6f718ff 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 ![version](https://img.shields.io/badge/version-1.0-blue)
 
+<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>
+
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 
 The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
@@ -18,7 +20,8 @@ A demo of bitnet.cpp running a BitNet b1.58 3B model on Apple M2:
 https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
 
 ## What's New:
-- 02/18/2025 [Bitnet.cpp: Efficient Edge Inference for Ternary LLMs](https://arxiv.org/abs/2502.11880) ![NEW](https://img.shields.io/badge/NEW-red)
+- 04/14/2025 [BitNet Official 2B Parameter Model on Hugging Face](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T) ![NEW](https://img.shields.io/badge/NEW-red)
+- 02/18/2025 [Bitnet.cpp: Efficient Edge Inference for Ternary LLMs](https://arxiv.org/abs/2502.11880)
 - 11/08/2024 [BitNet a4.8: 4-bit Activations for 1-bit LLMs](https://arxiv.org/abs/2411.04965)
 - 10/21/2024 [1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs](https://arxiv.org/abs/2410.16144)
 - 10/17/2024 bitnet.cpp 1.0 released.
@@ -29,9 +32,38 @@ https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
 ## Acknowledgements
 
 This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) framework. We would like to thank all the authors for their contributions to the open-source community. Also, bitnet.cpp's kernels are built on top of the Lookup Table methodologies pioneered in [T-MAC](https://github.com/microsoft/T-MAC/). For inference of general low-bit LLMs beyond ternary models, we recommend using T-MAC.
+## Official Models
+<table>
+    </tr>
+    <tr>
+        <th rowspan="2">Model</th>
+        <th rowspan="2">Parameters</th>
+        <th rowspan="2">CPU</th>
+        <th colspan="3">Kernel</th>
+    </tr>
+    <tr>
+        <th>I2_S</th>
+        <th>TL1</th>
+        <th>TL2</th>
+    </tr>
+    <tr>
+        <td rowspan="2"><a href="https://huggingface.co/microsoft/BitNet-b1.58-2B-4T">BitNet-b1.58-2B-4T</a></td>
+        <td rowspan="2">2.4B</td>
+        <td>x86</td>
+        <td>&#9989;</td>
+        <td>&#10060;</td>
+        <td>&#9989;</td>
+    </tr>
+    <tr>
+        <td>ARM</td>
+        <td>&#9989;</td>
+        <td>&#9989;</td>
+        <td>&#10060;</td>
+    </tr>
+</table>
 
 ## Supported Models
-❗️**We use existing 1-bit LLMs available on [Hugging Face](https://huggingface.co/) to demonstrate the inference capabilities of bitnet.cpp. These models are neither trained nor released by Microsoft. We hope the release of bitnet.cpp will inspire the development of 1-bit LLMs in large-scale settings in terms of model size and training tokens.**
+❗️**We use existing 1-bit LLMs available on [Hugging Face](https://huggingface.co/) to demonstrate the inference capabilities of bitnet.cpp. We hope the release of bitnet.cpp will inspire the development of 1-bit LLMs in large-scale settings in terms of model size and training tokens.**
 
 <table>
     </tr>
@@ -143,12 +175,13 @@ pip install -r requirements.txt
 ```
 3. Build the project
 ```bash
-# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
+# Manually download the model and run with local path
+huggingface-cli download microsoft/BitNet-b1.58-2B-4T --local-dir models/BitNet-b1.58-2B-4T
+python setup_env.py -md models/BitNet-b1.58-2B-4T -q i2_s
+
+# Or you can download a model from Hugging Face, convert it to quantized gguf format, and build the project
 python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s
 
-# Or you can manually download the model and run with local path
-huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit
-python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s
 ```
 <pre>
 usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
@@ -173,7 +206,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
+python run_inference.py -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
 ```
 <pre>
 usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE] [-cnv]
@@ -246,4 +279,3 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
 python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
 ```
 
-
diff --git a/assets/header_model_release.png b/assets/header_model_release.png
new file mode 100644
index 0000000000000000000000000000000000000000..0c955c930df9ac38d054ab2a363b0d255e5b1694
GIT binary patch
literal 14848
zcmd^mXIE2S^d=n(ML|RbM3IhwO7Ef~5PFvm0@4Z6dl3-<>77ud1Pn+O5(y<JD$;um
zAt*J35K8EQOw`}}*L;9^Gi&Zzl-zai$vMy2XV+)P>+7mBUbu0Af`WokQ{#yt1qC&j
z{D1K|I`VHy!N^O0{`eZIKcc7|<iwCa&^W8=s8Ucgq+BH0(~>`*_tr4?rJ!Ky`tw8C
z4gBCpK_Qo=`9#$?$OeZB4r7}@sch-pvY)tk)oQ!(v!d`9Mzypk5kS?48vRF};F?Dt
zwAkKR78|m`;8$Xry^MpJOlcVn*=cmDK0mpnEiAn0Ui(U<n&<U0@$B$LMBrwN?_hs>
zOQ<q_3fp$ed1Wx<#nE7aT&0E(_~DbkS0*ivz`*nOD%G~v2mHHh-t|87?cb~MjjGV)
zzn2qZjN!irdvDNz|9e}C0TBQ9IYIYpnJj;wGnl+5$oFr+kN>X@=vpf6MPphs>g#|C
z7TEgE9^y6rhOskWDn#tx5<y+>B@IN(s3MOp)_a1KpAfgGz0u*Fqu}Gz4|2XyaCxPi
zzo>i|u-C6b!r7)ftcQ{;TCW~XA);n2!rum+`rFMsu=_$4{`T1?y3fm3-<POzvw*J*
z&wX6|hawv%9T=XXLbr*q9ZCv0us(km_?dFLNt63Rf1wDeV~c8QZ|Ycop(r%!08O8t
zPq=N6$WFbxf@%4;F|={2keupTs-iH%A-1%)#FJ62&MxL|Cd*do<2QuUgT?Bt2*PFW
zDeE0lUB^xY$MJGbXYgtd;=i|(RUx&XpWWb*$15)2V0$H<0>w{ABp8CS+Dx5f17$c|
zh*?92*jZY0g+GQrR|{J>T1@GDeZF&N`C+AQL)SmixuB1{2tbhL&y)lxnt;q<XL4Ju
z=<lXqPPf$;nx{%55ohxE9G(*J!Mo2n9#+-Rb{suDL-@IRG%xsLpHo{jwUvagL#7qQ
zZNn~eaI*ddei;H~&%DY`ErX)zFMu$|!~BH2x4(C;4D1~INZO8GLZ9(d{?3=`{2JTP
zrF?qjLtPFD9rlUlTk-%XdPq6(>@+2kVyr@qByHY6bvOXz{qRpvdwo5{1##o&*T(3t
zso1=F2qxTru5@_I8`r!;$`V<O`C?KXrLdX$Bc1@u0Z@{Tn78^RI$8C;F^4WP9+67b
zI-PQ6EKk+67B!cbclM#SD&d0v<YeA8R8@!~GrJd&V@;=Bw1G;fMe4|CqD#3W?<QTx
zobA=QEL4SV+|L`z7CpYS(28H;7(<ihL*ZU2id64z@J%EfAeq-Db+}{x9Ws2=z<WaK
z<EUEe<$)znn5O?U5HZo6Fbg-K>Qm>Qh<jVtL6v=X#P@q|)`(0g{0-g0$4^lQ^J8(}
z;5g}p0$<*@YJEG%!}inv)_pnldZyPbjpBjThD?E1dkCchp7Zo1O+BiZWYw{KSBNqp
zVa_43f*;Zg0_^Q|CW^1Olv?-I<|&$Yxck@jmBc+I;h|1igrxt@#ZAHf_f19uN7=jI
zSj!k0TlUr{33zVM!A@Uik3b)SkFTL?^!xr(U!29l#;0h^m;*xUv_~(hecvZV$n;}s
z=kcIIwD&Xae_ALr<AQ`BZB)XAg#nr9UtQ4`#ZrOyxp1?Tq+ONGhVMuqHLJjE_0Ee&
z2A91TlNzjl%6_$fABbN=yHE8VeC+0=?k{F0uA4oKW7@R6^iO;~eq{tO%b$}y3xf29
zc5|^bp?p&aQWq#<=rD*&9MVBDfJ{w*=9Gj@&GzOe=3#Imk%Xy?X4R#rZ8PkU!A)Ge
zcK;EH_TSzP3eUa2A_~;vBJX9*Tr#K7Y06DMeaX*#0^M8!Ndie6^aT;}!_(Ke0)EGj
zq8R_ZR3vEFyKnE1?0XeeZO`-Yc$ZWD3i&Wi6ZOx@;w0bKG9B*pP20D4jSJcx`7CrX
zJM1PVf{!;f1-`Ge3OUW2{->T((W;niQL+e9oZ!V7*@J1VGWa!fgk!yiZ!N^KM-3M4
z-QGcAB!)CdAF1iq9{*F(jr@pee0C-UhYXcEH{-kOcTV4~CtXRpzR2}~Ek8pq*?MT@
z1*9Tu#;LP2dep@b5%IrabFz*=2&rjhjo){O%D-_GS{xoFF_^k(Q1IhjZy&hIt7Lti
z>scesG*IOh3*}WYnwt^Vf|l9rc3)tqDxs5eh<^seAbTlN|ANMpc=t^a(MfrEf2!Wz
zDD;`>4LSI_@PYV+zIXvDc5B|~qpz{s=pLJJfBQO>f4f%m^&Eh50gjDQA(;=+&cf&e
z<}N>jiOt?KEWymOO+$6yh!MscG>Ru5+jRaD?EA5-nYHG3!UN6@5XY&|>c{kUn^qXQ
zxPQlpep&CaI-m#nFrzSlszN(Wri^z`aqMQQ7upM5@eg9xY!x5?2Bd#6L^mI9*1?8~
zjC}8`2Z68eSKCrDD_sFiUBz`MZ2e9g-O6z2yGlN~<@b3jFPCpGMi&#moof9vfbV*W
z-(R6nK4FkO^&@X+umt>IeiXl=d_tklMZ3|QBN-s+!&~o`b%up5*S1EDl!4PKm;0iB
zJY6VDDL42a<zDWFf6P3CF8^l~z~&LO;3yjNJL**UHdc_WG0p2oonPpwxg*Yfw7Ra)
zCMqOE`G$c~`H1H5cjBQioJ*d9bP#FS{?8Z*Chmxn&y!PM)R=A}{`mE)77YJFV;{ll
zzq$4mze%2>W2D+_p<f$S<2Ia5NQ~|XkLC&(j-xR%p|uKu6lwhhCzI{ydM4FQMHBpn
zOAey?(lgET_q=a;FK2ZUk2+_Zu?{r8D|wyGQveg^e?xPo7Ec7CYFl+*H7nXO>hbZ7
z){lLD*;#M@@L)5YL{<)(0+cFeA4p#}a{ow}YeVk8eNtZv(1*LSZgnv~>`p$uG*VmW
z$0p&*a?iIySu(O2D2|;gq28MF?JSA=!G!*NS@ziFeaUTU^imXclsY%<K8oGNYceiD
z)N+ZjiuLdxgia5==)pQVo!n;>!~?{`)xl$Z@~}-}CnM@C?^nw9ap1b)E@pZ<X1Ziq
zs<lbRYxk*S$wu%8$DS=k<k_b-KBnhW*3pH5()8s61Eb#u_ok>-P9e&fera!;Y}z8)
ziS4UJJ9e*kki#V*$=8(fPsrH&P~%lUJX{XYPsr{h%=t!F7Ei`qNw)7=(pec)P)DSp
z>Q&(|Az`xSPP6yY%BJY%E`G*llR?J*x<nMK-gsPUZNufmnW3f91yfW0i7KJUypHDD
zDyp+%GbI@bv6d&v{;)@)&S~fB{e<d+UV223&V0Ns#5wFrlr$K?Cr5LHu22EAq7BNP
z@5a9!jX$_${L@%@`m+I~Pt2&wS^>O-9r+4)949<6_pz?ouuQ#gi~i<toZ#MK3V<ub
zh1kTR)6HEsPS3q&G$ej3PpesyYUtzyFfkRDB5#D+r=6WcS2+mWf?L7g&F%MUPC{@_
z)_1*&`c+i`?oV47<Sh`g6gt@k9380pNUJF&b!<G~=C;wDmC@UqF1{M?;*uJElc`%g
zrs&aau(0rwj4E%dXP3P>sY+Og1EX)vrD7`(45>=s7SDow)Gofm*9Wa*ku&9jVhloN
z2D9>HRaO+jL3|lOUH}l$1rGIf##rtag_h^LcHEG?bhyYJ4VMq5{(VR=vReim<d&L=
zA03p${jTN@TpvHKg02=&E1t>L4=Ep~euYIS>D!6}-ww&quc0_{rL?$3=0)n(<P3@<
zYgZiyP0YTl$bpHh@f55Ay`V40A>l6e@lT=}9Ka!jud&C&L1YuT)MGdx&*g+NFY02V
zuF$zCowA3GCcA`-$<!5kQ*jl#ygXPH&-~1!lE=&P=>XV#)#@dK+MbEfOQlzSro11&
zAECH4ZZvAASBss#yf!O!e88kYK_UuxcQJOJ7oae+qU7{TWjNW<(uB{qFQZqBcD=@3
zmNYP}k_HN!=jH-9ybn+{*xjboCp;_#$z0XU@%{_ZA-X;{IHKKaT~A|ZyN~xCI)RHU
zpS>7Jj}k4EA<c|V3pxQiDT4J4*LW@qo%pV-Jtj^}^#NmOlHw1S#WO5#HgQH~2sqbz
zOipQe=T{x+6c-)iG@n3ns>rkE%(NxDI1JaR6nEeGWj#Npy9)}KRxKimKckp?sZQV+
zSy@?A$50HH(cg1Q_}DdB)7j0f9w$e~l<8$J4Wzpn{DfOrk3i2b?2Fr<x9qTO-}<^l
zHTy`$S$D(IqY#oZt*>z$-Td-Z)qJ}X+S=cwUgn9asMWO%Phf{P{id4Q&amB&cVB~V
zY<xsVQ5xP{>5o@Zchus4m4U6LHvKs48)Mn<=+#dviC~{~wB4+B5kr4r^(sF<%P&jA
z(y)>IKP$+wRfQOi*6YE%I*(SVPw;gqX-dEDd1JtxjZQ2CuE2tVTTkDm{u&tQv;=-X
zTOo{r8+a&!Rtu-?T_7DuNCpv6L3enJo&Loo|D=-p3CzofRl<&9>-2zy!Klr3jMr|<
zXMg6QzrRPB2pH_+AnW};wEtO-6$@k4Xz-Xyuun*3u_or+oNrGod+%{GzpPfAAN{Gn
z|32)CiHT1~H6)D`_vrXqX@{~u@Pf{x+pZ74X@bGW*S0Hk#(YQJcN~=j?!!=~va1ZG
zk8(DY&KcbNIWQR*d8m3UXbsa9%LXDC!}jz1PIi99b;8h8?vol(Rgvy_nhN;4UskK@
zsJd@noC_>geq-FOw0y<9xF=(=SVP=`HyShDu!H-6ltKSY8m8y|iT)#nljaSJvIyoz
z4rFu2L+LB&XBu7ht}~55+w)r|dl@pH3l*rX;TmRAN1cQpbJbFCjVHSEuz5GcWsp4E
z=N*8OOmL3X^A2;yM*PZPmduu_u`$IbXKc&-8>YRpfcqfPL3!;HDeJy$%uyb+z4+EL
zSW4Tex#H>2eTCoWeBE1E1@dD?*Vhkg8Q0hQMPqVuv}!Vqz40kKJ`3!Ktq75k>2vQm
zoFj9G$CujApo`Ph1ya(z{=c|M^$4}P8rnC^Q@*F<J6+r=HcnofbBCt!{wa#jm4{4C
zFmesk18VEJ$*xVDQ}Iw9Pf(GW`R!h|3T&7x!@KKct09XmLg9q$6rZn&;==xOF9H4=
zAB<nV4C^zS(IzXzJ5MzdYjkuP+hC{)<4RqCsCj8!{52Wf2!();%1_>LRE}R(7i@Ml
z@;KYa0SbKpe)C;%BF#=(8?t1>Hy%Z~-IwPgEM{8um>zt*Z?Zn9u(+DhNAQ1{UK86?
z!5@mv&&oO$x2SBcVdI~5a~{6RrX$++qU^hQVTU%xMmk&>Q&jAVqy}VV|5%k1LTNim
znP_7@6>LOA7Q6MjeX_FY6aw<|mn`0kCuV4m)pK=2eM~JmR5}#+Mqhmfx!SG?nL{x~
zE9B)r>#a)i40+O`EncVFXdNl&;PdHtv);b5Fp!Fs|0hSQ2X<UD8v~5YP^K=9uqb-3
zp!ao1{^GR1*}^dv9E25XH?A7&IH_%;owN>tv}vBMfToG1r7joSq7<_xtFr>uX*k3E
zJZv67y|QFz15tcf(={~cP1mS*wx&@=QHJdW$PJ(+3sWB`amOcIV`c5Mv|<(nT$g*k
zWv3*tUY<cH826o}I=3?^$eF#*s7;)|#qW0L9h#?6(Yq%##d}au^!BYFBYD!bNDvSZ
zcrQg+80|!k5z=UKtXuy$m=m7?THuk}8P&DviNwVe>dL|j=B!$~%{HSBk-e%(f0Tdc
z{cepdY`_U_2bbqC$_x2jk*SS6JBg;}CU~{Wd>A!6|E@bZ%QR*;2_pZ-Y|k3@q<XXj
z3`YtSueE#)m*9w}(8_sAiXa&H_7&Mo@a%CoARFI2nm0KS76EhcaTu`~jvXV%CwdNw
ziz9EgYl?|x(Tm(sKaRv3nhBWfb5-osd`NW|t+7=hVNUL6-3hmplm>__01;f)LtkRR
z!fj^$daRcIBOazHyTYts@HYCJfO{tezN+pxAmCFL$q&fPid243ljy0Wjd(e|lH<bG
z|KsTq;am9pxwrx5#&pyV8>gPM4N0SthRZ?QYH|@~A1S0gQo~lmq#QMVPn75|>S9*;
zPl*kS$q&#wJKt8X8w>X_g;hAu)~VJN+7R4h$}C~upfexM8&QSf9jiq?^8vQS1a}w+
z?E{xPIZ6~ST~(;VzIH_sd=@ocD7<L(c;VA}a^HIObd#Nxd`8>O;2(G2FpiueR*TY;
z<%RvROJPrc6){`%S_ozyw)9ue$@igVeJ;H|TbC5@L{*Jn6c_X#)dVyHxk<B1CJ=RI
z$a4j!1Y;-MuquRXhF#lgo|PD{4gwgjmyRExv5&J7o|L=gx`vSL_W1AbA+?6PtKDLY
zHlad$-&g^s$5Q}YlMzv#H~o0e$Zb{<Y+5(Dnr}<k^GUb$XRvhMt`OEytrnfB!q95U
zBMIr|%yE#A!W!X{TKKFD+pMOO)Bf^(QLWU>pF8<$W8;;m@p5A|HKKRmh1kq|d6VKs
z*t*-y%|T}hFV#MC`5+uU!a?YA#UR?#WW{;#9k<FvUzQ+wgR&@*FE`BI=s1aA*%5mz
zZF4f+lr)m?U{$JkQumWoRW;6J5hcW#9nRv1Tt@_^pAhYZPjrPvjH*ghi+!*&y;ZmZ
zkLREw%?F5=xxEI%iz+{*su~UV({R@qOH00leUumCO!f*KgXay>_jss>D~8|12!rzG
zHdQfW0fxhkj-$i3u)kp)x));mPo`6hes~n8el}8zRgMjkl3Hn8)BRPyF(s+FJ+J#|
z<pp?Uz{qX-9vA@lEo1rsVy$p~Pz+%Oaq7#Qh3dTOeaxKoD}x`S^{b}E976VNWCGDO
z%xp6A*AzTX_NPfQGU}uNNonB;u-3){h3H^|v@5;(8fSHNAfdf=@OYLP;Fi@n!k%wB
zTJwh=#cTvS;gy}j7rH;-XF6E}wDyF)KR6Zp+4JCMY`;ieg{MXKtscpNCu?JVdaY3D
zLs|+-aVqJ*SfiSd$?;F}UKM*!b5i8?n%@vl?(kTi@GAo<Bzrnd?=x280&UuQvp+@d
zg$a;7$)pDm=wz!Ez2>ia;(W@Pf&gF0*|}I2#L>ZG^H8CqaJ4iPlgGQu5eMNsD$IGM
zsNpBD3i44xYeu-ay}~;;DV}>!1HydPeW9?v*s_|I+?a>K6`c#8)!h-%l8!(Vk%q32
zBlzlfC`qRWh)U0U5gh!yk4xVx!pP0gkVkaJCgA#TWs0(-96k5liA1lq%d$jo*Al4v
zl#6@HkTTY~`D=p@xapp(mleSh;yU)b6{y+=t8Cq1IXyuF%j&ul6WXfj<GQx7a8FQE
zYUmeQLzMmZ3z!?f1XSX=hP2|C40QsDhtqfyWV*%^XtY^#qyD+i$8Jh82YK1S-!FtB
zs(Z*sg*5C4ch8d+RqD$W<B--c;iwZH<&QW;tvsl!YIdIjNZnTATio~ruF4c+NgF!n
zH)XYGJ=ojZI7Ri`q3qzO>*cr0A-X#^IF_I73P*y$JMo(OzN<gXS{Y)?+h0F2F%#&n
zGizbuVM#lQc5LOItyk@Bn>yQVj0oQlP{?fS1?)OY*pSmddJX9YWDU3*v+3N_becM_
zFEVN(EzyGVjTU{n_33%6N8PO-$w?}aFw@2{tIzk8!eg-pZn_pQT?kx0e&ysq?@ZLS
zJ9&E|vNh0by)cj<C%m`Q_rU99A0TS=fjLVClw(yoF4p>Oh+cnvC~`9L_GDcStN|dj
zH(>^_4lFbT$zUDSJkhn|4c$NVBA0$n%hUjRnh4w4WQF58bI@^8?$)4O6INOW$|zVj
z+jOknxwI7j85f@p!a;0P%RlK1*;`7Fh439`PTC)HSrk=dXU%j+ce92dwmo}*!7;;?
zc?nmRAdT$E4sDt6hzLlN<hD#UR3`FQy>U!c)dK_yK*0D^<;~BPE}w6V$H|I`B&yiX
ztQeVQPfNQzXp-Hmb^FNdYuM<~?Z+=EncHVxDs<&Rv3iyhl20KL*~a3eP;6}cEk^;7
z*&op?8pAzC^Z-VA>}pBpO%FQvR0wxT4M1)9kArjp0ePNIW6-dk3ggjA9IvJMj)Qz%
zD?XM8ZH`bW*IF9*RHThvfz`db+X^Y*7y2Hm5F2r*+R;E<<OcU;I8u-Nehwyk@KLxV
znM^og^zdaZ!lt;DCgJg$pIhfy2_?UOC-S){{i4Zxik}~G-Rx*J#J5+Hc;=@Y07^S=
zi#Yl)hCd41$X?#+V)w3=ni>EH1(5?!5r5Jti{L6pcp-~UNjCgc)oaUxzQNMdlMp2)
zvfFXpFFjOI0HZCS6y5;^1&LQAw&R{i_`4AgH_<JyvYHzy9YtAL4akSwXw6^V#C?FI
z;7;jKw}^?cMpkwHjTt8BvfmHAr^7zkOvl7m6}OoE=koLWmnX7EjXeeLAv>an24Zot
zO!?`j72VYz*D0Hn=QMkE@}1o)^UfiOBR*mB=zyUDq19VR#$1eI)m`KuS#1<Z6a>%Q
zl<i#QED%II0QEXPx1>|4IQ?Mzn3Iuldl7CV$a?~>c$$um2uzR1+*svZy<uW>hm=3r
zU8sGcl85_{Qdw!7HJ}YVa9kS;5C=ur+5_Ydg4^4y_8X?8s(u)@TdJ_P<|zlitV~E)
zQ})}`D>z+i?uQC@FE4L1kLw(Sm+;O?)@zXU^ksodI!g+gO8)rTaawi_YE2HAr<LuV
zmSJ`wOKIETuZ6+mwr4zDCwoTg*t)$EKsX^SY!^eo1aE(D7%P-DAgQ%o6gOUfAG73H
z-_zvUFh(}79n}*#!M&M>xKx)}e>3o)q7lT*HmNW`Y;5Xw+LE#+U0Oe_YV+?vCgi|J
z5bnvM|LyzMob5mSb9Jkr=-gL6mY2#>I2@I0N`|@%A7#UMBV>4cTs(s{vulVnnoykM
zb8L*UUJs1}#1SdJ)zYA^-2F0jX-<(Jp46U=#B$h#bCT7xUxe#`WK5%{yQSP-=(Fd-
zF~>ohjv=4$&9((g{&({3BEt*|wLSV4x{Z^ykv{`;@8d2aPMNxVq@}*6tm;<m7<#(C
zzm$!&dVn<G9xBvEc1agoO>g#$Ru*cHPK*c9``CD_^0AJcvRXi^Jv$f~=Na*NHx+aD
zYZ~XUK{_hr$)>6`Hv_B#@(km+i-;VfUZ~8FU7hgW=O+Pq9THwSVu1FY1m$eQy7r3k
zWQAT?)mPc%q=N)j8)hWn>)x6icQgGZ5Mj;DvN!Fpjh$*JGYmwr)m$KADgih|Br!<U
z$RN$|WTR{xA5l5JF#`zS&+E9-qGg-oXe7IVK2H{N&x{Q`XMDj=?cEE(l7@EhX^`04
zii@tYA5*UP5i$n?9*lMGSwbgAR~L07x)H7lbVoJpW6H0)&vf}L<u2yGQkH9OSA&5Q
z5)OR|yi~I%{Kl5E6L8q_#s|K3%P&HDH}@?6LLFdL4#HGiq)gUCa&V8u(1$AqHfEWP
zJyu5ey773dW*7^wA|D)*z0xtpCRfmLA;{x|>&>#HHS*Geg_A;MZUm@5V7%M6(l}dZ
z>=5%G$Zn!WsThA<=|aTit^CFB(00$|FNEP)KV9udUebP&VV?;)aF;X!UjR9Un5Ir;
z4PSG}ud$?8JZ&lxNl1u75Y=vIQ$;0lv~%EO5A8H5ww5x;*_IeO$`c&>Y3LaL!mK;k
zo`;+;TSl+D*-Z$Ce^{H)x?$^sY%{;{`I@}3e6|XPcL5dX(7ip@p;SL*vy(<nZfV)7
ze|n**lG+}Mbztw%ppjkWJlawyKHaMurAQESxkD>Gq*%84w)o6=D1NzFF)Fjg%FJRN
zU6~8PYcBQ3hX{#g{dhembY-+V0T$677^CLp?>}nm%9CU-yV9SgEbTln8dn}SHk|5d
z{`mGnF^RAlBX7NqJ>^({RA>A!_A{!iKp<K?cHKBHiZVJLaSc(^1B%0Jw8x}`#1A%s
zqxSNF`N!}0TH*4(T}lq!vneCd)XQfAm!Z3@ws#x{m0Y0ZjUF#TT35?{JdKyqo-(MT
z%P0yr9g5FaMs-;06omkWE`!1QJ9h9FZ954?8BKEx050oR0DZktB_QgUxI4<S*wCFH
zScPm0?o3hnQ`%8*eMzp>EaPJ@E;4`k{d?^@R)w^!d<pgF_5xwdVb{$)I+COyJ>+?S
zidNdEP(GUu^Bt*VuHPLD5gVSnEAUM%@fB>r)atHG%xW>S;{LUIkL%t;N}Z5ODy7hM
zhgF#OGx_qs%3c6ITLrb!Psy)4?z@`^xl7JA-Os#hbTPX<<U6c?UAF=)S2(%)U~B+_
z98t&09l4HwSlb<f&8H83L%E{l$;HxSKBQdG7b+!zUROpF6?Ew`r-iu?Gp4F~nPjtT
zj<C6TYhnWP*wFph?|hWbGP@}I4s5>Bwm(Mjk3}^x&gEW{<=5M18oqNGpDiUKyrc@e
z{V6LkLwD4-jmtl?h}8+yF$g5#S}Kbk*KAK(hUglnhN}eC7dbe~%VcI9(KArfAE#vK
zLhgd{<~LhBHmzEQ@?{NdI2o_q67OPC(7tI0>V3y<)M;kn#>8^kb_y-JL&xwI5jd8?
zetKi<#&Pvn0nESJSRCa$bZGW7L|DjR9h^fD`Zn8P-C<9gX;J@>yK4Ld-**6sGH#Y~
z^)huk@;OR2woO)z$+T!SCB?dUnXa#uIkgoAqyvrp2L^Uf)ygPGL$a?^7+Py+8vz;O
zi^O2pc#S4*venBMnMu8ItQ!1aN*uJW-gka}+|gy7G-WK>YwWMVUDw<LO(DE2bhP-Z
zKX?&U5lp9nx{Wnun(lpC{Nu+2p!2L(MU_@Xl?Ef@3NX2?BT<`eY<p2RUK;L5_N$j?
zp^OU#g@c5`Y*!8i!ggWi-LWIE)EL88Xra!sVa}Q);bnm%vQGGMH{A2)<`08c{K)39
zZ%~E^?d6}?x!L?9!Q#_RHSTWBh!d2(h0GW}!(1g?5fLoQl4r)V7^3OdDxg&i&3IJf
zvZm2o;9Fg_G4%wI*}I#N1nF<)2WxmCg#3*F^O;2<!rtV{!)h_s;Ty8?yH)IzfiaC$
zq|%=nF*Wnr<LLH-<PLL*_U*7`w%|}pQ+^;9CB$v6<r6V-=E1_*i9-ar-kbU4gx#84
zl>bF%M~+=yb{x^IjSESOn<Jr=<aC7|xI*usR&4m{>#oj?nSGeuU`M~1QALnStTp21
z;aD-js`MKJH<Nbm`_-qxChj0*l-1%l<5T~Ihu2s5SnjK|dnPqFkn{S4xo1X|y&U#+
zPP)y})VSS*C(qH@Hi3l)V~5*|V4(Xc28I)owJ0LCX1H=2p1oAo)8OUXp-U3v<_xk6
z8@OdODpy>l4a%N{z46qKpaIt*m<s%_diM(X*iXhtg9M5w$U&{IgB&`SiGwce$Z4ZK
z09U%VZW}fdcHahWTy=rln_>$=5|MH2=H}*74wtX@0iUoN#L%gcRDd_{MRsUoA(cxN
zKypD5oHK`TY1dQ9a<hFL2e^0E#e96tv+Hm>L^pFv$6EP(9QZ<cPVB|@YNEk9?sHj@
zk*o)^f7|%@np|n=SDVv`{iI^o{GJ;MlV_(cu8^?oG>mZluY31iyer~Zu1$B657a5T
zJJGG(^Mo6SF$`zmDTQ|L%O%{olAdEqE~lHn!TjP`4$?oKao(ZZ{k#}O&n+3r?Y29p
z(CJ=>Y1x+dO9f`ObyO=JIoiQcymo~uw%o!p+A?z#ipPRm>(x6ed+%d-fBJ{*oZmV?
zv_`V|cT$d!fJlC<Vo-fpaR|PhrJtYWt-$XZSk%i!-YwlFg}g_<!X!492W0jNh!Sp@
zVL7W#JC1fZ3ndvjo21_~ni1%xrNkdkl?1BvCr7kO(M{u1SUH}bAoj5UNgavn<f=6F
zXu^fyd--c2l?gZnb)#E=9@pyTx3h3<{DHA`%&v6>EVRlhygBgUdfOEjw{-u|Bd3-C
z;$B?~GSx+vGcx6>Bw48Gz}2RjdtV<jr4`QR#{`@N1fN`cq8TqqQ;wYypH1x<)9a>x
zTPUyTo}oet-mpwF(o6EZAN1qh;X`#Hay|0-t}^jb&|=czQTdpbxTc*vxzgwuuiWH4
z^D5p8MJOOTR{sE1am;KlE>?f0(LB;ulm96la_UdksnsP#kL*KJvw&Eoz}oiS&2fm~
zG8+4=_ll7Y*OlQTVKxsW&(CcJEr(ka-sI4a=?H_i9g1(vaxcH<{mB~7Ro79q$}8|w
z;6<9o)=uuwgBFwoq+#Tw#6^(zD1#r24@q+oSx{xT&hkGi6*22Q$Sr%?W7*q&kZm_4
zV#o0OENp@7rCJ4tUMDyPu0wW!;YwR0>c{%P@Dr+&O^(x(R(zDDR~oDY-*Is+?9&yh
zI{p)h60!;V_1R2(A=jaNcJ}O~Pv+yFsE)mY-FDo!)MwAtwA5oFh}9{?(&Q)e0}q)>
zCd*H_$8Xl;V!I}PmqI=Uo%f+;cPyB!FN~|Mmaw&+O2o-t+<~^#)VKFQxvIS?Kpoz)
z;WGKL{rm4bO}P|DItKzcbKzgTKQ(i1emB1QO1pcXE3l)75HLhr5jPTHy%0p($X@|V
z6)~OWzJScF9jM5TkPwuK8SP@qfGHfzi<~-l99Q{0CKr7=xdQ*xcM4w8)%S@Vi?^%s
zbovfqS~|^aHDUKa(wN7;89Df^VIOU{h^3QYE#lah;N5choz&Z-Tm$KRod@Xd<P0&g
zeMbaD9>&moV^-pgtuIkME)H9ugjjg5L;loI>n`e9b9qmggm`6QI(3?F>0P$$Gfk4Y
zT*12ZiZ;)j+3}X_$UvTh%F)zBUUk@;NBbO0+TO=cFOELc;W@d-Lh03RGQ?Jr8aNR&
zcE8y!I(&xb@~%#zYkqvLRG%WL@j>wKLeH^EP~Y_?8G|zQ?m%%?-n_NNJ#1x4TBtVq
z*3{GqJ+-eW>RH~gSNYP+Cd8{%XQ+YW-ov4I_<_ytOC}AOo6ui2+N;HTSRATuX0Z53
za15!$o_q5dayIRr|CIA{tgiED`4SYFroJx6Q!164!x-Y*7VPlrs~zgAEG7prUU>`E
z$G|15Ljul!WKNO^_^9nZeGgN?_x3#N?;Ji$h=aQ8nbJ@A3TwDhp`#nTwdD4*F;w0<
z3%|zLJj6w;9nuq7c$y7?N=H};9Qn)qBG)IyXo+iMfrf%tJe1@Ac-X({Kei6mS!VVE
zPT9Xq_iFJFyvTJKa8>nQLumm&-g$65?psyO-loqt&5F}%alKTupcDSpS7TFF1Dl&F
z)&f~sJ&G)vHaaIF#f#HTH=CRH_u|xZ?vVo$Y`NnM{HBHs63Rr*%KgOW?!(0d`=Amy
zGxe%ue-5OgHuB34X{C&cio=(v9xTzq$N|k>;ob&83O%benwZ#LT<=Ll<J$&d6=I;d
z9%HZcnf>FSh+S`9`8sW{14nY|FCB7sCKs9lseXX)O`K|&9=@bfq>z=JKcrdZbP{51
zsnd7)LNIrteC5T0TZ{a+$gw{)6lk_y0m@t-<%@8b9laSUxHsc`Je~>1id~O5I|6vl
zmNO?w0gK)to2y$cdCZiKhvJ3&u{8iL-*<)zo7WI`XLJ!nupv64T0Hn{bV{792Ybsj
zJL6Iq6CaW>T|#B`%_Np?Z2#m*f5CN9Xq{v0vl%bR$G6Wby1A7KRL@!*g|3fJgCbR+
zN3Xj^aLXR8<T&S7*4#;txW4xRJ?MjDUmi%5f&f1h@&v^_YD5(5`KO}?GI1rUsTV!`
zoA27{BoWa8Wn=lC8!zAw1wh)%BLSQ+{$SK0|KsA?jC3c3Hxgf@Bbx{kSTQ6tj&&>e
zXL$;i*T~$v{P(j12T&Zw%FY2)l!R{S3Xj-Bve(nFL_{b17W%9#o9PxfK37mz4XrmZ
zzrGRXAS~145&F+}DKsOi`3D47g&yN<506#f)n8sl8Sl*47S!Hio;6wbp>}<t?k0Vw
z@kV0plLJuTh|k@@g6q~5Z{Dm)wUqL}#VVY14HFIm@@ve?#)DUdCTgvTbPVa6?PkTL
zhTP%h#zRF9h;zQUXpEVy&_pZjUFc%c2+<Q+zA%sv9*-9gZXVO!-W_a~bV(%#rYknB
zL~xy@dj=-HXh$y9lUT%p)pS-k;uHS)eLkSU&1Flag-$c#Rl(arN6Y#51XrlEPGQy1
z?azO}_2kaG38!h-tmi|(J{nd2Ao0&+Zra_QaTHF0F3Qe0&gS$CSK`RB+iS#MzHUWZ
zjD6MUq3}QWsXvg(f?{T-Bb>D=-Vb44AnlKnY#X!KSrK;q+1k4jSsX}d$&v0|P~4ro
zU?-nefyZQT&%>#QcjQD~Aa8QQ?=D^NCoioEfA;}NvYWcXySyR;woyJlM<zu?Dd4p`
zOf2f~8vC<a9osZft(#P0@Sk%(hs0W+AOwPUG$D~nz?)>haEEI{ZdHTi?>@PP=vV5?
z*36MmRelK60%+k1#|1<<vp0EtzP!6$<9^+Lp|d|M@ZQVPq71H{%-Dp44Via~6WZ-}
ztOxLBX4KBbHLE}1JH6wa7qhZjwriZF$r+}^&Lz)<hb^11m9RKT8}`}!L@_{^t+5H;
z)Y`&F^Y|f=E9s_CRw-9QpVg*bz(ObW-W)Nz2Uq0z%lo&dzp*iI$55BO94WWMR(J9h
zZ9KkOvM=R5p=Gsv<Q2Qka2)T-gjw_fddin?+JDYBrNB-*iCZo_rDH!o?5?6)rlj)r
zfXvR^t>UolSic_;y91*<-j-J{^nAzHz(xZXU-pSYR3N-}*lM<uZ}2hOjW*W|`#heR
zUDl4h@4(Xf>u1)(FXO`(Dc)TxS#8plWf&i)VP<h2$_@wh)84;A^<uC;W%KnYA7{2_
z>c!=|Q;gh9ilnxdG-;KrECvlDK2F_@+p21@k{!A4es!^e%mjHC1zL-phjfYZ25(pE
z$+;~L&;!21aBK6tIePxC^V*(MdrxA!e1<?9E%zCH6a-O?Y8C<<a)!s79yJryU@2Qv
zLpJ~>nB0m_X4IDPq`E3fCHDEDg}bP5It)^6x@>!^>SSBpmG+QKZxlz2$6bXsK4|%s
zD`zy%$rrkPu~!aqX#M$BQNJZQYi$|n!=Y?t(fG0ILWJu1I49y8y70F!%?wA<p^gd)
z>9Ow-obC?NtEIgl@gm6fu845|m&K)?_^iZH!x^!?z4;p4tad&x`?vSY-9ie$_GOYZ
zbolinpN^`k^f*-=9Zw)f(+@IXF2}~-e~Y7@?Dq}(3@Lr-6`)<D5rv+nR{279AAZH>
zTN>P6WaRep@|QceY7eSudu@61f&#YUz}oiQgLc%Xo|d{#?f3m{LfVLp?H-3OS4@mp
ziSpJKZU|Gd&hsssI(?O+ud#3K>G^vqoLddB>}|Y<sT}pxG)QYn8BOL|u31K$vE6;B
zxc|&GqOF`)I`a3o!{o8ZU5?-Z<xpsWtoB5RFRC-T^YcrVrBKg1U~)QTS#;~Xz}jOW
zt{WWS6yx^LjIw0Fk!sQX3jaraA$wQ>r~Br<t5k{X-7B~j0dK-c4alpMuzW7IJh9Rv
z@bvkE@DRV^%p7{#LAd<+h_?K8Jepf}DRtcI;Kx&pf(5!8BB=1hEA1^I36j&Sui@3d
zv&UHgTwgD@ls~AX!@%jD^7ydgl#7F{RoXUbM|^hY$IDmqY0Y68->jrs;IqW7d&Q0+
zt<D)!@O`|v^?fM+HL3fB-NS~H$+A4#M<F#aWs#9Wb*lK=>8VIn*Gt6_!V|z8@u|2V
z>?myz27Jg-0GDsE7y0J)(Hj19Y~Ge+!TsnDfh}szFLdwpnT^+`h@AOYr55grcyqIB
z`_YgbxjL5l$*1+Wwm3^<IkfIQrO+)P4R`2WX&}xb7yu060O85R@CZ_<wA-J7mYgQ<
z;ycf|9x`!Lkk#_g<zL>&3mOvbC?V%%pg!E5`kwu(CLYj-*o%DnmN2HzrvZ?Vz@z0O
zOC_1N+z}LmN-~GAus;JGeW=wPY9M=@h-QY^4U+^8jsexIp6P}aykPgc5#JqkewYGV
z0g(X!^U8wn>#5W*pI<2q<UPkF&G32a(NU}DA<_2;D#@1g^bcclxwN{Dj*cN!xG{oq
zk$_<#=E6tv?|nJ*oEIqm+4duOkGYq;<L+PNd<nOCuB6xc?wI@=pIAZ2cQ0qN$|&W*
z(}m9_l=5g+oLQyFJ+;8E&lvp$FBAQSDa-T=my=WH!k5kaORA@@Xjk5!6E$(gVsT$a
z=2|+}Mn(`__K;}UYSCsxd3F&7_rYO(eB<CH>INBJV{bAEv3-##IAKXr!`O;}uy(uV
z5n0v*USY<-$S7#l2aS1}KrG!|bE+_6LmL;WN$3VS$9ZS(ur**JQiqIHCW5dacwLt}
zKLQ=NuIk$815UwC{#**Ee3fCVib0S>(Q2EpVww_aI*K?hx{VHn`tF-7@0ItjZ6(^~
zqFJD9cb-z{DN_2a-9isO<r)39OD6f0i`g&`_hDqtA{7paMevhpi@ka^jLBc^gT1+V
z?*Y{G$?2tb-lbM;uRvkhBi2kWl(4GF!Dgpdt7bj;WK#hX*CdMI^FKL3NAC`im9(90
zyG4~g0Vjg|(?{{)QWL*y)km5N?ejMNO;W0^vUkJNuL5F}{{bQ($oPBKeYaICqH1rG
zv2E)WP0=5r@Km)f^lRDa6**(vq<Bb<oF^rH9K%%hBX};K*LHV%MeA6SDjDE6s`-a=
z`FPjz?;d_V4?qV|%l@KSKmzw}dm_KZccKEZf4c%=#aSq`#gfiQyF<&Y^<F4m@d=<1
zc75rS&_m^{3$eV1ROPq~C`*`ANUPDj$h-G`Cjkrn678VrO*Hq~n7W0MNR-K*-$L|g
z`sVNNqq%98-N|IaCrkZ*)cjoHMy0U~@aEQ9*$ARFfL2%e`e(edYxw^Am<uwuXrt0u
z&i^cqv^fT%0&QKR*(s!f+5A6byv8Hd{w(%MrvDMC!f(%Q;czjUOo1hbf$J1y>RQ!p
zxLM6b^%#1s9Kl$!a8T`z>E$5I>gbBXuLd33+l}yGF0TLJ0E4dmM>S0+sbUyqG)_;q
z7njSni<BJVREYJ)I8i^unbcM%bwe<K65lR>TDoh)Fz0izo^;l*_>Ua!WDr3QCUZJh
zUmzlOaOF8<iZkpjJe}nzu<vTPh8K3)plWG70Bub*q#RDT{Tyduhm?M=&K!R3AG3RM
z=|Bd|CI1toV)XEMA34gGusQq~Z2O@wa58SGqfb0X27zm`hPknAePeA1_CmEv-1^H^
zE-lDFt3ryqxpVL$Tnaee@VpKJi}zAm=ag-0_aHM4NzJS*IFU}z@Rm#3S@*2{=-*q?
zHQq6!ctnaRQdEpiSU8H^mMmRqlfgM+v{0{!$gGpAqUn*{@6t#lJgL`GpCSk+WjE89
zJMxbT?qmLo4qVs9qdI##jv_^o2gr^BUgh9GIV6_yERd=Dsy5IoY?ODN2APT5v^+T8
zhp99Fq4$X7lEDY+aEEC8pCoufp7WCEZ*mvJP^8#|4Hrm8tGdYxhwQT)N03m&B)1Ad
zK41WM#I<RsoN4>waJsN#SC4t}_ex2z1VxgwGT+%7y8fG%u{|Bw5=y~t%U~7AHCzKy
zzH+!OvQQXU8X34w^M~9xAZ0i)Rc!%0+3&1vU2m!RG^R`D@P^d7c+u&~s@@P^N2}x`
zT#Ysw6%rD`l;xrLti9WR1*4am-0DCkzqf~AS`6?A=8v}GB($dtWIpul==fQKt^(uV
z<lRWEbyMPX>yv$!dm-2545{ovcT#9phUk-3l=ERh4WaLzwc-a&1O84|5P0Q<{`%_P
z)UxL~CB24&MrBb|%4K)bDcAxZMcFVB#if4pys-(U<@uVv-R~z#W*cx3?;gIDU&$Te
zU?-F<@zL4Y0=3;qK=|E%vfDN$9W%Pz2d*0xD5jxfo{amkKEs+T$2#r6Y=>`e=ir8~
z@j&FsL6$O&?DCa`UJ<_faa=o$7`i*q`LY}H*kNzu{9L4p_pjK_1U91AzYeYokH}p(
zQ#I6N@0iL^Ku}UhjIKhD{tZJKqBW2{VET4I0lBq;EbDk>W;xz~XH&JK`FOEbiq#?x
zg^!<~&5u((6r}0TrQG<Ww;>B3ER1#?i|P}nC-+9h{UcN_7m{0Y0&v2rg*cE=ckvkw
zx#fa<{{7+^9Tfs-YCk{f5b#9X95eYV5Vu?wf(PXuy1M#3>rUQLs&*(2=e>t|+3ix4
z^ZWTi+iny!^LX5Uof>~ey~?iT?OK}I)IsLZ`CVQ#wc<RHZYq6b@zZ}Tyx@#bIkM>#
zLU`*b2m#tc(y;+^FL(HxtQAgV%5X@KCGNjYfIH;Yp1n`O65itmqiXNWs#`)gZj<)c
zZleZ2<67$^LtbagG%hbUbg8E#uaIv9inOruQ~c?sxc65rJ)!;63+1f0u#a3;&=)`1
z6d|3F4aFDwZYB{iI$ag(GZhzo4^pWMrmCXqf}rpYlLlAhnXd=?|9(9|72=g+y}`(|
zhq?1*jr;69^JRfzhK(lHd(U47qQPBpT2o@=Xx#=LxGGcj_bbQw?+NM{3-Y4h>|@H9
zFNYs%$lUAZEMty`|Gr>1eeZu-%Y+O8LP{BbneuN1Sr04zzwX%kzucs_LZV!pyPOiL
UDQQIR+N97_(|uC=$nMqu0{vpyE&u=k

literal 0
HcmV?d00001

diff --git a/setup_env.py b/setup_env.py
index 9256324fb..801187228 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -41,6 +41,9 @@
     "tiiuae/Falcon3-1B-Instruct-1.58bit": {
         "model_name": "Falcon3-1B-Instruct-1.58bit",
     },
+    "microsoft/BitNet-b1.58-2B-4T": {
+        "model_name": "BitNet-b1.58-2B-4T",
+    },
 }
 
 SUPPORTED_QUANT_TYPES = {
@@ -161,6 +164,8 @@ def gen_code():
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
+        elif get_model_name() == "BitNet-b1.58-2B-4T":
+            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
         else:
             raise NotImplementedError()
     else:
@@ -177,6 +182,8 @@ def gen_code():
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+        elif get_model_name() == "BitNet-b1.58-2B-4T":
+            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")    
         else:
             raise NotImplementedError()
 
@@ -222,4 +229,4 @@ def signal_handler(sig, frame):
     args = parse_args()
     Path(args.log_dir).mkdir(parents=True, exist_ok=True)
     logging.basicConfig(level=logging.INFO)
-    main()
+    main()
\ No newline at end of file

From 0e7dadba1efa030ecd5a9ea94d5d7881d051ecb5 Mon Sep 17 00:00:00 2001
From: Yan Xia <59006636+sd983527@users.noreply.github.com>
Date: Tue, 15 Apr 2025 15:24:42 +0800
Subject: [PATCH 04/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index df6f718ff..fa3f2f5a7 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ pip install -r requirements.txt
 3. Build the project
 ```bash
 # Manually download the model and run with local path
-huggingface-cli download microsoft/BitNet-b1.58-2B-4T --local-dir models/BitNet-b1.58-2B-4T
+huggingface-cli download microsoft/BitNet-b1.58-2B-4T-gguf --local-dir models/BitNet-b1.58-2B-4T
 python setup_env.py -md models/BitNet-b1.58-2B-4T -q i2_s
 
 # Or you can download a model from Hugging Face, convert it to quantized gguf format, and build the project

From 8f75f99c721c38d4490b31d5cc6802c0d020bd87 Mon Sep 17 00:00:00 2001
From: Yan Xia <59006636+sd983527@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:07:20 +0800
Subject: [PATCH 05/11] Update README.md (#172)

add two FAQs for windows build requestions.
---
 README.md | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fa3f2f5a7..3f72ef0aa 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 ![version](https://img.shields.io/badge/version-1.0-blue)
 
-<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>
+[<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T)
 
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 
@@ -158,7 +158,7 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp)
 ### Build from source
 
 > [!IMPORTANT]
-> If you are using Windows, please remember to always use a Developer Command Prompt / PowerShell for VS2022 for the following commands
+> If you are using Windows, please remember to always use a Developer Command Prompt / PowerShell for VS2022 for the following commands. Please refer to the FAQs below if you see any issues.
 
 1. Clone the repo
 ```bash
@@ -278,4 +278,36 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
 # Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
 python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
 ```
+### FAQ (Frequently Asked Questions)📌 
 
+#### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp?
+
+**A:**
+This is an issue introduced in recent version of llama.cpp. Please refer to this [commit](https://github.com/tinglou/llama.cpp/commit/4e3db1e3d78cc1bcd22bcb3af54bd2a4628dd323) in the [discussion](https://github.com/abetlen/llama-cpp-python/issues/1942) to fix this issue.
+
+#### Q2: How to build with clang in conda environment on windows?
+
+**A:** 
+Before building the project, verify your clang installation and access to Visual Studio tools by running:
+```
+clang -v
+```
+
+This command checks that you are using the correct version of clang and that the Visual Studio tools are available. If you see an error message such as:
+```
+'clang' is not recognized as an internal or external command, operable program or batch file.
+```
+
+It indicates that your command line window is not properly initialized for Visual Studio tools.
+
+• If you are using Command Prompt, run:
+```
+"C:\Program Files\Microsoft Visual Studio\2022\Professional\Common7\Tools\VsDevCmd.bat" -startdir=none -arch=x64 -host_arch=x64
+```
+
+• If you are using Windows PowerShell, run the following commands:
+```
+Import-Module "C:\Program Files\Microsoft Visual Studio\2022\Professional\Common7\Tools\Microsoft.VisualStudio.DevShell.dll" Enter-VsDevShell 3f0e31ad -SkipAutomaticLocation -DevCmdArguments "-arch=x64 -host_arch=x64"
+```
+
+These steps will initialize your environment and allow you to use the correct Visual Studio tools.

From 1c77bd8966a4d4d5cfb235ca9c1df41fe2b47643 Mon Sep 17 00:00:00 2001
From: Yan Xia <59006636+sd983527@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:11:23 +0800
Subject: [PATCH 06/11] Update README.md

---
 README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/README.md b/README.md
index 3f72ef0aa..bcaff9e3a 100644
--- a/README.md
+++ b/README.md
@@ -179,9 +179,6 @@ pip install -r requirements.txt
 huggingface-cli download microsoft/BitNet-b1.58-2B-4T-gguf --local-dir models/BitNet-b1.58-2B-4T
 python setup_env.py -md models/BitNet-b1.58-2B-4T -q i2_s
 
-# Or you can download a model from Hugging Face, convert it to quantized gguf format, and build the project
-python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s
-
 ```
 <pre>
 usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]

From 71fdd9472fa91d9498e8ebc1f89a7ebbdf5cc172 Mon Sep 17 00:00:00 2001
From: tsong <tsong@microsoft.com>
Date: Tue, 15 Apr 2025 14:36:05 +0000
Subject: [PATCH 07/11] add third-party demo

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index bcaff9e3a..e4708bbca 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
 
 [<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T)
 
+Try it out via this [demo hosted by third-party](https://bitnet-demo.azurewebsites.net/)， or [build and run](https://github.com/microsoft/BitNet?tab=readme-ov-file#build-from-source) it on your own CPU.
+
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 
 The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.

From 874e6bd5fb1453ba39930264132a82c72c06e328 Mon Sep 17 00:00:00 2001
From: tsong <tsong@microsoft.com>
Date: Wed, 16 Apr 2025 04:34:59 +0000
Subject: [PATCH 08/11] refine readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e4708bbca..4af4626b6 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 [<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T)
 
-Try it out via this [demo hosted by third-party](https://bitnet-demo.azurewebsites.net/)， or [build and run](https://github.com/microsoft/BitNet?tab=readme-ov-file#build-from-source) it on your own CPU.
+Try it out via this [demo](https://bitnet-demo.azurewebsites.net/), or [build and run](https://github.com/microsoft/BitNet?tab=readme-ov-file#build-from-source) it on your own CPU.
 
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 

From 488dc1e876b79ad60745a3a7338d34600d7be00d Mon Sep 17 00:00:00 2001
From: junhuihe <junhui-he@outlook.com>
Date: Tue, 22 Apr 2025 17:28:59 +0800
Subject: [PATCH 09/11] Fix model architecture name

---
 3rdparty/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index a8ac7072a..5eb47b721 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit a8ac7072ae02ffd68b4b661db0ebd2689fb82b7f
+Subproject commit 5eb47b72106e3b35f10e8befa616a9241242b226

From 1792346223ac13d4e26b65e3d0397b60d6ed3746 Mon Sep 17 00:00:00 2001
From: Benjamin Wegener <wegeneredv@gmail.com>
Date: Thu, 8 May 2025 10:22:12 +0200
Subject: [PATCH 10/11] Add run_inference_server.py for Running llama.cpp
 Built-in Server (#204)

* Update CMakeLists.txt

I added a CMake option to compile the Llama.cpp server. This update allows us to easily build and deploy the server using BitNet

* Create run_inference_server.py

same as run_inference, but for use with llama.cpp's built in server, for some extra comfort

In particular:
- The build directory is determined based on whether the system is running on Windows or not.
- A list of arguments (`--model`, `-m` etc.) is created.
- The main argument list is parsed and passed to the `subprocess.run()` method to execute the system command.
---
 CMakeLists.txt          |  3 +-
 run_inference_server.py | 64 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 run_inference_server.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ddaa51f7..5c8382e34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 find_package(Threads REQUIRED)
 
 add_subdirectory(src)
+set(LLAMA_BUILD_SERVER ON CACHE BOOL "Build llama.cpp server" FORCE)
 add_subdirectory(3rdparty/llama.cpp)
 
 # install
@@ -74,4 +75,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 
 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
\ No newline at end of file
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
diff --git a/run_inference_server.py b/run_inference_server.py
new file mode 100644
index 000000000..9b0f10d53
--- /dev/null
+++ b/run_inference_server.py
@@ -0,0 +1,64 @@
+import os
+import sys
+import signal
+import platform
+import argparse
+import subprocess
+
+def run_command(command, shell=False):
+    """Run a system command and ensure it succeeds."""
+    try:
+        subprocess.run(command, shell=shell, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while running command: {e}")
+        sys.exit(1)
+
+def run_server():
+    build_dir = "build"
+    if platform.system() == "Windows":
+        server_path = os.path.join(build_dir, "bin", "Release", "llama-server.exe")
+        if not os.path.exists(server_path):
+            server_path = os.path.join(build_dir, "bin", "llama-server")
+    else:
+        server_path = os.path.join(build_dir, "bin", "llama-server")
+    
+    command = [
+        f'{server_path}',
+        '-m', args.model,
+        '-c', str(args.ctx_size),
+        '-t', str(args.threads),
+        '-n', str(args.n_predict),
+        '-ngl', '0',
+        '--temp', str(args.temperature),
+        '--host', args.host,
+        '--port', str(args.port),
+        '-cb'  # Enable continuous batching
+    ]
+    
+    if args.prompt:
+        command.extend(['-p', args.prompt])
+    
+    # Note: -cnv flag is removed as it's not supported by the server
+    
+    print(f"Starting server on {args.host}:{args.port}")
+    run_command(command)
+
+def signal_handler(sig, frame):
+    print("Ctrl+C pressed, shutting down server...")
+    sys.exit(0)
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGINT, signal_handler)
+    
+    parser = argparse.ArgumentParser(description='Run llama.cpp server')
+    parser.add_argument("-m", "--model", type=str, help="Path to model file", required=False, default="models/bitnet_b1_58-3B/ggml-model-i2_s.gguf")
+    parser.add_argument("-p", "--prompt", type=str, help="System prompt for the model", required=False)
+    parser.add_argument("-n", "--n-predict", type=int, help="Number of tokens to predict", required=False, default=4096)
+    parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
+    parser.add_argument("-c", "--ctx-size", type=int, help="Size of the context window", required=False, default=2048)
+    parser.add_argument("--temperature", type=float, help="Temperature for sampling", required=False, default=0.8)
+    parser.add_argument("--host", type=str, help="IP address to listen on", required=False, default="127.0.0.1")
+    parser.add_argument("--port", type=int, help="Port to listen on", required=False, default=8080)
+    
+    args = parser.parse_args()
+    run_server()

From c9e752c9d705fbbbdca474a9ce8e112bde9cc8e0 Mon Sep 17 00:00:00 2001
From: Benjamin Wegener <wegeneredv@gmail.com>
Date: Thu, 8 May 2025 10:22:45 +0200
Subject: [PATCH 11/11] Fix build error with GCC by forcing Clang compiler in
 CMake on android/aarch64 (#242)

GCC does not recognize Clang-specific warning flags like
-Wunreachable-code-break and -Wunreachable-code-return, which are passed
by upstream submodules (e.g., ggml). This patch forces CMake to use Clang
via command-line arguments, avoiding the need to patch nested submodules.

This resolves compiler errors without modifying submodule source code.
---
 setup_env.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index 801187228..dfad6c3e7 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -199,7 +199,7 @@ def compile():
         logging.error(f"Arch {arch} is not supported yet")
         exit(0)
     logging.info("Compiling the code using CMake.")
-    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
+    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), []), "-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"], log_step="generate_build_files")
     # run_command(["cmake", "--build", "build", "--target", "llama-cli", "--config", "Release"])
     run_command(["cmake", "--build", "build", "--config", "Release"], log_step="compile")
 
@@ -229,4 +229,4 @@ def signal_handler(sig, frame):
     args = parse_args()
     Path(args.log_dir).mkdir(parents=True, exist_ok=True)
     logging.basicConfig(level=logging.INFO)
-    main()
\ No newline at end of file
+    main()