yashuatla · yashuatla · Mar 12, 2025 · Mar 12, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 find_package(Threads REQUIRED)
 
 add_subdirectory(src)
+set(LLAMA_BUILD_SERVER ON CACHE BOOL "Build llama.cpp server" FORCE)
 add_subdirectory(3rdparty/llama.cpp)
 
 # install
@@ -74,4 +75,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 
 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
diff --git a/README.md b/README.md
@@ -2,6 +2,10 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 ![version](https://img.shields.io/badge/version-1.0-blue)
 
+[<img src="./assets/header_model_release.png" alt="BitNet Model on Hugging Face" width="800"/>](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T)
+
+Try it out via this [demo](https://bitnet-demo.azurewebsites.net/), or [build and run](https://github.com/microsoft/BitNet?tab=readme-ov-file#build-from-source) it on your own CPU.
+
 bitnet.cpp is the official inference framework for 1-bit LLMs (e.g., BitNet b1.58). It offers a suite of optimized kernels, that support **fast** and **lossless** inference of 1.58-bit models on CPU (with NPU and GPU support coming next).
 
 The first release of bitnet.cpp is to support inference on CPUs. bitnet.cpp achieves speedups of **1.37x** to **5.07x** on ARM CPUs, with larger models experiencing greater performance gains. Additionally, it reduces energy consumption by **55.4%** to **70.0%**, further boosting overall efficiency. On x86 CPUs, speedups range from **2.37x** to **6.17x** with energy reductions between **71.9%** to **82.2%**. Furthermore, bitnet.cpp can run a 100B BitNet b1.58 model on a single CPU, achieving speeds comparable to human reading (5-7 tokens per second), significantly enhancing the potential for running LLMs on local devices. Please refer to the [technical report](https://arxiv.org/abs/2410.16144) for more details.
@@ -18,7 +22,8 @@ A demo of bitnet.cpp running a BitNet b1.58 3B model on Apple M2:
 https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
 
 ## What's New:
-- 02/18/2025 [Bitnet.cpp: Efficient Edge Inference for Ternary LLMs](https://arxiv.org/abs/2502.11880) ![NEW](https://img.shields.io/badge/NEW-red)
+- 04/14/2025 [BitNet Official 2B Parameter Model on Hugging Face](https://huggingface.co/microsoft/BitNet-b1.58-2B-4T) ![NEW](https://img.shields.io/badge/NEW-red)
+- 02/18/2025 [Bitnet.cpp: Efficient Edge Inference for Ternary LLMs](https://arxiv.org/abs/2502.11880)
 - 11/08/2024 [BitNet a4.8: 4-bit Activations for 1-bit LLMs](https://arxiv.org/abs/2411.04965)
 - 10/21/2024 [1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on CPUs](https://arxiv.org/abs/2410.16144)
 - 10/17/2024 bitnet.cpp 1.0 released.
@@ -29,9 +34,38 @@ https://github.com/user-attachments/assets/7f46b736-edec-4828-b809-4be780a3e5b1
 ## Acknowledgements
 
 This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp) framework. We would like to thank all the authors for their contributions to the open-source community. Also, bitnet.cpp's kernels are built on top of the Lookup Table methodologies pioneered in [T-MAC](https://github.com/microsoft/T-MAC/). For inference of general low-bit LLMs beyond ternary models, we recommend using T-MAC.
+## Official Models
+<table>
+    </tr>
+    <tr>
+        <th rowspan="2">Model</th>
+        <th rowspan="2">Parameters</th>
+        <th rowspan="2">CPU</th>
+        <th colspan="3">Kernel</th>
+    </tr>
+    <tr>
+        <th>I2_S</th>
+        <th>TL1</th>
+        <th>TL2</th>
+    </tr>
+    <tr>
+        <td rowspan="2"><a href="https://huggingface.co/microsoft/BitNet-b1.58-2B-4T">BitNet-b1.58-2B-4T</a></td>
+        <td rowspan="2">2.4B</td>
+        <td>x86</td>
+        <td>&#9989;</td>
+        <td>&#10060;</td>
+        <td>&#9989;</td>
+    </tr>
+    <tr>
+        <td>ARM</td>
+        <td>&#9989;</td>
+        <td>&#9989;</td>
+        <td>&#10060;</td>
+    </tr>
+</table>
 
 ## Supported Models
-❗️**We use existing 1-bit LLMs available on [Hugging Face](https://huggingface.co/) to demonstrate the inference capabilities of bitnet.cpp. These models are neither trained nor released by Microsoft. We hope the release of bitnet.cpp will inspire the development of 1-bit LLMs in large-scale settings in terms of model size and training tokens.**
+❗️**We use existing 1-bit LLMs available on [Hugging Face](https://huggingface.co/) to demonstrate the inference capabilities of bitnet.cpp. We hope the release of bitnet.cpp will inspire the development of 1-bit LLMs in large-scale settings in terms of model size and training tokens.**
 
 <table>
     </tr>
@@ -126,7 +160,7 @@ This project is based on the [llama.cpp](https://github.com/ggerganov/llama.cpp)
 ### Build from source
 
 > [!IMPORTANT]
-> If you are using Windows, please remember to always use a Developer Command Prompt / PowerShell for VS2022 for the following commands
+> If you are using Windows, please remember to always use a Developer Command Prompt / PowerShell for VS2022 for the following commands. Please refer to the FAQs below if you see any issues.
 
 1. Clone the repo
 ```bash
@@ -143,12 +177,10 @@ pip install -r requirements.txt
 ```
 3. Build the project
 ```bash
-# Download the model from Hugging Face, convert it to quantized gguf format, and build the project
-python setup_env.py --hf-repo tiiuae/Falcon3-7B-Instruct-1.58bit -q i2_s
+# Manually download the model and run with local path
+huggingface-cli download microsoft/BitNet-b1.58-2B-4T-gguf --local-dir models/BitNet-b1.58-2B-4T
+python setup_env.py -md models/BitNet-b1.58-2B-4T -q i2_s
 
-# Or you can manually download the model and run with local path
-huggingface-cli download tiiuae/Falcon3-7B-Instruct-1.58bit --local-dir models/Falcon3-7B-Instruct-1.58bit
-python setup_env.py -md models/Falcon3-7B-Instruct-1.58bit -q i2_s
 ```
 <pre>
 usage: setup_env.py [-h] [--hf-repo {1bitLLM/bitnet_b1_58-large,1bitLLM/bitnet_b1_58-3B,HF1BitLLM/Llama3-8B-1.58-100B-tokens,tiiuae/Falcon3-1B-Instruct-1.58bit,tiiuae/Falcon3-3B-Instruct-1.58bit,tiiuae/Falcon3-7B-Instruct-1.58bit,tiiuae/Falcon3-10B-Instruct-1.58bit}] [--model-dir MODEL_DIR] [--log-dir LOG_DIR] [--quant-type {i2_s,tl1}] [--quant-embd]
@@ -173,7 +205,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
+python run_inference.py -m models/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
 ```
 <pre>
 usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE] [-cnv]
@@ -245,5 +277,36 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
 # Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
 python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
 ```
+### FAQ (Frequently Asked Questions)📌 
 
+#### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp?
+
+**A:**
+This is an issue introduced in recent version of llama.cpp. Please refer to this [commit](https://github.com/tinglou/llama.cpp/commit/4e3db1e3d78cc1bcd22bcb3af54bd2a4628dd323) in the [discussion](https://github.com/abetlen/llama-cpp-python/issues/1942) to fix this issue.
+
+#### Q2: How to build with clang in conda environment on windows?
+
+**A:** 
+Before building the project, verify your clang installation and access to Visual Studio tools by running:
+```
+clang -v
+```
+
+This command checks that you are using the correct version of clang and that the Visual Studio tools are available. If you see an error message such as:
+```
+'clang' is not recognized as an internal or external command, operable program or batch file.
+```
+
+It indicates that your command line window is not properly initialized for Visual Studio tools.
+
+• If you are using Command Prompt, run:
+```
+"C:\Program Files\Microsoft Visual Studio\2022\Professional\Common7\Tools\VsDevCmd.bat" -startdir=none -arch=x64 -host_arch=x64
+```
+
+• If you are using Windows PowerShell, run the following commands:
+```
+Import-Module "C:\Program Files\Microsoft Visual Studio\2022\Professional\Common7\Tools\Microsoft.VisualStudio.DevShell.dll" Enter-VsDevShell 3f0e31ad -SkipAutomaticLocation -DevCmdArguments "-arch=x64 -host_arch=x64"
+```
 
+These steps will initialize your environment and allow you to use the correct Visual Studio tools.
diff --git a/assets/header_model_release.png b/assets/header_model_release.png
diff --git a/run_inference_server.py b/run_inference_server.py
@@ -0,0 +1,64 @@
+import os
+import sys
+import signal
+import platform
+import argparse
+import subprocess
+
+def run_command(command, shell=False):
+    """Run a system command and ensure it succeeds."""
+    try:
+        subprocess.run(command, shell=shell, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while running command: {e}")
+        sys.exit(1)
+
+def run_server():
+    build_dir = "build"
+    if platform.system() == "Windows":
+        server_path = os.path.join(build_dir, "bin", "Release", "llama-server.exe")
+        if not os.path.exists(server_path):
+            server_path = os.path.join(build_dir, "bin", "llama-server")
+    else:
+        server_path = os.path.join(build_dir, "bin", "llama-server")
+
+    command = [
+        f'{server_path}',
+        '-m', args.model,
+        '-c', str(args.ctx_size),
+        '-t', str(args.threads),
+        '-n', str(args.n_predict),
+        '-ngl', '0',
+        '--temp', str(args.temperature),
+        '--host', args.host,
+        '--port', str(args.port),
+        '-cb'  # Enable continuous batching
+    ]
+
+    if args.prompt:
+        command.extend(['-p', args.prompt])
+
+    # Note: -cnv flag is removed as it's not supported by the server
+
+    print(f"Starting server on {args.host}:{args.port}")
+    run_command(command)
+
+def signal_handler(sig, frame):
+    print("Ctrl+C pressed, shutting down server...")
+    sys.exit(0)
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGINT, signal_handler)
+
+    parser = argparse.ArgumentParser(description='Run llama.cpp server')
+    parser.add_argument("-m", "--model", type=str, help="Path to model file", required=False, default="models/bitnet_b1_58-3B/ggml-model-i2_s.gguf")
+    parser.add_argument("-p", "--prompt", type=str, help="System prompt for the model", required=False)
+    parser.add_argument("-n", "--n-predict", type=int, help="Number of tokens to predict", required=False, default=4096)
+    parser.add_argument("-t", "--threads", type=int, help="Number of threads to use", required=False, default=2)
+    parser.add_argument("-c", "--ctx-size", type=int, help="Size of the context window", required=False, default=2048)
+    parser.add_argument("--temperature", type=float, help="Temperature for sampling", required=False, default=0.8)
+    parser.add_argument("--host", type=str, help="IP address to listen on", required=False, default="127.0.0.1")
+    parser.add_argument("--port", type=int, help="Port to listen on", required=False, default=8080)
+
+    args = parser.parse_args()
+    run_server()
diff --git a/setup_env.py b/setup_env.py
@@ -41,6 +41,9 @@
     "tiiuae/Falcon3-1B-Instruct-1.58bit": {
         "model_name": "Falcon3-1B-Instruct-1.58bit",
     },
+    "microsoft/BitNet-b1.58-2B-4T": {
+        "model_name": "BitNet-b1.58-2B-4T",
+    },
 }
 
 SUPPORTED_QUANT_TYPES = {
@@ -161,6 +164,8 @@ def gen_code():
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
+        elif get_model_name() == "BitNet-b1.58-2B-4T":
+            run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
         else:
             raise NotImplementedError()
     else:
@@ -177,6 +182,8 @@ def gen_code():
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+        elif get_model_name() == "BitNet-b1.58-2B-4T":
+            run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")    
         else:
             raise NotImplementedError()
 
@@ -192,7 +199,7 @@ def compile():
         logging.error(f"Arch {arch} is not supported yet")
         exit(0)
     logging.info("Compiling the code using CMake.")
-    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
+    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), []), "-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"], log_step="generate_build_files")
-    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), []), "-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"], log_step="generate_build_files")
+    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
-    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), []), "-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"], log_step="generate_build_files")
+    run_command(["cmake", "-B", "build", *COMPILER_EXTRA_ARGS[arch], *OS_EXTRA_ARGS.get(platform.system(), [])], log_step="generate_build_files")
     # run_command(["cmake", "--build", "build", "--target", "llama-cli", "--config", "Release"])
     run_command(["cmake", "--build", "build", "--config", "Release"], log_step="compile")
+24 −0		gguf-py/gguf/constants.py
+5 −0		gguf-py/gguf/tensor_mapping.py
+332 −1		src/llama.cpp