mlc-ai · Irfnfnkemed · Mar 14, 2025 · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/3rdparty/xgrammar b/3rdparty/xgrammar
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -71,7 +71,7 @@ add_subdirectory(${TOKENZIER_CPP_PATH} tokenizers EXCLUDE_FROM_ALL)
 set(XGRAMMAR_PATH 3rdparty/xgrammar)
 tvm_file_glob(GLOB_RECURSE MLC_LLM_SRCS cpp/*.cc)
 tvm_file_glob(GLOB_RECURSE XGRAMMAR_SRCS ${XGRAMMAR_PATH}/cpp/*.cc)
-list(FILTER XGRAMMAR_SRCS EXCLUDE REGEX "${XGRAMMAR_PATH}/cpp/pybind/.*\\.cc")
+list(FILTER XGRAMMAR_SRCS EXCLUDE REGEX "${XGRAMMAR_PATH}/cpp/nanobind/.*\\.cc")
 list(APPEND MLC_LLM_SRCS ${XGRAMMAR_SRCS})
 add_library(mlc_llm_objs OBJECT ${MLC_LLM_SRCS})
 

diff --git a/cpp/serve/config.cc b/cpp/serve/config.cc
@@ -42,13 +42,43 @@ Result<ResponseFormat> ResponseFormat::FromJSON(const picojson::object& config)
   ResponseFormat res;
   res.type = json::LookupOrDefault<std::string>(config, "type", "text");
 
+  if (res.type != "text" && res.type != "function" && res.type != "json_object" &&
+      res.type != "json_schema" && res.type != "structural_tag") {
+    return TResult::Error("Uknonwn response_format type " + res.type);
+  }
+
   std::optional<std::string> schema = json::LookupOptional<std::string>(config, "schema");
   if (schema.has_value()) {
     res.schema = schema.value();
   }
 
-  if (res.type != "text" && res.type != "function" && res.type != "json_object") {
-    return TResult::Error("Uknonwn response_format type " + res.type);
+  if (auto tags_obj = json::LookupOptional<picojson::array>(config, "tags")) {
+    auto tags = Array<Array<String>>();
+    for (auto tag_obj : tags_obj.value()) {
+      Array<String> tag = Array<String>();
+      std::optional<std::string> begin =
+          json::LookupOptional<std::string>(tag_obj.get<picojson::object>(), "begin");
+      std::optional<std::string> schema =
+          json::LookupOptional<std::string>(tag_obj.get<picojson::object>(), "schema");
+      std::optional<std::string> end =
+          json::LookupOptional<std::string>(tag_obj.get<picojson::object>(), "end");
+      if (!(begin.has_value() && schema.has_value() && end.has_value())) {
+        return TResult::Error("Miss tag attribute.");
+      }
+      tag.push_back(begin.value());
+      tag.push_back(schema.value());
+      tag.push_back(end.value());
+      tags.push_back(std::move(tag));
+    }
+    res.tags = tags;
+  }
+
+  if (auto triggers_obj = json::LookupOptional<picojson::array>(config, "triggers")) {
+    auto triggers = Array<String>();
+    for (auto trigger : triggers_obj.value()) {
+      triggers.push_back(trigger.get<std::string>());
+    }
+    res.triggers = triggers;
   }
 
   return TResult::Ok(res);
@@ -60,6 +90,24 @@ picojson::object ResponseFormat::AsJSON() const {
   if (schema.defined()) {
     config["schema"] = picojson::value(schema.value().operator std::string());
   }
+  if (tags.defined()) {
+    picojson::array tags_obj = picojson::array();
+    for (auto tag : tags.value()) {
+      picojson::array tag_obj = picojson::array();
+      tag_obj.emplace_back(tag[0]);
+      tag_obj.emplace_back(tag[1]);
+      tag_obj.emplace_back(tag[2]);
+      tags_obj.emplace_back(tag_obj);
+    }
+    config["tags"] = picojson::value(tags_obj);
+  }
+  if (triggers.defined()) {
+    picojson::array trigger_obj = picojson::array();
+    for (std::string trigger : triggers.value()) {
+      trigger_obj.emplace_back(trigger);
+    }
+    config["triggers"] = picojson::value(trigger_obj);
+  }
   return config;
 }
 
@@ -1073,4 +1121,4 @@ Result<bool> ModelsUseKVCache(const std::vector<picojson::object>& model_configs
 
 }  // namespace serve
 }  // namespace llm
-}  // namespace mlc
+}  // namespace mlc
diff --git a/cpp/serve/config.h b/cpp/serve/config.h
@@ -28,6 +28,8 @@ using namespace tvm::runtime;
 struct ResponseFormat {
   String type = "text";
   Optional<String> schema = NullOpt;
+  Optional<Array<Array<String>>> tags = NullOpt;
+  Optional<Array<String>> triggers = NullOpt;
   /*!
    * \brief Create debug config from JSON.
    * \param config_json The json string for generation config

diff --git a/cpp/serve/engine.cc b/cpp/serve/engine.cc
@@ -35,6 +35,7 @@
 #include "request.h"
 #include "request_state.h"
 #include "sampler/sampler.h"
+#include "xgrammar/tokenizer_info.h"
 
 namespace mlc {
 namespace llm {
@@ -64,6 +65,9 @@ inline std::optional<TokenizerInfo> GetTokenizerInfo(const picojson::object& mod
   if (tokenizer_info_obj.count("strip_space_in_decode")) {
     info->strip_space_in_decode = tokenizer_info_obj.at("strip_space_in_decode").get<bool>();
   }
+  if (model_config.count("vocab_size")) {
+    info->vocab_size = model_config.at("vocab_size").get<int64_t>();
+  }
   return TokenizerInfo(info);
 }
 
@@ -463,9 +467,17 @@ class EngineImpl : public Engine {
           ModelWorkspace{model->AllocEmbeddingTensor(), model->AllocHiddenStatesTensor()});
     }
     // - Initialize tokenizer and grammar
-    n->tokenizer_ = Tokenizer::FromPath(engine_config->model, GetTokenizerInfo(model_configs[0]));
+
+    std::optional<TokenizerInfo> info = GetTokenizerInfo(model_configs[0]);
+    n->tokenizer_ = Tokenizer::FromPath(engine_config->model, info);
     n->token_table_ = n->tokenizer_->PostProcessedTokenTable();
-    n->cached_grammar_compiler_ = xgrammar::CachedGrammarCompiler(n->token_table_);
+    int64_t vocab_size = n->tokenizer_->GetVocabSize();
+    if (info.has_value() && info.value()->vocab_size != 0) {
+      vocab_size = info.value()->vocab_size;
+    }
+    n->grammar_compiler_ = xgrammar::GrammarCompiler(
+        xgrammar::TokenizerInfo(n->token_table_, xgrammar::VocabType::RAW, vocab_size));
+
     // - Create the logit processor and sampler, and
     // the DraftTokenWorkspaceManager for speculative decoding.
     int max_num_tokens = engine_config->max_num_sequence;
@@ -975,13 +987,22 @@ class EngineImpl : public Engine {
    * is not JSON, return std::nullopt. */
   std::optional<xgrammar::CompiledGrammar> GetGrammarFromResponseFormat(
       const ResponseFormat& response_format) {
-    if (response_format.type != "json_object") {
+    if (response_format.type == "text") {
       return std::nullopt;
-    } else if (!response_format.schema) {
-      return cached_grammar_compiler_.GetCompiledGrammarForJSON();
-    } else {
-      return cached_grammar_compiler_.GetCompiledGrammarForJSONSchema(
-          response_format.schema.value());
+    } else if (response_format.type == "json_object") {
+      return grammar_compiler_.CompileBuiltinJSONGrammar();
+    } else if (response_format.type == "json_schema") {
+      return grammar_compiler_.CompileJSONSchema(response_format.schema.value());
+    } else if (response_format.type == "structural_tag") {
+      std::vector<xgrammar::StructuralTagItem> tags;
+      std::vector<std::string> triggers;
+      for (auto tag : response_format.tags.value()) {
+        tags.emplace_back(xgrammar::StructuralTagItem{tag[0], tag[1], tag[2]});
+      }
+      for (auto trigger : response_format.triggers.value()) {
+        triggers.emplace_back(trigger);
+      }
+      return grammar_compiler_.CompileStructuralTag(std::move(tags), std::move(triggers));
     }
   }
 
@@ -992,8 +1013,8 @@ class EngineImpl : public Engine {
   // internal tokenizer
   Tokenizer tokenizer_;
   std::vector<std::string> token_table_;
-  // Cached grammar compiler for grammar matching.
-  xgrammar::CachedGrammarCompiler cached_grammar_compiler_;
+  // Grammar compiler for grammar matching.
+  xgrammar::GrammarCompiler grammar_compiler_;
   // Models
   Array<Model> models_;
   // Device that the models run on.

diff --git a/cpp/serve/request_state.cc b/cpp/serve/request_state.cc
@@ -24,7 +24,7 @@ RequestModelState::RequestModelState(
   if (compiled_grammar.has_value()) {
     // TODO(yixin): set rollback limit to a configurable value.
     n->grammar_matcher =
-        xgrammar::GrammarMatcher(compiled_grammar.value(), std::nullopt, false, std::nullopt, 10);
+        xgrammar::GrammarMatcher(compiled_grammar.value(), std::nullopt, false, 10);
   }
 
   n->request = std::move(request);
@@ -44,7 +44,7 @@ bool RequestModelStateNode::RequireNextTokenBitmask() { return grammar_matcher.h
 void RequestModelStateNode::GetNextTokenBitmask(DLTensor* bitmask) {
   ICHECK(grammar_matcher.has_value());
 
-  grammar_matcher->GetNextTokenBitmask(bitmask);
+  grammar_matcher->FillNextTokenBitmask(bitmask);
 }
 
 void RequestModelStateNode::CommitToken(SampleResult sampled_token) {

diff --git a/cpp/tokenizers/tokenizers.cc b/cpp/tokenizers/tokenizers.cc
@@ -30,6 +30,7 @@ String TokenizerInfoNode::AsJSONString() const {
   obj["token_postproc_method"] = picojson::value(token_postproc_method);
   obj["prepend_space_in_encode"] = picojson::value(prepend_space_in_encode);
   obj["strip_space_in_decode"] = picojson::value(strip_space_in_decode);
+  obj["vocab_size"] = picojson::value(vocab_size);
   return picojson::value(obj).serialize(false);
 }
 
@@ -54,6 +55,10 @@ TokenizerInfo TokenizerInfo::FromJSONString(String json_string) {
     ICHECK(obj.at("strip_space_in_decode").is<bool>());
     n->strip_space_in_decode = obj.at("strip_space_in_decode").get<bool>();
   }
+  if (obj.count("vocab_size")) {
+    ICHECK(obj.at("vocab_size").is<int64_t>());
+    n->vocab_size = obj.at("vocab_size").get<int64_t>();
+  }
 
   return TokenizerInfo(n);
 }

diff --git a/cpp/tokenizers/tokenizers.h b/cpp/tokenizers/tokenizers.h
@@ -43,6 +43,9 @@ class TokenizerInfoNode : public Object {
   bool prepend_space_in_encode = false;
   /*! \brief Whether to strip the first space during decoding. */
   bool strip_space_in_decode = false;
+  /*! \brief The vocab_size in config.json (length of logits).This may be bigger than the vocabulary
+   * size. The value will be 0 if not set.*/
+  int64_t vocab_size = 0;
 
   String AsJSONString() const;
 

diff --git a/python/mlc_llm/protocol/conversation_protocol.py b/python/mlc_llm/protocol/conversation_protocol.py
@@ -81,6 +81,8 @@ class Conversation(BaseModel):
     function_string: str = ""
     # whether using function calling or not, helps check for output message format in API call
     use_function_calling: bool = False
+    # Tool function call format mode
+    _tool_call_format: str = "json"
 
     def __init__(self, role_templates: Optional[Dict[str, str]] = None, **kwargs):
         # Defaults templates which would be overridden by model specific templates
@@ -124,6 +126,8 @@ def as_prompt(self, config=None) -> List[Any]:
         from ..serve import data  # pylint: disable=import-outside-toplevel
 
         # - Get the system message.
+        if self.use_function_calling:
+            self.set_tool_call_format_in_system_message()
         system_msg = self.system_template.replace(
             MessagePlaceholders.SYSTEM.value, self.system_message
         )
@@ -195,6 +199,51 @@ def as_prompt(self, config=None) -> List[Any]:
 
         return prompt
 
+    def set_tool_call_format_in_system_message(self):
+        """Add tool function information and call format to the system message."""
+        if self._tool_call_format == "json":
+            tool_call_instruct = (
+                "Tool Instructions:"
+                "You have access to the following tool functions:"
+                f"{MessagePlaceholders.FUNCTION.value}"
+                "If a you choose to call a function, you should ONLY reply in the following format:"
+                '`{"name": func_name, "parameters": parameters(JSON dict)}`'
+                "Here is an example,"
+                '`{"name": "get_time", "parameters": {"location": "Pittsburgh"}}}}`'
+                "Reminder:"
+                "- Function calls MUST follow the specified format"
+                "- Required parameters MUST be specified"
+                "- You should not repeat or miss the call"
+                "- You should response with at least one function calling"
+            )
+            self.system_message += tool_call_instruct
+        elif self._tool_call_format == "xml":
+            tool_call_instruct = (
+                "Tool Instructions:"
+                "You have access to the following tool functions:"
+                f"{MessagePlaceholders.FUNCTION.value}"
+                "If a you choose to call a function, you should ONLY reply in the following format:"
+                "`<function={func name}>{parameters(JSON dict)}</function>`"
+                "Here is an example,"
+                '`<function=get_time>{"location": "Pittsburgh"}</function>`'
+                "Reminder:"
+                "- Function calls MUST follow the specified format"
+                "- Required parameters MUST be specified"
+                "- You should not repeat or miss the call"
+            )
+            self.system_message += tool_call_instruct
+        elif self._tool_call_format == "python":
+            tool_call_instruct = (
+                "Tool Instructions:"
+                "- You have access to the following tool functions:"
+                f"{MessagePlaceholders.FUNCTION.value}"
+                "- Required parameters MUST be specified"
+                "- You should not repeat or miss the call"
+            )
+            self.system_message += tool_call_instruct
+        else:
+            raise ValueError("Unknown tool calling format.")
+
 
 def _get_url_from_item(item: Dict) -> str:
     image_url: str

diff --git a/python/mlc_llm/protocol/openai_api_protocol.py b/python/mlc_llm/protocol/openai_api_protocol.py
@@ -86,12 +86,46 @@ class ModelResponse(BaseModel):
 
 
 class RequestResponseFormat(BaseModel):
-    type: Literal["text", "json_object"] = "text"
-    json_schema: Optional[str] = Field(default=None, alias="schema")
+    type: Literal["text", "json_object", "json_schema", "structural_tag"] = "text"
     """This field is named json_schema instead of schema because BaseModel defines a method called
     schema. During construction of RequestResponseFormat, key "schema" still should be used:
-    `RequestResponseFormat(type="json_object", schema="{}")`
+    `RequestResponseFormat(type="json_schema", schema="{}")`
     """
+    json_schema: Optional[str] = Field(default=None, alias="schema")
+
+    """These field are only used for type="structural_tag"."""
+    tags: Optional[List[Dict[str, str]]] = None
+    triggers: Optional[List[str]] = None
+
+    @model_validator(mode="after")
+    def check_request_response_format(self) -> "RequestResponseFormat":
+        """Check if the RequestResponseFormat is valid."""
+        if self.type in ["text", "json_object"]:
+            if self.json_schema is not None:
+                raise Warning("'json_schema' should be set in 'json_schema' type.")
+            if self.tags is not None or self.triggers is not None:
+                raise Warning(
+                    "'tags' and 'triggers' attributes should be used when type='structural_tag'"
+                )
+        elif self.type == "json_schema":
+            if self.json_schema is None:
+                raise ValueError("'json_schema' should be set in 'json_schema' type.")
+            if self.tags is not None or self.triggers is not None:
+                raise Warning(
+                    "'tags' and 'triggers' attributes should be used when type='structural_tag'"
+                )
+        elif self.type == "structural_tag":
+            if self.tags is None or self.triggers is None:
+                raise ValueError("structural_tag type must contain keys 'tags' and 'triggers'.")
+            for tag in self.tags:  # pylint: disable=not-an-iterable
+                if set(tag.keys()) != {"begin", "schema", "end"}:
+                    raise ValueError(
+                        "Each tag must contain exactly 'begin', 'schema' and 'end' keys."
+                        f"Got keys: {list(tag.keys())}."
+                    )
+            if self.json_schema is not None:
+                raise Warning("'json_schema' should be set in 'json_schema' type.")
+        return self
 
 
 class CompletionRequest(BaseModel):
@@ -181,6 +215,7 @@ class ChatFunction(BaseModel):
     description: Optional[str] = None
     name: str
     parameters: Dict
+    strict: bool = True
 
 
 class ChatTool(BaseModel):
@@ -318,12 +353,10 @@ def check_function_call_usage(self, conv_template: Conversation) -> None:
         """Check if function calling is used and update the conversation template.
         Return error message if invalid request format for function calling.
         """
-
         # return if no tools are provided or tool_choice is set to none
         if self.tools is None or (isinstance(self.tool_choice, str) and self.tool_choice == "none"):
             conv_template.use_function_calling = False
             return
-
         # select the tool based on the tool_choice if specified
         if isinstance(self.tool_choice, dict):
             if self.tool_choice["type"] != "function":  # pylint: disable=unsubscriptable-object

diff --git a/python/mlc_llm/serve/config.py b/python/mlc_llm/serve/config.py
@@ -130,6 +130,17 @@ class EngineConfig:  # pylint: disable=too-many-instance-attributes
         "hybrid" means the hybrid prefill or split-fuse,
         so that decode step will be converted into prefill.
 
+    tool_call_format : Literal["json", "xml", "python"]
+        The tool function call foramt.
+        "json" means model will call tool function in json style format
+        '{"name": func_name, "parameters": parameters(JSON dict)}',
+        e.g. '{"name": "get_time", "parameters": {"location": "Pittsburgh"}}'.
+        "xml" means model will call tool function in xml style format
+        '<function=func_name>{parameters(JSON dict)}</function>',
+        e.g. '<function=get_time>{"location": "Pittsburgh"}</function>'.
+        "python" means model will call tool function in python-style format,
+        e.g. 'wolfram_alpha.call(query="solve x^3 - 4x^2 + 6x - 24 = 0")'.
+
     verbose : bool
         A boolean indicating whether to print logging info in engine.
     """
@@ -157,6 +168,7 @@ class EngineConfig:  # pylint: disable=too-many-instance-attributes
     prefix_cache_mode: Literal["disable", "radix"] = "radix"
     prefix_cache_max_num_recycling_seqs: Optional[int] = None
     prefill_mode: Literal["chunked", "hybrid"] = "hybrid"
+    tool_call_format: Literal["json", "xml", "python"] = "json"
     verbose: bool = True
 
     def asjson(self) -> str: