|
| 1 | +# |
| 2 | +# Copyright 2016 The BigDL Authors. |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +# |
| 16 | + |
| 17 | +# =========================================================================== |
| 18 | +# |
| 19 | +# This file is adapted from |
| 20 | +# https://huggingface.co/THUDM/codegeex2-6b/blob/ee1e7db429e587645bd3f0f4c3f5d8e6e843f2f6/tokenization_chatglm.py |
| 21 | +# |
| 22 | +# Apache 2.0 license |
| 23 | +# https://huggingface.co/THUDM/codegeex2-6b/blob/main/LICENSE |
| 24 | + |
| 25 | +import os |
| 26 | +import torch |
| 27 | +from typing import List, Optional, Union, Dict |
| 28 | +from sentencepiece import SentencePieceProcessor |
| 29 | +from transformers import PreTrainedTokenizer |
| 30 | +from transformers.utils import logging, PaddingStrategy |
| 31 | +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding |
| 32 | + |
| 33 | + |
| 34 | +class SPTokenizer: |
| 35 | + def __init__(self, model_path: str): |
| 36 | + # reload tokenizer |
| 37 | + assert os.path.isfile(model_path), model_path |
| 38 | + self.sp_model = SentencePieceProcessor(model_file=model_path) |
| 39 | + |
| 40 | + # BOS / EOS token IDs |
| 41 | + self.n_words: int = self.sp_model.vocab_size() |
| 42 | + self.bos_id: int = self.sp_model.bos_id() |
| 43 | + self.eos_id: int = self.sp_model.eos_id() |
| 44 | + self.pad_id: int = self.sp_model.unk_id() |
| 45 | + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() |
| 46 | + |
| 47 | + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] |
| 48 | + self.special_tokens = {} |
| 49 | + self.index_special_tokens = {} |
| 50 | + for token in special_tokens: |
| 51 | + self.special_tokens[token] = self.n_words |
| 52 | + self.index_special_tokens[self.n_words] = token |
| 53 | + self.n_words += 1 |
| 54 | + |
| 55 | + def tokenize(self, s: str): |
| 56 | + return self.sp_model.EncodeAsPieces(s) |
| 57 | + |
| 58 | + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: |
| 59 | + assert type(s) is str |
| 60 | + t = self.sp_model.encode(s) |
| 61 | + if bos: |
| 62 | + t = [self.bos_id] + t |
| 63 | + if eos: |
| 64 | + t = t + [self.eos_id] |
| 65 | + return t |
| 66 | + |
| 67 | + def decode(self, t: List[int]) -> str: |
| 68 | + return self.sp_model.decode(t) |
| 69 | + |
| 70 | + def decode_tokens(self, tokens: List[str]) -> str: |
| 71 | + text = self.sp_model.DecodePieces(tokens) |
| 72 | + return text |
| 73 | + |
| 74 | + def convert_token_to_id(self, token): |
| 75 | + """ Converts a token (str) in an id using the vocab. """ |
| 76 | + if token in self.special_tokens: |
| 77 | + return self.special_tokens[token] |
| 78 | + return self.sp_model.PieceToId(token) |
| 79 | + |
| 80 | + def convert_id_to_token(self, index): |
| 81 | + """Converts an index (integer) in a token (str) using the vocab.""" |
| 82 | + if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: |
| 83 | + return "" |
| 84 | + return self.sp_model.IdToPiece(index) |
| 85 | + |
| 86 | + |
| 87 | +class ChatGLMTokenizer(PreTrainedTokenizer): |
| 88 | + vocab_files_names = {"vocab_file": "tokenizer.model"} |
| 89 | + |
| 90 | + model_input_names = ["input_ids", "attention_mask", "position_ids"] |
| 91 | + |
| 92 | + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): |
| 93 | + self.tokenizer = SPTokenizer(vocab_file) |
| 94 | + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) |
| 95 | + self.name = "GLMTokenizer" |
| 96 | + |
| 97 | + self.vocab_file = vocab_file |
| 98 | + |
| 99 | + self.special_tokens = { |
| 100 | + "<bos>": self.tokenizer.bos_id, |
| 101 | + "<eos>": self.tokenizer.eos_id, |
| 102 | + "<pad>": self.tokenizer.pad_id |
| 103 | + } |
| 104 | + |
| 105 | + def get_command(self, token): |
| 106 | + if token in self.special_tokens: |
| 107 | + return self.special_tokens[token] |
| 108 | + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" |
| 109 | + return self.tokenizer.special_tokens[token] |
| 110 | + |
| 111 | + @property |
| 112 | + def unk_token(self) -> str: |
| 113 | + return "<unk>" |
| 114 | + |
| 115 | + @property |
| 116 | + def pad_token(self) -> str: |
| 117 | + return "<unk>" |
| 118 | + |
| 119 | + @property |
| 120 | + def pad_token_id(self): |
| 121 | + return self.get_command("<pad>") |
| 122 | + |
| 123 | + @property |
| 124 | + def eos_token(self) -> str: |
| 125 | + return "</s>" |
| 126 | + |
| 127 | + @property |
| 128 | + def eos_token_id(self): |
| 129 | + return self.get_command("<eos>") |
| 130 | + |
| 131 | + @property |
| 132 | + def vocab_size(self): |
| 133 | + return self.tokenizer.n_words |
| 134 | + |
| 135 | + def get_vocab(self): |
| 136 | + """ Returns vocab as a dict """ |
| 137 | + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} |
| 138 | + vocab.update(self.added_tokens_encoder) |
| 139 | + return vocab |
| 140 | + |
| 141 | + def _tokenize(self, text, **kwargs): |
| 142 | + return self.tokenizer.tokenize(text) |
| 143 | + |
| 144 | + def _convert_token_to_id(self, token): |
| 145 | + """ Converts a token (str) in an id using the vocab. """ |
| 146 | + return self.tokenizer.convert_token_to_id(token) |
| 147 | + |
| 148 | + def _convert_id_to_token(self, index): |
| 149 | + """Converts an index (integer) in a token (str) using the vocab.""" |
| 150 | + return self.tokenizer.convert_id_to_token(index) |
| 151 | + |
| 152 | + def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| 153 | + return self.tokenizer.decode_tokens(tokens) |
| 154 | + |
| 155 | + def save_vocabulary(self, save_directory, filename_prefix=None): |
| 156 | + """ |
| 157 | + Save the vocabulary and special tokens file to a directory. |
| 158 | +
|
| 159 | + Args: |
| 160 | + save_directory (`str`): |
| 161 | + The directory in which to save the vocabulary. |
| 162 | + filename_prefix (`str`, *optional*): |
| 163 | + An optional prefix to add to the named of the saved files. |
| 164 | +
|
| 165 | + Returns: |
| 166 | + `Tuple(str)`: Paths to the files saved. |
| 167 | + """ |
| 168 | + if os.path.isdir(save_directory): |
| 169 | + vocab_file = os.path.join( |
| 170 | + save_directory, self.vocab_files_names["vocab_file"] |
| 171 | + ) |
| 172 | + else: |
| 173 | + vocab_file = save_directory |
| 174 | + |
| 175 | + with open(self.vocab_file, 'rb') as fin: |
| 176 | + proto_str = fin.read() |
| 177 | + |
| 178 | + with open(vocab_file, "wb") as writer: |
| 179 | + writer.write(proto_str) |
| 180 | + |
| 181 | + return (vocab_file,) |
| 182 | + |
| 183 | + def get_prefix_tokens(self): |
| 184 | + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] |
| 185 | + return prefix_tokens |
| 186 | + |
| 187 | + def build_prompt(self, query, history=None): |
| 188 | + if history is None: |
| 189 | + history = [] |
| 190 | + prompt = "" |
| 191 | + for i, (old_query, response) in enumerate(history): |
| 192 | + prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) |
| 193 | + prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) |
| 194 | + return prompt |
| 195 | + |
| 196 | + def build_inputs_with_special_tokens( |
| 197 | + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None |
| 198 | + ) -> List[int]: |
| 199 | + """ |
| 200 | + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
| 201 | + adding special tokens. A BERT sequence has the following format: |
| 202 | +
|
| 203 | + - single sequence: `[CLS] X [SEP]` |
| 204 | + - pair of sequences: `[CLS] A [SEP] B [SEP]` |
| 205 | +
|
| 206 | + Args: |
| 207 | + token_ids_0 (`List[int]`): |
| 208 | + List of IDs to which the special tokens will be added. |
| 209 | + token_ids_1 (`List[int]`, *optional*): |
| 210 | + Optional second list of IDs for sequence pairs. |
| 211 | +
|
| 212 | + Returns: |
| 213 | + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
| 214 | + """ |
| 215 | + prefix_tokens = self.get_prefix_tokens() |
| 216 | + token_ids_0 = prefix_tokens + token_ids_0 |
| 217 | + if token_ids_1 is not None: |
| 218 | + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")] |
| 219 | + return token_ids_0 |
| 220 | + |
| 221 | + def _pad( |
| 222 | + self, |
| 223 | + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], |
| 224 | + max_length: Optional[int] = None, |
| 225 | + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| 226 | + pad_to_multiple_of: Optional[int] = None, |
| 227 | + return_attention_mask: Optional[bool] = None, |
| 228 | + ) -> dict: |
| 229 | + """ |
| 230 | + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) |
| 231 | +
|
| 232 | + Args: |
| 233 | + encoded_inputs: |
| 234 | + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). |
| 235 | + max_length: maximum length of the returned list and optionally padding length (see below). |
| 236 | + Will truncate by taking into account the special tokens. |
| 237 | + padding_strategy: PaddingStrategy to use for padding. |
| 238 | +
|
| 239 | + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch |
| 240 | + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) |
| 241 | + - PaddingStrategy.DO_NOT_PAD: Do not pad |
| 242 | + The tokenizer padding sides are defined in self.padding_side: |
| 243 | +
|
| 244 | + - 'left': pads on the left of the sequences |
| 245 | + - 'right': pads on the right of the sequences |
| 246 | + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
| 247 | + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
| 248 | + `>= 7.5` (Volta). |
| 249 | + return_attention_mask: |
| 250 | + (optional) Set to False to avoid returning attention mask (default: set to model specifics) |
| 251 | + """ |
| 252 | + # Load from model defaults |
| 253 | + # assert self.padding_side == "left" |
| 254 | + |
| 255 | + required_input = encoded_inputs[self.model_input_names[0]] |
| 256 | + seq_length = len(required_input) |
| 257 | + |
| 258 | + if padding_strategy == PaddingStrategy.LONGEST: |
| 259 | + max_length = len(required_input) |
| 260 | + |
| 261 | + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): |
| 262 | + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of |
| 263 | + |
| 264 | + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length |
| 265 | + |
| 266 | + # Initialize attention mask if not present. |
| 267 | + if "attention_mask" not in encoded_inputs: |
| 268 | + encoded_inputs["attention_mask"] = [1] * seq_length |
| 269 | + |
| 270 | + if "position_ids" not in encoded_inputs: |
| 271 | + encoded_inputs["position_ids"] = list(range(seq_length)) |
| 272 | + |
| 273 | + if needs_to_be_padded: |
| 274 | + difference = max_length - len(required_input) |
| 275 | + |
| 276 | + if self.padding_side == "left": |
| 277 | + if "attention_mask" in encoded_inputs: |
| 278 | + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] |
| 279 | + if "position_ids" in encoded_inputs: |
| 280 | + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] |
| 281 | + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input |
| 282 | + else: |
| 283 | + if "attention_mask" in encoded_inputs: |
| 284 | + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference |
| 285 | + if "position_ids" in encoded_inputs: |
| 286 | + encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference |
| 287 | + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference |
| 288 | + |
| 289 | + return encoded_inputs |
0 commit comments