|
153 | 153 | "context_window_size": 2048,
|
154 | 154 | "prefill_chunk_size": 2048,
|
155 | 155 | },
|
| 156 | + "tinyllama_1b_chat_v0.4": { |
| 157 | + "_name_or_path": "/data/tianduo/tinyllama-ft/checkpoint-3890", |
| 158 | + "architectures": ["LlamaForCausalLM"], |
| 159 | + "bos_token_id": 1, |
| 160 | + "eos_token_id": 2, |
| 161 | + "hidden_act": "silu", |
| 162 | + "hidden_size": 2048, |
| 163 | + "initializer_range": 0.02, |
| 164 | + "intermediate_size": 5632, |
| 165 | + "max_position_embeddings": 2048, |
| 166 | + "model_type": "llama", |
| 167 | + "num_attention_heads": 32, |
| 168 | + "num_hidden_layers": 22, |
| 169 | + "num_key_value_heads": 4, |
| 170 | + "pretraining_tp": 1, |
| 171 | + "rms_norm_eps": 1e-05, |
| 172 | + "rope_scaling": None, |
| 173 | + "rope_theta": 10000.0, |
| 174 | + "tie_word_embeddings": False, |
| 175 | + "torch_dtype": "float32", |
| 176 | + "transformers_version": "4.33.1", |
| 177 | + "use_cache": False, |
| 178 | + "vocab_size": 32003, |
| 179 | + }, |
156 | 180 | "tinyllama_1b_chat_v1.0": {
|
157 | 181 | "architectures": ["LlamaForCausalLM"],
|
158 | 182 | "attention_bias": False,
|
|
201 | 225 | "prefill_chunk_size": 128,
|
202 | 226 | "attention_sink_size": 4,
|
203 | 227 | },
|
| 228 | + "mistral_7b_v03": { |
| 229 | + "architectures": ["MistralForCausalLM"], |
| 230 | + "attention_dropout": 0.0, |
| 231 | + "bos_token_id": 1, |
| 232 | + "eos_token_id": 2, |
| 233 | + "hidden_act": "silu", |
| 234 | + "hidden_size": 4096, |
| 235 | + "initializer_range": 0.02, |
| 236 | + "intermediate_size": 14336, |
| 237 | + "max_position_embeddings": 32768, |
| 238 | + "model_type": "mistral", |
| 239 | + "num_attention_heads": 32, |
| 240 | + "num_hidden_layers": 32, |
| 241 | + "num_key_value_heads": 8, |
| 242 | + "rms_norm_eps": 1e-05, |
| 243 | + "rope_theta": 1000000.0, |
| 244 | + "sliding_window": None, |
| 245 | + "tie_word_embeddings": False, |
| 246 | + "torch_dtype": "bfloat16", |
| 247 | + "transformers_version": "4.42.0.dev0", |
| 248 | + "use_cache": True, |
| 249 | + "vocab_size": 32768, |
| 250 | + }, |
204 | 251 | "gpt2": {
|
| 252 | + "activation_function": "gelu_new", |
205 | 253 | "architectures": ["GPT2LMHeadModel"],
|
| 254 | + "attn_pdrop": 0.1, |
206 | 255 | "bos_token_id": 50256,
|
| 256 | + "embd_pdrop": 0.1, |
207 | 257 | "eos_token_id": 50256,
|
208 |
| - "hidden_act": "gelu_new", |
209 |
| - "n_embd": 768, |
210 | 258 | "initializer_range": 0.02,
|
211 |
| - "n_positions": 1024, |
| 259 | + "layer_norm_epsilon": 1e-05, |
212 | 260 | "model_type": "gpt2",
|
| 261 | + "n_ctx": 1024, |
| 262 | + "n_embd": 768, |
213 | 263 | "n_head": 12,
|
214 | 264 | "n_layer": 12,
|
| 265 | + "n_positions": 1024, |
| 266 | + "resid_pdrop": 0.1, |
| 267 | + "summary_activation": None, |
| 268 | + "summary_first_dropout": 0.1, |
| 269 | + "summary_proj_to_labels": True, |
| 270 | + "summary_type": "cls_index", |
| 271 | + "summary_use_proj": True, |
| 272 | + "task_specific_params": {"text-generation": {"do_sample": True, "max_length": 50}}, |
| 273 | + "vocab_size": 50257, |
| 274 | + }, |
| 275 | + "gpt2_medium": { |
| 276 | + "activation_function": "gelu_new", |
| 277 | + "architectures": ["GPT2LMHeadModel"], |
| 278 | + "attn_pdrop": 0.1, |
| 279 | + "bos_token_id": 50256, |
| 280 | + "embd_pdrop": 0.1, |
| 281 | + "eos_token_id": 50256, |
| 282 | + "initializer_range": 0.02, |
215 | 283 | "layer_norm_epsilon": 1e-05,
|
216 |
| - "transformers_version": "4.26.0.dev0", |
217 |
| - "use_cache": True, |
| 284 | + "model_type": "gpt2", |
| 285 | + "n_ctx": 1024, |
| 286 | + "n_embd": 1024, |
| 287 | + "n_head": 16, |
| 288 | + "n_layer": 24, |
| 289 | + "n_positions": 1024, |
| 290 | + "n_special": 0, |
| 291 | + "predict_special_tokens": True, |
| 292 | + "resid_pdrop": 0.1, |
| 293 | + "summary_activation": None, |
| 294 | + "summary_first_dropout": 0.1, |
| 295 | + "summary_proj_to_labels": True, |
| 296 | + "summary_type": "cls_index", |
| 297 | + "summary_use_proj": True, |
| 298 | + "task_specific_params": {"text-generation": {"do_sample": True, "max_length": 50}}, |
218 | 299 | "vocab_size": 50257,
|
219 |
| - "context_window_size": 2048, |
220 |
| - "prefill_chunk_size": 2048, |
221 | 300 | },
|
222 | 301 | "gpt_bigcode": {
|
223 | 302 | "activation_function": "gelu_pytorch_tanh",
|
|
796 | 875 | "type_vocab_size": 2,
|
797 | 876 | "vocab_size": 30522,
|
798 | 877 | },
|
| 878 | + "stablelm-2-zephyr-1_6b": { |
| 879 | + "architectures": ["StableLmForCausalLM"], |
| 880 | + "bos_token_id": 100257, |
| 881 | + "eos_token_id": 100257, |
| 882 | + "hidden_act": "silu", |
| 883 | + "hidden_size": 2048, |
| 884 | + "initializer_range": 0.02, |
| 885 | + "intermediate_size": 5632, |
| 886 | + "max_position_embeddings": 4096, |
| 887 | + "model_type": "stablelm", |
| 888 | + "layer_norm_eps": 1e-05, |
| 889 | + "num_attention_heads": 32, |
| 890 | + "num_hidden_layers": 24, |
| 891 | + "num_key_value_heads": 32, |
| 892 | + "partial_rotary_factor": 0.25, |
| 893 | + "rope_theta": 10000, |
| 894 | + "tie_word_embeddings": False, |
| 895 | + "torch_dtype": "float16", |
| 896 | + "transformers_version": "4.38.0", |
| 897 | + "use_cache": True, |
| 898 | + "use_qkv_bias": True, |
| 899 | + "vocab_size": 100352, |
| 900 | + }, |
| 901 | + "qwen2_0_5b": { |
| 902 | + "architectures": ["Qwen2ForCausalLM"], |
| 903 | + "attention_dropout": 0.0, |
| 904 | + "bos_token_id": 151643, |
| 905 | + "eos_token_id": 151645, |
| 906 | + "hidden_act": "silu", |
| 907 | + "hidden_size": 896, |
| 908 | + "initializer_range": 0.02, |
| 909 | + "intermediate_size": 4864, |
| 910 | + "max_position_embeddings": 32768, |
| 911 | + "max_window_layers": 24, |
| 912 | + "model_type": "qwen2", |
| 913 | + "num_attention_heads": 14, |
| 914 | + "num_hidden_layers": 24, |
| 915 | + "num_key_value_heads": 2, |
| 916 | + "rms_norm_eps": 1e-06, |
| 917 | + "rope_theta": 1000000.0, |
| 918 | + "sliding_window": 32768, |
| 919 | + "tie_word_embeddings": True, |
| 920 | + "torch_dtype": "bfloat16", |
| 921 | + "transformers_version": "4.40.1", |
| 922 | + "use_cache": True, |
| 923 | + "use_sliding_window": False, |
| 924 | + "vocab_size": 151936, |
| 925 | + }, |
| 926 | + "qwen2_1_5b": { |
| 927 | + "architectures": ["Qwen2ForCausalLM"], |
| 928 | + "attention_dropout": 0.0, |
| 929 | + "bos_token_id": 151643, |
| 930 | + "eos_token_id": 151645, |
| 931 | + "hidden_act": "silu", |
| 932 | + "hidden_size": 1536, |
| 933 | + "initializer_range": 0.02, |
| 934 | + "intermediate_size": 8960, |
| 935 | + "max_position_embeddings": 32768, |
| 936 | + "max_window_layers": 28, |
| 937 | + "model_type": "qwen2", |
| 938 | + "num_attention_heads": 12, |
| 939 | + "num_hidden_layers": 28, |
| 940 | + "num_key_value_heads": 2, |
| 941 | + "rms_norm_eps": 1e-06, |
| 942 | + "rope_theta": 1000000.0, |
| 943 | + "sliding_window": 32768, |
| 944 | + "tie_word_embeddings": True, |
| 945 | + "torch_dtype": "bfloat16", |
| 946 | + "transformers_version": "4.40.1", |
| 947 | + "use_cache": True, |
| 948 | + "use_sliding_window": False, |
| 949 | + "vocab_size": 151936, |
| 950 | + }, |
| 951 | + "qwen2_7b": { |
| 952 | + "architectures": ["Qwen2ForCausalLM"], |
| 953 | + "attention_dropout": 0.0, |
| 954 | + "bos_token_id": 151643, |
| 955 | + "eos_token_id": 151645, |
| 956 | + "hidden_act": "silu", |
| 957 | + "hidden_size": 3584, |
| 958 | + "initializer_range": 0.02, |
| 959 | + "intermediate_size": 18944, |
| 960 | + "max_position_embeddings": 32768, |
| 961 | + "max_window_layers": 28, |
| 962 | + "model_type": "qwen2", |
| 963 | + "num_attention_heads": 28, |
| 964 | + "num_hidden_layers": 28, |
| 965 | + "num_key_value_heads": 4, |
| 966 | + "rms_norm_eps": 1e-06, |
| 967 | + "rope_theta": 1000000.0, |
| 968 | + "sliding_window": 131072, |
| 969 | + "tie_word_embeddings": False, |
| 970 | + "torch_dtype": "bfloat16", |
| 971 | + "transformers_version": "4.41.2", |
| 972 | + "use_cache": True, |
| 973 | + "use_sliding_window": False, |
| 974 | + "vocab_size": 152064, |
| 975 | + }, |
799 | 976 | }
|
0 commit comments