@@ -318,7 +318,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
318
318
fin.close ();
319
319
320
320
std::vector<uint8_t > tmp;
321
-
321
+
322
322
for (int i = 0 ; i < n_parts; ++i) {
323
323
const int part_id = i;
324
324
// const int part_id = n_parts - i - 1;
@@ -797,14 +797,6 @@ int main(int argc, char ** argv) {
797
797
798
798
gpt_params params;
799
799
800
- params.temp = 0 .1f ;
801
- params.top_p = 0 .95f ;
802
- params.n_ctx = 2048 ;
803
- params.interactive = true ;
804
- params.interactive_start = true ;
805
- params.use_color = true ;
806
- params.model = " ggml-alpaca-7b-q4.bin" ;
807
-
808
800
if (gpt_params_parse (argc, argv, params) == false ) {
809
801
return 1 ;
810
802
}
@@ -856,13 +848,26 @@ int main(int argc, char ** argv) {
856
848
// Add a space in front of the first character to match OG llama tokenizer behavior
857
849
// params.prompt.insert(0, 1, ' ');
858
850
// tokenize the prompt
859
- std::vector<gpt_vocab::id> embd_inp;// = ::llama_tokenize(vocab, params.prompt, true);
851
+ std::vector<gpt_vocab::id> embd_inp;
860
852
861
853
// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
862
854
863
855
// // tokenize the reverse prompt
864
856
// std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
865
857
858
+
859
+ std::vector<gpt_vocab::id> instruct_inp = ::llama_tokenize (vocab, " Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n " , true );
860
+ std::vector<gpt_vocab::id> prompt_inp = ::llama_tokenize (vocab, " ### Instruction:\n\n " , true );
861
+ std::vector<gpt_vocab::id> response_inp = ::llama_tokenize (vocab, " ### Response:\n\n " , false );
862
+ embd_inp.insert (embd_inp.end (), instruct_inp.begin (), instruct_inp.end ());
863
+
864
+ if (!params.prompt .empty ()) {
865
+ std::vector<gpt_vocab::id> param_inp = ::llama_tokenize (vocab, params.prompt , true );
866
+ embd_inp.insert (embd_inp.end (), prompt_inp.begin (), prompt_inp.end ());
867
+ embd_inp.insert (embd_inp.end (), param_inp.begin (), param_inp.end ());
868
+ embd_inp.insert (embd_inp.end (), response_inp.begin (), response_inp.end ());
869
+ }
870
+
866
871
// fprintf(stderr, "\n");
867
872
// fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
868
873
// fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -871,13 +876,6 @@ int main(int argc, char ** argv) {
871
876
// }
872
877
// fprintf(stderr, "\n");
873
878
874
- std::vector<gpt_vocab::id> instruct_inp = ::llama_tokenize (vocab, " Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n " , true );
875
- std::vector<gpt_vocab::id> prompt_inp = ::llama_tokenize (vocab, " ### Instruction:\n\n " , true );
876
- std::vector<gpt_vocab::id> response_inp = ::llama_tokenize (vocab, " ### Response:\n\n " , false );
877
-
878
- embd_inp.insert (embd_inp.end (), instruct_inp.begin (), instruct_inp.end ());
879
-
880
-
881
879
if (params.interactive ) {
882
880
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
883
881
struct sigaction sigint_action;
@@ -1076,9 +1074,14 @@ int main(int argc, char ** argv) {
1076
1074
1077
1075
// end of text token
1078
1076
if (embd.back () == 2 ) {
1079
- // fprintf(stderr, " [end of text]\n");
1080
- is_interacting = true ;
1081
- continue ;
1077
+ if (params.interactive ) {
1078
+ is_interacting = true ;
1079
+ continue ;
1080
+ } else {
1081
+ printf (" \n " );
1082
+ fprintf (stderr, " [end of text]\n " );
1083
+ break ;
1084
+ }
1082
1085
}
1083
1086
}
1084
1087
0 commit comments