-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathLLM.kt
99 lines (84 loc) · 3.09 KB
/
LLM.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package com.swmansion.rnexecutorch
import android.util.Log
import com.facebook.react.bridge.Promise
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReadableArray
import com.swmansion.rnexecutorch.utils.ArrayUtils
import com.swmansion.rnexecutorch.utils.llms.ChatRole
import com.swmansion.rnexecutorch.utils.llms.ConversationManager
import com.swmansion.rnexecutorch.utils.llms.END_OF_TEXT_TOKEN
import org.pytorch.executorch.LlamaCallback
import org.pytorch.executorch.LlamaModule
import java.net.URL
class LLM(
reactContext: ReactApplicationContext,
) : NativeLLMSpec(reactContext),
LlamaCallback {
private var llamaModule: LlamaModule? = null
private var tempLlamaResponse = StringBuilder()
private lateinit var conversationManager: ConversationManager
override fun getName(): String = NAME
override fun initialize() {
super.initialize()
}
override fun onResult(result: String) {
emitOnToken(result)
this.tempLlamaResponse.append(result)
}
override fun onStats(tps: Float) {
Log.d("rn_executorch", "TPS: $tps")
}
override fun loadLLM(
modelSource: String,
tokenizerSource: String,
systemPrompt: String,
messageHistory: ReadableArray,
contextWindowLength: Double,
promise: Promise,
) {
try {
this.conversationManager =
ConversationManager(
contextWindowLength.toInt(),
systemPrompt,
ArrayUtils.createMapArray<String>(messageHistory),
)
llamaModule = LlamaModule(1, URL(modelSource).path, URL(tokenizerSource).path, 0.7f)
this.tempLlamaResponse.clear()
promise.resolve("Model loaded successfully")
} catch (e: Exception) {
promise.reject("Model loading failed", e.message)
}
}
override fun runInference(
input: String,
promise: Promise,
) {
this.conversationManager.addResponse(input, ChatRole.USER)
val conversation = this.conversationManager.getConversation()
Thread {
llamaModule!!.generate(conversation, (conversation.length * 0.75).toInt() + 64, this, false)
// When we call .interrupt(), the LLM doesn't produce EOT token, that also could happen when the
// generated sequence length is larger than specified in the JNI callback, hence we check if EOT
// is there and if not, we append it to the output and emit the EOT token to the JS side.
if (!this.tempLlamaResponse.endsWith(END_OF_TEXT_TOKEN)) {
this.onResult(END_OF_TEXT_TOKEN)
}
// We want to add the LLM response to the conversation once all the tokens are generated.
// Each token is appended to the tempLlamaResponse StringBuilder in onResult callback.
this.conversationManager.addResponse(this.tempLlamaResponse.toString(), ChatRole.ASSISTANT)
this.tempLlamaResponse.clear()
Log.d("ExecutorchLib", this.conversationManager.getConversation())
}.start()
promise.resolve("Inference completed successfully")
}
override fun interrupt() {
llamaModule!!.stop()
}
override fun deleteModule() {
llamaModule = null
}
companion object {
const val NAME = "LLM"
}
}