added a proxy

samfundev · samfundev · commit 47a5291217a2 · 2025-07-13T15:55:01.000-04:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -143,6 +143,10 @@
 MaxMemory = [0]
 MaxFreeMemory = [0]
 
+server_process: subprocess.Popen | None = None
+is_proxy = "KOBOLDCPP_SERVER" not in os.environ
+current_model = None
+
 class logit_bias(ctypes.Structure):
     _fields_ = [("token_id", ctypes.c_int32),
                 ("bias", ctypes.c_float)]
@@ -1323,6 +1327,11 @@ def auto_set_backend_cli():
         print(f"Auto Selected Default Backend (flag={cpusupport})\n")
 
 def load_model(model_filename):
+    if is_proxy:
+        current_model = model_filename
+        print("Deferred model loading.", current_model)
+        return True
+
     global args
     inputs = load_model_inputs()
     inputs.model_filename = model_filename.encode("UTF-8")
@@ -3851,6 +3860,58 @@ def do_POST(self):
                 if args.foreground:
                     bring_terminal_to_foreground()
 
+                # Proxy
+                if is_proxy:
+                    global server_process
+                    global current_model
+
+                    model = genparams["model"]
+                    if server_process is not None and current_model != model:
+                        import psutil
+                        parent = psutil.Process(server_process.pid)
+                        processes = parent.children(recursive=True) + [parent]
+                        for process in processes:
+                            process.terminate()
+                        for process in processes:
+                            process.wait()
+
+                        server_process = None
+
+                    if server_process is None:
+                        current_model = model
+                        server_process = subprocess.Popen([sys.executable] + sys.argv + ["--port", str(args.port + 1), "--model", model], env={
+                            "KOBOLDCPP_SERVER": "True"
+                        })
+
+                    # Poke the server until it's alive
+                    while True:
+                        try:
+                            with urllib.request.urlopen(urllib.request.Request(f"http://localhost:{args.port + 1}", method="HEAD"), timeout=1000) as response:
+                                if response.status == 200:
+                                    break
+
+                                time.sleep(1)
+                        except Exception:
+                            time.sleep(1)
+
+
+                    request = urllib.request.Request(f"http://localhost:{args.port + 1}" + self.path, data=body, headers=dict(self.headers), method="POST")
+                    with urllib.request.urlopen(request) as response:
+                        self.send_response_only(response.status)
+                        for keyword, value in response.headers.items():
+                            self.send_header(keyword, value)
+                        super(KcppServerRequestHandler, self).end_headers()
+
+                        while True:
+                            chunk = response.read()
+                            if not chunk:
+                                break
+                            self.wfile.write(chunk)
+
+                        self.wfile.flush()
+                        self.close_connection = True
+                    return
+
                 if api_format > 0: #text gen
                     # Check if streaming chat completions, if so, set stream mode to true
                     if (api_format == 4 or api_format == 3) and "stream" in genparams and genparams["stream"]: