For non low_resource llama loading code, I added an argument low_cpu_mem_usage=True, drastically speeding up load time (cuts more than a minute off of load)

Cory · Cory · commit 5ab62fe8104b · 2023-10-10T20:15:08.000-04:00
diff --git a/minigpt4/models/mini_gpt4.py b/minigpt4/models/mini_gpt4.py
@@ -118,6 +118,7 @@ def __init__(
             self.llama_model = LlamaForCausalLM.from_pretrained(
                 llama_model,
                 torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
             )
 
         if lora_r > 0:

Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,7 @@ def __init__(`
`118`	`118`	`self.llama_model = LlamaForCausalLM.from_pretrained(`
`119`	`119`	`llama_model,`
`120`	`120`	`torch_dtype=torch.float16,`
	`121`	`+ low_cpu_mem_usage=True,`
`121`	`122`	`)`
`122`	`123`
`123`	`124`	`if lora_r > 0:`