CodeLlama

OpenInterpreter · Aug 25, 2023 · 6ee99b5 · 6ee99b5
1 parent a6b1c06
commit 6ee99b5
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 16 deletions.
diff --git a/interpreter/llama_2.py b/interpreter/llama_2.py
@@ -13,41 +13,45 @@ def get_llama_2_instance():
     # First, we ask them which model they want to use.
     print('', Markdown("Please select a `Llama-2` model (use arrow keys)."), '')
 
-    models_7b = [
+    llama_2_7b = [
         # Smallest/Fastest
-        {'Name': 'llama-2-7b-chat.ggmlv3.q2_K.bin', 'Param': '7B', 'Bits': 2, 'Size': '2.87 GB', 'RAM': '5.37 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q4_K for the attention.vw and feed_forward.w2 tensors, GGML_TYPE_Q2_K for the other tensors.'},
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q2_K.bin', 'Param': '7B', 'Bits': 2, 'Size': '2.87 GB', 'RAM': '5.37 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q4_K for the attention.vw and feed_forward.w2 tensors, GGML_TYPE_Q2_K for the other tensors.'},
         # Middle Ground
-        {'Name': 'llama-2-7b-chat.ggmlv3.q4_1.bin', 'Param': '7B', 'Bits': 4, 'Size': '4.21 GB', 'RAM': '6.71 GB', 'Description': 'Original quant method, 4-bit. Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.'},
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_1.bin', 'Param': '7B', 'Bits': 4, 'Size': '4.21 GB', 'RAM': '6.71 GB', 'Description': 'Original quant method, 4-bit. Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.'},
         # Middle Ground
-        # {'Name': 'llama-2-7b-chat.ggmlv3.q5_0.bin', 'Param': '7B', 'Bits': 5, 'Size': '4.63 GB', 'RAM': '7.13 GB', 'Description': 'Original quant method, 5-bit. Higher accuracy, higher resource usage and slower inference.'},
+        # {'URL': 'https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q5_0.bin', 'Param': '7B', 'Bits': 5, 'Size': '4.63 GB', 'RAM': '7.13 GB', 'Description': 'Original quant method, 5-bit. Higher accuracy, higher resource usage and slower inference.'},
         # Best/Slowest
-        {'Name': 'llama-2-7b-chat.ggmlv3.q8_0.bin', 'Param': '7B', 'Bits': 8, 'Size': '7.16 GB', 'RAM': '9.66 GB', 'Description': 'Original quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.'}
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin', 'Param': '7B', 'Bits': 8, 'Size': '7.16 GB', 'RAM': '9.66 GB', 'Description': 'Original quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.'}
     ]
-    models_13b = [
+    llama_2_13b = [
         # Smallest/Fastest
-        {'Name': 'llama-2-13b-chat.ggmlv3.q2_K.bin', 'Param': '13B', 'Bits': 2, 'Size': '5.51 GB', 'RAM': '8.01 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q4_K for the attention.vw and feed_forward.w2 tensors, GGML_TYPE_Q2_K for the other tensors.'},
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q2_K.bin', 'Param': '13B', 'Bits': 2, 'Size': '5.51 GB', 'RAM': '8.01 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q4_K for the attention.vw and feed_forward.w2 tensors, GGML_TYPE_Q2_K for the other tensors.'},
         # Middle Ground
-        {'Name': 'llama-2-13b-chat.ggmlv3.q3_K_L.bin', 'Param': '13B', 'Bits': 3, 'Size': '6.93 GB', 'RAM': '9.43 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else GGML_TYPE_Q3_K'},
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin', 'Param': '13B', 'Bits': 3, 'Size': '6.93 GB', 'RAM': '9.43 GB', 'Description': 'New k-quant method. Uses GGML_TYPE_Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else GGML_TYPE_Q3_K'},
         # Middle Ground
-        # {'Name': 'llama-2-13b-chat.ggmlv3.q4_1.bin', 'Param': '13B', 'Bits': 4, 'Size': '8.14 GB', 'RAM': '10.64 GB', 'Description': 'Original quant method, 4-bit. Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.'},
+        # {'URL': 'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_1.bin', 'Param': '13B', 'Bits': 4, 'Size': '8.14 GB', 'RAM': '10.64 GB', 'Description': 'Original quant method, 4-bit. Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.'},
         # Best/Slowest
-        {'Name': 'llama-2-13b-chat.ggmlv3.q8_0.bin', 'Param': '13B', 'Bits': 8, 'Size': '13.83 GB', 'RAM': '16.33 GB', 'Description': 'Original quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.'}
+        {'URL': 'https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q8_0.bin', 'Param': '13B', 'Bits': 8, 'Size': '13.83 GB', 'RAM': '16.33 GB', 'Description': 'Original quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.'}
+    ]
+    code_llama_13b = [
+        {'URL': 'https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGML/blob/main/codellama-7b-instruct.ggmlv3.Q2_K.bin', 'Param': '13B', 'Bits': 8, 'Size': '13.83 GB', 'RAM': '16.33 GB', 'Description': 'Original quant method, 8-bit. Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.'}
     ]
 
-    all_models = models_7b + models_13b
+    all_models = llama_2_7b + llama_2_13b + code_llama_13b
 
     # Function to format the model choice for display
     def format_choice(model):
         return f"{model['Param']} Parameter, {model['Bits']}-Bit | Size: {model['Size']}, RAM usage: {model['RAM']}"
 
     questions = [
-        inquirer.List('model',
-                      choices=[(format_choice(m), m['Name']) for m in all_models])
+        inquirer.List('URL',
+                      choices=[(format_choice(m), m['URL']) for m in all_models])
     ]
 
     answers = inquirer.prompt(questions)
 
-    file_name = answers['model']
+    url = answers['URL']
+    file_name = URL.split("/")[-1]
 
     # Get user data directory for your application
     user_data_dir = appdirs.user_data_dir("open-interpreter")
@@ -75,7 +79,6 @@ def format_choice(model):
         download_path = os.path.join(default_path, file_name)
         message = f"This instance of `Llama-2` was not found. Would you like to download it to `{download_path}`?"
         if confirm_action(message):
-            url = f"https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML/resolve/main/{file_name}"
             subprocess.run(f"curl -L '{url}' -o '{download_path}'", shell=True)
             model_path = download_path
             print('\n', "Finished downloading `Llama-2`.", '\n')

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "open-interpreter"
 packages = [
     {include = "interpreter"},
 ]
-version = "0.0.294"
+version = "0.0.295"
 description = "Ask GPT-4 to run code locally."
 authors = ["Killian Lucas <[email protected]>"]
 readme = "README.md"