I'm trying to load `mixtral-8x7b-instruct-v0.1.Q6_K.gguf` but I'm getting an initialization error? #692

michael-conrad · 2024-03-14T04:54:45Z

michael-conrad
Mar 14, 2024

I'm using the following to load the model, and I'm getting the stack trace below. I tried passing in the model path and model kwargs to the guidance LlamaCpp constructor, and get the same results.

def load_mixtral_8x7b(verbose: bool = False, n_ctx=8192) -> LlamaCpp:

    offload_kqv = True
    repo_id = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
    gguf_filename = "mixtral-8x7b-instruct-v0.1.Q6_K.gguf"

    gguf_path = hf_hub_download(repo_id=repo_id, filename=gguf_filename)

    if gguf_filename == "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf":
        layers = 25  # // 3
    elif gguf_filename == "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf":
        layers = 21  # // 3
    elif gguf_filename == "mixtral-8x7b-instruct-v0.1.Q6_K.gguf":
        layers = 17  # // 3, reduce for extra large context windows
    else:
        raise Exception("Invalid gguf_id")

    if offload_kqv:
        layers -= 1

    model_path = str(gguf_path)
    model_kwargs = dict(n_ctx=n_ctx,  # The max sequence length to use - note that longer sequence
                        # lengths
                        # require much more resources
                        n_threads=32,  # The number of CPU threads to use, tailor to your system
                        # and the
                        # resulting performance
                        n_gpu_layers=layers,  # The number of layers to offload to GPU,
                        # if you have GPU
                        # acceleration available
                        verbose=verbose,  # Whether to print verbose output during inference
                        offload_kqv=offload_kqv,  # Whether to offload KV cache to GPU
                        n_threads_batch=32,  # The number of CPU threads to use for batched
                        # inference
                        logits_all=True,  # Needed for controlling output format
                        )
    model = Llama(model_path=model_path, **model_kwargs)
    llm = guidance.models.LlamaCpp(model=model)
    # llm = guidance.models.LlamaCpp(model=model_path, **model_kwargs)
    return llm

Traceback (most recent call last):
  File "/home/muksihs/Documents/Mohegan/books/An Introduction to Mohegan/local_models/mixtral_guidance_gguf.py", line 144, in <module>
    main()
  File ".../local_models/mixtral_guidance_gguf.py", line 59, in main
    llm: LlamaCpp = load_mixtral_8x7b(verbose=False, n_ctx=8192)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".../local_models/mixtral_guidance_gguf.py", line 54, in load_mixtral_8x7b
    llm = guidance.models.LlamaCpp(model=model)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".../lib/python3.12/site-packages/guidance/models/llama_cpp/_llama_cpp.py", line 74, in __init__
    self._context = _LlamaBatchContext(self.model_obj.n_batch, self.model_obj.n_ctx())
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File ".../lib/python3.12/site-packages/guidance/models/llama_cpp/_llama_cpp.py", line 23, in __init__
    self.batch = llama_cpp.llama_batch_init(n_tokens=n_batch, embd=0, n_seq_max=n_ctx)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: this function takes at least 3 arguments (0 given)