From 51296b432a938f0b9558b778a9374000b04e1743 Mon Sep 17 00:00:00 2001 From: hassaanQadir <86531769+hassaanQadir@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:24:22 -0500 Subject: [PATCH] Create inference.py (#36) * Create inference.py This is script 2 out of 2 accompanying my 30Aug24 Fine-Tuning Llama3 Article. * Update inference.py I added "Deploy to beam by running `$ python inference.py` in the terminal" * Update inference.py I added Deploy to beam by running `$ beam deploy inference.py:predict` in the terminal * Update inference.py modify Beam deploy command --- finetuning/llama/inference.py | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 finetuning/llama/inference.py diff --git a/finetuning/llama/inference.py b/finetuning/llama/inference.py new file mode 100644 index 0000000..dc766eb --- /dev/null +++ b/finetuning/llama/inference.py @@ -0,0 +1,79 @@ +# inference.py +# Deploy to beam by running `$ beam deploy inference.py:predict --name llama-ft` in the terminal + +from beam import Image, endpoint, env, Volume, QueueDepthAutoscaler, experimental + +MOUNT_PATH = "./llama-ft" +FINETUNE_PATH = "./llama-ft/llama-finetuned" +MODEL_PATH = "./llama-ft/weights" + +# This ensures that these packages are only loaded when the script is running remotely on Beam +if env.is_remote(): + from transformers import AutoTokenizer, AutoModelForCausalLM + from peft import PeftModel + + +def load_finetuned_model(): + global model, tokenizer, stop_token_ids + print("Loading latest...") + + model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, attn_implementation="eager", device_map="auto", is_decoder=True + ) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) + + # using our LORA result via the PEFT library + model = PeftModel.from_pretrained(model, FINETUNE_PATH) + print(model.config) + + stop_token = "<|im_end|>" + stop_token_ids = tokenizer.encode(stop_token, add_special_tokens=False) + + + +@endpoint( + name="llama-inference", + on_start=load_finetuned_model, + volumes=[Volume(name="llama-ft", mount_path=MOUNT_PATH)], + cpu=1, + memory="16Gi", + # We can switch to a smaller, more cost-effective GPU for inference rather than fine-tuning + gpu="T4", + image=Image( + python_version="python3.9", + python_packages=["transformers==4.42.0", "torch", "peft"], + ), + # This autoscaler spawns new containers (up to 5) if the queue depth for tasks exceeds 1 + autoscaler=QueueDepthAutoscaler(max_containers=5, tasks_per_container=1), +) + +def predict(**inputs): + global model, tokenizer, stop_token_ids # These will have the latest values + + prompt = inputs.get("prompt", None) + if not prompt: + return {"error": "Please provide a prompt."} + + # Now we will format the user provided prompt so that it is of the format that + # the fine tuning dataset established. + prompt = f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n" + + # We set the end of sequence token to the last token from <|im_end|> + inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device) + output = model.generate( + inputs, + max_length=100, + num_return_sequences=1, + use_cache=False, + eos_token_id=stop_token_ids[-1], + pad_token_id=tokenizer.eos_token_id, + ) + # Here we are trimming the input length from the output so that only the newly generated text is returned. + text = tokenizer.decode(output[0][len(inputs[0]) :]) + print(text) + + return {"text": text} + +if __name__ == "__main__": + predict.remote()