-
Notifications
You must be signed in to change notification settings - Fork 52
/
1.1_Evaluations.py
191 lines (143 loc) · 5.99 KB
/
1.1_Evaluations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Databricks notebook source
# MAGIC %md
# MAGIC # Evaluations
# MAGIC Running Evaluations on RAGs is still more art than science \
# MAGIC We will use llama_index to assist in generating evaluation questions \
# MAGIC And use the inbuilt assessment prompt in llama_index \
# COMMAND ----------
# MAGIC %pip install llama_index==0.10.25 langchain==0.1.13 llama-index-llms-langchain llama-index-embeddings-langchain
# MAGIC dbutils.library.restartPython()
# COMMAND ----------
import os
import pandas as pd
import nest_asyncio
# Needed for the async calls to work
nest_asyncio.apply()
# COMMAND ----------
# MAGIC %md
# MAGIC # Intro to Llama Index
# MAGIC Much like langchain, llama_index is an orchestration layer for LLM logic \
# MAGIC Where they differ is that llama_index is a lot more focused on RAGs and doing intelligent indexing \
# MAGIC Langchain is more generalist and has been focused on enabling complex workflows
# MAGIC
# MAGIC Llama Index has a few key concepts we will use for this notebook:
# MAGIC - Service Context - wrapper class to hold llm model / embeddings
# MAGIC - An Index - this is the core of llama index. At it's base, an index consists of a complex structure of nodes which contain text and embeddings
# COMMAND ----------
# MAGIC %run ./utils
# COMMAND ----------
# MAGIC %md
# MAGIC ## Setting up Llama Index default models
# MAGIC If we don't setup new defaults, Llama_index will go to OpenAI by default
# COMMAND ----------
from langchain_community.chat_models import ChatDatabricks
from langchain_community.embeddings import DatabricksEmbeddings
from llama_index.core import Settings
from llama_index.llms.langchain import LangChainLLM
from llama_index.embeddings.langchain import LangchainEmbedding
embedding_model = 'databricks-bge-large-en'
model_name = 'databricks-dbrx-instruct'
llm_model = model = ChatDatabricks(
target_uri='databricks',
endpoint = model_name,
temperature = 0.1
)
embeddings = DatabricksEmbeddings(endpoint=embedding_model)
llama_index_chain = LangChainLLM(llm=llm_model)
llama_index_embeddings = LangchainEmbedding(langchain_embeddings=embeddings)
Settings.llm = llama_index_chain
Settings.embed_model = llama_index_embeddings
# COMMAND ----------
# MAGIC %md
# MAGIC # Load and Chunk Documents
# MAGIC We will load a sample doc to test on, firstly with a naive default chunking strategy
# MAGIC
# COMMAND ----------
vol_path = f'/Volumes/{db_catalog}/{db_schema}/{db_volume}/'
# validate we have files
os.listdir(vol_path)
# COMMAND ----------
from llama_index.core import (
SimpleDirectoryReader, VectorStoreIndex, Response
)
reader = SimpleDirectoryReader(vol_path)
documents = reader.load_data()
# COMMAND ----------
# we are just setting up a simple in memory Vectorstore here
index = VectorStoreIndex.from_documents(documents)
# and turning it into a query engine
query_engine = index.as_query_engine()
# Let's validate that it is all working
reply = query_engine.query('what is a neural network?')
print(reply.response)
# COMMAND ----------
# MAGIC %md
# MAGIC # Build out evaluation Questions
# MAGIC In order to run evaluation we need to have feasible questions to feed the model \
# MAGIC It is time consuming to manually construct questions so we will use a LLM to do this \
# MAGIC Note that this will have limitations, namely in the types of questions it will generate
# COMMAND ----------
from llama_index.core.evaluation import DatasetGenerator
data_generator = DatasetGenerator.from_documents(documents)
# this is the call to generate the questions
# if you set the number it will run multithreaded and be gaster
eval_questions = data_generator.generate_questions_from_nodes(num=64)
eval_questions
# Some of these questions might not be too useful. It could be because of the model we are using for generation
# It could also be that the chunk is particularly bad
# COMMAND ----------
# When running in lab env we may pregenerate ahead of the class and store it for reloading
#question_frame = spark.sql(f"SELECT * FROM {db_catalog}.{db_schema}.evaluation_questions").toPandas()
question_frame = pd.DataFrame(eval_questions, columns=["eval_questions"])
dataframe = spark.createDataFrame(question_frame)
dataframe.write.mode("overwrite").saveAsTable(f"{db_catalog}.{db_schema}.evaluation_questions")
display(dataframe)
# COMMAND ----------
# MAGIC %md
# MAGIC # Use Questions to generate evaluations
# MAGIC Now we have our queries we need to run some responses
# MAGIC
# MAGIC This next step can be slow so we will cut it down to 20 questions \
# MAGIC We can then use the `ResponseEvaluator`` looks at whether the query is answered by the response
# COMMAND ----------
import pandas as pd
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.evaluation import EvaluationResult
eval_questions = eval_questions[0:20]
# Yes we are using a LLM to evaluate a LLM
## When doing this normally you might use a more powerful but more expensive evaluator
## to assess the quality of your input
evaluator = RelevancyEvaluator(llm=llama_index_chain)
# define jupyter display function
def display_eval_df(
query: str, response: Response, eval_result: EvaluationResult
) -> None:
eval_df = pd.DataFrame(
{
"Query": query,
"Response": str(response),
"Source": response.source_nodes[0].node.text[:1000] + "...",
"Evaluation Result": "Pass" if eval_result.passing else "Fail",
"Reasoning": eval_result.feedback,
},
index=[0],
)
eval_df = eval_df.style.set_properties(
**{
"inline-size": "600px",
"overflow-wrap": "break-word",
},
subset=["Response", "Source"]
)
display(eval_df)
# COMMAND ----------
query_str = (
"What is the best approach to finetuning llms?"
)
query_engine = index.as_query_engine()
response_vector = query_engine.query(query_str)
eval_result = evaluator.evaluate_response(
query=query_str, response=response_vector
)
# COMMAND ----------
display_eval_df(query_str, response_vector, eval_result)