-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
500 lines (421 loc) · 18.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
# Basic flask stuff for building http APIs and rendering html templates
from flask import Flask, render_template, redirect, url_for, request, session
# Bootstrap integration with flask so we can make pretty pages
from flask_bootstrap import Bootstrap
# Flask forms integrations which save insane amounts of time
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField, TextAreaField, SelectField, PasswordField, IntegerField, FloatField
from wtforms.validators import DataRequired
# Basic python stuff
import os
import json
import functools
import random
import threading
# Basic mongo python stuff
import pymongo
from bson import ObjectId
from bson import json_util
# Nice way to load environment variables for deployments
from dotenv import load_dotenv
# Instructor-large embedding model for creating vectors
from InstructorEmbedding import INSTRUCTOR
instructor_model = INSTRUCTOR('hkunlp/instructor-large')
# Use the wonderful llama.cpp library to execute our LLM (mistral-7b with dolphin fine tune)
from llama_cpp import Llama
llama_model = Llama(model_path="dolphin-2.1-mistral-7b.Q5_K_S.gguf", n_ctx=2048, use_mlock=False)
prompt_format = "<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant:"
ban_token = "<|" # This is to prevent the model from leaking additional questions
data_lock = threading.Lock() # This is a lock to prevent the LLM library from being called concurrently
# Some default constants, feel free to change any of these!
# ----------------------------------------------------------
DEFAULT_SCORE_CUT = 0.9 # The score cut off for instructor results anywhere from 0.8 to 0.92 seems good
DEFAULT_TEMP = 0.1 # The LLM temperature value, 0.1 is deterministic results, 0.7 is more creative
DEFAULT_K = 100 # The over-request value for the ANN query. 100-200 is good.
DEFAULT_TOKENS = 64 # The default number of tokens for the LLM to produce. 64 is fast, 128 gives longer results.
# This is the default prompt with replaceable question (%q%) and data (%d%) tokens
DEFAULT_PROMPT = "Answer the following question \"%q%\" using only this data while ignoring any data irrelevant to this question: %d%"
# This is the default system message for controlling the LLM behavior
DEFAULT_SYSTEM = "You are a helpful assistant who will always answer the question with only the data provided and in 2 sentences"
# ----------------------------------------------------------
# Get environment variables
load_dotenv()
# Create the Flask app object
app = Flask(__name__)
# Load API key from .evn file - super secure
if "API_KEY" in os.environ:
api_key = os.environ["API_KEY"]
else:
api_key = None
# Need this for storing anything in session object
if "SECRET_KEY" in os.environ:
app.config['SECRET_KEY'] = os.environ["SECRET_KEY"].strip()
else:
app.config['SECRET_KEY'] = "ohboyyoureallyshouldachangedthis"
# Connect to mongo using our loaded environment variables from the .env file
if "SPECUIMDBCONNSTR" in os.environ:
conn = os.environ["SPECUIMDBCONNSTR"].strip()
else:
conn = os.environ["MONGO_CON"].strip()
if "MONGO_DB" in os.environ:
database = os.environ["MONGO_DB"].strip()
else:
database = "specialists"
if "MONGO_COL" in os.environ:
collection = os.environ["MONGO_COL"].strip()
else:
collection = "ragtagchunks"
client = pymongo.MongoClient(conn)
db = client[database]
col = db[collection]
# Load users from .env file
if "USERS" in os.environ:
users_string = os.environ["USERS"]
users = json.loads(users_string)
else:
users = None
# Make it pretty because I can't :(
Bootstrap(app)
# Flask forms is magic
class ChunkForm(FlaskForm):
chunk_question = StringField('Question', validators=[DataRequired()])
chunk_answer = TextAreaField('Answer', validators=[DataRequired()])
chunk_enabled = SelectField('Enabled', choices=[(True, True), (False, False)])
submit = SubmitField('Submit')
# Amazing, I hate writing this stuff
class LoginForm(FlaskForm):
username = StringField('Username', validators=[DataRequired()])
password = PasswordField('Password', validators=[DataRequired()])
submit = SubmitField('Login')
# Always have a search bar
class SearchForm(FlaskForm):
search_string = StringField('Question/Answer Search', validators=[DataRequired()])
submit = SubmitField('Submit')
# Always have a search bar - for vectors too
class VectorSearchForm(FlaskForm):
search_string = StringField('Vector Search', validators=[DataRequired()])
search_k = IntegerField("K Value", validators=[DataRequired()])
search_score_cut = FloatField("Score Cut Off", validators=[DataRequired()])
submit = SubmitField('Submit')
# Vector search but now for the chatbot LLM
class LLMForm(FlaskForm):
question = StringField('Question', validators=[DataRequired()])
search_k = IntegerField("K Value", validators=[DataRequired()])
search_score_cut = FloatField("Score Cut Off", validators=[DataRequired()])
llm_system = TextAreaField('System Message', validators=[DataRequired()])
llm_prompt = TextAreaField('Prompt', validators=[DataRequired()])
llm_temp = FloatField("Temperature", validators=[DataRequired()])
llm_tokens = IntegerField("Number of tokens from LLM", validators=[DataRequired()])
submit = SubmitField('Submit')
# Return embedding with instruction and text
def get_embedding(ins, text):
return instructor_model.encode([[ins,text]]).tolist()[0]
# Return the retrieval augmented generative result
def get_rag(question, search_k, search_score_cut, llm_prompt, llm_system, llm_temp, llm_tokens):
# Get all the chunks
print("GETTING CHUNKS", flush=True)
chunks = list(vector_search_chunks(question, search_k, search_score_cut))
answer_scores = []
# Build the LLM answer chunks and build up our answer scores for later
answers = ""
print("ANSWERS", flush=True)
for answer in chunks:
print("\tAnswer", flush=True)
answers = answers + answer["chunk_answer"] + " "
score_data = {"chunk_answer": answer["chunk_answer"], "score": answer["score"]}
answer_scores.append(score_data)
# Oh no! We have no chunks. Just return a generic "we can't help you"
# Score cut offs really help prevent LLM abuse. This is your first guardrail.
if answers == "":
return {"input": "no chunks found", "output": "No data was found to answer this question", "chunks": {}}
# Replace the template tokens with the question and the answers
prompt = llm_prompt.replace("%q%", question)
prompt = prompt.replace("%d%", answers)
# One more replacement step to help our chat model out with a system prompt and proper control tokens
llm_result = {}
llm_result["input"] = prompt_format.replace("{prompt}", prompt)
llm_result["input"] = llm_result["input"].replace("{system}", llm_system)
print("MODEL", flush=True)
# Generate LLM response and return the text but only allow 1 at a time
with data_lock:
llm_result["output"] = llama_model(llm_result["input"], max_tokens=llm_tokens, temperature=llm_temp)["choices"][0]["text"]
# Find the baned tokens
print("BAN", flush=True)
index = llm_result["output"].find(ban_token)
# Check if the ban token is found in the string
print("TRIM", flush=True)
if index != -1:
# Trim the string, including the marker and everything after it
print("\tTrim", flush=True)
llm_result["output"] = llm_result["output"][:index]
# Sure, throw the chunks in there too!
llm_result["chunks"] = answer_scores
return llm_result
# Count the number of tokens in the string (useful for setting limits)
def token_count(text):
words = text.split() # Split the text into words using spaces as the default delimiter
return len(words)
# Atlas search query for chunks
def search_chunks(search_string):
search_query = [
{
"$search": {
"text": {
"path": ["chunk_question", "chunk_answer"],
"query": search_string
}
}
},
{
"$limit": 25
},
{
"$project": {
"_id": 1,
"chunk_question": 1,
"chunk_answer": 1,
"chunk_enabled": 1,
"score": {"$meta": "searchScore"}
}
}]
return col.aggregate(search_query)
# Altlas vector search query for testing chunks semantically using embeddings
def vector_search_chunks(search_string, k, cut):
print("GETEMBEDDING", flush=True)
v = get_embedding("Represent the question for retrieving supporting documents:", search_string)
search_query = [
{
"$search": {
"knnBeta": {
"path": "chunk_embedding",
"vector": v,
"filter": { "equals": { "path": "chunk_enabled", "value": True}},
"k": int(k)
}
}
},
{
"$limit": 5
},
{
"$project": {
"_id": 1,
"chunk_question": 1,
"chunk_answer": 1,
"chunk_enabled": 1,
"score": {"$meta": "searchScore"}
}
},
{
"$match": { "score": { "$gte": float(cut) }}
}
]
print("VECQUERY", flush=True)
return col.aggregate(search_query)
# Define a decorator to check if the user is authenticated
# No idea how this works... Magic.
def login_required(view):
@functools.wraps(view)
def wrapped_view(**kwargs):
if users != None:
if session.get("user") is None:
return redirect(url_for('login'))
return view(**kwargs)
return wrapped_view
# The default chunk view with pagination and lexical search
@app.route('/', methods=['GET', 'POST'])
@login_required
def index():
# We're doing a lexical search here
form = SearchForm()
if request.method == "POST":
form_result = request.form.to_dict(flat=True)
chunks = search_chunks(form_result["search_string"])
return render_template('search.html', chunks=chunks)
# Get the chunks!
chunk_query = col.find().skip(0).limit(50)
chunks = []
for chunk_item in chunk_query:
chunks.append(chunk_item)
# Spit out the template
return render_template('index.html', chunks=chunks, form=form)
# We use this for doing semantic search testing on the chunks
@app.route('/test', methods=['GET', 'POST'])
@login_required
def test():
# no chunks by default
chunks = []
# We're doing a vector search here
form = VectorSearchForm(search_k=DEFAULT_K, search_score_cut=DEFAULT_SCORE_CUT)
if request.method == "POST":
form_result = request.form.to_dict(flat=True)
chunks = vector_search_chunks(form_result["search_string"], form_result["search_k"], form_result["search_score_cut"])
return render_template('test.html', chunks=chunks, form=form)
# Spit out the template
return render_template('test.html', chunks=chunks, form=form)
# We use this for doing semantic search testing on the chunks
@app.route('/llm', methods=['GET', 'POST'])
@login_required
def llm():
# We're doing a vector search here
chunks = []
form = LLMForm(search_k=DEFAULT_K, search_score_cut=DEFAULT_SCORE_CUT, llm_prompt=str(DEFAULT_PROMPT), llm_system=str(DEFAULT_SYSTEM),llm_temp = DEFAULT_TEMP, llm_tokens=DEFAULT_TOKENS)
if request.method == "POST":
form_result = request.form.to_dict(flat=True)
llm_response = get_rag(form_result["question"], form_result["search_k"], form_result["search_score_cut"], form_result["llm_prompt"], form_result["llm_system"], float(form_result["llm_temp"]), int(form_result["llm_tokens"]))
return render_template('llm.html', chunks=llm_response["chunks"], form=form, llm_response=llm_response["output"],prompt=llm_response["input"])
# Spit out the template
return render_template('llm.html', chunks=chunks, form=form)
# Create or edit chunks. Basic CRUD functionality.
@app.route('/chunk', methods=['GET', 'POST'])
@app.route('/chunk/<id>', methods=['GET', 'POST'])
@login_required
def chunk(id=None):
# This is the input form we want to load for doing chunk add/edit
form = ChunkForm()
# POST means we're getting a completed form
if request.method == "POST":
# Get the form result back and clean up the data set
form_result = request.form.to_dict(flat=True)
form_result.pop('csrf_token')
form_result.pop('submit')
# Change the text True/False to a proper python type - Thanks WTForms :(
if form_result["chunk_enabled"] == "True":
form_result["chunk_enabled"] = True
if form_result["chunk_enabled"] == "False":
form_result["chunk_enabled"] = False
embed_text = form_result["chunk_question"] + " " + form_result["chunk_answer"]
# Stuff the token count into the form result
form_result["chunk_tokens"] = token_count(form_result["chunk_question"] + " " + form_result["chunk_answer"])
form_result["chunk_embedding"] = get_embedding("Represent the document for retrieval:", embed_text)
# Store the result in mongo collection
if id:
col.replace_one({'_id': ObjectId(id)}, form_result)
else:
col.insert_one(form_result)
# Back to the chunk view
return redirect("/")
else:
# This is if we got passed a mongo document ID and we need to edit it.
# Load the doc up and render the edit form.
if id:
chunk = col.find_one({'_id': ObjectId(id)})
form.chunk_question.data = chunk["chunk_question"]
form.chunk_answer.data = chunk["chunk_answer"]
form.chunk_enabled.data = bool(chunk["chunk_enabled"])
return render_template('chunk.html', form=form)
# This chunk is bad, we need to make it feel bad
@app.route('/chunk_disable/<id>')
@login_required
def chunk_disable(id):
update_doc = {
"chunk_enabled": False
}
chunk_data = col.find_one({'_id': ObjectId(id)})
col.update_one({'_id': ObjectId(id)}, {"$set": update_doc})
return redirect('/')
# This chunk is bad, we need to make it feel bad
@app.route('/chunk_delete/<id>')
@login_required
def chunk_delete(id):
chunk_data = col.delete_one({'_id': ObjectId(id)})
return redirect('/')
# Login/logout routes that rely on the user being stored in session
@app.route('/login', methods=['GET', 'POST'])
def login():
form = LoginForm()
if form.validate_on_submit():
if form.username.data in users:
if form.password.data == users[form.username.data]:
session["user"] = form.username.data
return redirect(url_for('index'))
return render_template('login.html', form=form)
# We finally have a link for this now!
@app.route('/logout')
def logout():
session["user"] = None
return redirect(url_for('login'))
# API endpoint for sending a question and getting the LLM output (RAG)
# This is what you want to call from your website, slack or discord bot.
@app.route('/api/rag')
def api_rag():
key = request.args.get("key")
q = request.args.get("q")
# Make sure we have a valid key and question
if not q:
return {'error': 'No q parameter found. You must ask a question - /api/rag/q=<string>'}
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
# Get the LLM result for the question with default settings then return it
llm_response = get_rag(q, DEFAULT_K, DEFAULT_SCORE_CUT, DEFAULT_PROMPT, DEFAULT_SYSTEM, DEFAULT_TEMP, DEFAULT_TOKENS)
return llm_response
# API endpoint for that mimics the response of the previous
# To respond quickly for api testing
@app.route('/api/ragfake')
def api_ragfake():
key = request.args.get("key")
q = request.args.get("q")
# Make sure we have a valid key and question
if not q:
return {'error': 'No q parameter found. You must ask a question - /api/rag/q=<string>'}
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
resp = {}
resp["input"] = q
resp["output"] = "this is a mock response from the api."
resp["chunks"] = []
for i in range(3):
c = {}
c["chunk_answer"] = "This is chunk answer number " + str(i)
c["score"] = random.random()
resp["chunks"].append(c)
return resp
# API endpoint for getting a text embedding from instructor.
@app.route('/api/vector')
def api_vector():
key = request.args.get("key")
q = request.args.get("q")
# Make sure we have a valid key and question
if not q:
return {'error': 'No q parameter found. You must provide a string to vectorize - /api/vector/q=<string>'}
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
# Get the vector result for the string
return get_embedding("Represent the question for retrieving supporting documents:", q)
# API Endpoint to dump all stored chunks
@app.route('/api/list')
def api_list():
key = request.args.get("key")
# Make sure we have a valid key and question
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
# Get the chunks!
chunk_query = col.find().skip(0).limit(50)
chunks = []
for chunk_item in chunk_query:
chunks.append(chunk_item)
return json.loads(json_util.dumps(chunks))
# API Endpoint to perform lexical search
@app.route('/api/search')
def api_search():
key = request.args.get("key")
q = request.args.get("q")
# Make sure we have a valid key and question
if not q:
return {'error': 'No q parameter found. You must ask a question - /api/search/q=<string>'}
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
chunks = search_chunks(q)
return json.loads(json_util.dumps(chunks))
# API Endpoint to perform vector search
@app.route('/api/vector_search')
def api_vector_search():
key = request.args.get("key")
q = request.args.get("q")
# Make sure we have a valid key and question
if not q:
return {'error': 'No q parameter found. You must ask a question - /api/search/q=<string>'}
if ((key != api_key) and (api_key != None)):
return {'error': 'API key does not match'}
chunks = vector_search_chunks(q, DEFAULT_K, DEFAULT_SCORE_CUT)
return json.loads(json_util.dumps(chunks))