-
Notifications
You must be signed in to change notification settings - Fork 0
/
streamlittest.py
83 lines (70 loc) · 3.16 KB
/
streamlittest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import streamlit as st
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import torch
print(torch.cuda.is_available())
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""
prompt = PromptTemplate(template=template, input_variables=["context","question"])
n_gpu_layers = 80 # Change this value based on your model and your GPU VRAM pool.
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
# Loading model,
llm = LlamaCpp(
model_path="F:\Aakash\models\TheBloke\Llama-2-7B-Chat-GGUF\llama-2-7b-chat.Q8_0.gguf",
max_tokens=1024,
n_gpu_layers=n_gpu_layers,
n_batch=n_batch,
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
verbose=True,
n_ctx=4096, # Context window
stop = ['USER:'], # Dynamic stopping when such token is detected.
temperature = 0.4,
)
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cuda"}
def generate_response(uploaded_file, query_text):
# Load document if file is uploaded
if uploaded_file is not None:
documents = [uploaded_file.read().decode()]
# Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.create_documents(documents)
# Select embeddings
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
# Create a vectorstore from documents
db = Chroma.from_documents(texts, embeddings)
# Create retriever interface
retriever = db.as_retriever()
# Create QA chain
qa = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
return qa.run(query_text)
# Page title
st.set_page_config(page_title='Ask questions on your document')
st.title('Ask questions on your document')
# File upload
uploaded_file = st.file_uploader('Upload an article')
# Query text
query_text = st.text_input('Enter your question:', placeholder = 'Please provide a short summary.', disabled=not uploaded_file)
# Form input and query
result = []
with st.form('myform', clear_on_submit=True):
# openai_api_key = st.text_input('OpenAI API Key', (uploaded_file and query_text))
submitted = st.form_submit_button('Submit', disabled=not(uploaded_file and query_text))
if submitted:
with st.spinner('Calculating...'):
response = generate_response(uploaded_file, query_text)
result.append(response)
if len(result):
st.info(response)