-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
340 lines (277 loc) · 14.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import streamlit as st
from dotenv import load_dotenv
from supabase import create_client, Client
from sentence_transformers import SentenceTransformer
import numpy as np
from sqlalchemy import create_engine
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
from PyPDF2 import PdfReader
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
import re
import httpx
import logging
import os
from google.auth import default
from google.auth.transport.requests import Request
import google.generativeai as genai
# HuggingFace token
# api_token = "AIzaSyBjTX8MSFMU3XvkiKhZAJ2BHgnt3S3_MWI"
# huggingface token
# load_dotenv()
# DATABASE_URL = os.getenv('DATABASE_URL')
# SUPABASE_URL = os.getenv('SUPABASE_URL')
# SUPABASE_KEY = os.getenv('SUPABASE_KEY')
# api_token = os.getenv('API_TOKEN')
# api_key = os.getenv('API_KEY')
# streamlit environment variables
DATABASE_URL = st.secrets['DATABASE_URL']
SUPABASE_URL = st.secrets["SUPABASE_URL"]
SUPABASE_KEY = st.secrets["SUPABASE_KEY"]
api_token = st.secrets["API_TOKEN"]
api_key = st.secrets["API_KEY"]
engine = create_engine(DATABASE_URL)
supabase_client: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
def test_connection():
try:
response = httpx.get(SUPABASE_URL)
except Exception as e:
st.error(f"Failed to connect to Supabase URL: {e}")
def get_pdf_text(pdf_docs):
text = ""
try:
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
text = clean_text(text) # Clean the extracted text
except Exception as e:
st.error(f"Error reading PDF: {e}")
text = None # Set text to None in case of error
return text
def get_text_chunks(text):
max_chunk_size = 5000
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chunk_size,
chunk_overlap=100,
length_function=len,
)
text_chunks = text_splitter.split_text(text)
return text_chunks
def get_embeddings(text_chunks):
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(text_chunks, convert_to_tensor=True)
return embeddings
def find_top_chunks(user_query, content, content_embeddings, top_n=3):
model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = model.encode([user_query], convert_to_tensor=True)
# If content_embeddings is empty, encode the entire content
if len(content_embeddings) == 0:
content_embeddings = model.encode(content, convert_to_tensor=True)
similarities = np.dot(content_embeddings, query_embedding.T).squeeze()
top_indices = np.argsort(similarities)[-top_n:][::-1]
top_chunks = [content[idx] for idx in top_indices]
return top_chunks
def clean_text(text):
text = text.replace('\n', ' ')
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s.,@-]', '', text)
return text.strip()
def extract_chunks(texts):
cleaned_chunks = [clean_text(text) for text in texts]
return cleaned_chunks
def handle_userinput(user_question):
id_value = st.session_state.get('id') # Safely access session state with .get() method
if id_value is None:
st.error("User ID is not set. Please set your user ID first.")
return
response = supabase_client.table('pdfs').select('embeddings', 'content').eq('id', id_value).execute()
if not response.data: # Check if response data is empty
st.error("No data found for the provided user ID.")
return
data = response.data[0]
content = data.get('content', [])
# st.write("this is final content",content) # Safely access 'content' key
content_embeddings = np.array(data.get('embeddings', [])) # Safely access 'embeddings' key
if isinstance(content, str):
try:
content = eval(content)
except Exception as e:
st.error(f"Content is not in the expected format: {e}")
return
# Ensure content is a list of strings
if not isinstance(content, list):
content = [content]
best_chunk = find_top_chunks(user_question, content, content_embeddings)
best_chunk = extract_chunks(best_chunk)
# Join the best chunks into a single string for the input
# context = " ".join(best_chunk)
input_text = f"You are an AI language model designed for a Retrieval-Augmented Generation (RAG) application. You will receive a query along with relevant chunks of text. Based on these chunks, generate a coherent and accurate response to the query. Query: {user_question} Relevant Chunks: {best_chunk}"
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.0-pro')
# input_text = f"You are an AI language model and will answer the query based on the best chunk provided. Query: {user_question} Best chunk: {best_chunk}"
# headers = {
# "Authorization": f"Bearer {api_token}",
# "Content-Type": "application/json"
# }
# payload = {"inputs": input_text, "parameters": {"max_length": 512, "temperature": 0.7, "repetition_penalty": 1.2}}
output = model.generate_content(input_text)
# response = requests.post(
# "https://api-inference.huggingface.co/models/google/flan-t5-large",
# headers=headers,
# json=payload
# )
response = output.text
# if response.status_code == 200:
# response_data = response.json()
# if isinstance(response_data, list) and 'generated_text' in response_data[0]:
# response_text = response_data[0]['generated_text']
# else:
# response_text = "Sorry, I couldn't generate a response. Please try again."
# else:
# response_text = f"Error: {response.status_code}. {response.content.decode('utf-8')}"
# response_text = ' '.join(dict.fromkeys(response_text.split()))
st.session_state.chat_history = [
{"role": "user", "content": user_question},
{"role": "bot", "content": response }
]
for message in st.session_state.chat_history:
if message['role'] == 'user':
st.write(user_template.replace("{{MSG}}", message['content']), unsafe_allow_html=True)
else:
st.write(bot_template.replace("{{MSG}}", message['content']), unsafe_allow_html=True)
def fetch_user_data(id_value):
response = supabase_client.table('pdfs').select('id').eq('id', id_value).limit(1).execute()
if response.data:
return True
else:
st.error(f"No data found for ID, try again!: {id_value}")
return False
def is_id_unique(new_id_value):
response = supabase_client.table('pdfs').select('id').eq('id', new_id_value).execute()
return len(response.data) == 0 # Return True if the ID is unique, False otherwise
def main():
load_dotenv()
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
# old_id_value = None
# new_id_value = None
key_old_user = "file_uploader_old_user"
key_new_user = "file_uploader_new_user"
# old_user_pdf_docs = None
# new_user_pdf_docs = None
if "id" not in st.session_state:
st.session_state.id = None
if "user_pdf_docs" not in st.session_state:
st.session_state.user_pdf_docs = None
if "existing_content" not in st.session_state:
st.session_state.existing_content = []
if "pdf_processed" not in st.session_state:
st.session_state.pdf_processed = False
st.header("Chat with multiple PDFs :books:")
st.subheader("Welcome to the AI-powered document chatbot!")
with st.sidebar:
user_type = st.radio("Are you a new user or an old user?", ("New User", "Old User","Continue with previous docs"))
if user_type == "New User":
new_id_value = st.text_input("Enter your ID", value="")
new_user_pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True, key=key_new_user)
if st.button("Process New User Data"):
if new_id_value and new_user_pdf_docs:
if is_id_unique(new_id_value):
with st.spinner("Processing"):
try:
raw_text = get_pdf_text(new_user_pdf_docs)
text_chunks = get_text_chunks(raw_text)
embeddings = get_embeddings(text_chunks)
embedding_list = embeddings.tolist()
st.session_state.id = new_id_value
data = {'id': new_id_value, 'content': text_chunks, 'embeddings': embedding_list}
supabase_client.table('pdfs').insert(data).execute()
st.session_state.pdf_processed = True
except Exception as e:
st.error(f"Error occurred in processing new ID and new PDFs: {e}")
st.success("Processing complete!")
st.write("You can now ask a question to the chatbot.")
else:
st.error("This User ID already exists. Please provide a unique ID.")
else:
st.error("Please provide both ID and PDFs to proceed.")
elif user_type == "Old User":
old_id_value = st.text_input("Enter your old ID", value="")
old_user_pdf_docs = st.file_uploader("Upload your PDFs and click on 'Process'", accept_multiple_files=True, key='key_old_user')
if old_user_pdf_docs:
st.session_state.user_pdf_docs = old_user_pdf_docs
if st.button("Process Data"):
st.session_state.id = old_id_value
# Fetch existing data associated with old_id_value
response = supabase_client.table('pdfs').select('content', 'embeddings').eq('id', st.session_state.id).execute()
existing_data = response.data[0]
# Ensure existing_content is a list
existing_content = existing_data.get('content', [])
if isinstance(existing_content, str):
existing_content = eval(existing_content)
if not isinstance(existing_content, list):
existing_content = [existing_content]
st.session_state.existing_content = existing_content
existing_embeddings = np.array(existing_data['embeddings']) if 'embeddings' in existing_data else np.array([])
# st.write("Fetching complete")
if st.session_state.user_pdf_docs:
with st.spinner("Processing"):
try:
# st.write("Started processing")
new_raw_text = get_pdf_text(st.session_state.user_pdf_docs)
new_text_chunks = get_text_chunks(new_raw_text)
# new_text_chunks should be a list of strings (or text chunks)
# Ensure new_text_chunks is always a list of strings
if isinstance(new_text_chunks, str):
new_text_chunks = [new_text_chunks]
existing_content.extend(new_text_chunks)
# combined_text_chunks = st.session_state.existing_content + new_text_chunks
# combined_text_chunks is now a combined list of strings (or text chunks)
new_embeddings = get_embeddings(new_text_chunks)
combined_embeddings = np.concatenate([existing_embeddings, new_embeddings], axis=0)
combined_embedding_list = combined_embeddings.tolist()
data = {
'id': st.session_state.id,
'content': existing_content,
'embeddings': combined_embedding_list
}
supabase_client.table('pdfs').delete().eq('id', st.session_state.id).execute()
supabase_client.table('pdfs').insert(data).execute()
st.session_state.pdf_processed = True
except Exception as e:
st.error(f"An error occurred with processing old ID and new documents: {e}")
st.success("You can now ask a question to the chatbot.")
elif user_type == "Continue with previous docs":
old_id_value = st.text_input("Enter your old ID", value="")
st.session_state.id = old_id_value
if st.button("Process the old documents"):
with st.spinner("Processing"):
try:
# Fetch existing data associated with old_id_value
# st.write("Started fetching existing data...")
response = supabase_client.table('pdfs').select('content', 'embeddings').eq('id', old_id_value).execute()
# st.write("Response fetched")
if response.data:
existing_data = response.data[0]
# st.write(f"Existing data: {existing_data}")
existing_content = existing_data.get('content', [])
# debugging
# st.write("YE existing content hae")
# st.write(existing_content)
existing_embeddings = np.array(existing_data.get('embeddings', []))
# st.write("Fetching complete")
st.session_state.pdf_processed = True
else:
st.error(f"No data found for the provided ID: {old_id_value}")
except Exception as e:
st.error(f"An error occurred with processing old ID and new documents: {e}")
if st.session_state.pdf_processed:
st.success("You can now ask a question to the chatbot.")
if st.session_state.pdf_processed:
user_question = st.text_input("Ask a question about your documents:")
if st.button("Get Response"):
handle_userinput(user_question)
if __name__ == '__main__':
main()