Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
vishalkc9565 committed Feb 27, 2024
0 parents commit 1c6bd00
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
MONGODB_URI=
MONGODB_DATABASE=
MONGODB_COLLECTION=
OPENAI_API_KEY=
MONGODB_VECTORS=
MONGODB_VECTOR_INDEX=
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
__pycache__/
venv/

63 changes: 63 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from dotenv import load_dotenv
load_dotenv()

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin
import os
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.indices.vector_store.base import VectorStoreIndex
from process import process_entries

# Create a new client and connect to the server
client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))

# connect to Atlas as a vector store
store = MongoDBAtlasVectorSearch(
client,
db_name=os.getenv('MONGODB_DATABASE'), # this is the database where you stored your embeddings
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings were stored in 2_load_and_index.py
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you created after loading your data
)
index = VectorStoreIndex.from_vector_store(store)

app = Flask(__name__)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

# This is just so you can easily tell the app is running
@app.route('/')
def hello_world():
return jsonify({
"status": "success",
"message": "hello world"
})

@app.route('/process', methods=['POST'])
@cross_origin()
def process():
is_processed = process_entries(client = client)
if not is_processed:
return jsonify({"status": "failed", "error": "process failed"}), 400
return jsonify({"status": "success", "message": "process successful"})


@app.route('/query', methods=['POST'])
@cross_origin()
def process_form():
# get the query
query = request.json["query"]

if query is not None:
# query your data!
# here we have customized the number of documents returned per query to 20, because tweets are really short
query_engine = index.as_query_engine(similarity_top_k=20)
response = query_engine.query(query)
return jsonify({"response": str(response)})
else:
return jsonify({"error": "query field is missing"}), 400

if __name__ == '__main__':
app.run(debug=True, port=9000)

71 changes: 71 additions & 0 deletions process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
## This script loads data from a mongo database into an index
## This will convert all the documents in the database into vectors
## which requires a call to OpenAI for each one, so it can take some time.
## Once the data is indexed, it will be stored as a new collection in mongodb
## and you can query it without having to re-index every time.
from dotenv import load_dotenv
load_dotenv()
from bson import ObjectId
# This will turn on really noisy logging if you want it, but it will slow things down
# import logging
# import sys
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import os
from llama_index.readers.mongo import SimpleMongoReader
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.storage.storage_context import StorageContext

# Create a new client and connect to the server
# client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))


# load objects from mongo and convert them into LlamaIndex Document objects
# llamaindex has a special class that does this for you
# it pulls every object in a given collection

def process_entries(client):
collection = client[os.getenv("MONGODB_DATABASE")][os.getenv("MONGODB_COLLECTION")]
query_dict = {"processed": False}
unprocessed_entries = collection.find(query_dict)
reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI"))
documents = reader.load_data(
os.getenv("MONGODB_DATABASE"),
os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored
# field_names=["saleDate", "items", "storeLocation", "customer", "couponUsed", "purchaseMethod"], # these is a list of the top-level fields in your objects that will be indexed
field_names=["text"], # make sure your objects have a field called "full_text" or that you change this value
query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything
)

store = MongoDBAtlasVectorSearch(
client,
db_name=os.getenv('MONGODB_DATABASE'),
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create
)
# # create Atlas as a vector store
# now create an index from all the Documents and store them in Atlas
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context,
show_progress=True, # this will show you a progress bar as the embeddings are created
)

if unprocessed_entries is not None:
for entry in unprocessed_entries:
result = collection.update_one({'_id': entry['_id']}, {'$set': {'processed': True}})
if result.modified_count > 0:
print("Entry updated successfully")
else:
print("Failed to update entry")
return False
return True
else:
print("No unprocessed entries found")
return False


58 changes: 58 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
aiohttp==3.8.6
aiosignal==1.3.1
aiostream==0.5.2
annotated-types==0.6.0
anyio==3.7.1
async-timeout==4.0.3
attrs==23.1.0
certifi==2023.7.22
charset-normalizer==3.3.1
click==8.1.7
dataclasses-json==0.5.14
Deprecated==1.2.14
dnspython==2.4.2
Flask==2.2.5
Flask-Cors==4.0.0
frozenlist==1.4.0
fsspec==2023.10.0
greenlet==3.0.1
gunicorn==21.2.0
idna==3.4
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.3.2
jsonpatch==1.33
jsonpointer==2.4
langchain==0.0.325
langsmith==0.0.53
llama-index==0.8.55
MarkupSafe==2.1.3
marshmallow==3.20.1
multidict==6.0.4
mypy-extensions==1.0.0
nest-asyncio==1.5.8
nltk==3.8.1
openai==0.28.1
packaging==23.2
pydantic==2.4.2
pydantic_core==2.10.1
pymongo==4.5.0
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3.post1
PyYAML==6.0.1
regex==2023.10.3
requests==2.31.0
six==1.16.0
sniffio==1.3.0
SQLAlchemy==2.0.22
tenacity==8.2.3
tiktoken==0.5.1
tqdm==4.66.1
typing-inspect==0.9.0
typing_extensions==4.8.0
tzdata==2023.3
urllib3==1.26.18
Werkzeug==2.2.3
wrapt==1.15.0
yarl==1.9.2

0 comments on commit 1c6bd00

Please sign in to comment.