-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1c6bd00
Showing
5 changed files
with
202 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
MONGODB_URI= | ||
MONGODB_DATABASE= | ||
MONGODB_COLLECTION= | ||
OPENAI_API_KEY= | ||
MONGODB_VECTORS= | ||
MONGODB_VECTOR_INDEX= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.env | ||
__pycache__/ | ||
venv/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from dotenv import load_dotenv | ||
load_dotenv() | ||
|
||
from flask import Flask, request, jsonify | ||
from flask_cors import CORS, cross_origin | ||
import os | ||
from pymongo.mongo_client import MongoClient | ||
from pymongo.server_api import ServerApi | ||
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch | ||
from llama_index.indices.vector_store.base import VectorStoreIndex | ||
from process import process_entries | ||
|
||
# Create a new client and connect to the server | ||
client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1')) | ||
|
||
# connect to Atlas as a vector store | ||
store = MongoDBAtlasVectorSearch( | ||
client, | ||
db_name=os.getenv('MONGODB_DATABASE'), # this is the database where you stored your embeddings | ||
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings were stored in 2_load_and_index.py | ||
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you created after loading your data | ||
) | ||
index = VectorStoreIndex.from_vector_store(store) | ||
|
||
app = Flask(__name__) | ||
cors = CORS(app) | ||
app.config['CORS_HEADERS'] = 'Content-Type' | ||
|
||
# This is just so you can easily tell the app is running | ||
@app.route('/') | ||
def hello_world(): | ||
return jsonify({ | ||
"status": "success", | ||
"message": "hello world" | ||
}) | ||
|
||
@app.route('/process', methods=['POST']) | ||
@cross_origin() | ||
def process(): | ||
is_processed = process_entries(client = client) | ||
if not is_processed: | ||
return jsonify({"status": "failed", "error": "process failed"}), 400 | ||
return jsonify({"status": "success", "message": "process successful"}) | ||
|
||
|
||
@app.route('/query', methods=['POST']) | ||
@cross_origin() | ||
def process_form(): | ||
# get the query | ||
query = request.json["query"] | ||
|
||
if query is not None: | ||
# query your data! | ||
# here we have customized the number of documents returned per query to 20, because tweets are really short | ||
query_engine = index.as_query_engine(similarity_top_k=20) | ||
response = query_engine.query(query) | ||
return jsonify({"response": str(response)}) | ||
else: | ||
return jsonify({"error": "query field is missing"}), 400 | ||
|
||
if __name__ == '__main__': | ||
app.run(debug=True, port=9000) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
## This script loads data from a mongo database into an index | ||
## This will convert all the documents in the database into vectors | ||
## which requires a call to OpenAI for each one, so it can take some time. | ||
## Once the data is indexed, it will be stored as a new collection in mongodb | ||
## and you can query it without having to re-index every time. | ||
from dotenv import load_dotenv | ||
load_dotenv() | ||
from bson import ObjectId | ||
# This will turn on really noisy logging if you want it, but it will slow things down | ||
# import logging | ||
# import sys | ||
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | ||
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) | ||
|
||
import os | ||
from llama_index.readers.mongo import SimpleMongoReader | ||
from pymongo.mongo_client import MongoClient | ||
from pymongo.server_api import ServerApi | ||
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch | ||
from llama_index.indices.vector_store.base import VectorStoreIndex | ||
from llama_index.storage.storage_context import StorageContext | ||
|
||
# Create a new client and connect to the server | ||
# client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1')) | ||
|
||
|
||
# load objects from mongo and convert them into LlamaIndex Document objects | ||
# llamaindex has a special class that does this for you | ||
# it pulls every object in a given collection | ||
|
||
def process_entries(client): | ||
collection = client[os.getenv("MONGODB_DATABASE")][os.getenv("MONGODB_COLLECTION")] | ||
query_dict = {"processed": False} | ||
unprocessed_entries = collection.find(query_dict) | ||
reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI")) | ||
documents = reader.load_data( | ||
os.getenv("MONGODB_DATABASE"), | ||
os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored | ||
# field_names=["saleDate", "items", "storeLocation", "customer", "couponUsed", "purchaseMethod"], # these is a list of the top-level fields in your objects that will be indexed | ||
field_names=["text"], # make sure your objects have a field called "full_text" or that you change this value | ||
query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything | ||
) | ||
|
||
store = MongoDBAtlasVectorSearch( | ||
client, | ||
db_name=os.getenv('MONGODB_DATABASE'), | ||
collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored | ||
index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create | ||
) | ||
# # create Atlas as a vector store | ||
# now create an index from all the Documents and store them in Atlas | ||
storage_context = StorageContext.from_defaults(vector_store=store) | ||
index = VectorStoreIndex.from_documents( | ||
documents, storage_context=storage_context, | ||
show_progress=True, # this will show you a progress bar as the embeddings are created | ||
) | ||
|
||
if unprocessed_entries is not None: | ||
for entry in unprocessed_entries: | ||
result = collection.update_one({'_id': entry['_id']}, {'$set': {'processed': True}}) | ||
if result.modified_count > 0: | ||
print("Entry updated successfully") | ||
else: | ||
print("Failed to update entry") | ||
return False | ||
return True | ||
else: | ||
print("No unprocessed entries found") | ||
return False | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
aiohttp==3.8.6 | ||
aiosignal==1.3.1 | ||
aiostream==0.5.2 | ||
annotated-types==0.6.0 | ||
anyio==3.7.1 | ||
async-timeout==4.0.3 | ||
attrs==23.1.0 | ||
certifi==2023.7.22 | ||
charset-normalizer==3.3.1 | ||
click==8.1.7 | ||
dataclasses-json==0.5.14 | ||
Deprecated==1.2.14 | ||
dnspython==2.4.2 | ||
Flask==2.2.5 | ||
Flask-Cors==4.0.0 | ||
frozenlist==1.4.0 | ||
fsspec==2023.10.0 | ||
greenlet==3.0.1 | ||
gunicorn==21.2.0 | ||
idna==3.4 | ||
itsdangerous==2.1.2 | ||
Jinja2==3.1.2 | ||
joblib==1.3.2 | ||
jsonpatch==1.33 | ||
jsonpointer==2.4 | ||
langchain==0.0.325 | ||
langsmith==0.0.53 | ||
llama-index==0.8.55 | ||
MarkupSafe==2.1.3 | ||
marshmallow==3.20.1 | ||
multidict==6.0.4 | ||
mypy-extensions==1.0.0 | ||
nest-asyncio==1.5.8 | ||
nltk==3.8.1 | ||
openai==0.28.1 | ||
packaging==23.2 | ||
pydantic==2.4.2 | ||
pydantic_core==2.10.1 | ||
pymongo==4.5.0 | ||
python-dateutil==2.8.2 | ||
python-dotenv==1.0.0 | ||
pytz==2023.3.post1 | ||
PyYAML==6.0.1 | ||
regex==2023.10.3 | ||
requests==2.31.0 | ||
six==1.16.0 | ||
sniffio==1.3.0 | ||
SQLAlchemy==2.0.22 | ||
tenacity==8.2.3 | ||
tiktoken==0.5.1 | ||
tqdm==4.66.1 | ||
typing-inspect==0.9.0 | ||
typing_extensions==4.8.0 | ||
tzdata==2023.3 | ||
urllib3==1.26.18 | ||
Werkzeug==2.2.3 | ||
wrapt==1.15.0 | ||
yarl==1.9.2 |