Initial Commit

mehul100100 · Feb 27, 2024 · 1c6bd00 · 1c6bd00
commit 1c6bd00
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 0 deletions.
diff --git a/.env.sample b/.env.sample
@@ -0,0 +1,6 @@
+MONGODB_URI=
+MONGODB_DATABASE=
+MONGODB_COLLECTION=
+OPENAI_API_KEY=
+MONGODB_VECTORS=
+MONGODB_VECTOR_INDEX=
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.env
+__pycache__/
+venv/
+
diff --git a/app.py b/app.py
@@ -0,0 +1,63 @@
+from dotenv import load_dotenv
+load_dotenv()
+
+from flask import Flask, request, jsonify
+from flask_cors import CORS, cross_origin
+import os
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from llama_index.indices.vector_store.base import VectorStoreIndex
+from process import process_entries
+
+# Create a new client and connect to the server
+client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
+
+# connect to Atlas as a vector store
+store = MongoDBAtlasVectorSearch(
+    client,
+    db_name=os.getenv('MONGODB_DATABASE'), # this is the database where you stored your embeddings
+    collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings were stored in 2_load_and_index.py
+    index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you created after loading your data
+)
+index = VectorStoreIndex.from_vector_store(store)
+
+app = Flask(__name__)
+cors = CORS(app)
+app.config['CORS_HEADERS'] = 'Content-Type'
+
+# This is just so you can easily tell the app is running
+@app.route('/')
+def hello_world():
+    return jsonify({
+        "status": "success",
+        "message": "hello world"
+    })
+
+@app.route('/process', methods=['POST'])
+@cross_origin()
+def process():
+    is_processed = process_entries(client = client)
+    if not is_processed:
+        return jsonify({"status": "failed", "error": "process failed"}), 400
+    return jsonify({"status": "success", "message": "process successful"})
+
+
+@app.route('/query', methods=['POST'])
+@cross_origin()
+def process_form():
+    # get the query
+    query = request.json["query"]
+
+    if query is not None:
+        # query your data!
+        # here we have customized the number of documents returned per query to 20, because tweets are really short
+        query_engine = index.as_query_engine(similarity_top_k=20)
+        response = query_engine.query(query)
+        return jsonify({"response": str(response)})
+    else:
+        return jsonify({"error": "query field is missing"}), 400
+
+if __name__ == '__main__':
+    app.run(debug=True, port=9000)
+
diff --git a/process.py b/process.py
@@ -0,0 +1,71 @@
+## This script loads data from a mongo database into an index
+## This will convert all the documents in the database into vectors
+## which requires a call to OpenAI for each one, so it can take some time.
+## Once the data is indexed, it will be stored as a new collection in mongodb
+## and you can query it without having to re-index every time.
+from dotenv import load_dotenv
+load_dotenv()
+from bson import ObjectId
+# This will turn on really noisy logging if you want it, but it will slow things down
+# import logging
+# import sys
+# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+
+import os
+from llama_index.readers.mongo import SimpleMongoReader
+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+from llama_index.indices.vector_store.base import VectorStoreIndex
+from llama_index.storage.storage_context import StorageContext
+
+# Create a new client and connect to the server
+# client = MongoClient(os.getenv("MONGODB_URI"), server_api=ServerApi('1'))
+
+
+# load objects from mongo and convert them into LlamaIndex Document objects
+# llamaindex has a special class that does this for you
+# it pulls every object in a given collection
+
+def process_entries(client):
+    collection = client[os.getenv("MONGODB_DATABASE")][os.getenv("MONGODB_COLLECTION")]
+    query_dict = {"processed": False}
+    unprocessed_entries = collection.find(query_dict)
+    reader = SimpleMongoReader(uri=os.getenv("MONGODB_URI"))
+    documents = reader.load_data(
+        os.getenv("MONGODB_DATABASE"),
+        os.getenv("MONGODB_COLLECTION"), # this is the collection where the objects you loaded in 1_import got stored
+        # field_names=["saleDate", "items", "storeLocation", "customer", "couponUsed", "purchaseMethod"], # these is a list of the top-level fields in your objects that will be indexed
+        field_names=["text"],                               # make sure your objects have a field called "full_text" or that you change this value
+        query_dict=query_dict # this is a mongo query dict that will filter your data if you don't want to index everything
+    )
+
+    store = MongoDBAtlasVectorSearch(
+        client,
+        db_name=os.getenv('MONGODB_DATABASE'),
+        collection_name=os.getenv('MONGODB_VECTORS'), # this is where your embeddings will be stored
+        index_name=os.getenv('MONGODB_VECTOR_INDEX') # this is the name of the index you will need to create
+    )
+    # # create Atlas as a vector store
+    # now create an index from all the Documents and store them in Atlas
+    storage_context = StorageContext.from_defaults(vector_store=store)
+    index = VectorStoreIndex.from_documents(
+        documents, storage_context=storage_context,
+        show_progress=True, # this will show you a progress bar as the embeddings are created
+    )
+
+    if unprocessed_entries is not None:
+        for entry in unprocessed_entries:
+            result = collection.update_one({'_id': entry['_id']}, {'$set': {'processed': True}})
+            if result.modified_count > 0:
+                print("Entry updated successfully")
+            else:
+                print("Failed to update entry")
+                return False
+        return True
+    else:
+        print("No unprocessed entries found")
+        return False
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,58 @@
+aiohttp==3.8.6
+aiosignal==1.3.1
+aiostream==0.5.2
+annotated-types==0.6.0
+anyio==3.7.1
+async-timeout==4.0.3
+attrs==23.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.1
+click==8.1.7
+dataclasses-json==0.5.14
+Deprecated==1.2.14
+dnspython==2.4.2
+Flask==2.2.5
+Flask-Cors==4.0.0
+frozenlist==1.4.0
+fsspec==2023.10.0
+greenlet==3.0.1
+gunicorn==21.2.0
+idna==3.4
+itsdangerous==2.1.2
+Jinja2==3.1.2
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.0.325
+langsmith==0.0.53
+llama-index==0.8.55
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+multidict==6.0.4
+mypy-extensions==1.0.0
+nest-asyncio==1.5.8
+nltk==3.8.1
+openai==0.28.1
+packaging==23.2
+pydantic==2.4.2
+pydantic_core==2.10.1
+pymongo==4.5.0
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.22
+tenacity==8.2.3
+tiktoken==0.5.1
+tqdm==4.66.1
+typing-inspect==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==1.26.18
+Werkzeug==2.2.3
+wrapt==1.15.0
+yarl==1.9.2