From 4447e4143ff145d45d4b3b56c4e7de08ceec9186 Mon Sep 17 00:00:00 2001
From: AKhileshPothuri <“pothuriakhilesh@yahoo.com”>
Date: Mon, 18 Mar 2024 12:08:31 -0400
Subject: [PATCH] Added Functionality to read JSON files

This new class will be able to read JSON files. Each dictionary in a JSON file is considered as a separate document. This can handle lists, nested dictionaries efficiently.
---
 constants.py | 43 ++++++++++++++++++++++++++++++++++++++++---
 ingest.py    |  1 +
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/constants.py b/constants.py
index 380c650a..28a2f9d2 100644
--- a/constants.py
+++ b/constants.py
@@ -1,10 +1,10 @@
 import os
-
+import json
 # from dotenv import load_dotenv
 from chromadb.config import Settings
 
 # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
-from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader
+from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader,JSONLoader
 from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader
 from langchain.document_loaders import UnstructuredHTMLLoader
 
@@ -40,7 +40,43 @@
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20
 # N_BATCH = 512
-
+class JSONArrayLoader:
+    def _init_(self, file_path:str,jq_schema:str = ".",text_content:bool = True):
+        self.file_path = file_path
+        self.jq_schema = jq_schema
+        self.text_content = text_content
+        
+    def load(self):
+        documents = []
+        with open(self.file_path, encoding = "utf-8") as json_file:
+            raw_data = json_file.read()
+        try:
+            data = json.loads(raw_data)
+            if not isinstance(data,list):
+                raise ValueError("JSON file at {} should contain a list of documents.".format(self.file_path))
+                
+                for entry in data:
+                    if not isinstance(entry,dict):
+                        raise ValueError("Each entry in the JSON file at {} should be a dictionary.".format(self.file_path))
+                        
+                    entry_schema = self.jq_schema
+                    if self.text_content:
+                        entry_schema = f".{entry_schema}"
+                    loader = JSONLoader(
+                            file_path = self.file_path,
+                            jq_schema = entry_schema,
+                            text_content = self.text_content,
+                        )
+                    documents.append(loader.load()[0])
+                return documents
+            except:
+                loader = JSONLoader(
+                    file_path = self.file_path,
+                    jq_schema = self.jq_schema,
+                    text_content = self.text_content,
+                )
+                return loader.load()
+                        
 
 # https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
 DOCUMENT_MAP = {
@@ -55,6 +91,7 @@
     ".xlsx": UnstructuredExcelLoader,
     ".docx": Docx2txtLoader,
     ".doc": Docx2txtLoader,
+    ".json":JSONArrayLoader
 }
 
 # Default Instructor Model
diff --git a/ingest.py b/ingest.py
index 5e61627e..cbafbc19 100644
--- a/ingest.py
+++ b/ingest.py
@@ -6,6 +6,7 @@
 import torch
 from langchain.docstore.document import Document
 from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
+from langchain_text_splitters import RecursiveJsonSplitter
 from langchain.vectorstores import Chroma
 from utils import get_embeddings