From b265e0095db4bc910a52c8fb96c96929732285ed Mon Sep 17 00:00:00 2001
From: Don B <don.branson@outlook.com>
Date: Tue, 13 Feb 2024 13:50:46 -0800
Subject: [PATCH] add support for json files and to allow for more than 41666
 embeddings

---
 .../langchain-python-rag-privategpt/ingest.py | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/examples/langchain-python-rag-privategpt/ingest.py b/examples/langchain-python-rag-privategpt/ingest.py
index 35324775..de557df1 100755
--- a/examples/langchain-python-rag-privategpt/ingest.py
+++ b/examples/langchain-python-rag-privategpt/ingest.py
@@ -8,6 +8,7 @@ from tqdm import tqdm
 from langchain.document_loaders import (
     CSVLoader,
     EverNoteLoader,
+    JSONLoader,
     PyMuPDFLoader,
     TextLoader,
     UnstructuredEmailLoader,
@@ -66,6 +67,7 @@ LOADER_MAPPING = {
     ".eml": (MyElmLoader, {}),
     ".epub": (UnstructuredEPubLoader, {}),
     ".html": (UnstructuredHTMLLoader, {}),
+    ".json": (TextLoader, {"encoding": "utf8"}),
     ".md": (UnstructuredMarkdownLoader, {}),
     ".odt": (UnstructuredODTLoader, {}),
     ".pdf": (PyMuPDFLoader, {}),
@@ -133,26 +135,37 @@ def does_vectorstore_exist(persist_directory: str) -> bool:
                 return True
     return False
 
+def split_into_batches(lst, batch_size):
+    """Yield successive batches of `batch_size` from `lst`."""
+    for i in range(0, len(lst), batch_size):
+        yield lst[i:i + batch_size]
+
 def main():
     # Create embeddings
     embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
 
+    # Initialize db variable outside of the if-else scope
+    db = None
+
     if does_vectorstore_exist(persist_directory):
-        # Update and store locally vectorstore
         print(f"Appending to existing vectorstore at {persist_directory}")
         db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
         collection = db.get()
         texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
-        print(f"Creating embeddings. May take some minutes...")
-        db.add_documents(texts)
     else:
-        # Create and store locally vectorstore
         print("Creating new vectorstore")
         texts = process_documents()
-        print(f"Creating embeddings. May take some minutes...")
-        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
+
+    batch_size = 41666  # Maximum number of embeddings per batch
+    for batch_index, batch in enumerate(split_into_batches(texts, batch_size)):
+        print(f"Processing batch {batch_index + 1} of {len(batch)} texts...")
+        if db is None:  # This check is necessary only if db could still be None here
+            db = Chroma.from_documents(batch, embeddings, persist_directory=persist_directory)
+        else:
+            db.add_documents(batch)
+        print("Batch processed successfully.")
+
     db.persist()
-    db = None
 
     print(f"Ingestion complete! You can now run privateGPT.py to query your documents")