Programming-Assistant-Bot/ingest.py at main · Mahbub2001/Programming-Assistant-Bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import time
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from uuid import uuid4
import gc
from multiprocessing import Pool

# load_dotenv()

class Models:
    def __init__(self):
        # Use Ollama embeddings
        self.embeddings_ollama = OllamaEmbeddings(model="llama3.1")
        self.llm_ollama = "qwen2.5-coder:7b"

models = Models()

embeddings = models.embeddings_ollama
llm = models.llm_ollama

# Define constants
data_folder = "./data"
chunk_size = 1000
chunk_overlap = 50
check_interval = 10
batch_size = 40

vector_store = Chroma(
    collection_name="documents",
    embedding_function=embeddings,
    persist_directory="vectorDB",
)

# Function to process a batch of documents and store them in the vector store
def process_batch(batch_data):
    try:
        documents, uuids = batch_data
        vector_store.add_documents(documents=documents, ids=uuids)
        print(f"Successfully added {len(documents)} documents.")
    except Exception as e:
        print(f"Error adding batch: {e}")
    finally:
        del documents, uuids
        gc.collect()

# Ingest a file
def ingest_file(file_path):
    if not file_path.lower().endswith('.pdf'):
        print(f"Skipping non-PDF file: {file_path}")
        return

    print(f"Starting to ingest file: {file_path}")
    loader = PyPDFLoader(file_path)
    loaded_documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n", " ", ""]
    )
    documents = text_splitter.split_documents(loaded_documents)
    uuids = [str(uuid4()) for _ in range(len(documents))]

    batches = [(documents[i:i+batch_size], uuids[i:i+batch_size]) for i in range(0, len(documents), batch_size)]

    # Use multiprocessing to process batches concurrently
    with Pool(processes=4) as pool:
        pool.map(process_batch, batches)

    print(f"Finished ingesting file: {file_path}")

def main_loop():
    while True:
        for filename in os.listdir(data_folder):
            if not filename.startswith("_"):
                file_path = os.path.join(data_folder, filename)
                ingest_file(file_path)
                new_filename = "_" + filename
                new_file_path = os.path.join(data_folder, new_filename)
                os.rename(file_path, new_file_path)
        time.sleep(check_interval)

if __name__ == "__main__":
    main_loop()