File size: 1,049 Bytes
fea418b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

    def create_vector_database(input_path, persist_directory):
        # Load preprocessed data
        with open(input_path, "r") as f:
            docs = json.load(f)

        # Load an embedding model
        embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

        # Split text into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]

        # Create a ChromaDB vector store
        vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)

        print("Vector database created successfully!")

    if __name__ == "__main__":
        create_vector_database("preprocessed_data.json", "db")  # Change paths as needed