Spaces:
Runtime error
Runtime error
File size: 1,049 Bytes
fea418b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
def create_vector_database(input_path, persist_directory):
# Load preprocessed data
with open(input_path, "r") as f:
docs = json.load(f)
# Load an embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]
# Create a ChromaDB vector store
vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)
print("Vector database created successfully!")
if __name__ == "__main__":
create_vector_database("preprocessed_data.json", "db") # Change paths as needed |