Spaces:
Runtime error
Runtime error
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import json | |
def create_vector_database(input_path, persist_directory): | |
with open(input_path, "r") as f: | |
docs = json.load(f) | |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])] | |
vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory) | |
print("Vector database created successfully!") | |
if __name__ == "__main__": | |
create_vector_database("preprocessed_data.json", "db") |