Spaces:

ohalkhateeb
/

Dubai_Legislation

Runtime error

Dubai_Legislation / create_database.py

Create create_database.py

fea418b verified 4 months ago

1.05 kB

	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import json

	def create_vector_database(input_path, persist_directory):
	# Load preprocessed data
	with open(input_path, "r") as f:
	docs = json.load(f)

	# Load an embedding model
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# Split text into smaller chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
	all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]

	# Create a ChromaDB vector store
	vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)

	print("Vector database created successfully!")

	if __name__ == "__main__":
	create_vector_database("preprocessed_data.json", "db") # Change paths as needed