ohalkhateeb commited on
Commit
fea418b
·
verified ·
1 Parent(s): 873f4c6

Create create_database.py

Browse files
Files changed (1) hide show
  1. create_database.py +24 -0
create_database.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import HuggingFaceEmbeddings
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ import json
5
+
6
+ def create_vector_database(input_path, persist_directory):
7
+ # Load preprocessed data
8
+ with open(input_path, "r") as f:
9
+ docs = json.load(f)
10
+
11
+ # Load an embedding model
12
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
13
+
14
+ # Split text into smaller chunks
15
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
16
+ all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]
17
+
18
+ # Create a ChromaDB vector store
19
+ vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)
20
+
21
+ print("Vector database created successfully!")
22
+
23
+ if __name__ == "__main__":
24
+ create_vector_database("preprocessed_data.json", "db") # Change paths as needed