ohalkhateeb commited on
Commit
8e78866
·
verified ·
1 Parent(s): cc8f4aa

Update create_database.py

Browse files
Files changed (1) hide show
  1. create_database.py +10 -16
create_database.py CHANGED
@@ -3,22 +3,16 @@ from langchain.vectorstores import Chroma
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  import json
5
 
6
- def create_vector_database(input_path, persist_directory):
7
- # Load preprocessed data
8
- with open(input_path, "r") as f:
9
- docs = json.load(f)
10
 
11
- # Load an embedding model
12
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
13
 
14
- # Split text into smaller chunks
15
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
16
- all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]
17
 
18
- # Create a ChromaDB vector store
19
- vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)
20
-
21
- print("Vector database created successfully!")
22
-
23
- if __name__ == "__main__":
24
- create_vector_database("preprocessed_data.json", "db") # Change paths as needed
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  import json
5
 
6
+ def create_vector_database(input_path, persist_directory):
7
+ with open(input_path, "r") as f:
8
+ docs = json.load(f)
 
9
 
10
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
11
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
12
+ all_texts = [chunk for doc in docs for chunk in text_splitter.split_text(doc["content"])]
13
 
14
+ vector_db = Chroma.from_texts(texts=all_texts, embedding=embedding_model, persist_directory=persist_directory)
15
+ print("Vector database created successfully!")
 
16
 
17
+ if __name__ == "__main__":
18
+ create_vector_database("preprocessed_data.json", "db")