ohalkhateeb commited on
Commit
873f4c6
·
verified ·
1 Parent(s): aa9796b

Rename document_loader.py to preprocess.py

Browse files
Files changed (2) hide show
  1. document_loader.py +0 -20
  2. preprocess.py +24 -0
document_loader.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- from bs4 import BeautifulSoup
3
- import html2text
4
-
5
- def load_html_documents(folder_path):
6
- documents = []
7
- for file_name in os.listdir(folder_path):
8
- if file_name.endswith(".html"):
9
- file_path = os.path.join(folder_path, file_name)
10
- with open(file_path, "r", encoding="latin-1") as file:
11
- soup = BeautifulSoup(file, "html.parser")
12
- text = html2text.html2text(soup.prettify()) # Convert HTML to readable text
13
- documents.append({"filename": file_name, "content": text})
14
- return documents
15
-
16
- # Change "your_html_folder_path" to your actual folder path
17
- #docs = load_html_documents("documents")
18
-
19
- # Print a sample document
20
- #print(docs[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from bs4 import BeautifulSoup
3
+ import html2text
4
+
5
+ def load_html_documents(folder_path):
6
+ documents = []
7
+ for file_name in os.listdir(folder_path):
8
+ if file_name.endswith(".html"):
9
+ file_path = os.path.join(folder_path, file_name)
10
+ with open(file_path, "r", encoding="latin-1") as file:
11
+ soup = BeautifulSoup(file, "html.parser")
12
+ text = html2text.html2text(soup.prettify()) # Convert HTML to readable text
13
+ documents.append({"filename": file_name, "content": text})
14
+ return documents
15
+
16
+ def preprocess_and_save(folder_path, output_path):
17
+ # Load documents
18
+ docs = load_html_documents(folder_path)
19
+
20
+ # Save preprocessed documents (you can customize the saving format)
21
+ # For example, save as a JSON file
22
+ import json
23
+ with open(output_path, "w") as f:
24
+ json.dump(docs, f)