Spaces:
Runtime error
Runtime error
Rename document_loader.py to preprocess.py
Browse files- document_loader.py +0 -20
- preprocess.py +24 -0
document_loader.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
import html2text
|
4 |
-
|
5 |
-
def load_html_documents(folder_path):
|
6 |
-
documents = []
|
7 |
-
for file_name in os.listdir(folder_path):
|
8 |
-
if file_name.endswith(".html"):
|
9 |
-
file_path = os.path.join(folder_path, file_name)
|
10 |
-
with open(file_path, "r", encoding="latin-1") as file:
|
11 |
-
soup = BeautifulSoup(file, "html.parser")
|
12 |
-
text = html2text.html2text(soup.prettify()) # Convert HTML to readable text
|
13 |
-
documents.append({"filename": file_name, "content": text})
|
14 |
-
return documents
|
15 |
-
|
16 |
-
# Change "your_html_folder_path" to your actual folder path
|
17 |
-
#docs = load_html_documents("documents")
|
18 |
-
|
19 |
-
# Print a sample document
|
20 |
-
#print(docs[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import html2text
|
4 |
+
|
5 |
+
def load_html_documents(folder_path):
|
6 |
+
documents = []
|
7 |
+
for file_name in os.listdir(folder_path):
|
8 |
+
if file_name.endswith(".html"):
|
9 |
+
file_path = os.path.join(folder_path, file_name)
|
10 |
+
with open(file_path, "r", encoding="latin-1") as file:
|
11 |
+
soup = BeautifulSoup(file, "html.parser")
|
12 |
+
text = html2text.html2text(soup.prettify()) # Convert HTML to readable text
|
13 |
+
documents.append({"filename": file_name, "content": text})
|
14 |
+
return documents
|
15 |
+
|
16 |
+
def preprocess_and_save(folder_path, output_path):
|
17 |
+
# Load documents
|
18 |
+
docs = load_html_documents(folder_path)
|
19 |
+
|
20 |
+
# Save preprocessed documents (you can customize the saving format)
|
21 |
+
# For example, save as a JSON file
|
22 |
+
import json
|
23 |
+
with open(output_path, "w") as f:
|
24 |
+
json.dump(docs, f)
|