Spaces:
Runtime error
Runtime error
Update preprocess.py
Browse files- preprocess.py +17 -18
preprocess.py
CHANGED
@@ -1,24 +1,23 @@
|
|
1 |
import os
|
2 |
from bs4 import BeautifulSoup
|
3 |
import html2text
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
import json
|
23 |
-
with open(output_path, "w") as f:
|
24 |
-
json.dump(docs, f)
|
|
|
1 |
import os
|
2 |
from bs4 import BeautifulSoup
|
3 |
import html2text
|
4 |
+
import json
|
5 |
|
6 |
+
def load_html_documents(folder_path):
|
7 |
+
documents = []
|
8 |
+
for file_name in os.listdir(folder_path):
|
9 |
+
if file_name.endswith(".html"):
|
10 |
+
file_path = os.path.join(folder_path, file_name)
|
11 |
+
with open(file_path, "r", encoding="latin-1") as file:
|
12 |
+
soup = BeautifulSoup(file, "html.parser")
|
13 |
+
text = html2text.html2text(soup.prettify())
|
14 |
+
documents.append({"filename": file_name, "content": text})
|
15 |
+
return documents
|
16 |
|
17 |
+
def preprocess_and_save(folder_path, output_path):
|
18 |
+
docs = load_html_documents(folder_path)
|
19 |
+
with open(output_path, "w") as f:
|
20 |
+
json.dump(docs, f)
|
21 |
|
22 |
+
if __name__ == "__main__":
|
23 |
+
preprocess_and_save("./documents", "preprocessed_data.json") # Update path
|
|
|
|
|
|