Spaces:
Runtime error
Runtime error
File size: 823 Bytes
873f4c6 cc8f4aa 873f4c6 cc8f4aa 873f4c6 cc8f4aa 873f4c6 cc8f4aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import os
from bs4 import BeautifulSoup
import html2text
import json
def load_html_documents(folder_path):
documents = []
for file_name in os.listdir(folder_path):
if file_name.endswith(".html"):
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", encoding="latin-1") as file:
soup = BeautifulSoup(file, "html.parser")
text = html2text.html2text(soup.prettify())
documents.append({"filename": file_name, "content": text})
return documents
def preprocess_and_save(folder_path, output_path):
docs = load_html_documents(folder_path)
with open(output_path, "w") as f:
json.dump(docs, f)
if __name__ == "__main__":
preprocess_and_save("./documents", "preprocessed_data.json") # Update path |