Spaces:
Runtime error
Runtime error
import os | |
from bs4 import BeautifulSoup | |
import html2text | |
import json | |
def load_html_documents(folder_path): | |
documents = [] | |
for file_name in os.listdir(folder_path): | |
if file_name.endswith(".html"): | |
file_path = os.path.join(folder_path, file_name) | |
with open(file_path, "r", encoding="latin-1") as file: | |
soup = BeautifulSoup(file, "html.parser") | |
text = html2text.html2text(soup.prettify()) | |
documents.append({"filename": file_name, "content": text}) | |
return documents | |
def preprocess_and_save(folder_path, output_path): | |
docs = load_html_documents(folder_path) | |
with open(output_path, "w") as f: | |
json.dump(docs, f) | |
if __name__ == "__main__": | |
preprocess_and_save("./documents", "preprocessed_data.json") # Update path |