import os from bs4 import BeautifulSoup import html2text import json def load_html_documents(folder_path): documents = [] for file_name in os.listdir(folder_path): if file_name.endswith(".html"): file_path = os.path.join(folder_path, file_name) with open(file_path, "r", encoding="latin-1") as file: soup = BeautifulSoup(file, "html.parser") text = html2text.html2text(soup.prettify()) documents.append({"filename": file_name, "content": text}) return documents def preprocess_and_save(folder_path, output_path): docs = load_html_documents(folder_path) with open(output_path, "w") as f: json.dump(docs, f) if __name__ == "__main__": preprocess_and_save("./documents", "preprocessed_data.json") # Update path