import os from bs4 import BeautifulSoup import html2text def load_html_documents(folder_path): documents = [] for file_name in os.listdir(folder_path): if file_name.endswith(".html"): file_path = os.path.join(folder_path, file_name) with open(file_path, "r", encoding="latin-1") as file: soup = BeautifulSoup(file, "html.parser") text = html2text.html2text(soup.prettify()) # Convert HTML to readable text documents.append({"filename": file_name, "content": text}) return documents def preprocess_and_save(folder_path, output_path): # Load documents docs = load_html_documents(folder_path) # Save preprocessed documents (you can customize the saving format) # For example, save as a JSON file import json with open(output_path, "w") as f: json.dump(docs, f)