Dubai_Legislation / preprocess.py
ohalkhateeb's picture
Update preprocess.py
cc8f4aa verified
raw
history blame
823 Bytes
import os
from bs4 import BeautifulSoup
import html2text
import json
def load_html_documents(folder_path):
documents = []
for file_name in os.listdir(folder_path):
if file_name.endswith(".html"):
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", encoding="latin-1") as file:
soup = BeautifulSoup(file, "html.parser")
text = html2text.html2text(soup.prettify())
documents.append({"filename": file_name, "content": text})
return documents
def preprocess_and_save(folder_path, output_path):
docs = load_html_documents(folder_path)
with open(output_path, "w") as f:
json.dump(docs, f)
if __name__ == "__main__":
preprocess_and_save("./documents", "preprocessed_data.json") # Update path