File size: 823 Bytes
873f4c6
 
 
cc8f4aa
873f4c6
cc8f4aa
 
 
 
 
 
 
 
 
 
873f4c6
cc8f4aa
 
 
 
873f4c6
cc8f4aa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import os
from bs4 import BeautifulSoup
import html2text
import json

def load_html_documents(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".html"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="latin-1") as file:
                soup = BeautifulSoup(file, "html.parser")
                text = html2text.html2text(soup.prettify())
                documents.append({"filename": file_name, "content": text})
    return documents

def preprocess_and_save(folder_path, output_path):
    docs = load_html_documents(folder_path)
    with open(output_path, "w") as f:
        json.dump(docs, f)

if __name__ == "__main__":
    preprocess_and_save("./documents", "preprocessed_data.json")  # Update path