ohalkhateeb commited on
Commit
cc8f4aa
·
verified ·
1 Parent(s): 1232f3f

Update preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +17 -18
preprocess.py CHANGED
@@ -1,24 +1,23 @@
1
  import os
2
  from bs4 import BeautifulSoup
3
  import html2text
 
4
 
5
- def load_html_documents(folder_path):
6
- documents = []
7
- for file_name in os.listdir(folder_path):
8
- if file_name.endswith(".html"):
9
- file_path = os.path.join(folder_path, file_name)
10
- with open(file_path, "r", encoding="latin-1") as file:
11
- soup = BeautifulSoup(file, "html.parser")
12
- text = html2text.html2text(soup.prettify()) # Convert HTML to readable text
13
- documents.append({"filename": file_name, "content": text})
14
- return documents
15
 
16
- def preprocess_and_save(folder_path, output_path):
17
- # Load documents
18
- docs = load_html_documents(folder_path)
 
19
 
20
- # Save preprocessed documents (you can customize the saving format)
21
- # For example, save as a JSON file
22
- import json
23
- with open(output_path, "w") as f:
24
- json.dump(docs, f)
 
1
  import os
2
  from bs4 import BeautifulSoup
3
  import html2text
4
+ import json
5
 
6
+ def load_html_documents(folder_path):
7
+ documents = []
8
+ for file_name in os.listdir(folder_path):
9
+ if file_name.endswith(".html"):
10
+ file_path = os.path.join(folder_path, file_name)
11
+ with open(file_path, "r", encoding="latin-1") as file:
12
+ soup = BeautifulSoup(file, "html.parser")
13
+ text = html2text.html2text(soup.prettify())
14
+ documents.append({"filename": file_name, "content": text})
15
+ return documents
16
 
17
+ def preprocess_and_save(folder_path, output_path):
18
+ docs = load_html_documents(folder_path)
19
+ with open(output_path, "w") as f:
20
+ json.dump(docs, f)
21
 
22
+ if __name__ == "__main__":
23
+ preprocess_and_save("./documents", "preprocessed_data.json") # Update path