drewThomasson commited on
Commit
2b19583
Β·
verified Β·
1 Parent(s): 24bd570

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -35
app.py CHANGED
@@ -3,60 +3,122 @@ import tempfile, os
3
  from pdf2image import convert_from_path
4
  import pytesseract, pdfplumber, camelot
5
  from PIL import Image, ImageOps
 
 
6
 
7
- # βœ… Must be named "file" for Gradio API to detect correctly
8
  def extract_text_from_pdf(file):
9
  extracted = []
10
  pdf_path = file.name
11
-
12
- # 1. Extract using pdfplumber
 
 
 
 
13
  try:
14
- with pdfplumber.open(pdf_path) as pdf:
15
- for page in pdf.pages:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  text = page.extract_text(layout=True)
17
  if text:
18
- extracted.append(text)
 
 
19
  tables = page.extract_tables()
20
- for table in tables:
21
- extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
22
- except Exception as e:
23
- print("pdfplumber error:", e)
24
-
25
- # 2. Table extraction with Camelot
26
- try:
27
- tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
28
- for table in tables:
29
- extracted.append("CAMELOT TABLE:\n" + table.df.to_csv(index=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  except Exception as e:
31
- print("Camelot error:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # 3. OCR fallback if text is too short
34
- combined = "\n".join(extracted).strip()
35
- if len(combined) < 100:
36
- images = convert_from_path(pdf_path, dpi=300)
37
- for img in images:
38
- img = img.convert("L")
39
- img = ImageOps.invert(img)
40
- combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
41
 
42
- # Save output
43
- output_path = os.path.join(tempfile.gettempdir(), "extracted_text.txt")
44
- with open(output_path, "w", encoding="utf-8") as f:
45
- f.write(combined)
46
 
47
- return combined, output_path
48
 
49
- # βœ… Use Gr.Interface (NOT Blocks) with correct api_name
50
  app = gr.Interface(
51
  fn=extract_text_from_pdf,
52
  inputs=gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"]),
53
  outputs=[
54
  gr.Textbox(label="πŸ“„ Extracted Text", lines=25, show_copy_button=True),
55
- gr.File(label="πŸ“₯ Download .txt")
 
56
  ],
57
- title="Advanced PDF Extractor",
58
- description="Extract text + tables + OCR from scanned/digital PDFs.",
59
  allow_flagging="never",
60
  )
61
 
62
- app.launch()
 
 
3
  from pdf2image import convert_from_path
4
  import pytesseract, pdfplumber, camelot
5
  from PIL import Image, ImageOps
6
+ import ocrmypdf
7
+ import subprocess
8
 
 
9
  def extract_text_from_pdf(file):
10
  extracted = []
11
  pdf_path = file.name
12
+
13
+ # Create temporary paths for OCR'd PDF and text output
14
+ temp_dir = tempfile.gettempdir()
15
+ ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
16
+ output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
17
+
18
  try:
19
+ # Step 1: Use OCRmyPDF to create a searchable PDF
20
+ print("Processing PDF with OCRmyPDF...")
21
+ ocrmypdf.ocr(
22
+ pdf_path,
23
+ ocr_pdf_path,
24
+ deskew=True,
25
+ clean=True,
26
+ force_ocr=False, # Only OCR if needed
27
+ skip_text=False,
28
+ optimize=1
29
+ )
30
+
31
+ # Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
32
+ print("Extracting text from OCR'd PDF...")
33
+ with pdfplumber.open(ocr_pdf_path) as pdf:
34
+ for page_num, page in enumerate(pdf.pages):
35
  text = page.extract_text(layout=True)
36
  if text:
37
+ extracted.append(f"--- Page {page_num + 1} ---\n{text}")
38
+
39
+ # Extract tables if any
40
  tables = page.extract_tables()
41
+ for table_num, table in enumerate(tables):
42
+ if table:
43
+ table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
44
+ table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
45
+ extracted.append(table_text)
46
+
47
+ # Step 3: Try Camelot for additional table extraction
48
+ try:
49
+ tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
50
+ for i, table in enumerate(tables):
51
+ extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
52
+ except Exception as e:
53
+ print(f"Camelot extraction failed: {e}")
54
+
55
+ # Combine all extracted text
56
+ combined_text = "\n\n".join(extracted).strip()
57
+
58
+ # If still no text, fallback to direct OCR
59
+ if len(combined_text) < 50:
60
+ print("Fallback to direct OCR...")
61
+ images = convert_from_path(pdf_path, dpi=300)
62
+ ocr_text = []
63
+ for i, img in enumerate(images):
64
+ img = img.convert("L")
65
+ img = ImageOps.invert(img)
66
+ page_text = pytesseract.image_to_string(img, config="--psm 6")
67
+ if page_text.strip():
68
+ ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
69
+ combined_text = "\n\n".join(ocr_text)
70
+
71
+ # Save the extracted text
72
+ with open(output_txt_path, "w", encoding="utf-8") as f:
73
+ f.write(combined_text)
74
+
75
+ return combined_text, output_txt_path, ocr_pdf_path
76
+
77
  except Exception as e:
78
+ error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
79
+ print(error_msg)
80
+
81
+ # Fallback to original method if OCRmyPDF fails
82
+ try:
83
+ with pdfplumber.open(pdf_path) as pdf:
84
+ for page in pdf.pages:
85
+ text = page.extract_text(layout=True)
86
+ if text:
87
+ extracted.append(text)
88
+ tables = page.extract_tables()
89
+ for table in tables:
90
+ extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
91
+ except Exception as e2:
92
+ print("pdfplumber error:", e2)
93
 
94
+ # OCR fallback if text is too short
95
+ combined = "\n".join(extracted).strip()
96
+ if len(combined) < 100:
97
+ images = convert_from_path(pdf_path, dpi=300)
98
+ for img in images:
99
+ img = img.convert("L")
100
+ img = ImageOps.invert(img)
101
+ combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
102
 
103
+ # Save fallback output
104
+ with open(output_txt_path, "w", encoding="utf-8") as f:
105
+ f.write(combined)
 
106
 
107
+ return combined, output_txt_path, pdf_path # Return original PDF if OCR failed
108
 
109
+ # Create Gradio interface
110
  app = gr.Interface(
111
  fn=extract_text_from_pdf,
112
  inputs=gr.File(label="πŸ“€ Upload PDF", file_types=[".pdf"]),
113
  outputs=[
114
  gr.Textbox(label="πŸ“„ Extracted Text", lines=25, show_copy_button=True),
115
+ gr.File(label="πŸ“₯ Download Extracted Text (.txt)"),
116
+ gr.File(label="πŸ“₯ Download OCR'd Searchable PDF")
117
  ],
118
+ title="Advanced PDF OCR Extractor with OCRmyPDF",
119
+ description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
120
  allow_flagging="never",
121
  )
122
 
123
+ if __name__ == "__main__":
124
+ app.launch()