Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

drewThomasson commited on 17 days ago

Commit

2b19583

verified ·

1 Parent(s): 24bd570

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -35

app.py CHANGED Viewed

@@ -3,60 +3,122 @@ import tempfile, os
 from pdf2image import convert_from_path
 import pytesseract, pdfplumber, camelot
 from PIL import Image, ImageOps
-# ✅ Must be named "file" for Gradio API to detect correctly
 def extract_text_from_pdf(file):
     extracted = []
     pdf_path = file.name
-    # 1. Extract using pdfplumber
     try:
-        with pdfplumber.open(pdf_path) as pdf:
-            for page in pdf.pages:
                 text = page.extract_text(layout=True)
                 if text:
-                    extracted.append(text)
                 tables = page.extract_tables()
-                for table in tables:
-                    extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
-    except Exception as e:
-        print("pdfplumber error:", e)
-    # 2. Table extraction with Camelot
-    try:
-        tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")
-        for table in tables:
-            extracted.append("CAMELOT TABLE:\n" + table.df.to_csv(index=False))
     except Exception as e:
-        print("Camelot error:", e)
-    # 3. OCR fallback if text is too short
-    combined = "\n".join(extracted).strip()
-    if len(combined) < 100:
-        images = convert_from_path(pdf_path, dpi=300)
-        for img in images:
-            img = img.convert("L")
-            img = ImageOps.invert(img)
-            combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
-    # Save output
-    output_path = os.path.join(tempfile.gettempdir(), "extracted_text.txt")
-    with open(output_path, "w", encoding="utf-8") as f:
-        f.write(combined)
-    return combined, output_path
-# ✅ Use Gr.Interface (NOT Blocks) with correct api_name
 app = gr.Interface(
     fn=extract_text_from_pdf,
     inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]),
     outputs=[
         gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True),
-        gr.File(label="📥 Download .txt")
     ],
-    title="Advanced PDF Extractor",
-    description="Extract text + tables + OCR from scanned/digital PDFs.",
     allow_flagging="never",
 )
-app.launch()

 from pdf2image import convert_from_path
 import pytesseract, pdfplumber, camelot
 from PIL import Image, ImageOps
+import ocrmypdf
+import subprocess
 def extract_text_from_pdf(file):
     extracted = []
     pdf_path = file.name
+    # Create temporary paths for OCR'd PDF and text output
+    temp_dir = tempfile.gettempdir()
+    ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf")
+    output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
     try:
+        # Step 1: Use OCRmyPDF to create a searchable PDF
+        print("Processing PDF with OCRmyPDF...")
+        ocrmypdf.ocr(
+            pdf_path,
+            ocr_pdf_path,
+            deskew=True,
+            clean=True,
+            force_ocr=False,  # Only OCR if needed
+            skip_text=False,
+            optimize=1
+        )
+        # Step 2: Extract text from the OCR'd searchable PDF using pdfplumber
+        print("Extracting text from OCR'd PDF...")
+        with pdfplumber.open(ocr_pdf_path) as pdf:
+            for page_num, page in enumerate(pdf.pages):
                 text = page.extract_text(layout=True)
                 if text:
+                    extracted.append(f"--- Page {page_num + 1} ---\n{text}")
+                # Extract tables if any
                 tables = page.extract_tables()
+                for table_num, table in enumerate(tables):
+                    if table:
+                        table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n"
+                        table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table])
+                        extracted.append(table_text)
+        # Step 3: Try Camelot for additional table extraction
+        try:
+            tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice")
+            for i, table in enumerate(tables):
+                extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}")
+        except Exception as e:
+            print(f"Camelot extraction failed: {e}")
+        # Combine all extracted text
+        combined_text = "\n\n".join(extracted).strip()
+        # If still no text, fallback to direct OCR
+        if len(combined_text) < 50:
+            print("Fallback to direct OCR...")
+            images = convert_from_path(pdf_path, dpi=300)
+            ocr_text = []
+            for i, img in enumerate(images):
+                img = img.convert("L")
+                img = ImageOps.invert(img)
+                page_text = pytesseract.image_to_string(img, config="--psm 6")
+                if page_text.strip():
+                    ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}")
+            combined_text = "\n\n".join(ocr_text)
+        # Save the extracted text
+        with open(output_txt_path, "w", encoding="utf-8") as f:
+            f.write(combined_text)
+        return combined_text, output_txt_path, ocr_pdf_path
     except Exception as e:
+        error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..."
+        print(error_msg)
+        # Fallback to original method if OCRmyPDF fails
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    text = page.extract_text(layout=True)
+                    if text:
+                        extracted.append(text)
+                    tables = page.extract_tables()
+                    for table in tables:
+                        extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table]))
+        except Exception as e2:
+            print("pdfplumber error:", e2)
+        # OCR fallback if text is too short
+        combined = "\n".join(extracted).strip()
+        if len(combined) < 100:
+            images = convert_from_path(pdf_path, dpi=300)
+            for img in images:
+                img = img.convert("L")
+                img = ImageOps.invert(img)
+                combined += pytesseract.image_to_string(img, config="--psm 6") + "\n"
+        # Save fallback output
+        with open(output_txt_path, "w", encoding="utf-8") as f:
+            f.write(combined)
+        return combined, output_txt_path, pdf_path  # Return original PDF if OCR failed
+# Create Gradio interface
 app = gr.Interface(
     fn=extract_text_from_pdf,
     inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]),
     outputs=[
         gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True),
+        gr.File(label="📥 Download Extracted Text (.txt)"),
+        gr.File(label="📥 Download OCR'd Searchable PDF")
     ],
+    title="Advanced PDF OCR Extractor with OCRmyPDF",
+    description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.",
     allow_flagging="never",
 )
+if __name__ == "__main__":
+    app.launch()