import gradio as gr import tempfile, os from pdf2image import convert_from_path import pytesseract, pdfplumber, camelot from PIL import Image, ImageOps import ocrmypdf import subprocess def extract_text_from_pdf(file): extracted = [] pdf_path = file.name # Create temporary paths for OCR'd PDF and text output temp_dir = tempfile.gettempdir() ocr_pdf_path = os.path.join(temp_dir, "ocr_searchable.pdf") output_txt_path = os.path.join(temp_dir, "extracted_text.txt") try: # Step 1: Use OCRmyPDF to create a searchable PDF print("Processing PDF with OCRmyPDF...") ocrmypdf.ocr( pdf_path, ocr_pdf_path, deskew=True, clean=True, force_ocr=False, # Only OCR if needed skip_text=False, optimize=1 ) # Step 2: Extract text from the OCR'd searchable PDF using pdfplumber print("Extracting text from OCR'd PDF...") with pdfplumber.open(ocr_pdf_path) as pdf: for page_num, page in enumerate(pdf.pages): text = page.extract_text(layout=True) if text: extracted.append(f"--- Page {page_num + 1} ---\n{text}") # Extract tables if any tables = page.extract_tables() for table_num, table in enumerate(tables): if table: table_text = f"TABLE {table_num + 1} (Page {page_num + 1}):\n" table_text += "\n".join([", ".join([str(cell) if cell else "" for cell in row]) for row in table]) extracted.append(table_text) # Step 3: Try Camelot for additional table extraction try: tables = camelot.read_pdf(ocr_pdf_path, pages="all", flavor="lattice") for i, table in enumerate(tables): extracted.append(f"CAMELOT TABLE {i + 1}:\n{table.df.to_csv(index=False)}") except Exception as e: print(f"Camelot extraction failed: {e}") # Combine all extracted text combined_text = "\n\n".join(extracted).strip() # If still no text, fallback to direct OCR if len(combined_text) < 50: print("Fallback to direct OCR...") images = convert_from_path(pdf_path, dpi=300) ocr_text = [] for i, img in enumerate(images): img = img.convert("L") img = ImageOps.invert(img) page_text = pytesseract.image_to_string(img, config="--psm 6") if page_text.strip(): ocr_text.append(f"--- Page {i + 1} (Direct OCR) ---\n{page_text}") combined_text = "\n\n".join(ocr_text) # Save the extracted text with open(output_txt_path, "w", encoding="utf-8") as f: f.write(combined_text) return combined_text, output_txt_path, ocr_pdf_path except Exception as e: error_msg = f"Error processing PDF: {str(e)}\n\nFalling back to original extraction methods..." print(error_msg) # Fallback to original method if OCRmyPDF fails try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text(layout=True) if text: extracted.append(text) tables = page.extract_tables() for table in tables: extracted.append("TABLE:\n" + "\n".join([", ".join(row) for row in table])) except Exception as e2: print("pdfplumber error:", e2) # OCR fallback if text is too short combined = "\n".join(extracted).strip() if len(combined) < 100: images = convert_from_path(pdf_path, dpi=300) for img in images: img = img.convert("L") img = ImageOps.invert(img) combined += pytesseract.image_to_string(img, config="--psm 6") + "\n" # Save fallback output with open(output_txt_path, "w", encoding="utf-8") as f: f.write(combined) return combined, output_txt_path, pdf_path # Return original PDF if OCR failed # Create Gradio interface app = gr.Interface( fn=extract_text_from_pdf, inputs=gr.File(label="📤 Upload PDF", file_types=[".pdf"]), outputs=[ gr.Textbox(label="📄 Extracted Text", lines=25, show_copy_button=True), gr.File(label="📥 Download Extracted Text (.txt)"), gr.File(label="📥 Download OCR'd Searchable PDF") ], title="Advanced PDF OCR Extractor with OCRmyPDF", description="Upload a PDF to get: 1) Extracted text displayed and downloadable as .txt, 2) OCR'd searchable PDF download. Uses OCRmyPDF for superior OCR quality.", allow_flagging="never", ) if __name__ == "__main__": app.launch()