from langchain_docling import DoclingLoader from langchain_docling.loader import ExportType # Import required classes for building a custom converter from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend import spaces @spaces.GPU def convert_to_markdown(file_objs, url, do_ocr, do_table_structure): file_path = file_objs if file_objs is not None else url pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = do_ocr pipeline_options.do_table_structure = do_table_structure pdf_format_options = PdfFormatOption( pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend, ) doc_converter = DocumentConverter( allowed_formats=[InputFormat.PDF], format_options={ InputFormat.PDF: pdf_format_options } ) # Pass the custom converter to the DoclingLoader. loader = DoclingLoader( file_path=file_path, export_type=ExportType.MARKDOWN, converter=doc_converter ) docs = loader.load() return docs[0].page_content