import marimo __generated_with = "0.10.17" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pdfplumber return mo, pdfplumber @app.cell def _(mo): file = mo.ui.file_browser(initial_path="") file return (file,) @app.cell def _(file, pdfplumber): with pdfplumber.open(file.path()) as pdf: # Join all pages into single string pdf_text = " ".join(p.extract_text() for p in pdf.pages) return pdf, pdf_text @app.cell def _(): from everycure.extract import extract_pdf_entities return (extract_pdf_entities,) @app.cell def _(pdf_text): from transformers import pipeline from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all") model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all") pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu result = pipe(pdf_text) result return ( AutoModelForTokenClassification, AutoTokenizer, model, pipe, pipeline, result, tokenizer, ) @app.cell def _(): from gliner import GLiNER # Curated medical labels based on your domain needs MEDICAL_LABELS = [ # Parent: NamedThing -> biological_entity "gene", "protein", "protein_isoform", "cell", "disease", "phenotypic_feature", "clinical_finding", "anatomical_entity", "pathway", "biological_process", # Parent: NamedThing -> chemical_entity "drug", "small_molecule", "food_additive", "chemical_mixture", "molecular_entity", # Parent: NamedThing -> clinical_entity "clinical_intervention", "clinical_trial", "hospitalization", # Parent: NamedThing -> planetary_entity "geographic_location", "environmental_feature", "environmental_process", # Parent: NamedThing -> information_content_entity "publication", "journal_article", "book", "patent", "dataset", "study_result", # Parent: NamedThing -> organismal_entity "human", "mammal", "plant", "virus", "bacterium", "cell_line", # Parent: NamedThing -> attribute "biological_sex", "clinical_attribute", "socioeconomic_attribute", "environmental_exposure", "drug_exposure", # Parent: NamedThing -> procedure "procedure", # Parent: NamedThing -> treatment "treatment", # Parent: NamedThing -> device "device", # Parent: NamedThing -> diagnostic_aid "diagnostic_aid", # Parent: NamedThing -> event "event", ] gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5") def gliner_medical_ner(text, threshold=0.7): entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold) return [{"text": ent["text"], "label": ent["label"]} for ent in entities if len(ent["text"]) > 2] # Filter short fragments return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model @app.cell def _(gliner_medical_ner, pdf_text): result_gli = gliner_medical_ner(pdf_text) return (result_gli,) @app.cell def _(result_gli): result_gli return @app.cell def _(): return if __name__ == "__main__": app.run()