Spaces:

lucharo
/

everycure-ner-pdf

Runtime error

File size: 3,660 Bytes

73b49a2

import marimo

__generated_with = "0.10.17"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pdfplumber
    return mo, pdfplumber


@app.cell
def _(mo):
    file = mo.ui.file_browser(initial_path="")
    file
    return (file,)


@app.cell
def _(file, pdfplumber):
    with pdfplumber.open(file.path()) as pdf:
        # Join all pages into single string
        pdf_text = " ".join(p.extract_text() for p in pdf.pages)
    return pdf, pdf_text


@app.cell
def _():
    from everycure.extract import extract_pdf_entities
    return (extract_pdf_entities,)


@app.cell
def _(pdf_text):
    from transformers import pipeline
    from transformers import AutoTokenizer, AutoModelForTokenClassification

    tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
    model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
    result = pipe(pdf_text)
    result
    return (
        AutoModelForTokenClassification,
        AutoTokenizer,
        model,
        pipe,
        pipeline,
        result,
        tokenizer,
    )


@app.cell
def _():
    from gliner import GLiNER

    # Curated medical labels based on your domain needs
    MEDICAL_LABELS = [
        # Parent: NamedThing -> biological_entity
        "gene",
        "protein",
        "protein_isoform",
        "cell",
        "disease",
        "phenotypic_feature",
        "clinical_finding",
        "anatomical_entity",
        "pathway",
        "biological_process",

        # Parent: NamedThing -> chemical_entity
        "drug",
        "small_molecule",
        "food_additive",
        "chemical_mixture",
        "molecular_entity",

        # Parent: NamedThing -> clinical_entity
        "clinical_intervention",
        "clinical_trial",
        "hospitalization",

        # Parent: NamedThing -> planetary_entity
        "geographic_location",
        "environmental_feature",
        "environmental_process",

        # Parent: NamedThing -> information_content_entity
        "publication",
        "journal_article",
        "book",
        "patent",
        "dataset",
        "study_result",

        # Parent: NamedThing -> organismal_entity
        "human",
        "mammal",
        "plant",
        "virus",
        "bacterium",
        "cell_line",

        # Parent: NamedThing -> attribute
        "biological_sex",
        "clinical_attribute",
        "socioeconomic_attribute",
        "environmental_exposure",
        "drug_exposure",

        # Parent: NamedThing -> procedure
        "procedure",

        # Parent: NamedThing -> treatment
        "treatment",

        # Parent: NamedThing -> device
        "device",

        # Parent: NamedThing -> diagnostic_aid
        "diagnostic_aid",

        # Parent: NamedThing -> event
        "event",
    ]

    gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")

    def gliner_medical_ner(text, threshold=0.7):
        entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
        return [{"text": ent["text"], "label": ent["label"]} 
                for ent in entities if len(ent["text"]) > 2]  # Filter short fragments
    return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model


@app.cell
def _(gliner_medical_ner, pdf_text):
    result_gli = gliner_medical_ner(pdf_text)
    return (result_gli,)


@app.cell
def _(result_gli):
    result_gli
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()