File size: 3,660 Bytes
73b49a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import marimo

__generated_with = "0.10.17"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pdfplumber
    return mo, pdfplumber


@app.cell
def _(mo):
    file = mo.ui.file_browser(initial_path="")
    file
    return (file,)


@app.cell
def _(file, pdfplumber):
    with pdfplumber.open(file.path()) as pdf:
        # Join all pages into single string
        pdf_text = " ".join(p.extract_text() for p in pdf.pages)
    return pdf, pdf_text


@app.cell
def _():
    from everycure.extract import extract_pdf_entities
    return (extract_pdf_entities,)


@app.cell
def _(pdf_text):
    from transformers import pipeline
    from transformers import AutoTokenizer, AutoModelForTokenClassification

    tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
    model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
    result = pipe(pdf_text)
    result
    return (
        AutoModelForTokenClassification,
        AutoTokenizer,
        model,
        pipe,
        pipeline,
        result,
        tokenizer,
    )


@app.cell
def _():
    from gliner import GLiNER

    # Curated medical labels based on your domain needs
    MEDICAL_LABELS = [
        # Parent: NamedThing -> biological_entity
        "gene",
        "protein",
        "protein_isoform",
        "cell",
        "disease",
        "phenotypic_feature",
        "clinical_finding",
        "anatomical_entity",
        "pathway",
        "biological_process",

        # Parent: NamedThing -> chemical_entity
        "drug",
        "small_molecule",
        "food_additive",
        "chemical_mixture",
        "molecular_entity",

        # Parent: NamedThing -> clinical_entity
        "clinical_intervention",
        "clinical_trial",
        "hospitalization",

        # Parent: NamedThing -> planetary_entity
        "geographic_location",
        "environmental_feature",
        "environmental_process",

        # Parent: NamedThing -> information_content_entity
        "publication",
        "journal_article",
        "book",
        "patent",
        "dataset",
        "study_result",

        # Parent: NamedThing -> organismal_entity
        "human",
        "mammal",
        "plant",
        "virus",
        "bacterium",
        "cell_line",

        # Parent: NamedThing -> attribute
        "biological_sex",
        "clinical_attribute",
        "socioeconomic_attribute",
        "environmental_exposure",
        "drug_exposure",

        # Parent: NamedThing -> procedure
        "procedure",

        # Parent: NamedThing -> treatment
        "treatment",

        # Parent: NamedThing -> device
        "device",

        # Parent: NamedThing -> diagnostic_aid
        "diagnostic_aid",

        # Parent: NamedThing -> event
        "event",
    ]

    gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")

    def gliner_medical_ner(text, threshold=0.7):
        entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
        return [{"text": ent["text"], "label": ent["label"]} 
                for ent in entities if len(ent["text"]) > 2]  # Filter short fragments
    return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model


@app.cell
def _(gliner_medical_ner, pdf_text):
    result_gli = gliner_medical_ner(pdf_text)
    return (result_gli,)


@app.cell
def _(result_gli):
    result_gli
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()