Spaces:

lucharo
/

everycure-ner-pdf

Runtime error

App Files Files Community

Luis Chaves commited on Jan 27

Commit

73b49a2

1 Parent(s): 83728f4

first commit

Browse files

Files changed (13) hide show

.gitattributes +0 -35
.gitignore +13 -0
.python-version +1 -0
Dockerfile +15 -0
README.md +57 -1
app.py +50 -0
explore.py +158 -0
extractor.py +87 -0
learning.md +73 -0
openapi.yaml +66 -0
pyproject.toml +15 -0
test_api.py +24 -0
uv.lock +0 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+*pdf
+__pycache__/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.12-slim
+WORKDIR /code
+# Copy only the necessary files
+COPY ./app.py /code/
+COPY ./extractor.py /code/
+COPY ./pyproject.toml /code/
+COPY ./openapi.yaml /code/
+# Install dependencies
+RUN pip install --no-cache-dir .[all]
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -7,4 +7,60 @@ sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+## local dev
+```
+uv run uvicorn app:app --reload --port 8000
+```
+if your pdfs are in a folder called `pdfs/` run:
+```
+curl -v -X POST -F "file=@pdfs/MECFS systematic review.pdf" http://localhost:8000/api/v1/extract
+```
+Or use the automatic Swagger documentation at `http://localhost:8000/docs`
+## Deploying to HuggingFace Spaces
+```dockerfile
+FROM python:3.9-slim
+WORKDIR /code
+# Copy only the necessary files
+COPY ./app.py /code/
+COPY ./extractor.py /code/
+COPY ./pyproject.toml /code/
+COPY ./openapi.yaml /code/
+# Install dependencies
+RUN pip install --no-cache-dir "fastapi[all]" python-multipart pydantic
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+```
+3. Push to HuggingFace:
+```bash
+# Initialize git repository if not already done
+git init
+git add .
+git commit -m "Initial commit"
+# Add HuggingFace remote
+git remote add hf https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+# Push to HuggingFace
+git push hf main
+```
+Note: Replace `YOUR_USERNAME` and `YOUR_SPACE_NAME` with your HuggingFace username and the name you chose for your Space.
+### Important Notes
+- HuggingFace Spaces uses port 7860 by default
+- The API will be available at `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
+- Make sure your model files are included in the repository if needed
+- The free tier of HuggingFace Spaces has limitations on CPU/RAM usage

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from fastapi import FastAPI, UploadFile, HTTPException
+from typing import List
+from extractor import Entity, extract_entities_from_pdf
+import logging
+import uvicorn
+# Set up logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="Medical Entity Extraction API",
+    description="This API allows users to extract medically relevant entities from PDF documents using a pre-trained NER model.",
+    version="1.0.0"
+)
+from fastapi.middleware.cors import CORSMiddleware
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Rest of your existing code
+@app.post("/api/v1/extract", response_model=List[Entity])
+async def extract_entities(file: UploadFile):
+    logger.debug(f"Received request for file: {file.filename}")
+    if not file:
+        logger.error("No file provided")
+        raise HTTPException(status_code=400, detail="No file provided")
+    if not file.filename.lower().endswith('.pdf'):
+        logger.error(f"Invalid file type: {file.filename}")
+        raise HTTPException(status_code=415, detail="File must be a PDF")
+    try:
+        logger.debug("Starting entity extraction")
+        result = extract_entities_from_pdf(file)
+        logger.debug(f"Successfully extracted {len(result)} entities")
+        return result
+    except Exception as e:
+        logger.error(f"Error during extraction: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

explore.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import marimo
+__generated_with = "0.10.17"
+app = marimo.App(width="medium")
+@app.cell
+def _():
+    import marimo as mo
+    import pdfplumber
+    return mo, pdfplumber
+@app.cell
+def _(mo):
+    file = mo.ui.file_browser(initial_path="")
+    file
+    return (file,)
+@app.cell
+def _(file, pdfplumber):
+    with pdfplumber.open(file.path()) as pdf:
+        # Join all pages into single string
+        pdf_text = " ".join(p.extract_text() for p in pdf.pages)
+    return pdf, pdf_text
+@app.cell
+def _():
+    from everycure.extract import extract_pdf_entities
+    return (extract_pdf_entities,)
+@app.cell
+def _(pdf_text):
+    from transformers import pipeline
+    from transformers import AutoTokenizer, AutoModelForTokenClassification
+    tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
+    model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
+    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
+    result = pipe(pdf_text)
+    result
+    return (
+        AutoModelForTokenClassification,
+        AutoTokenizer,
+        model,
+        pipe,
+        pipeline,
+        result,
+        tokenizer,
+    )
+@app.cell
+def _():
+    from gliner import GLiNER
+    # Curated medical labels based on your domain needs
+    MEDICAL_LABELS = [
+        # Parent: NamedThing -> biological_entity
+        "gene",
+        "protein",
+        "protein_isoform",
+        "cell",
+        "disease",
+        "phenotypic_feature",
+        "clinical_finding",
+        "anatomical_entity",
+        "pathway",
+        "biological_process",
+        # Parent: NamedThing -> chemical_entity
+        "drug",
+        "small_molecule",
+        "food_additive",
+        "chemical_mixture",
+        "molecular_entity",
+        # Parent: NamedThing -> clinical_entity
+        "clinical_intervention",
+        "clinical_trial",
+        "hospitalization",
+        # Parent: NamedThing -> planetary_entity
+        "geographic_location",
+        "environmental_feature",
+        "environmental_process",
+        # Parent: NamedThing -> information_content_entity
+        "publication",
+        "journal_article",
+        "book",
+        "patent",
+        "dataset",
+        "study_result",
+        # Parent: NamedThing -> organismal_entity
+        "human",
+        "mammal",
+        "plant",
+        "virus",
+        "bacterium",
+        "cell_line",
+        # Parent: NamedThing -> attribute
+        "biological_sex",
+        "clinical_attribute",
+        "socioeconomic_attribute",
+        "environmental_exposure",
+        "drug_exposure",
+        # Parent: NamedThing -> procedure
+        "procedure",
+        # Parent: NamedThing -> treatment
+        "treatment",
+        # Parent: NamedThing -> device
+        "device",
+        # Parent: NamedThing -> diagnostic_aid
+        "diagnostic_aid",
+        # Parent: NamedThing -> event
+        "event",
+    ]
+    gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
+    def gliner_medical_ner(text, threshold=0.7):
+        entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
+        return [{"text": ent["text"], "label": ent["label"]}
+                for ent in entities if len(ent["text"]) > 2]  # Filter short fragments
+    return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
+@app.cell
+def _(gliner_medical_ner, pdf_text):
+    result_gli = gliner_medical_ner(pdf_text)
+    return (result_gli,)
+@app.cell
+def _(result_gli):
+    result_gli
+    return
+@app.cell
+def _():
+    return
+if __name__ == "__main__":
+    app.run()

extractor.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import List
+from pydantic import BaseModel
+import pdfplumber
+from fastapi import UploadFile
+from gliner import GLiNER
+import logging
+# Set up logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+class Entity(BaseModel):
+    entity: str
+    context: str
+    start: int
+    end: int
+# Curated medical labels
+MEDICAL_LABELS = [
+    "gene", "protein", "protein_isoform", "cell", "disease",
+    "phenotypic_feature", "clinical_finding", "anatomical_entity",
+    "pathway", "biological_process", "drug", "small_molecule",
+    "food_additive", "chemical_mixture", "molecular_entity",
+    "clinical_intervention", "clinical_trial", "hospitalization",
+    "geographic_location", "environmental_feature", "environmental_process",
+    "publication", "journal_article", "book", "patent", "dataset",
+    "study_result", "human", "mammal", "plant", "virus", "bacterium",
+    "cell_line", "biological_sex", "clinical_attribute",
+    "socioeconomic_attribute", "environmental_exposure", "drug_exposure",
+    "procedure", "treatment", "device", "diagnostic_aid", "event"
+]
+# Initialize model
+gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
+def extract_entities_from_pdf(file: UploadFile) -> List[Entity]:
+    """
+    Extract medical entities from a PDF file using GLiNER.
+    Args:
+        file (UploadFile): The uploaded PDF file
+    Returns:
+        List[Entity]: List of extracted entities with their context
+    """
+    logger.debug(f"Starting extraction for file: {file.filename}")
+    try:
+        # Create a temporary file to handle the upload
+        with pdfplumber.open(file.file) as pdf:
+            logger.debug(f"Successfully opened PDF with {len(pdf.pages)} pages")
+            # Join all pages into single string
+            pdf_text = " ".join(p.extract_text() for p in pdf.pages)
+            logger.debug(f"Extracted text length: {len(pdf_text)} characters")
+        # Extract entities using GLiNER
+        logger.debug("Starting GLiNER entity extraction")
+        entities = gliner_model.predict_entities(pdf_text, MEDICAL_LABELS, threshold=0.7)
+        logger.debug(f"Found {len(entities)} entities")
+        # Convert to our Entity model format
+        result = []
+        for ent in entities:
+            if len(ent["text"]) <= 2:  # Skip very short entities
+                continue
+            # Find the context (text surrounding the entity)
+            start_idx = pdf_text.find(ent["text"])
+            if start_idx != -1:
+                # Get surrounding context (50 chars before and after)
+                context_start = max(0, start_idx - 50)
+                context_end = min(len(pdf_text), start_idx + len(ent["text"]) + 50)
+                context = pdf_text[context_start:context_end]
+                result.append(Entity(
+                    entity=ent["text"],
+                    context=context,
+                    start=start_idx - context_start,  # Adjust start position relative to context
+                    end=start_idx - context_start + len(ent["text"])
+                ))
+        logger.debug(f"Returning {len(result)} processed entities")
+        return result
+    except Exception as e:
+        logger.error(f"Error during extraction: {str(e)}", exc_info=True)
+        raise

learning.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# Every Cure Take Home
+## How to create an API endpoint that adeheres to an OpenAPI spec?
+## How to host publicly and for free an API?
+can use docker + hugging face
+## What type of hugging face models do entity type extraction?
+NER model, some are fine tuned in medical terminology such as d4data/biomedical-ner-all, BioBert or ClinicalBert.
+Could also use LLM calls, but hard to judge whose performance would be better/benchmarking (potential improvement), also might be more expensive than a simpler fine tuned BERT model.
+biobert was trained in 2020, not much docs in HF but it's the most popular 700k downloads last month
+clinical bert 47k last month (2023)
+bio clinical bert 3M downloads (2019)
+CLINICAL ner leaderboard useful: <https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard#:~:text=The%20main%20goal%20of%20the,entities%20across%20diverse%20medical%20domains>.
+indeed LLMs are up there
+## What do entities mean in the context of this challenge?
+In this context, entities refer to [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition)
+and in particular medical entities (diseases, names of molecules, proteins, medical procedures, etc)
+There are models specifically trained to do NER detection from text, we'll leverage those.
+## how to extract text out of a pdf?
+pdfplumber works pretty well as stated below we'll keep images and tables out of here, pdfplumber does extract text from tables but without time to assess how good the extraction is we don't know how reliable that is
+## how to extract meaningful context that's not just related to the text contet? wors around it?
+attention mechanism comes to mind
+## caveats pf pdfplumber
+we shouldn't include appendix and references into the mix
+## torch and uv
+torch only works with python 3.12
+UV_PYTHON=3.12 uv init
+uv add transformers torch pdfplumber marimo gliner
+## separate model and app -> probs cleaner but don't have the time
+to do separate model/app deployments (two apis, etc) for now model in hf with gpu shpuld run fine
+## what's the context size of these bert models? do i need to chunk the output
+## test the fast api
+it's got a nice test module
+## looks good
+https://huggingface.co/blaze999/Medical-NER
+<https://docs.astral.sh/uv/guides/integration/pytorch/#installing-pytorch>
+## Parts to the problem
+- Check how good pdfplumber or PyMuPDF is at extracting text without butchering it.
+  - I think for now I could focus on text and list image or table parsing as an improvement.
+- Identify suitable model for tasks
+- write out fastapi endpoint matching openapi spec
+  - write out caching based on filename/content (sha)"
+  - write out effective logging in API backend
+- write out testing of endpoint
+- deploy

openapi.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+openapi: 3.0.0
+info:
+  title: Medical Entity Extraction API
+  description: This API allows users to extract medically relevant entities from PDF documents using a pre-trained NER model.
+  version: "1.0.0"
+servers:
+  - url: 'http://localhost:5000'
+    description: Development server
+paths:
+  /api/v1/extract:
+    post:
+      summary: Extract medical entities from a PDF document.
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              type: object
+              properties:
+                file:
+                  type: string
+                  format: binary
+                  description: PDF file to be processed
+              required:
+                - file
+        description: PDF file to extract medical entities from.
+      responses:
+        '200':
+          description: Successfully extracted entities.
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: '#/components/schemas/Entity'
+        '400':
+          description: Bad request, file not included or empty filename.
+        '415':
+          description: Unsupported file type.
+        '500':
+          description: Server error.
+components:
+  schemas:
+    Entity:
+      type: object
+      properties:
+        entity:
+          type: string
+          example: 'CCR5'
+          description: The identified medical entity.
+        context:
+          type: string
+          example: '... uses on the relief of symptoms rather than on a biological ‘cure’. have identified rare mutations in CCR5 that confer resilience against ...'
+          description: Context where the entity was found, including text surrounding the entity for clarity.
+        start:
+          type: integer
+          format: int32
+          example: 25
+          description: The start position of the entity in the context with respect to the original text.
+        end:
+          type: integer
+          format: int32
+          example: 34
+          description: The end position of the entity in the context with respect to the original text.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "everycure"
+version = "0.1.0"
+readme = "README.md"
+requires-python = ">=3.12"
+description = "Medical Entity Extraction API"
+dependencies = [
+    "marimo>=0.10.16",
+    "pdfplumber>=0.11.5",
+    "torch>=2.5.1",
+    "transformers>=4.48.1",
+    "fastapi>=0.109.0",
+    "python-multipart>=0.0.6",  # Required for handling file uploads
+    "pydantic>=2.5.3"
+]

test_api.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import requests
+url = 'http://localhost:7860/api/v1/extract'
+pdf_path = "pdfs/Enfothelial dysfunction.pdf"
+try:
+    with open(pdf_path, 'rb') as f:
+        files = {'file': f}
+        response = requests.post(url, files=files)
+    print(f"Status Code: {response.status_code}")
+    print(f"Response Headers: {response.headers}")
+    print(f"Response Text: {response.text}")
+    if response.ok:
+        print("JSON Response:", response.json())
+    else:
+        print(f"Error: {response.text}")
+except FileNotFoundError:
+    print(f"Error: Could not find PDF file at {pdf_path}")
+except requests.exceptions.ConnectionError:
+    print(f"Error: Could not connect to server at {url}. Make sure the server is running.")
+except Exception as e:
+    print(f"Unexpected error: {str(e)}")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff