Luis Chaves commited on
Commit
73b49a2
·
1 Parent(s): 83728f4

first commit

Browse files
Files changed (13) hide show
  1. .gitattributes +0 -35
  2. .gitignore +13 -0
  3. .python-version +1 -0
  4. Dockerfile +15 -0
  5. README.md +57 -1
  6. app.py +50 -0
  7. explore.py +158 -0
  8. extractor.py +87 -0
  9. learning.md +73 -0
  10. openapi.yaml +66 -0
  11. pyproject.toml +15 -0
  12. test_api.py +24 -0
  13. uv.lock +0 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ *pdf
13
+ __pycache__/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # Copy only the necessary files
6
+ COPY ./app.py /code/
7
+ COPY ./extractor.py /code/
8
+ COPY ./pyproject.toml /code/
9
+ COPY ./openapi.yaml /code/
10
+
11
+ # Install dependencies
12
+ RUN pip install --no-cache-dir .[all]
13
+
14
+ # Run the application
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -7,4 +7,60 @@ sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  pinned: false
8
  ---
9
 
10
+ ## local dev
11
+
12
+ ```
13
+ uv run uvicorn app:app --reload --port 8000
14
+ ```
15
+
16
+ if your pdfs are in a folder called `pdfs/` run:
17
+
18
+ ```
19
+ curl -v -X POST -F "file=@pdfs/MECFS systematic review.pdf" http://localhost:8000/api/v1/extract
20
+ ```
21
+
22
+ Or use the automatic Swagger documentation at `http://localhost:8000/docs`
23
+
24
+ ## Deploying to HuggingFace Spaces
25
+
26
+ ```dockerfile
27
+ FROM python:3.9-slim
28
+
29
+ WORKDIR /code
30
+
31
+ # Copy only the necessary files
32
+ COPY ./app.py /code/
33
+ COPY ./extractor.py /code/
34
+ COPY ./pyproject.toml /code/
35
+ COPY ./openapi.yaml /code/
36
+
37
+ # Install dependencies
38
+ RUN pip install --no-cache-dir "fastapi[all]" python-multipart pydantic
39
+
40
+ # Run the application
41
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
42
+ ```
43
+
44
+ 3. Push to HuggingFace:
45
+
46
+ ```bash
47
+ # Initialize git repository if not already done
48
+ git init
49
+ git add .
50
+ git commit -m "Initial commit"
51
+
52
+ # Add HuggingFace remote
53
+ git remote add hf https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
54
+
55
+ # Push to HuggingFace
56
+ git push hf main
57
+ ```
58
+
59
+ Note: Replace `YOUR_USERNAME` and `YOUR_SPACE_NAME` with your HuggingFace username and the name you chose for your Space.
60
+
61
+ ### Important Notes
62
+
63
+ - HuggingFace Spaces uses port 7860 by default
64
+ - The API will be available at `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
65
+ - Make sure your model files are included in the repository if needed
66
+ - The free tier of HuggingFace Spaces has limitations on CPU/RAM usage
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, HTTPException
2
+ from typing import List
3
+ from extractor import Entity, extract_entities_from_pdf
4
+ import logging
5
+ import uvicorn
6
+
7
+ # Set up logging
8
+ logging.basicConfig(level=logging.DEBUG)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ app = FastAPI(
12
+ title="Medical Entity Extraction API",
13
+ description="This API allows users to extract medically relevant entities from PDF documents using a pre-trained NER model.",
14
+ version="1.0.0"
15
+ )
16
+
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+
19
+ # Add CORS middleware
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ # Rest of your existing code
28
+ @app.post("/api/v1/extract", response_model=List[Entity])
29
+ async def extract_entities(file: UploadFile):
30
+ logger.debug(f"Received request for file: {file.filename}")
31
+
32
+ if not file:
33
+ logger.error("No file provided")
34
+ raise HTTPException(status_code=400, detail="No file provided")
35
+
36
+ if not file.filename.lower().endswith('.pdf'):
37
+ logger.error(f"Invalid file type: {file.filename}")
38
+ raise HTTPException(status_code=415, detail="File must be a PDF")
39
+
40
+ try:
41
+ logger.debug("Starting entity extraction")
42
+ result = extract_entities_from_pdf(file)
43
+ logger.debug(f"Successfully extracted {len(result)} entities")
44
+ return result
45
+ except Exception as e:
46
+ logger.error(f"Error during extraction: {str(e)}", exc_info=True)
47
+ raise HTTPException(status_code=500, detail=str(e))
48
+
49
+ if __name__ == "__main__":
50
+ uvicorn.run(app, host="0.0.0.0", port=7860)
explore.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import marimo
2
+
3
+ __generated_with = "0.10.17"
4
+ app = marimo.App(width="medium")
5
+
6
+
7
+ @app.cell
8
+ def _():
9
+ import marimo as mo
10
+ import pdfplumber
11
+ return mo, pdfplumber
12
+
13
+
14
+ @app.cell
15
+ def _(mo):
16
+ file = mo.ui.file_browser(initial_path="")
17
+ file
18
+ return (file,)
19
+
20
+
21
+ @app.cell
22
+ def _(file, pdfplumber):
23
+ with pdfplumber.open(file.path()) as pdf:
24
+ # Join all pages into single string
25
+ pdf_text = " ".join(p.extract_text() for p in pdf.pages)
26
+ return pdf, pdf_text
27
+
28
+
29
+ @app.cell
30
+ def _():
31
+ from everycure.extract import extract_pdf_entities
32
+ return (extract_pdf_entities,)
33
+
34
+
35
+ @app.cell
36
+ def _(pdf_text):
37
+ from transformers import pipeline
38
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
41
+ model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
42
+
43
+ pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
44
+ result = pipe(pdf_text)
45
+ result
46
+ return (
47
+ AutoModelForTokenClassification,
48
+ AutoTokenizer,
49
+ model,
50
+ pipe,
51
+ pipeline,
52
+ result,
53
+ tokenizer,
54
+ )
55
+
56
+
57
+ @app.cell
58
+ def _():
59
+ from gliner import GLiNER
60
+
61
+ # Curated medical labels based on your domain needs
62
+ MEDICAL_LABELS = [
63
+ # Parent: NamedThing -> biological_entity
64
+ "gene",
65
+ "protein",
66
+ "protein_isoform",
67
+ "cell",
68
+ "disease",
69
+ "phenotypic_feature",
70
+ "clinical_finding",
71
+ "anatomical_entity",
72
+ "pathway",
73
+ "biological_process",
74
+
75
+ # Parent: NamedThing -> chemical_entity
76
+ "drug",
77
+ "small_molecule",
78
+ "food_additive",
79
+ "chemical_mixture",
80
+ "molecular_entity",
81
+
82
+ # Parent: NamedThing -> clinical_entity
83
+ "clinical_intervention",
84
+ "clinical_trial",
85
+ "hospitalization",
86
+
87
+ # Parent: NamedThing -> planetary_entity
88
+ "geographic_location",
89
+ "environmental_feature",
90
+ "environmental_process",
91
+
92
+ # Parent: NamedThing -> information_content_entity
93
+ "publication",
94
+ "journal_article",
95
+ "book",
96
+ "patent",
97
+ "dataset",
98
+ "study_result",
99
+
100
+ # Parent: NamedThing -> organismal_entity
101
+ "human",
102
+ "mammal",
103
+ "plant",
104
+ "virus",
105
+ "bacterium",
106
+ "cell_line",
107
+
108
+ # Parent: NamedThing -> attribute
109
+ "biological_sex",
110
+ "clinical_attribute",
111
+ "socioeconomic_attribute",
112
+ "environmental_exposure",
113
+ "drug_exposure",
114
+
115
+ # Parent: NamedThing -> procedure
116
+ "procedure",
117
+
118
+ # Parent: NamedThing -> treatment
119
+ "treatment",
120
+
121
+ # Parent: NamedThing -> device
122
+ "device",
123
+
124
+ # Parent: NamedThing -> diagnostic_aid
125
+ "diagnostic_aid",
126
+
127
+ # Parent: NamedThing -> event
128
+ "event",
129
+ ]
130
+
131
+ gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
132
+
133
+ def gliner_medical_ner(text, threshold=0.7):
134
+ entities = gliner_model.predict_entities(text, MEDICAL_LABELS, threshold=threshold)
135
+ return [{"text": ent["text"], "label": ent["label"]}
136
+ for ent in entities if len(ent["text"]) > 2] # Filter short fragments
137
+ return GLiNER, MEDICAL_LABELS, gliner_medical_ner, gliner_model
138
+
139
+
140
+ @app.cell
141
+ def _(gliner_medical_ner, pdf_text):
142
+ result_gli = gliner_medical_ner(pdf_text)
143
+ return (result_gli,)
144
+
145
+
146
+ @app.cell
147
+ def _(result_gli):
148
+ result_gli
149
+ return
150
+
151
+
152
+ @app.cell
153
+ def _():
154
+ return
155
+
156
+
157
+ if __name__ == "__main__":
158
+ app.run()
extractor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pydantic import BaseModel
3
+ import pdfplumber
4
+ from fastapi import UploadFile
5
+ from gliner import GLiNER
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.DEBUG)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class Entity(BaseModel):
13
+ entity: str
14
+ context: str
15
+ start: int
16
+ end: int
17
+
18
+ # Curated medical labels
19
+ MEDICAL_LABELS = [
20
+ "gene", "protein", "protein_isoform", "cell", "disease",
21
+ "phenotypic_feature", "clinical_finding", "anatomical_entity",
22
+ "pathway", "biological_process", "drug", "small_molecule",
23
+ "food_additive", "chemical_mixture", "molecular_entity",
24
+ "clinical_intervention", "clinical_trial", "hospitalization",
25
+ "geographic_location", "environmental_feature", "environmental_process",
26
+ "publication", "journal_article", "book", "patent", "dataset",
27
+ "study_result", "human", "mammal", "plant", "virus", "bacterium",
28
+ "cell_line", "biological_sex", "clinical_attribute",
29
+ "socioeconomic_attribute", "environmental_exposure", "drug_exposure",
30
+ "procedure", "treatment", "device", "diagnostic_aid", "event"
31
+ ]
32
+
33
+ # Initialize model
34
+ gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
35
+
36
+ def extract_entities_from_pdf(file: UploadFile) -> List[Entity]:
37
+ """
38
+ Extract medical entities from a PDF file using GLiNER.
39
+
40
+ Args:
41
+ file (UploadFile): The uploaded PDF file
42
+
43
+ Returns:
44
+ List[Entity]: List of extracted entities with their context
45
+ """
46
+ logger.debug(f"Starting extraction for file: {file.filename}")
47
+
48
+ try:
49
+ # Create a temporary file to handle the upload
50
+ with pdfplumber.open(file.file) as pdf:
51
+ logger.debug(f"Successfully opened PDF with {len(pdf.pages)} pages")
52
+ # Join all pages into single string
53
+ pdf_text = " ".join(p.extract_text() for p in pdf.pages)
54
+ logger.debug(f"Extracted text length: {len(pdf_text)} characters")
55
+
56
+ # Extract entities using GLiNER
57
+ logger.debug("Starting GLiNER entity extraction")
58
+ entities = gliner_model.predict_entities(pdf_text, MEDICAL_LABELS, threshold=0.7)
59
+ logger.debug(f"Found {len(entities)} entities")
60
+
61
+ # Convert to our Entity model format
62
+ result = []
63
+ for ent in entities:
64
+ if len(ent["text"]) <= 2: # Skip very short entities
65
+ continue
66
+
67
+ # Find the context (text surrounding the entity)
68
+ start_idx = pdf_text.find(ent["text"])
69
+ if start_idx != -1:
70
+ # Get surrounding context (50 chars before and after)
71
+ context_start = max(0, start_idx - 50)
72
+ context_end = min(len(pdf_text), start_idx + len(ent["text"]) + 50)
73
+ context = pdf_text[context_start:context_end]
74
+
75
+ result.append(Entity(
76
+ entity=ent["text"],
77
+ context=context,
78
+ start=start_idx - context_start, # Adjust start position relative to context
79
+ end=start_idx - context_start + len(ent["text"])
80
+ ))
81
+
82
+ logger.debug(f"Returning {len(result)} processed entities")
83
+ return result
84
+
85
+ except Exception as e:
86
+ logger.error(f"Error during extraction: {str(e)}", exc_info=True)
87
+ raise
learning.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Every Cure Take Home
2
+
3
+ ## How to create an API endpoint that adeheres to an OpenAPI spec?
4
+
5
+ ## How to host publicly and for free an API?
6
+
7
+ can use docker + hugging face
8
+
9
+ ## What type of hugging face models do entity type extraction?
10
+
11
+ NER model, some are fine tuned in medical terminology such as d4data/biomedical-ner-all, BioBert or ClinicalBert.
12
+ Could also use LLM calls, but hard to judge whose performance would be better/benchmarking (potential improvement), also might be more expensive than a simpler fine tuned BERT model.
13
+
14
+ biobert was trained in 2020, not much docs in HF but it's the most popular 700k downloads last month
15
+ clinical bert 47k last month (2023)
16
+
17
+ bio clinical bert 3M downloads (2019)
18
+
19
+ CLINICAL ner leaderboard useful: <https://huggingface.co/spaces/m42-health/clinical_ner_leaderboard#:~:text=The%20main%20goal%20of%20the,entities%20across%20diverse%20medical%20domains>.
20
+
21
+ indeed LLMs are up there
22
+
23
+ ## What do entities mean in the context of this challenge?
24
+
25
+ In this context, entities refer to [Named Entity Recognition](https://en.wikipedia.org/wiki/Named-entity_recognition)
26
+ and in particular medical entities (diseases, names of molecules, proteins, medical procedures, etc)
27
+
28
+ There are models specifically trained to do NER detection from text, we'll leverage those.
29
+
30
+ ## how to extract text out of a pdf?
31
+
32
+ pdfplumber works pretty well as stated below we'll keep images and tables out of here, pdfplumber does extract text from tables but without time to assess how good the extraction is we don't know how reliable that is
33
+
34
+ ## how to extract meaningful context that's not just related to the text contet? wors around it?
35
+
36
+ attention mechanism comes to mind
37
+
38
+ ## caveats pf pdfplumber
39
+
40
+ we shouldn't include appendix and references into the mix
41
+
42
+ ## torch and uv
43
+
44
+ torch only works with python 3.12
45
+
46
+ UV_PYTHON=3.12 uv init
47
+ uv add transformers torch pdfplumber marimo gliner
48
+
49
+ ## separate model and app -> probs cleaner but don't have the time
50
+ to do separate model/app deployments (two apis, etc) for now model in hf with gpu shpuld run fine
51
+
52
+ ## what's the context size of these bert models? do i need to chunk the output
53
+
54
+ ## test the fast api
55
+
56
+ it's got a nice test module
57
+
58
+ ## looks good
59
+
60
+ https://huggingface.co/blaze999/Medical-NER
61
+
62
+ <https://docs.astral.sh/uv/guides/integration/pytorch/#installing-pytorch>
63
+
64
+ ## Parts to the problem
65
+
66
+ - Check how good pdfplumber or PyMuPDF is at extracting text without butchering it.
67
+ - I think for now I could focus on text and list image or table parsing as an improvement.
68
+ - Identify suitable model for tasks
69
+ - write out fastapi endpoint matching openapi spec
70
+ - write out caching based on filename/content (sha)"
71
+ - write out effective logging in API backend
72
+ - write out testing of endpoint
73
+ - deploy
openapi.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openapi: 3.0.0
2
+ info:
3
+ title: Medical Entity Extraction API
4
+ description: This API allows users to extract medically relevant entities from PDF documents using a pre-trained NER model.
5
+ version: "1.0.0"
6
+ servers:
7
+ - url: 'http://localhost:5000'
8
+ description: Development server
9
+
10
+ paths:
11
+ /api/v1/extract:
12
+ post:
13
+ summary: Extract medical entities from a PDF document.
14
+ requestBody:
15
+ content:
16
+ multipart/form-data:
17
+ schema:
18
+ type: object
19
+ properties:
20
+ file:
21
+ type: string
22
+ format: binary
23
+ description: PDF file to be processed
24
+ required:
25
+ - file
26
+ description: PDF file to extract medical entities from.
27
+ responses:
28
+ '200':
29
+ description: Successfully extracted entities.
30
+ content:
31
+ application/json:
32
+ schema:
33
+ type: array
34
+ items:
35
+ $ref: '#/components/schemas/Entity'
36
+ '400':
37
+ description: Bad request, file not included or empty filename.
38
+ '415':
39
+ description: Unsupported file type.
40
+ '500':
41
+ description: Server error.
42
+
43
+ components:
44
+ schemas:
45
+ Entity:
46
+ type: object
47
+ properties:
48
+ entity:
49
+ type: string
50
+ example: 'CCR5'
51
+ description: The identified medical entity.
52
+ context:
53
+ type: string
54
+ example: '... uses on the relief of symptoms rather than on a biological ‘cure’. have identified rare mutations in CCR5 that confer resilience against ...'
55
+ description: Context where the entity was found, including text surrounding the entity for clarity.
56
+ start:
57
+ type: integer
58
+ format: int32
59
+ example: 25
60
+ description: The start position of the entity in the context with respect to the original text.
61
+ end:
62
+ type: integer
63
+ format: int32
64
+ example: 34
65
+ description: The end position of the entity in the context with respect to the original text.
66
+
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "everycure"
3
+ version = "0.1.0"
4
+ readme = "README.md"
5
+ requires-python = ">=3.12"
6
+ description = "Medical Entity Extraction API"
7
+ dependencies = [
8
+ "marimo>=0.10.16",
9
+ "pdfplumber>=0.11.5",
10
+ "torch>=2.5.1",
11
+ "transformers>=4.48.1",
12
+ "fastapi>=0.109.0",
13
+ "python-multipart>=0.0.6", # Required for handling file uploads
14
+ "pydantic>=2.5.3"
15
+ ]
test_api.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = 'http://localhost:7860/api/v1/extract'
4
+ pdf_path = "pdfs/Enfothelial dysfunction.pdf"
5
+
6
+ try:
7
+ with open(pdf_path, 'rb') as f:
8
+ files = {'file': f}
9
+ response = requests.post(url, files=files)
10
+
11
+ print(f"Status Code: {response.status_code}")
12
+ print(f"Response Headers: {response.headers}")
13
+ print(f"Response Text: {response.text}")
14
+
15
+ if response.ok:
16
+ print("JSON Response:", response.json())
17
+ else:
18
+ print(f"Error: {response.text}")
19
+ except FileNotFoundError:
20
+ print(f"Error: Could not find PDF file at {pdf_path}")
21
+ except requests.exceptions.ConnectionError:
22
+ print(f"Error: Could not connect to server at {url}. Make sure the server is running.")
23
+ except Exception as e:
24
+ print(f"Unexpected error: {str(e)}")
uv.lock ADDED
The diff for this file is too large to render. See raw diff