File size: 3,382 Bytes
73b49a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from typing import List
from pydantic import BaseModel
import pdfplumber
from fastapi import UploadFile
from gliner import GLiNER
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class Entity(BaseModel):
    entity: str
    context: str
    start: int
    end: int

# Curated medical labels
MEDICAL_LABELS = [
    "gene", "protein", "protein_isoform", "cell", "disease",
    "phenotypic_feature", "clinical_finding", "anatomical_entity",
    "pathway", "biological_process", "drug", "small_molecule",
    "food_additive", "chemical_mixture", "molecular_entity",
    "clinical_intervention", "clinical_trial", "hospitalization",
    "geographic_location", "environmental_feature", "environmental_process",
    "publication", "journal_article", "book", "patent", "dataset",
    "study_result", "human", "mammal", "plant", "virus", "bacterium",
    "cell_line", "biological_sex", "clinical_attribute",
    "socioeconomic_attribute", "environmental_exposure", "drug_exposure",
    "procedure", "treatment", "device", "diagnostic_aid", "event"
]

# Initialize model
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")

def extract_entities_from_pdf(file: UploadFile) -> List[Entity]:
    """
    Extract medical entities from a PDF file using GLiNER.
    
    Args:
        file (UploadFile): The uploaded PDF file
        
    Returns:
        List[Entity]: List of extracted entities with their context
    """
    logger.debug(f"Starting extraction for file: {file.filename}")
    
    try:
        # Create a temporary file to handle the upload
        with pdfplumber.open(file.file) as pdf:
            logger.debug(f"Successfully opened PDF with {len(pdf.pages)} pages")
            # Join all pages into single string
            pdf_text = " ".join(p.extract_text() for p in pdf.pages)
            logger.debug(f"Extracted text length: {len(pdf_text)} characters")
        
        # Extract entities using GLiNER
        logger.debug("Starting GLiNER entity extraction")
        entities = gliner_model.predict_entities(pdf_text, MEDICAL_LABELS, threshold=0.7)
        logger.debug(f"Found {len(entities)} entities")
        
        # Convert to our Entity model format
        result = []
        for ent in entities:
            if len(ent["text"]) <= 2:  # Skip very short entities
                continue
                
            # Find the context (text surrounding the entity)
            start_idx = pdf_text.find(ent["text"])
            if start_idx != -1:
                # Get surrounding context (50 chars before and after)
                context_start = max(0, start_idx - 50)
                context_end = min(len(pdf_text), start_idx + len(ent["text"]) + 50)
                context = pdf_text[context_start:context_end]
                
                result.append(Entity(
                    entity=ent["text"],
                    context=context,
                    start=start_idx - context_start,  # Adjust start position relative to context
                    end=start_idx - context_start + len(ent["text"])
                ))
        
        logger.debug(f"Returning {len(result)} processed entities")
        return result
        
    except Exception as e:
        logger.error(f"Error during extraction: {str(e)}", exc_info=True)
        raise