Anupam251272 commited on
Commit
aaa1854
·
verified ·
1 Parent(s): 1e22ffc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import tempfile
4
+ from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
5
+ import gradio as gr
6
+ import fitz # PyMuPDF
7
+ import requests
8
+ from PIL import Image
9
+ import pytesseract
10
+ from langid import langid
11
+ from deep_translator import GoogleTranslator
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ device = 0 if torch.cuda.is_available() else -1
15
+
16
+ # Initialize multilingual QA pipeline
17
+ model_name = "mrm8488/bert-multi-cased-finetuned-xquadv1"
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
20
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
21
+
22
+ INDIAN_LANGUAGES = {
23
+ 'hi': 'Hindi',
24
+ 'pa': 'Punjabi',
25
+ 'bn': 'Bengali',
26
+ 'gu': 'Gujarati',
27
+ 'mr': 'Marathi',
28
+ 'ta': 'Tamil',
29
+ 'te': 'Telugu',
30
+ 'kn': 'Kannada',
31
+ 'ml': 'Malayalam',
32
+ 'en': 'English'
33
+ }
34
+
35
+ def download_pdf_from_url(url):
36
+ try:
37
+ response = requests.get(url)
38
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
39
+ temp_pdf.write(response.content)
40
+ return temp_pdf.name
41
+ except Exception as e:
42
+ logging.error(f"Error downloading PDF: {e}")
43
+ return None
44
+
45
+ def extract_text_from_pdf(pdf_path):
46
+ text = ""
47
+ try:
48
+ doc = fitz.open(pdf_path)
49
+ for page_num in range(len(doc)):
50
+ page = doc.load_page(page_num)
51
+ text += page.get_text("text") or ""
52
+
53
+ if not text.strip():
54
+ images = []
55
+ for page_num in range(len(doc)):
56
+ page = doc.load_page(page_num)
57
+ pix = page.get_pixmap()
58
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
59
+ images.append(img)
60
+
61
+ for image in images:
62
+ ocr_text = pytesseract.image_to_string(
63
+ image,
64
+ lang='+'.join(['eng', 'hin', 'pan', 'ben', 'guj', 'mar', 'tam', 'tel', 'kan', 'mal'])
65
+ )
66
+ text += ocr_text
67
+ except Exception as e:
68
+ logging.error(f"Error extracting text: {e}")
69
+ return text
70
+
71
+ def detect_language(text):
72
+ if not text.strip():
73
+ return 'en'
74
+ try:
75
+ lang_code, _ = langid.classify(text)
76
+ if lang_code in INDIAN_LANGUAGES:
77
+ return lang_code
78
+ else:
79
+ return 'en'
80
+ except Exception as e:
81
+ logging.error(f"Language detection error: {e}")
82
+ return 'en'
83
+
84
+ def process_qa(question, context, output_lang):
85
+ try:
86
+ result = qa_pipeline(question=question, context=context)
87
+ answer = result['answer']
88
+
89
+ # Translate answer to the specified output language
90
+ if output_lang != 'en':
91
+ answer = GoogleTranslator(source='en', target=output_lang).translate(answer)
92
+
93
+ return answer
94
+ except Exception as e:
95
+ logging.error(f"QA processing error: {e}")
96
+ return str(e)
97
+
98
+ def analyze_input(input_source, question, output_lang):
99
+ try:
100
+ if isinstance(input_source, str) and input_source.startswith(('http://', 'https://')):
101
+ pdf_path = download_pdf_from_url(input_source)
102
+ else:
103
+ pdf_path = input_source.name
104
+
105
+ if not pdf_path:
106
+ return "Error: Invalid input source"
107
+
108
+ text = extract_text_from_pdf(pdf_path)
109
+ if not text.strip():
110
+ return "No text extracted from document"
111
+
112
+ question_lang = detect_language(question)
113
+ logging.info(f"Detected question language: {question_lang}")
114
+
115
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
116
+ answers = [process_qa(question, chunk, output_lang) for chunk in chunks if chunk.strip()]
117
+
118
+ final_answer = " ".join(filter(None, answers))
119
+ return f"Answer ({INDIAN_LANGUAGES.get(output_lang, 'English')}): {final_answer}"
120
+
121
+ except Exception as e:
122
+ logging.error(f"Analysis error: {e}")
123
+ return f"Error: {str(e)}"
124
+
125
+ # Gradio Interface
126
+ def create_interface():
127
+ output_lang_list = list(INDIAN_LANGUAGES.keys())
128
+ return gr.Interface(
129
+ fn=analyze_input,
130
+ inputs=[
131
+ gr.File(label="Upload PDF or Enter PDF URL"),
132
+ gr.Textbox(label="Enter your question"),
133
+ gr.Dropdown(choices=output_lang_list, label="Select Output Language", value='en')
134
+ ],
135
+ outputs="text",
136
+ title="Indian Languages PDF QA System",
137
+ description="Support for Hindi, Punjabi, Bengali, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam, and English"
138
+ )
139
+
140
+ if __name__ == "__main__":
141
+ interface = create_interface()
142
+ interface.launch()