nguyenbh commited on
Commit
978aa95
Β·
1 Parent(s): 76ec88e
app.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import requests
4
+ import urllib.request
5
+ import os
6
+ import ssl
7
+ import base64
8
+ import tempfile
9
+ import edge_tts
10
+ import re
11
+ import logging
12
+ from PIL import Image
13
+ from io import BytesIO
14
+ from typing import Dict, List, Optional, Tuple, Union
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Azure ML endpoint configuration - these should be set as environment variables
21
+ url = os.getenv("AZURE_ENDPOINT")
22
+ api_key = os.getenv("AZURE_API_KEY")
23
+
24
+
25
+ def call_aml_endpoint(payload, url, api_key):
26
+ """Call Azure ML endpoint with the given payload."""
27
+ # Allow self-signed HTTPS certificates
28
+ def allow_self_signed_https(allowed):
29
+ if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
30
+ ssl._create_default_https_context = ssl._create_unverified_context
31
+
32
+ allow_self_signed_https(True)
33
+
34
+ # Set parameters (can be adjusted based on your needs)
35
+ parameters = {"temperature": 0.7}
36
+ if "parameters" not in payload["input_data"]:
37
+ payload["input_data"]["parameters"] = parameters
38
+
39
+ # Encode the request body
40
+ body = str.encode(json.dumps(payload))
41
+
42
+ if not api_key:
43
+ raise Exception("A key should be provided to invoke the endpoint")
44
+
45
+ # Set up headers
46
+ headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}
47
+
48
+ # Create and send the request
49
+ req = urllib.request.Request(url, body, headers)
50
+
51
+ try:
52
+ logger.info(f"Sending request to {url}")
53
+ response = urllib.request.urlopen(req)
54
+ result = response.read().decode('utf-8')
55
+ logger.info("Received response successfully")
56
+ return json.loads(result)
57
+ except urllib.error.HTTPError as error:
58
+ logger.error(f"Request failed with status code: {error.code}")
59
+ logger.error(f"Headers: {error.info()}")
60
+ error_message = error.read().decode("utf8", 'ignore')
61
+ logger.error(f"Error message: {error_message}")
62
+ return {"error": error_message}
63
+
64
+ def encode_base64_from_file(file_path):
65
+ """Encode file content to base64 string and determine MIME type."""
66
+ file_extension = os.path.splitext(file_path)[1].lower()
67
+
68
+ # Map file extensions to MIME types
69
+ if file_extension in ['.jpg', '.jpeg']:
70
+ mime_type = "image/jpeg"
71
+ elif file_extension == '.png':
72
+ mime_type = "image/png"
73
+ elif file_extension == '.gif':
74
+ mime_type = "image/gif"
75
+ elif file_extension in ['.bmp', '.tiff', '.webp']:
76
+ mime_type = f"image/{file_extension[1:]}"
77
+ else:
78
+ mime_type = "image/jpeg" # Default to JPEG
79
+
80
+ # Read and encode file content
81
+ with open(file_path, "rb") as file:
82
+ encoded_string = base64.b64encode(file.read()).decode('utf-8')
83
+
84
+ return encoded_string, mime_type
85
+
86
+ class ImageOCRApp:
87
+ def __init__(self):
88
+ """Initialize the app with Azure ML endpoint configurations"""
89
+ # Check if Azure endpoint and key are set
90
+ if not url or not api_key:
91
+ logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.")
92
+
93
+ def recognize_text(self, image_path: str) -> str:
94
+ """Recognize text from the image using Azure ML endpoint"""
95
+ try:
96
+ # Encode image to base64
97
+ base64_image, mime_type = encode_base64_from_file(image_path)
98
+
99
+ # Prepare prompt for OCR
100
+ ocr_prompt = "Please identify the handwritten text in the image."
101
+
102
+ # Create content array for the payload
103
+ content_items = [
104
+ {"type": "text", "text": ocr_prompt},
105
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
106
+ ]
107
+
108
+ # Create conversation state
109
+ conversation_state = [
110
+ {
111
+ "role": "user",
112
+ "content": content_items
113
+ }
114
+ ]
115
+
116
+ # Create the payload
117
+ payload = {
118
+ "input_data": {
119
+ "input_string": conversation_state
120
+ }
121
+ }
122
+
123
+ # Call Azure ML endpoint
124
+ response = call_aml_endpoint(payload, url, api_key)
125
+
126
+ # Extract text response from the Azure ML endpoint response
127
+ if isinstance(response, dict):
128
+ if "result" in response:
129
+ result = response["result"]
130
+ elif "output" in response:
131
+ # Depending on your API's response format
132
+ if isinstance(response["output"], list) and len(response["output"]) > 0:
133
+ result = response["output"][0]
134
+ else:
135
+ result = str(response["output"])
136
+ elif "error" in response:
137
+ logger.error(f"Error from Azure ML endpoint: {response['error']}")
138
+ result = f"Error: {response['error']}"
139
+ else:
140
+ # Just return the whole response as string if we can't parse it
141
+ result = f"Received response: {json.dumps(response)}"
142
+ else:
143
+ result = str(response)
144
+
145
+ return result
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error recognizing text: {str(e)}", exc_info=True)
149
+ return f"Error recognizing text: {str(e)}"
150
+
151
+ async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]:
152
+ """Convert text to speech using Edge TTS"""
153
+ if not text.strip():
154
+ return None
155
+
156
+ try:
157
+ communicate = edge_tts.Communicate(text, voice)
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
159
+ tmp_path = tmp_file.name
160
+ await communicate.save(tmp_path)
161
+ return tmp_path
162
+ except Exception as e:
163
+ logger.error(f"TTS Error: {str(e)}")
164
+ return None
165
+
166
+ def create_interface(self):
167
+ """Create the Gradio interface"""
168
+ custom_css = """
169
+ .container { max-width: 900px; margin: auto; }
170
+ .input-section {
171
+ background: #f8f9fa;
172
+ padding: 20px;
173
+ border-radius: 10px;
174
+ margin-bottom: 20px;
175
+ }
176
+ .output-section {
177
+ background: #ffffff;
178
+ padding: 20px;
179
+ border-radius: 10px;
180
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
181
+ }
182
+ """
183
+
184
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface:
185
+ # Header
186
+ gr.Markdown("""
187
+ # ✨ Stories Come Alive
188
+ ### Transform handwritten moments into spoken memories
189
+
190
+ Turn precious handwritten stories, notes, and drawings into living words.
191
+ Whether it's a child's imaginative tale, a heartfelt letter, or a creative
192
+ story - let's bring those special handwritten moments to life through sight
193
+ and sound. πŸŽ¨πŸ“πŸŽ§
194
+ """)
195
+
196
+ with gr.Row():
197
+ # Input section
198
+ with gr.Column(scale=1):
199
+ image_input = gr.Image(
200
+ label="Upload or Capture Image",
201
+ sources=["upload", "webcam"],
202
+ type="filepath"
203
+ )
204
+
205
+ # Example selector
206
+ gr.Markdown("### Try with Examples")
207
+ example_images = [
208
+ ["content/kid.handwriting.draw.01.jpg", "Tiny Seed"],
209
+ ["content/race.for.the.moon.jpg", "To the Moon!"],
210
+ ["content/john.adam.move.to.dc.png", "Move to DC"],
211
+ ]
212
+ gr.Examples(
213
+ examples=example_images,
214
+ inputs=image_input,
215
+ label="Example Images"
216
+ )
217
+
218
+ with gr.Row():
219
+ process_btn = gr.Button("πŸ” Recognize Text", variant="primary")
220
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
221
+ status_msg = gr.Markdown("Ready to process image...")
222
+
223
+ # Output section
224
+ with gr.Column(scale=1):
225
+ recognized_text = gr.Textbox(
226
+ label="Recognized Text",
227
+ lines=5,
228
+ # readonly=True
229
+ )
230
+
231
+ tts_audio = gr.Audio(
232
+ label="Text-to-Speech Output",
233
+ visible=True,
234
+ interactive=False
235
+ )
236
+
237
+ # Event handlers
238
+ async def process_image(image):
239
+ if image is None:
240
+ return "Please upload or capture an image.", None, "⚠️ Please provide an image"
241
+
242
+ # Check if Azure ML endpoint and API key are set
243
+ if not url or not api_key:
244
+ return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "⚠️ Configuration error"
245
+
246
+ # Recognize text using Azure ML endpoint
247
+ text = self.recognize_text(image)
248
+
249
+ if not text or text.strip() == "":
250
+ return "No text was recognized in the image.", None, "⚠️ No text recognized"
251
+
252
+ # Clean up text - replace newlines with spaces and remove multiple spaces
253
+ cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
254
+
255
+ # Generate audio immediately
256
+ audio_path = await self.text_to_speech(cleaned_text)
257
+
258
+ return text, audio_path, "βœ… Text recognized and audio generated"
259
+
260
+ def clear_inputs():
261
+ return None, "", None, "Ready to process image..."
262
+
263
+ process_btn.click(
264
+ fn=process_image,
265
+ inputs=[image_input],
266
+ outputs=[
267
+ recognized_text,
268
+ tts_audio,
269
+ status_msg
270
+ ],
271
+ api_name="process_image"
272
+ )
273
+
274
+ clear_btn.click(
275
+ fn=clear_inputs,
276
+ inputs=[],
277
+ outputs=[
278
+ image_input,
279
+ recognized_text,
280
+ tts_audio,
281
+ status_msg
282
+ ],
283
+ api_name="clear_inputs"
284
+ )
285
+
286
+ # Instructions
287
+ with gr.Accordion("ℹ️ How to Use", open=False):
288
+ gr.Markdown("""
289
+ 1. **Upload or Capture**: Use your webcam or upload an image containing text
290
+ 2. **Process**: Click 'Recognize Text' to extract text from the image
291
+ 3. **Listen**: The audio will automatically play once text is recognized
292
+
293
+ Note: The system works best with clear, well-lit images of handwritten text.
294
+
295
+ ### Configuration
296
+ Before using this app, set these environment variables:
297
+ - AZURE_ENDPOINT: Your Azure ML endpoint URL
298
+ - AZURE_API_KEY: Your Azure ML API key
299
+ """)
300
+
301
+ return interface
302
+
303
+ def run_app():
304
+ app = ImageOCRApp()
305
+ interface = app.create_interface()
306
+ interface.launch(
307
+ share=True,
308
+ server_name="0.0.0.0",
309
+ )
310
+
311
+ if __name__ == "__main__":
312
+ run_app()
content/john.adam.move.to.dc.png ADDED
content/kid.handwriting.draw.01.jpg ADDED
content/race.for.the.moon.jpg ADDED
requirements.txt ADDED
File without changes