Spaces:

microsoft
/

StoriesComeAlive

Build error

App Files Files Community

nguyenbh commited on Feb 26

Commit

978aa95

1 Parent(s): 76ec88e

Init

Browse files

Files changed (5) hide show

app.py +312 -0
content/john.adam.move.to.dc.png +0 -0
content/kid.handwriting.draw.01.jpg +0 -0
content/race.for.the.moon.jpg +0 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import gradio as gr
+import json
+import requests
+import urllib.request
+import os
+import ssl
+import base64
+import tempfile
+import edge_tts
+import re
+import logging
+from PIL import Image
+from io import BytesIO
+from typing import Dict, List, Optional, Tuple, Union
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Azure ML endpoint configuration - these should be set as environment variables
+url = os.getenv("AZURE_ENDPOINT")
+api_key = os.getenv("AZURE_API_KEY")
+def call_aml_endpoint(payload, url, api_key):
+    """Call Azure ML endpoint with the given payload."""
+    # Allow self-signed HTTPS certificates
+    def allow_self_signed_https(allowed):
+        if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
+            ssl._create_default_https_context = ssl._create_unverified_context
+    allow_self_signed_https(True)
+    # Set parameters (can be adjusted based on your needs)
+    parameters = {"temperature": 0.7}
+    if "parameters" not in payload["input_data"]:
+        payload["input_data"]["parameters"] = parameters
+    # Encode the request body
+    body = str.encode(json.dumps(payload))
+    if not api_key:
+        raise Exception("A key should be provided to invoke the endpoint")
+    # Set up headers
+    headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}
+    # Create and send the request
+    req = urllib.request.Request(url, body, headers)
+    try:
+        logger.info(f"Sending request to {url}")
+        response = urllib.request.urlopen(req)
+        result = response.read().decode('utf-8')
+        logger.info("Received response successfully")
+        return json.loads(result)
+    except urllib.error.HTTPError as error:
+        logger.error(f"Request failed with status code: {error.code}")
+        logger.error(f"Headers: {error.info()}")
+        error_message = error.read().decode("utf8", 'ignore')
+        logger.error(f"Error message: {error_message}")
+        return {"error": error_message}
+def encode_base64_from_file(file_path):
+    """Encode file content to base64 string and determine MIME type."""
+    file_extension = os.path.splitext(file_path)[1].lower()
+    # Map file extensions to MIME types
+    if file_extension in ['.jpg', '.jpeg']:
+        mime_type = "image/jpeg"
+    elif file_extension == '.png':
+        mime_type = "image/png"
+    elif file_extension == '.gif':
+        mime_type = "image/gif"
+    elif file_extension in ['.bmp', '.tiff', '.webp']:
+        mime_type = f"image/{file_extension[1:]}"
+    else:
+        mime_type = "image/jpeg"  # Default to JPEG
+    # Read and encode file content
+    with open(file_path, "rb") as file:
+        encoded_string = base64.b64encode(file.read()).decode('utf-8')
+    return encoded_string, mime_type
+class ImageOCRApp:
+    def __init__(self):
+        """Initialize the app with Azure ML endpoint configurations"""
+        # Check if Azure endpoint and key are set
+        if not url or not api_key:
+            logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.")
+    def recognize_text(self, image_path: str) -> str:
+        """Recognize text from the image using Azure ML endpoint"""
+        try:
+            # Encode image to base64
+            base64_image, mime_type = encode_base64_from_file(image_path)
+            # Prepare prompt for OCR
+            ocr_prompt = "Please identify the handwritten text in the image."
+            # Create content array for the payload
+            content_items = [
+                {"type": "text", "text": ocr_prompt},
+                {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
+            ]
+            # Create conversation state
+            conversation_state = [
+                {
+                    "role": "user",
+                    "content": content_items
+                }
+            ]
+            # Create the payload
+            payload = {
+                "input_data": {
+                    "input_string": conversation_state
+                }
+            }
+            # Call Azure ML endpoint
+            response = call_aml_endpoint(payload, url, api_key)
+            # Extract text response from the Azure ML endpoint response
+            if isinstance(response, dict):
+                if "result" in response:
+                    result = response["result"]
+                elif "output" in response:
+                    # Depending on your API's response format
+                    if isinstance(response["output"], list) and len(response["output"]) > 0:
+                        result = response["output"][0]
+                    else:
+                        result = str(response["output"])
+                elif "error" in response:
+                    logger.error(f"Error from Azure ML endpoint: {response['error']}")
+                    result = f"Error: {response['error']}"
+                else:
+                    # Just return the whole response as string if we can't parse it
+                    result = f"Received response: {json.dumps(response)}"
+            else:
+                result = str(response)
+            return result
+        except Exception as e:
+            logger.error(f"Error recognizing text: {str(e)}", exc_info=True)
+            return f"Error recognizing text: {str(e)}"
+    async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]:
+        """Convert text to speech using Edge TTS"""
+        if not text.strip():
+            return None
+        try:
+            communicate = edge_tts.Communicate(text, voice)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+                tmp_path = tmp_file.name
+                await communicate.save(tmp_path)
+                return tmp_path
+        except Exception as e:
+            logger.error(f"TTS Error: {str(e)}")
+            return None
+    def create_interface(self):
+        """Create the Gradio interface"""
+        custom_css = """
+            .container { max-width: 900px; margin: auto; }
+            .input-section {
+                background: #f8f9fa;
+                padding: 20px;
+                border-radius: 10px;
+                margin-bottom: 20px;
+            }
+            .output-section {
+                background: #ffffff;
+                padding: 20px;
+                border-radius: 10px;
+                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            }
+        """
+        with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface:
+            # Header
+            gr.Markdown("""
+                # ✨ Stories Come Alive
+                ### Transform handwritten moments into spoken memories
+                Turn precious handwritten stories, notes, and drawings into living words.
+                Whether it's a child's imaginative tale, a heartfelt letter, or a creative
+                story - let's bring those special handwritten moments to life through sight
+                and sound. 🎨📝🎧
+            """)
+            with gr.Row():
+                # Input section
+                with gr.Column(scale=1):
+                    image_input = gr.Image(
+                        label="Upload or Capture Image",
+                        sources=["upload", "webcam"],
+                        type="filepath"
+                    )
+                    # Example selector
+                    gr.Markdown("### Try with Examples")
+                    example_images = [
+                        ["content/kid.handwriting.draw.01.jpg", "Tiny Seed"],
+                        ["content/race.for.the.moon.jpg", "To the Moon!"],
+                        ["content/john.adam.move.to.dc.png", "Move to DC"],
+                    ]
+                    gr.Examples(
+                        examples=example_images,
+                        inputs=image_input,
+                        label="Example Images"
+                    )
+                    with gr.Row():
+                        process_btn = gr.Button("🔍 Recognize Text", variant="primary")
+                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    status_msg = gr.Markdown("Ready to process image...")
+                # Output section
+                with gr.Column(scale=1):
+                    recognized_text = gr.Textbox(
+                        label="Recognized Text",
+                        lines=5,
+                        # readonly=True
+                    )
+                    tts_audio = gr.Audio(
+                        label="Text-to-Speech Output",
+                        visible=True,
+                        interactive=False
+                    )
+            # Event handlers
+            async def process_image(image):
+                if image is None:
+                    return "Please upload or capture an image.", None, "⚠️ Please provide an image"
+                # Check if Azure ML endpoint and API key are set
+                if not url or not api_key:
+                    return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "⚠️ Configuration error"
+                # Recognize text using Azure ML endpoint
+                text = self.recognize_text(image)
+                if not text or text.strip() == "":
+                    return "No text was recognized in the image.", None, "⚠️ No text recognized"
+                # Clean up text - replace newlines with spaces and remove multiple spaces
+                cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
+                # Generate audio immediately
+                audio_path = await self.text_to_speech(cleaned_text)
+                return text, audio_path, "✅ Text recognized and audio generated"
+            def clear_inputs():
+                return None, "", None, "Ready to process image..."
+            process_btn.click(
+                fn=process_image,
+                inputs=[image_input],
+                outputs=[
+                    recognized_text,
+                    tts_audio,
+                    status_msg
+                ],
+                api_name="process_image"
+            )
+            clear_btn.click(
+                fn=clear_inputs,
+                inputs=[],
+                outputs=[
+                    image_input,
+                    recognized_text,
+                    tts_audio,
+                    status_msg
+                ],
+                api_name="clear_inputs"
+            )
+            # Instructions
+            with gr.Accordion("ℹ️ How to Use", open=False):
+                gr.Markdown("""
+                    1. **Upload or Capture**: Use your webcam or upload an image containing text
+                    2. **Process**: Click 'Recognize Text' to extract text from the image
+                    3. **Listen**: The audio will automatically play once text is recognized
+                    Note: The system works best with clear, well-lit images of handwritten text.
+                    ### Configuration
+                    Before using this app, set these environment variables:
+                    - AZURE_ENDPOINT: Your Azure ML endpoint URL
+                    - AZURE_API_KEY: Your Azure ML API key
+                """)
+        return interface
+def run_app():
+    app = ImageOCRApp()
+    interface = app.create_interface()
+    interface.launch(
+        share=True,
+        server_name="0.0.0.0",
+    )
+if __name__ == "__main__":
+    run_app()

content/john.adam.move.to.dc.png ADDED Viewed

content/kid.handwriting.draw.01.jpg ADDED Viewed

content/race.for.the.moon.jpg ADDED Viewed

requirements.txt ADDED Viewed

File without changes