""" AI Story-to-Video App - Single File Prototype Features: - Accepts a text story input - Splits into scenes using OpenAI API (if key provided), else naive split - Generates scene images via Replicate API (if token provided), else placeholders - Voiceover via ElevenLabs API (if key provided), else gTTS fallback - Assembles into a Ken Burns style video with MoviePy """ import os import io import uuid import json import textwrap import tempfile import re from typing import List from PIL import Image, ImageDraw, ImageFont import gradio as gr from moviepy.editor import ImageClip, AudioFileClip, concatenate_videoclips try: import openai except Exception: openai = None import requests # -------------------- CONFIG -------------------- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") REPLICATE_API_TOKEN = os.environ.get("REPLICATE_API_TOKEN") ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY") VIDEO_WIDTH = 720 VIDEO_HEIGHT = 1280 # vertical 9:16 PER_SCENE_DURATION = 4 FPS = 24 # -------------------- SCENE SPLITTING -------------------- def simple_scene_split(story: str, max_scenes: int = 6) -> List[str]: paragraphs = [p.strip() for p in story.split('\n') if p.strip()] if len(paragraphs) >= max_scenes: return paragraphs[:max_scenes] sentences = re.split(r'(?<=[.!?])\s+', story.strip()) sentences = [s for s in sentences if s] if len(sentences) <= max_scenes: return sentences chunk_size = max(1, len(sentences) // max_scenes) scenes = [] for i in range(0, len(sentences), chunk_size): scenes.append(' '.join(sentences[i:i + chunk_size])) if len(scenes) == max_scenes: break return scenes def openai_scene_split(story: str, max_scenes: int = 6) -> List[str]: if not OPENAI_API_KEY or not openai: return simple_scene_split(story, max_scenes) openai.api_key = OPENAI_API_KEY prompt = ( f"Split the following story into at most {max_scenes} scenes. " "Return a JSON array of objects with keys: 'title' and 'description'. " "Keep titles short (3-6 words). Story:\n\n" + story ) resp = openai.ChatCompletion.create( model=os.environ.get('OPENAI_MODEL', 'gpt-4o-mini'), messages=[{"role": "user", "content": prompt}], temperature=0.6, max_tokens=800, ) text = resp['choices'][0]['message']['content'] try: parsed = json.loads(text) return [ f"{item.get('title', 'Scene')} - {item.get('description', '')}" for item in parsed ] except Exception: return simple_scene_split(story, max_scenes) # -------------------- IMAGE GENERATION -------------------- def generate_placeholder_image(text: str, size=(VIDEO_WIDTH, VIDEO_HEIGHT)) -> Image.Image: img = Image.new('RGB', size, color=(245, 240, 230)) draw = ImageDraw.Draw(img) try: font = ImageFont.truetype("DejaVuSans-Bold.ttf", 36) except Exception: font = ImageFont.load_default() margin = 60 wrapped = textwrap.fill(text, width=30) draw.multiline_text((margin, margin), wrapped, fill=(30, 30, 30), font=font) return img def generate_image_via_replicate(prompt: str, width=VIDEO_WIDTH, height=VIDEO_HEIGHT) -> Image.Image: if not REPLICATE_API_TOKEN: return generate_placeholder_image(prompt, size=(width, height)) API_URL = "https://api.replicate.com/v1/predictions" headers = { "Authorization": f"Token {REPLICATE_API_TOKEN}", "Content-Type": "application/json", } model = "stability-ai/stable-diffusion-xl" payload = { "version": "latest", "input": {"prompt": prompt, "width": width, "height": height, "num_inference_steps": 20} } try: r = requests.post(API_URL, headers=headers, data=json.dumps(payload), timeout=60) r.raise_for_status() j = r.json() urls = [] def walk(jv): if isinstance(jv, dict): for k, v in jv.items(): if isinstance(v, str) and v.startswith('http') and (v.endswith('.png') or v.endswith('.jpg')): urls.append(v) else: walk(v) elif isinstance(jv, list): for it in jv: walk(it) walk(j) if urls: img_data = requests.get(urls[0]).content return Image.open(io.BytesIO(img_data)).convert('RGB') except Exception as e: print('Replicate image generation failed:', e) return generate_placeholder_image(prompt, size=(width, height)) # -------------------- TTS -------------------- def generate_voice_elevenlabs(text: str, voice: str = "alloy", out_path: str = "voice.mp3") -> str: if ELEVENLABS_API_KEY: try: url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice}" headers = {"xi-api-key": ELEVENLABS_API_KEY, "Content-Type": "application/json"} payload = {"text": text, "voice_settings": {"stability": 0.6, "similarity_boost": 0.75}} r = requests.post(url, headers=headers, json=payload, timeout=60) r.raise_for_status() with open(out_path, 'wb') as f: f.write(r.content) return out_path except Exception as e: print('ElevenLabs TTS failed:', e) # Fallback: silent audio from moviepy.editor import ColorClip duration = max(1, len(text.split()) // 2) silent = ColorClip(size=(1, 1), color=(0, 0, 0), duration=duration) silent.write_audiofile(out_path, fps=22050, codec='mp3') return out_path # -------------------- VIDEO ASSEMBLY -------------------- def create_ken_burns_clip(img: Image.Image, duration: float, fps: int = FPS) -> ImageClip: tmp = tempfile.NamedTemporaryFile(suffix='.png', delete=False) img.save(tmp.name) clip = ImageClip(tmp.name).set_duration(duration) clip = clip.resize(height=VIDEO_HEIGHT) return clip.set_fps(fps) def assemble_video_from_scenes(images: List[Image.Image], audio_path: str = None) -> str: clips = [create_ken_burns_clip(img, PER_SCENE_DURATION) for img in images] final = concatenate_videoclips(clips, method='compose') if audio_path and os.path.exists(audio_path): audio = AudioFileClip(audio_path) final = final.set_audio(audio.set_duration(final.duration)) out_path = os.path.join(tempfile.gettempdir(), f"story_video_{uuid.uuid4().hex}.mp4") final.write_videofile(out_path, fps=FPS, codec='libx264', audio_codec='aac', verbose=False, logger=None) return out_path # -------------------- MAIN PIPELINE -------------------- def story_to_video_pipeline(story: str, scenes: int = 6, voice: bool = True) -> dict: scene_texts = openai_scene_split(story, max_scenes=int(scenes)) images = [] for s in scene_texts: prompt = f"Cinematic, ultra-detailed, 4k, vertical {VIDEO_WIDTH}x{VIDEO_HEIGHT} -- {s}" img = generate_image_via_replicate(prompt) images.append(img) audio_path = None if voice: combined = '\n\n'.join(scene_texts) audio_path = os.path.join(tempfile.gettempdir(), f"voice_{uuid.uuid4().hex}.mp3") generate_voice_elevenlabs(combined, out_path=audio_path) video_path = assemble_video_from_scenes(images, audio_path=audio_path) return {"video_path": video_path} # -------------------- GRADIO APP -------------------- with gr.Blocks(title="AI Story → Video (Prototype)") as demo: gr.Markdown("## AI Story-to-Video — Prototype") with gr.Row(): with gr.Column(scale=3): story_input = gr.Textbox(lines=8, label="Story", placeholder="Paste your story here...") scenes_slider = gr.Slider(minimum=1, maximum=8, step=1, value=6, label="Max scenes") voice_toggle = gr.Checkbox(value=True, label="Generate voiceover") btn = gr.Button("Generate Video") with gr.Column(scale=2): video_output = gr.Video(label="Generated video") status = gr.Label(value="Ready", label="Status") def on_generate(story, scenes, voice): status.value = "Working..." out = story_to_video_pipeline(story, scenes, voice) status.value = "Done" return out['video_path'], status btn.click(on_generate, inputs=[story_input, scenes_slider, voice_toggle], outputs=[video_output, status]) if __name__ == "__main__": demo.launch(share=False)