multimodal-chat-MBTI-ESFP

Sleeping

App Files Files Community

seawolf2357 commited on Mar 16

Commit

ced8ba1

verified ·

1 Parent(s): 56354e9

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -74

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ import pandas as pd
 # PDF 텍스트 추출
 import PyPDF2
-MAX_CONTENT_CHARS = 8000  # 너무 큰 파일을 막기 위해 최대 표시 8000자
 model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
@@ -43,6 +43,10 @@ def analyze_csv_file(path: str) -> str:
     """
     try:
         df = pd.read_csv(path)
         df_str = df.to_string()
         if len(df_str) > MAX_CONTENT_CHARS:
             df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
@@ -73,11 +77,20 @@ def pdf_to_markdown(pdf_path: str) -> str:
     try:
         with open(pdf_path, "rb") as f:
             reader = PyPDF2.PdfReader(f)
-            for page_num, page in enumerate(reader.pages, start=1):
                 page_text = page.extract_text() or ""
                 page_text = page_text.strip()
                 if page_text:
-                    text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
     except Exception as e:
         return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
@@ -97,7 +110,7 @@ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
     for path in paths:
         if path.endswith(".mp4"):
             video_count += 1
-        else:
             image_count += 1
     return image_count, video_count
@@ -108,10 +121,13 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
     for item in history:
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
-        if item["content"][0].endswith(".mp4"):
-            video_count += 1
-        else:
-            image_count += 1
     return image_count, video_count
@@ -123,11 +139,9 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     - <image> 태그가 있으면 태그 수와 실제 이미지 수 일치
     - CSV, TXT, PDF 등은 여기서 제한하지 않음
     """
     media_files = []
     for f in message["files"]:
-        # 이미지: png/jpg/jpeg/gif/webp
-        # 비디오: mp4
-        # cf) PDF, CSV, TXT 등은 제외
         if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
             media_files.append(f)
@@ -149,9 +163,15 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
-    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
-        gr.Warning("The number of <image> tags in the text does not match the number of images.")
-        return False
     return True
@@ -164,7 +184,8 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_interval = int(fps / 3)
     frames = []
     for i in range(0, total_frames, frame_interval):
@@ -175,6 +196,10 @@ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
@@ -200,9 +225,13 @@ def process_interleaved_images(message: dict) -> list[dict]:
     parts = re.split(r"(<image>)", message["text"])
     content = []
     image_index = 0
     for part in parts:
-        if part == "<image>":
-            content.append({"type": "image", "url": message["files"][image_index]})
             image_index += 1
         elif part.strip():
             content.append({"type": "text", "text": part.strip()})
@@ -216,13 +245,30 @@ def process_interleaved_images(message: dict) -> list[dict]:
 ##################################################
 # PDF + CSV + TXT + 이미지/비디오
 ##################################################
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     # 1) 파일 분류
-    video_files = [f for f in message["files"] if f.endswith(".mp4")]
-    image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
     csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
     txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
     pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
@@ -251,9 +297,13 @@ def process_new_user_message(message: dict) -> list[dict]:
         return content_list
     # 7) 이미지 처리
-    if "<image>" in message["text"] and image_files:  # 이미지 파일이 있는 경우에만
         # interleaved
-        return process_interleaved_images({"text": message["text"], "files": image_files})
     else:
         # 일반 여러 장
         for img_path in image_files:
@@ -261,6 +311,7 @@ def process_new_user_message(message: dict) -> list[dict]:
     return content_list
 ##################################################
 # history -> LLM 메시지 변환
 ##################################################
@@ -280,9 +331,18 @@ def process_history(history: list[dict]) -> list[dict]:
             content = item["content"]
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
-            else:
-                # 이미지나 기타
-                current_user_content.append({"type": "image", "url": content[0]})
     return messages
@@ -295,55 +355,79 @@ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tok
         yield ""
         return
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
-    messages.extend(process_history(history))
-    # 사용자 메시지 처리
-    user_content = process_new_user_message(message)
-    # 이미지가 아닌 파일들만 텍스트로 변환
-    processed_content = []
-    for item in user_content:
-        if item["type"] == "image":
-            # 이미지 파일인지 확인
-            if re.search(r"\.(png|jpg|jpeg|gif|webp)$", item["url"], re.IGNORECASE):
-                processed_content.append(item)
-            else:
-                # 이미지가 아닌 파일은 텍스트로 변환
-                processed_content.append({"type": "text", "text": f"[File: {os.path.basename(item['url'])}]"})
-        else:
-            processed_content.append(item)
-    messages.append({"role": "user", "content": processed_content})
-    # LLM 처리는 그대로 진행
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    ).to(device=model.device, dtype=torch.bfloat16)
-    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs = dict(
-        inputs,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-    )
-    t = Thread(target=model.generate, kwargs=gen_kwargs)
-    t.start()
-    output = ""
-    for new_text in streamer:
-        output += new_text
-        yield output
-##################################################
-# 예시들 (기존)
-##################################################
 ##################################################
 # 예시들 (한글화 버전)
 ##################################################
@@ -477,7 +561,9 @@ examples = [
 ]
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
@@ -512,4 +598,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 # PDF 텍스트 추출
 import PyPDF2
+MAX_CONTENT_CHARS = 4000  # 너무 큰 파일을 막기 위해 최대 표시 4000자
 model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
     """
     try:
         df = pd.read_csv(path)
+        # 데이터 프레임 크기 제한 (행/열 수가 많은 경우)
+        if df.shape[0] > 50 or df.shape[1] > 10:
+            df = df.iloc[:50, :10]
         df_str = df.to_string()
         if len(df_str) > MAX_CONTENT_CHARS:
             df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
     try:
         with open(pdf_path, "rb") as f:
             reader = PyPDF2.PdfReader(f)
+            # 최대 5페이지만 처리
+            max_pages = min(5, len(reader.pages))
+            for page_num in range(max_pages):
+                page = reader.pages[page_num]
                 page_text = page.extract_text() or ""
                 page_text = page_text.strip()
                 if page_text:
+                    # 페이지별 텍스트도 제한
+                    if len(page_text) > MAX_CONTENT_CHARS // max_pages:
+                        page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(truncated)"
+                    text_chunks.append(f"## Page {page_num+1}\n\n{page_text}\n")
+            if len(reader.pages) > max_pages:
+                text_chunks.append(f"\n...(Showing {max_pages} of {len(reader.pages)} pages)...")
     except Exception as e:
         return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
     for path in paths:
         if path.endswith(".mp4"):
             video_count += 1
+        elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", path, re.IGNORECASE):
             image_count += 1
     return image_count, video_count
     for item in history:
         if item["role"] != "user" or isinstance(item["content"], str):
             continue
+        if isinstance(item["content"], list) and len(item["content"]) > 0:
+            file_path = item["content"][0]
+            if isinstance(file_path, str):
+                if file_path.endswith(".mp4"):
+                    video_count += 1
+                elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE):
+                    image_count += 1
     return image_count, video_count
     - <image> 태그가 있으면 태그 수와 실제 이미지 수 일치
     - CSV, TXT, PDF 등은 여기서 제한하지 않음
     """
+    # 이미지와 비디오 파일만 필터링
     media_files = []
     for f in message["files"]:
         if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
             media_files.append(f)
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
+    # 이미지 태그 검증 (실제 이미지 파일만 계산)
+    if "<image>" in message["text"]:
+        # 이미지 파일만 필터링
+        image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
+        image_tag_count = message["text"].count("<image>")
+        if image_tag_count != len(image_files):
+            gr.Warning("The number of <image> tags in the text does not match the number of image files.")
+            return False
     return True
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # 더 적은 프레임을 추출하도록 조정
+    frame_interval = max(int(fps), int(total_frames / 10))  # 초당 1프레임 또는 최대 10프레임
     frames = []
     for i in range(0, total_frames, frame_interval):
             pil_image = Image.fromarray(image)
             timestamp = round(i / fps, 2)
             frames.append((pil_image, timestamp))
+            # 최대 5프레임만 사용
+            if len(frames) >= 5:
+                break
     vidcap.release()
     return frames
     parts = re.split(r"(<image>)", message["text"])
     content = []
     image_index = 0
+    # 이미지 파일만 필터링
+    image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
     for part in parts:
+        if part == "<image>" and image_index < len(image_files):
+            content.append({"type": "image", "url": image_files[image_index]})
             image_index += 1
         elif part.strip():
             content.append({"type": "text", "text": part.strip()})
 ##################################################
 # PDF + CSV + TXT + 이미지/비디오
 ##################################################
+def is_image_file(file_path: str) -> bool:
+    """이미지 파일인지 확인"""
+    return bool(re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE))
+def is_video_file(file_path: str) -> bool:
+    """비디오 파일인지 확인"""
+    return file_path.endswith(".mp4")
+def is_document_file(file_path: str) -> bool:
+    """문서 파일인지 확인 (PDF, CSV, TXT)"""
+    return (file_path.lower().endswith(".pdf") or
+            file_path.lower().endswith(".csv") or
+            file_path.lower().endswith(".txt"))
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
     # 1) 파일 분류
+    video_files = [f for f in message["files"] if is_video_file(f)]
+    image_files = [f for f in message["files"] if is_image_file(f)]
     csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
     txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
     pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
         return content_list
     # 7) 이미지 처리
+    if "<image>" in message["text"] and image_files:
         # interleaved
+        interleaved_content = process_interleaved_images({"text": message["text"], "files": image_files})
+        # 원본 content_list 앞부분(텍스트)을 제거하고 interleaved로 대체
+        if content_list[0]["type"] == "text":
+            content_list = content_list[1:]  # 원본 텍스트 제거
+        return interleaved_content + content_list  # interleaved + 나머지 문서 분석 내용
     else:
         # 일반 여러 장
         for img_path in image_files:
     return content_list
 ##################################################
 # history -> LLM 메시지 변환
 ##################################################
             content = item["content"]
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
+            elif isinstance(content, list) and len(content) > 0:
+                file_path = content[0]
+                if is_image_file(file_path):
+                    current_user_content.append({"type": "image", "url": file_path})
+                else:
+                    # 비이미지 파일은 텍스트로 처리
+                    current_user_content.append({"type": "text", "text": f"[File: {os.path.basename(file_path)}]"})
+    # 마지막 사용자 메시지가 처리되지 않은 경우 추가
+    if current_user_content:
+        messages.append({"role": "user", "content": current_user_content})
     return messages
         yield ""
         return
+    try:
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
+        messages.extend(process_history(history))
+        # 사용자 메시지 처리
+        user_content = process_new_user_message(message)
+        # 토큰 수를 줄이기 위해 너무 긴 텍스트는 잘라내기
+        for item in user_content:
+            if item["type"] == "text" and len(item["text"]) > MAX_CONTENT_CHARS:
+                item["text"] = item["text"][:MAX_CONTENT_CHARS] + "\n...(truncated)..."
+        messages.append({"role": "user", "content": user_content})
+        # 모델 입력 생성 전 최종 확인
+        # 이미지나 비디오가 아닌 파일들은 모델의 "image" 파이프라인으로 전달되지 않도록 필터링
+        for msg in messages:
+            if msg["role"] != "user":
+                continue
+            filtered_content = []
+            for item in msg["content"]:
+                if item["type"] == "image":
+                    if is_image_file(item["url"]):
+                        filtered_content.append(item)
+                    else:
+                        # 이미지 파일이 아닌 경우 텍스트로 변환
+                        filtered_content.append({
+                            "type": "text",
+                            "text": f"[Non-image file: {os.path.basename(item['url'])}]"
+                        })
+                else:
+                    filtered_content.append(item)
+            msg["content"] = filtered_content
+        # 모델 입력 생성
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(device=model.device, dtype=torch.bfloat16)
+        # 텍스트 생성 스트리머 설정
+        streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+        gen_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+        )
+        # 별도 스레드에서 텍스트 생성
+        t = Thread(target=model.generate, kwargs=gen_kwargs)
+        t.start()
+        # 결과 스트리밍
+        output = ""
+        for new_text in streamer:
+            output += new_text
+            yield output
+    except Exception as e:
+        logger.error(f"Error in run: {str(e)}")
+        yield f"죄송합니다. 오류가 발생했습니다: {str(e)}"
 ##################################################
 # 예시들 (한글화 버전)
 ##################################################
 ]
+##################################################
+# Gradio 인터페이스 설정
+##################################################
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
 )
 if __name__ == "__main__":
+    demo.launch()