Coqui-Xtts-Demo

Sleeping

App Files Files Community

Jimmy Vu commited on Feb 12

Commit

2e99c77

1 Parent(s): 8460d0e

Add files

Browse files

Files changed (11) hide show

.gitignore +173 -0
README.md +1 -1
app.py +0 -7
gradio_app.py +298 -0
requirements.txt +12 -0
utils/__init__.py +0 -0
utils/cuda_toolkit.py +19 -0
utils/logger.py +146 -0
utils/sentence.py +75 -0
utils/spaces.py +10 -0
utils/vietnamese_normalization.py +360 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,173 @@

+WadaSNR/
+.idea/
+*.pyc
+.DS_Store
+./__init__.py
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# vim
+*.swp
+*.swm
+*.swn
+*.swo
+# pytorch models
+*.pth
+*.pth.tar
+!dummy_speakers.pth
+result/
+# setup.py
+version.py
+# jupyter dummy files
+core
+# ignore local datasets
+recipes/WIP/*
+recipes/ljspeech/LJSpeech-1.1/*
+recipes/vctk/VCTK/*
+recipes/**/*.npy
+recipes/**/*.json
+VCTK-Corpus-removed-silence/*
+# ignore training logs
+trainer_*_log.txt
+# files used internally for dev, test etc.
+tests/outputs/*
+tests/train_outputs/*
+TODO.txt
+.vscode/*
+data/*
+notebooks/data/*
+TTS/tts/utils/monotonic_align/core.c
+.vscode-upload.json
+temp_build/*
+events.out*
+old_configs/*
+model_importers/*
+model_profiling/*
+docs/source/TODO/*
+.noseids
+.dccache
+log.txt
+umap.png
+*.out
+SocialMedia.txt
+output.wav
+tts_output.wav
+deps.json
+speakers.json
+internal/*
+*_pitch.npy
+*_phoneme.npy
+wandb
+depot/*
+coqui_recipes/*
+local_scripts/*
+coqui_demos/*
+cache/*

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: green
 colorTo: gray
 sdk: gradio
 sdk_version: 5.15.0
-app_file: app.py
 pinned: false
 license: mpl-2.0
 short_description: Coqui-XTTS Text-to-Speech Demo with Vietnamese

 colorTo: gray
 sdk: gradio
 sdk_version: 5.15.0
+app_file: gradio_app.py
 pinned: false
 license: mpl-2.0
 short_description: Coqui-XTTS Text-to-Speech Demo with Vietnamese

app.py DELETED Viewed

@@ -1,7 +0,0 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

gradio_app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import os
+import time
+import uuid
+import hashlib
+from pathlib import Path
+import gradio as gr
+import torch
+import torchaudio
+import numpy as np
+from underthesea import sent_tokenize
+from df.enhance import enhance, init_df, load_audio, save_audio
+from huggingface_hub import hf_hub_download, snapshot_download
+from langdetect import detect
+from utils.vietnamese_normalization import normalize_vietnamese_text
+from utils.logger import setup_logger
+from utils.sentence import split_sentence, merge_sentences
+import warnings
+warnings.filterwarnings("ignore")
+logger = setup_logger(__file__)
+df_model, df_state = None, None
+APP_DIR = os.path.dirname(os.path.abspath(__file__))
+checkpoint_dir=f"{APP_DIR}/cache"
+temp_dir=f"{APP_DIR}/cache/temp/"
+sample_audio_dir=f"{APP_DIR}/cache/audio_samples/"
+enhance_audio_dir=f"{APP_DIR}/cache/audio_enhances/"
+for d in [checkpoint_dir, temp_dir, sample_audio_dir, enhance_audio_dir]:
+    os.makedirs(d, exist_ok=True)
+language_dict = {'English': 'en', 'Español (Spanish)': 'es', 'Français (French)': 'fr',
+                 'Deutsch (German)': 'de', 'Italiano (Italian)': 'it', 'Português (Portuguese)': 'pt',
+                 'Polski (Polish)': 'pl', 'Türkçe (Turkish)': 'tr', 'Русский (Russian)': 'ru',
+                 'Nederlands (Dutch)': 'nl', 'Čeština (Czech)': 'cs', 'العربية (Arabic)': 'ar', '中文 (Chinese)': 'zh-cn',
+                 'Magyar nyelv (Hungarian)': 'hu', '한국어 (Korean)': 'ko', '日本語 (Japanese)': 'ja',
+                 'Tiếng Việt (Vietnamese)': 'vi', 'Auto': 'auto'}
+default_language = 'Auto'
+language_codes = [v for _, v in language_dict.items()]
+def lang_detect(text):
+    try:
+        lang = detect(text)
+        if lang == 'zh-tw':
+            return 'zh-cn'
+        return lang if lang in language_codes else 'en'
+    except:
+        return 'en'
+input_text_max_length = 3000
+use_deepspeed = False
+try:
+    import spaces
+except ImportError:
+    from utils import spaces
+xtts_model = None
+def load_model():
+    global xtts_model
+    from TTS.tts.configs.xtts_config import XttsConfig
+    from TTS.tts.models.xtts import Xtts
+    repo_id = "jimmyvu/xtts"
+    snapshot_download(repo_id=repo_id,
+                      local_dir=checkpoint_dir,
+                      allow_patterns=["*.safetensors", "*.wav", "*.json"],
+                      ignore_patterns="*.pth")
+    config = XttsConfig()
+    config.load_json(os.path.join(checkpoint_dir, "config.json"))
+    xtts_model = Xtts.init_from_config(config)
+    logger.info("Loading model...")
+    xtts_model.load_safetensors_checkpoint(
+        config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed
+    )
+    if torch.cuda.is_available():
+        xtts_model.cuda()
+    logger.info(f"Successfully loaded model from {checkpoint_dir}")
+load_model()
+default_speaker_reference_audio = os.path.join(sample_audio_dir, 'harvard.wav')
+@spaces.GPU
+def generate_speech(input_text, speaker_reference_audio, enhance_speech, temperature=0.3, top_p=0.85, top_k=50, repetition_penalty=10.0, language='Auto', *args):
+    """Process text and generate audio."""
+    global df_model, df_state, xtts_model
+    log_messages = ""
+    if len(input_text) > input_text_max_length:
+        gr.Warning("Text is too long! Please provide a shorter text.")
+        log_messages += "Text is too long! Please provide a shorter text.\n"
+        return None, log_messages
+    language_code = language_dict.get(language, 'en')
+    logger.info(f"Language [{language}], code: [{language_code}]")
+    lang = lang_detect(input_text) if language_code == 'auto' else language_code
+    if (lang not in ['ja', 'kr', 'zh-cn'] and len(input_text.split()) < 2) or \
+        (lang in ['ja', 'kr', 'zh-cn'] and len(input_text) < 2):
+        gr.Warning("Text is too short! Please provide a longer text.")
+        log_messages += "Text is too short! Please provide a longer text.\n"
+        return None, log_messages
+    if not speaker_reference_audio:
+        gr.Warning("Please provide at least one reference audio!")
+        log_messages += "Please provide at least one reference audio!\n"
+        return None, log_messages
+    start = time.time()
+    logger.info(f"Start processing text: {input_text[:30]}... [length: {len(input_text)}]")
+    if enhance_speech:
+        logger.info("Enhancing reference audio...")
+        _, audio_file = os.path.split(speaker_reference_audio)
+        enhanced_audio_path = os.path.join(enhance_audio_dir, f"{audio_file}.enh.wav")
+        if not os.path.exists(enhanced_audio_path):
+            if not df_model:
+                df_model, df_state, _ = init_df()
+            audio, _ = load_audio(speaker_reference_audio, sr=df_state.sr())
+            # denoise audio
+            enhanced_audio = enhance(df_model, df_state, audio)
+            # save enhanced audio
+            save_audio(enhanced_audio_path, enhanced_audio, sr=df_state.sr())
+        speaker_reference_audio = enhanced_audio_path
+    gpt_cond_latent, speaker_embedding = xtts_model.get_conditioning_latents(
+        audio_path=speaker_reference_audio,
+        gpt_cond_len=xtts_model.config.gpt_cond_len,
+        max_ref_length=xtts_model.config.max_ref_len,
+        sound_norm_refs=xtts_model.config.sound_norm_refs,
+    )
+    # Split text by sentence
+    if lang in ["ja", "zh-cn"]:
+        sentences = input_text.split("。")
+    else:
+        sentences = sent_tokenize(input_text)
+    # merge short sentences to next/prev ones
+    sentences = merge_sentences(sentences)
+    # inference
+    wav_array = inference(sentences, language_code, gpt_cond_latent, speaker_embedding, temperature, top_p, top_k, repetition_penalty)
+    end = time.time()
+    logger.info(f"End processing text: {input_text[:30]}... Processing time: {end - start:.2f}s")
+    log_messages += f"Processing time: {end - start:.2f}s"
+    return (24000, wav_array), log_messages
+def inference(sentences, language_code, gpt_cond_latent, speaker_embedding, temperature, top_p, top_k, repetition_penalty):
+    # set dynamic length penalty from -1.0 to 1,0 based on text length
+    max_text_length = 180
+    dynamic_length_penalty = lambda text_length: (2 * (min(max_text_length, text_length) / max_text_length)) - 1
+     # inference
+    out_wavs = []
+    for sentence in sentences:
+        if len(sentence.strip()) == 0:
+            continue
+        lang = lang_detect(sentence) if language_code == 'auto' else language_code
+        if lang == 'vi':
+            sentence = normalize_vietnamese_text(sentence)
+        # split too long sentence
+        texts = split_sentence(sentence) if len(sentence) > max_text_length else [sentence]
+        for text in texts:
+            logger.info(f"[{lang}] {text}")
+            try:
+                out = xtts_model.inference(
+                    text=text,
+                    language=lang,
+                    gpt_cond_latent=gpt_cond_latent,
+                    speaker_embedding=speaker_embedding,
+                    temperature=temperature,
+                    top_p=top_p,
+                    top_k=top_k,
+                    repetition_penalty=repetition_penalty,
+                    length_penalty=dynamic_length_penalty(len(text)),
+                    enable_text_splitting=True,
+                )
+                out_wavs.append(out["wav"])
+            except Exception as e:
+                logger.error(f"Error processing text: {text} - {e}")
+    return np.concatenate(out_wavs)
+def build_gradio_ui():
+    """Builds and launches the Gradio UI."""
+    theme=gr.Theme.from_hub('JohnSmith9982/small_and_pretty')
+    setattr(theme, 'button_secondary_background_fill', '#fcd53f')
+    setattr(theme, 'checkbox_border_color', '#02c160')
+    setattr(theme, 'input-border-width', '1px')
+    setattr(theme, 'input-background-fill', '#ffffff')
+    setattr(theme, 'input-background-fill_focus', '#ffffff')
+    setattr(theme, 'input-border-color', '#d1d5db')
+    setattr(theme, 'input-border-color_focus', '#fcd53f')
+    default_prompt = ("Hi, I am a multilingual text-to-speech AI model.\n"
+                      "Bonjour, je suis un modèle d'IA de synthèse vocale multilingue.\n"
+                      "Hallo, ich bin ein mehrsprachiges Text-zu-Sprache KI-Modell.\n"
+                      "Ciao, sono un modello di intelligenza artificiale di sintesi vocale multilingue.\n"
+                      "Привет, я многоязычная модель искусственного интеллекта, преобразующая текст в речь.\n"
+                      "Xin chào, tôi là một mô hình AI chuyển đổi văn bản thành giọng nói đa ngôn ngữ.\n")
+    with gr.Blocks(title="Coqui XTTS Demo", theme=theme) as ui:
+        gr.Markdown(
+          """
+          # 🐸 Coqui-XTTS Text-to-Speech Demo
+          Convert text to speech with advanced voice cloning and enhancement.
+          Support 17 languages, \u2605 **Vietnamese** \u2605 newly added.
+          """
+        )
+        with gr.Tab("Text to Speech"):
+          with gr.Row():
+            with gr.Column():
+                input_text = gr.Text(label="Enter Text Here",
+                                     placeholder="Write the text you want to convert...",
+                                     value=default_prompt,
+                                     lines=5,
+                                     max_length=input_text_max_length)
+                speaker_reference_audio = gr.Audio(
+                    label="Speaker reference audio:",
+                    type="filepath",
+                    editable=False,
+                    min_length=3,
+                    max_length=300,
+                    value=default_speaker_reference_audio
+                )
+                enhance_speech = gr.Checkbox(label="Enhance Reference Audio", value=False)
+                language = gr.Dropdown(label="Target Language", choices=[k for k in language_dict.keys()], value=default_language)
+                generate_button = gr.Button("Generate Speech")
+            with gr.Column():
+                audio_output = gr.Audio(label="Generated Audio")
+                log_output = gr.Text(label="Log Output")
+        with gr.Tab("Clone Your Voice"):
+          with gr.Row():
+            with gr.Column():
+                input_text_mic = gr.Text(label="Enter Text Here",
+                                     placeholder="Write the text you want to convert...",
+                                     lines=5,
+                                     max_length=input_text_max_length)
+                mic_ref_audio = gr.Audio(label="Record Reference Audio", sources=["microphone"])
+                enhance_speech_mic = gr.Checkbox(label="Enhance Reference Audio", value=True)
+                language_mic = gr.Dropdown(label="Target Language", choices=[k for k in language_dict.keys()], value=default_language)
+                generate_button_mic = gr.Button("Generate Speech")
+            with gr.Column():
+                audio_output_mic = gr.Audio(label="Generated Audio")
+                log_output_mic = gr.Text(label="Log Output")
+        def process_mic_and_generate(input_text_mic, mic_ref_audio, enhance_speech_mic, temperature, top_p, top_k, repetition_penalty, language_mic):
+              if mic_ref_audio:
+                  data = str(time.time()).encode("utf-8")
+                  hash = hashlib.sha1(data).hexdigest()[:10]
+                  output_path = os.path.join(temp_dir, (f"mic_{hash}.wav"))
+                  torch_audio = torch.from_numpy(mic_ref_audio[1].astype(float))
+                  try:
+                      torchaudio.save(output_path, torch_audio.unsqueeze(0), mic_ref_audio[0])
+                      return generate_speech(input_text_mic, output_path, enhance_speech_mic, temperature, top_p, top_k, repetition_penalty, language_mic)
+                  except Exception as e:
+                      logger.error(f"Error saving audio file: {e}")
+                      return None, f"Error saving audio file: {e}"
+              else:
+                  return None, "Please record an audio!"
+        with gr.Tab("Advanced Settings"):
+            with gr.Row():
+                with gr.Column():
+                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.3, step=0.05)
+                    repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=50.0, value=9.5, step=1.0)
+                with gr.Column():
+                    top_p = gr.Slider(label="Top P", minimum=0.5, maximum=1.0, value=0.85, step=0.05)
+                    top_k = gr.Slider(label="Top K", minimum=0, maximum=100, value=50, step=5)
+        generate_button.click(
+            generate_speech,
+            inputs=[input_text, speaker_reference_audio, enhance_speech, temperature, top_p, top_k, repetition_penalty, language],
+            outputs=[audio_output, log_output],
+        )
+        generate_button_mic.click(
+            process_mic_and_generate,
+            inputs=[input_text_mic, mic_ref_audio, enhance_speech_mic, temperature, top_p, top_k, repetition_penalty, language_mic],
+            outputs=[audio_output_mic, log_output_mic],
+        )
+    return ui
+if __name__ == "__main__":
+    ui = build_gradio_ui()
+    ui.launch(debug=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio==4.44.1
+deepfilternet==0.5.6
+underthesea==6.8.0
+deepspeed
+colorama
+pyvi
+langdetect
+cutlet
+unidic
+# for Japanese
+# python -m unidic download
+git+https://github.com/quangvu3/coqui-xtts.git

utils/__init__.py ADDED Viewed

File without changes

utils/cuda_toolkit.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import subprocess
+import os
+def install_cuda_toolkit():
+    # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
+    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
+    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
+    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
+    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
+    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
+    os.environ["CUDA_HOME"] = "/usr/local/cuda"
+    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
+    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
+        os.environ["CUDA_HOME"],
+        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
+    )
+    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
+    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

utils/logger.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import logging
+import sys
+from pathlib import Path
+from datetime import datetime
+import colorama
+from colorama import Fore, Back, Style
+from typing import Optional, Union
+import re
+import traceback
+import copy
+import os
+# Initialize colorama
+colorama.init()
+class ColoredFormatter(logging.Formatter):
+    """Colored formatter for structured log output.
+    This formatter adds color-coding, icons, timestamps, and file location
+    information to log messages. It supports different color schemes for
+    different log levels and includes special formatting for exceptions.
+    Attributes:
+        COLORS (dict): Color schemes for different log levels, including:
+            - color: Foreground color
+            - style: Text style (dim, normal, bright)
+            - icon: Emoji icon for the log level
+            - bg: Background color (for critical logs)
+    """
+    COLORS = {
+        'DEBUG': {
+            'color': Fore.CYAN,
+            'style': Style.DIM,
+            'icon': '🔍'
+        },
+        'INFO': {
+            'color': Fore.GREEN,
+            'style': Style.NORMAL,
+            'icon': 'ℹ️'
+        },
+        'WARNING': {
+            'color': Fore.YELLOW,
+            'style': Style.BRIGHT,
+            'icon': '⚠️'
+        },
+        'ERROR': {
+            'color': Fore.RED,
+            'style': Style.BRIGHT,
+            'icon': '❌'
+        },
+        'CRITICAL': {
+            'color': Fore.WHITE,
+            'style': Style.BRIGHT,
+            'bg': Back.RED,
+            'icon': '💀'
+        }
+    }
+    def format(self, record: logging.LogRecord) -> str:
+        """Format a log record with color and structure.
+        This method formats log records with:
+        - Timestamp in HH:MM:SS.mmm format
+        - File location (filename:line)
+        - Color-coded level name with icon
+        - Color-coded message
+        - Formatted exception traceback if present
+        Args:
+            record (logging.LogRecord): Log record to format.
+        Returns:
+            str: Formatted log message with color and structure.
+        """
+        colored_record = copy.copy(record)
+        # Get color scheme
+        scheme = self.COLORS.get(record.levelname, {
+            'color': Fore.WHITE,
+            'style': Style.NORMAL,
+            'icon': '•'
+        })
+        # Format timestamp
+        timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S.%f')[:-3]
+        # Get file location
+        file_location = f"{os.path.basename(record.pathname)}:{record.lineno}"
+        # Build components
+        components = []
+        # log formatting
+        components.extend([
+            f"{Fore.BLUE}{timestamp}{Style.RESET_ALL}",
+            f"{Fore.WHITE}{Style.DIM}{file_location}{Style.RESET_ALL}",
+            f"{scheme['color']}{scheme['style']}{scheme['icon']} {record.levelname:8}{Style.RESET_ALL}",
+            f"{scheme['color']}{record.msg}{Style.RESET_ALL}"
+        ])
+        # Add exception info
+        if record.exc_info:
+            components.append(
+                f"\n{Fore.RED}{Style.BRIGHT}"
+                f"{''.join(traceback.format_exception(*record.exc_info))}"
+                f"{Style.RESET_ALL}"
+            )
+        return " | ".join(components)
+def setup_logger(
+        name: Optional[Union[str, Path]] = None,
+        level: int = logging.INFO
+) -> logging.Logger:
+    """Set up a colored logger
+    This function creates or retrieves a logger with colored output and
+    automatic log interception. If a file path is provided as the name,
+    it will use the filename (without extension) as the logger name.
+    Args:
+        name (Optional[Union[str, Path]], optional): Logger name or __file__ for
+            module name. Defaults to None.
+        level (int, optional): Logging level. Defaults to logging.INFO.
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    # Get logger name from file path
+    if isinstance(name, (str, Path)) and Path(name).suffix == '.py':
+        name = Path(name).stem
+    # Get or create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    # Only add handler if none exists
+    if not logger.handlers:
+        # Create console handler
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setFormatter(ColoredFormatter())
+        logger.addHandler(console_handler)
+    return logger

utils/sentence.py ADDED Viewed

	@@ -0,0 +1,75 @@

+def split_sentence(sentence, delimiters=",;-!?"):
+    """
+    Splits a sentence into two halves, prioritizing the delimiter closest to the middle.
+    If no delimiter is found, it ensures words are not split in the middle.
+    Args:
+        sentence (str): The input sentence to split.
+        delimiters (str): A string of delimiters to prioritize for splitting (default: ",;!?").
+    Returns:
+        tuple: A tuple containing the two halves of the sentence.
+    """
+    # Find all delimiter indices in the sentence
+    delimiter_indices = [i for i, char in enumerate(sentence) if char in delimiters]
+    if delimiter_indices:
+        # Calculate the midpoint of the sentence
+        midpoint = len(sentence) // 2
+        # Find the delimiter closest to the midpoint
+        closest_delimiter = min(delimiter_indices, key=lambda x: abs(x - midpoint))
+        # Split at the closest delimiter
+        first_half = sentence[:closest_delimiter].strip()
+        second_half = sentence[closest_delimiter + 1:].strip()
+    else:
+        # If no delimiter, split at the nearest space (word boundary)
+        midpoint = len(sentence) // 2
+        # Find the nearest space (word boundary) around the midpoint
+        left_space = sentence.rfind(" ", 0, midpoint)
+        right_space = sentence.find(" ", midpoint)
+        # Choose the closest space to the midpoint
+        if left_space == -1 and right_space == -1:
+            # No spaces found (single word), split at midpoint
+            split_index = midpoint
+        elif left_space == -1:
+            # Only right space found
+            split_index = right_space
+        elif right_space == -1:
+            # Only left space found
+            split_index = left_space
+        else:
+            # Choose the closest space to the midpoint
+            split_index = left_space if (midpoint - left_space) <= (right_space - midpoint) else right_space
+        # Split the sentence into two parts
+        first_half = sentence[:split_index].strip()
+        second_half = sentence[split_index:].strip()
+    return first_half, second_half
+def merge_sentences(sentences):
+    """ handling short sentences by merging them to next/prev ones """
+    merged_sentences = []
+    i = 0
+    while i < len(sentences):
+        s = sentences[i]
+        word_count = len(s.split())
+        j = 1
+        # merge the short sentence to the next one until long enough
+        while word_count <= 6 and i+j < len(sentences):
+            s += ' ' + sentences[i+j]
+            word_count = len(s.split())
+            j += 1
+        merged_sentences.append(s)
+        i += j
+    # merge the last one to the prev one until long enough
+    while len(merged_sentences) > 1 and len(merged_sentences[len(merged_sentences) - 1].split()) < 6:
+        merged_sentences[len(merged_sentences) - 2] += ' ' + merged_sentences[len(merged_sentences) - 1]
+        merged_sentences.pop()
+    return merged_sentences

utils/spaces.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import functools
+def GPU(func):
+    """Decorator to run a function on the fake GPU
+        to get comparable with HF Space"""
+    @functools.wraps(func) # Preserves original function's metadata
+    def wrapper(*args, **kwargs):
+        result = func(*args, **kwargs)
+        return result
+    return wrapper

utils/vietnamese_normalization.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import re
+from underthesea import text_normalize
+# Dictionary to map numbers to Vietnamese words
+number_to_words = {
+    0: 'không',
+    1: 'một',
+    2: 'hai',
+    3: 'ba',
+    4: 'bốn',
+    5: 'năm',
+    6: 'sáu',
+    7: 'bảy',
+    8: 'tám',
+    9: 'chín',
+    10: 'mười',
+    100: 'trăm',
+    1000: 'nghìn',
+    1000000: 'triệu',
+    1000000000: 'tỷ'
+}
+# Dictionary to map Roman numerals to integers
+roman_to_int = {
+    'I': 1,
+    'V': 5,
+    'X': 10,
+    'L': 50,
+    'C': 100,
+    'D': 500,
+    'M': 1000
+}
+# Function to convert Roman numerals to integers
+def roman_to_integer(roman):
+    total = 0
+    prev_value = 0
+    for char in reversed(roman):
+        value = roman_to_int.get(char, 0)
+        if value < prev_value:
+            total -= value
+        else:
+            total += value
+        prev_value = value
+    return total
+currency_symbols ={
+    '~': '~ ',
+    '%': 'phần trăm',
+    '$': 'đô la',
+    '₫': 'đồng',
+    'đ': 'đồng',
+    '€': 'ơ rô',
+    '£': 'bảng',
+    '¥': 'yên',
+    '₹': 'ru pi',
+    '₽': 'rúp',
+    '₺': 'li ra',
+    '₩': 'uôn',
+}
+def currency_symbol_to_word(currency_sign):
+    if currency_sign in currency_symbols:
+        return currency_symbols[currency_sign]
+    return currency_sign
+def detect_number_format(number_str):
+    # Check if the number contains a comma and a dot
+    if ',' in number_str and '.' in number_str:
+        # If the last comma is after the last dot, it's Vietnamese
+        if number_str.rfind(',') > number_str.rfind('.'):
+            # Validate Vietnamese format
+            if re.match(r'^\d{1,3}(?:\.\d{3})*(?:,\d+)?$', number_str):
+                return "Vietnamese"
+            else:
+                return "Invalid"
+        # Otherwise, it's US
+        else:
+            # Validate US format
+            if re.match(r'^\d{1,3}(?:,\d{3})*(?:\.\d+)?$', number_str):
+                return "US"
+            else:
+                return "Invalid"
+    # If only commas are present
+    elif ',' in number_str:
+        if re.match(r'^\d{1,3}(?:,\d{3})*(?:\.\d+)?$', number_str):
+            return "US"
+        elif re.match(r'^(\d+,\d+)?$', number_str):
+            return "Vietnamese"
+        else:
+            return "Invalid"
+    # If only dots are present
+    elif '.' in number_str:
+        if re.match(r'^\d{1,3}(?:\.\d{3})*(?:,\d+)?$', number_str):
+            return "Vietnamese"
+        elif re.match(r'^(\d+\.\d+)?$', number_str):
+            return "US"
+        else:
+            return "Invalid"
+    # If no separators are present, assume Vietnamese (default)
+    else:
+        return "Vietnamese"
+# Function to convert numbers to Vietnamese words
+def number_to_vietnamese_words(number_str):
+    number_str = str(number_str)
+    if detect_number_format(number_str) == 'Invalid':
+        return number_str
+    if detect_number_format(number_str) == 'US': # convert US number to Vietnamese one: 1,234.5 to 1234,5
+        number = re.sub(r'\.', ',', re.sub(r',', '', number_str))
+    else: # remove any dot inside number
+        number = re.sub(r'\.', '', number_str)
+    if isinstance(number, str) and ',' in number:
+        # Handle decimal numbers (e.g., "120,57")
+        integer_part, decimal_part = number.split(',')
+        integer_words = _convert_integer_part(int(integer_part))
+        decimal_words = _convert_decimal_part(decimal_part)
+        return f"{integer_words} phẩy {decimal_words}"
+    else:
+        # Handle integer numbers
+        return _convert_integer_part(int(number))
+# Helper function to convert the integer part of a number
+def _convert_integer_part(number):
+    if number == 0:
+        return number_to_words[0]
+    words = []
+    # Handle billions
+    if number >= 1000000000:
+        billion = number // 1000000000
+        words.append(_convert_integer_part(billion))
+        words.append(number_to_words[1000000000])
+        number %= 1000000000
+    # Handle millions
+    if number >= 1000000:
+        million = number // 1000000
+        words.append(_convert_integer_part(million))
+        words.append(number_to_words[1000000])
+        number %= 1000000
+    # Handle thousands
+    if number >= 1000:
+        thousand = number // 1000
+        words.append(_convert_integer_part(thousand))
+        words.append(number_to_words[1000])
+        number %= 1000
+        if number < 100 and number > 0:
+            words.append('không trăm')
+        if number < 10 and number > 0:
+            words.append('không')
+    # Handle hundreds
+    if number >= 100:
+        hundred = number // 100
+        words.append(number_to_words[hundred])
+        words.append(number_to_words[100])
+        number %= 100
+        if number > 0 and number < 10:
+            words.append('lẻ')  # Add "lẻ" for numbers like 106 (một trăm lẻ sáu)
+    # Handle tens and units
+    if number >= 20:
+        ten = number // 10
+        words.append(number_to_words[ten])
+        words.append('mươi')
+        number %= 10
+    elif number >= 10:
+        words.append(number_to_words[10])
+        number %= 10
+    # Handle units (1-9)
+    if number > 0:
+        if number == 5 and len(words) > 1 and not words[-1] in['lẻ', 'không']: w = 'lăm'
+        elif number == 1 and len(words) > 1 and not words[-1] in ['lẻ', 'mười', 'không']: w = 'mốt'
+        else:  w = number_to_words[number]
+        words.append(w)
+    return ' '.join(words)
+# Helper function to convert the decimal part of a number
+def _convert_decimal_part(decimal_part):
+    words = []
+    for digit in decimal_part:
+        words.append(number_to_words[int(digit)])
+    return ' '.join(words)
+# abbreviation replacement
+abbreviation_map = {
+    "AI": "Ây Ai",
+    "ASEAN": "A Xê An",
+    "ATGT": "An toàn giao thông",
+    "BCA": "Bộ Công an",
+    "BCH": "Ban chấp hành",
+    "BCHTW": "Ban Chấp hành Trung ương",
+    "BCT": "Bộ Chính trị",
+    "BGD": "Bộ Giáo dục",
+    "BKH": "Bộ Khoa học và Công nghệ",
+    "BNN": "Bộ Nông nghiệp",
+    "BQP": "Bộ Quốc phòng",
+    "BTC": "Ban tổ chức",
+    "BTL": "Bộ Tư lệnh",
+    "BYT": "Bộ Y tế",
+    "CA" : "công an",
+    "CAND" : "Công an nhân dân",
+    "CNCS": "chủ nghĩa cộng sản",
+    "CNTB": "chủ nghĩa tư bản",
+    "CNXH": "chủ nghĩa xã hội",
+    "CNY": "nhân dân tệ",
+    "CSGT": "Cảnh sát giao thông",
+    "CTN": "Chủ tịch nước",
+    "ĐBQH": "Đại biểu Quốc hội",
+    "ĐBSCL": "Đồng bằng sông Cửu Long",
+    "ĐCS": "Đảng cộng sản",
+    "ĐH": "Đại học",
+    "ĐHBK": "Đại học Bách khoa",
+    "ĐHKHTN": "Đại học Khoa học tự nhiên",
+    "ĐHQG": "Đại học Quốc gia",
+    "ĐSQ": "Đại sứ quán",
+    "EU": "Ơ u",
+    "GD": "Giáo dục",
+    "HCM": "Hồ Chí Minh",
+    "HĐBA": "Hội đồng bảo an",
+    "HĐND": "Hội đồng nhân dân",
+    "HĐQT": "Hội đồng quản trị",
+    "HN": "Hà Nội",
+    "HV": "Học viện",
+    "KHXH&NV": "Khoa học Xã hội và Nhân văn",
+    "KT": "Kinh tế",
+    "KTQS": "Kỹ thuật Quân sự",
+    "LĐ": "lao động",
+    "KHKT": "khoa học kỹ thuật",
+    "km": "ki lô mét",
+    "LHQ": "Liên Hiệp Quốc",
+    "NATO": "Na tô",
+    "ND": "nhân dân",
+    "NHNN": "ngân hàng nhà nước",
+    "NXB": "Nhà xuất bản",
+    "PCCC": "Phòng cháy chữa cháy",
+    "PTTH": "Phổ thông trung học",
+    "PTCS": "Phổ thông cơ sở",
+    "QĐND" : "Quân đội nhân dân",
+    "QĐNDVN" : "Quân đội nhân dân Việt Nam",
+    "QG": "Quốc gia",
+    "QK": "Quân khu",
+    "sau CN": "sau công nguyên",
+    "SG": "Sài Gòn",
+    "TAND": "Tòa án nhân dân",
+    "TBCN": "tư bản chủ nghĩa",
+    "TBT": "Tổng bí thư",
+    "TCN": "trước công nguyên",
+    "TCT": "Tổng công ty",
+    "THCS": "Trung học cơ sở",
+    "THPT": "Trung học phổ thông",
+    "TNHH": "Trách nhiệm hữu hạn",
+    "TNHH MTV": "Trách nhiệm hữu hạn một thành viên",
+    "TP": "thành phố",
+    "TP.": "thành phố",
+    "TPHCM": "Thành phố Hồ Chí Minh",
+    "TT": "Thủ tướng",
+    "TTCK": "Thị trường chứng khoán",
+    "TTTC": "Thị trường tài chính",
+    "TTCP": "Thủ tướng chính phủ",
+    "TTNT": "Trí tuệ nhân tạo",
+    "TTXVN": "Thông tấn xã Việt Nam",
+    "TƯ": "Trung ương",
+    "TW": "Trung ương",
+    "UB": "Ủy ban",
+    "UBND": "Ủy ban nhân dân",
+    "VH": "Văn hóa",
+    "VKSND": "Viện kiểm sát nhân dân",
+    "VN": "Việt Nam",
+    "VND": "Việt Nam đồng",
+    "XH": "Xã hội",
+    "XHCN": "xã hội chủ nghĩa",
+    "%": "phần trăm",
+    "@": "a còng",
+    "&": "và",
+}
+abbreviation_pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviation_map.keys()) + r')\b')
+def replace_abbreviations(text):
+    def replacement(match):
+        return abbreviation_map[match.group(0)]
+    return abbreviation_pattern.sub(replacement, text)
+def convert_abbreviations(text):
+    """Converts abbreviations like M.A.S.H. to MASH"""
+    return re.sub(r"([A-Z]\.){2,}", lambda match: "".join(c for c in match.group(0) if c.isalpha()), text)
+# Function to normalize Vietnamese text
+def normalize_vietnamese_text(text):
+    text = text_normalize(text)
+    def replace_slash_with_word(text):
+        def replacement(match):
+            word = match.group(1)
+            if word in ['ngày', 'giờ', 'tháng', 'quí', 'quý', 'năm']:
+                return f" mỗi {word}"
+            else:
+                return f" trên {word}"
+        return re.sub(r'/(\w+)', replacement, text)
+    # find and replace "/word" with "per word"
+    text = replace_slash_with_word(text)
+    # Convert standalone currency amounts (e.g., $200, ₫200, €50, £75, ¥1000)
+    def replace_currency(match):
+        currency_sign = match.group(1)
+        amount = match.group(2)
+        return f"{number_to_vietnamese_words(amount)} {currency_symbol_to_word(currency_sign)}"
+    text = re.sub(r'([$₫đ€£¥₹₽₩₺])([\d.,]+)', replace_currency, text)
+    # (reverse case) convert standalone currency amounts (e.g., 200$, 200đ, 50€, 75£, 1000¥)
+    def replace_currency_suffix(match):
+        amount = match.group(1)
+        currency_sign = match.group(2)
+        return f"{number_to_vietnamese_words(amount)} {currency_symbol_to_word(currency_sign)}"
+    text = re.sub(r'([\d.,]+)([$₫đ€£¥₹₽₩₺%])', replace_currency_suffix, text)
+    # in case symbol [¥] is used for Chinese currency and followed by CNY
+    text = text.replace('yên CNY', 'nhân dân tệ')
+    # Replace abbreviations
+    text = convert_abbreviations(text)
+    text = replace_abbreviations(text)
+    # Convert Roman numerals to integers
+    def replace_roman(match):
+        roman_numeral = match.group()
+        return str(roman_to_integer(roman_numeral))
+    # Replace Roman numerals with integers
+    text = re.sub(r'\b[IVXLCDM]+\b', replace_roman, text)
+    # Convert standalone numbers to words
+    text = re.sub(r'\b[\d.,]+\b', lambda match: number_to_vietnamese_words(match.group()), text)
+    # Fix common grammar errors
+    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
+    text = re.sub(r'\s([,\.])', r'\1', text)  # Remove space before punctuation
+    text = re.sub(r'([,\.])(\S)', r'\1 \2', text)  # Add space after punctuation
+    text = ( text.replace("..", ".")
+                .replace("!.", "!")
+                .replace("?.", "?")
+                .replace(" .", ".")
+                .replace(" ,", ",")
+                .replace(" (", ", ")
+                .replace(") ", ", ")
+    )
+    return text.strip()