Hhhh

Running

App Files Files Community

Hjgugugjhuhjggg commited on Mar 2

Commit

e83e49f

verified ·

1 Parent(s): e3e53f6

Upload 28 files

Browse files

Files changed (20) hide show

api.py +62 -162
background_tasks.py +6 -40
codegen_api.py +5 -5
configs.py +27 -5
constants.py +1 -7
extensions.py +81 -1
imagegen_api.py +5 -4
main.py +5 -34
model_loader.py +1 -1
models.py +1 -1
musicgen_api.py +1 -1
sadtalker_api.py +5 -1
sadtalker_utils.py +56 -57
sentiment_api.py +15 -12
stt_api.py +7 -9
summarization_api.py +6 -7
text_generation.py +94 -104
tokenxxx.py +1 -1
translation_api.py +7 -9
tts_api.py +10 -6

api.py CHANGED Viewed

@@ -1,18 +1,17 @@
 from main import *
-from tts_api import *
-from stt_api import *
-from sentiment_api import *
-from imagegen_api import *
-from musicgen_api import *
-from translation_api import *
-from codegen_api import *
-from text_to_video_api import *
-from summarization_api import *
-from image_to_3d_api import *
 from flask import Flask, request, jsonify, Response, send_file, stream_with_context
 from flask_cors import CORS
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio
 import numpy as np
@@ -22,9 +21,12 @@ import tempfile
 import queue
 import json
 import base64
 app = Flask(__name__)
 CORS(app)
 html_code = """<!DOCTYPE html>
 <html lang="en">
 <head>
@@ -225,59 +227,6 @@ html_code = """<!DOCTYPE html>
 """
 feedback_queue = queue.Queue()
-class TextGenerationModel(nn.Module):
-    def __init__(self, vocab_size, embed_dim, hidden_dim):
-        super(TextGenerationModel, self).__init__()
-        self.embedding = nn.Embedding(vocab_size, embed_dim)
-        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
-        self.fc = nn.Linear(hidden_dim, vocab_size)
-    def forward(self, x, hidden=None):
-        x = self.embedding(x)
-        out, hidden = self.rnn(x, hidden)
-        out = self.fc(out)
-        return out, hidden
-vocab = ["hola", "mundo", "este", "es", "un", "ejemplo", "de", "texto", "generado", "con", "torch"]
-vocab_size = len(vocab)
-embed_dim = 16
-hidden_dim = 32
-text_model = TextGenerationModel(vocab_size, embed_dim, hidden_dim)
-text_model.eval()
-def tokenize(text):
-    tokens = text.lower().split()
-    indices = [vocab.index(token) if token in vocab else 0 for token in tokens]
-    return torch.tensor(indices, dtype=torch.long).unsqueeze(0)
-def perform_reasoning_stream(text, temperature, top_k, top_p, repetition_penalty):
-    input_tensor = tokenize(text)
-    hidden = None
-    while True:
-        outputs, hidden = text_model(input_tensor, hidden)
-        logits = outputs[:, -1, :] / temperature
-        probs = F.softmax(logits, dim=-1)
-        topk_probs, topk_indices = torch.topk(probs, min(top_k, logits.shape[-1]))
-        chosen_index = topk_indices[0, torch.multinomial(topk_probs[0], 1).item()].item()
-        token_str = vocab[chosen_index]
-        yield token_str
-        input_tensor = torch.cat([input_tensor, torch.tensor([[chosen_index]], dtype=torch.long)], dim=1)
-        if token_str == "mundo":
-            yield "<END_STREAM>"
-            break
-class SentimentModel(nn.Module):
-    def __init__(self, input_dim, hidden_dim, output_dim):
-        super(SentimentModel, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, output_dim)
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = self.fc2(x)
-        return x
-sentiment_model = SentimentModel(10, 16, 2)
-sentiment_model.eval()
 @app.route("/")
 def index():
@@ -290,16 +239,30 @@ def generate_stream():
     top_k = int(request.args.get("top_k", 40))
     top_p = float(request.args.get("top_p", 0.0))
     reppenalty = float(request.args.get("reppenalty", 1.2))
     @stream_with_context
     def event_stream():
-        try:
-            for token in perform_reasoning_stream(text, temperature=temp, top_k=top_k, top_p=top_p, repetition_penalty=reppenalty):
-                if token == "<END_STREAM>":
-                    yield "data: <END_STREAM>\n\n"
-                    break
-                yield "data: " + token + "\n\n"
-        except Exception as e:
-            yield "data: <ERROR>\n\n"
     return Response(event_stream(), mimetype="text/event-stream")
 @app.route("/api/v1/generate", methods=["POST"])
@@ -310,15 +273,20 @@ def generate():
     top_k = int(data.get("top_k", 40))
     top_p = float(data.get("top_p", 0.0))
     reppenalty = float(data.get("reppenalty", 1.2))
-    result = ""
-    try:
-        for token in perform_reasoning_stream(text, temperature=temp, top_k=top_k, top_p=top_p, repetition_penalty=reppenalty):
-            if token == "<END_STREAM>":
-                break
-            result += token + " "
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-    return jsonify({"solidity": result.strip()})
 @app.route("/api/v1/feedback", methods=["POST"])
 def feedback():
@@ -332,116 +300,48 @@ def feedback():
 @app.route("/api/v1/tts", methods=["POST"])
 def tts_api():
-    data = request.get_json()
-    text = data.get("text", "")
-    sr = 22050
-    duration = 3.0
-    t = torch.linspace(0, duration, int(sr * duration))
-    frequency = 440.0
-    audio = 0.5 * torch.sin(2 * torch.pi * frequency * t)
-    audio = audio.unsqueeze(0)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-        torchaudio.save(tmp.name, audio, sr)
-        tmp_path = tmp.name
-    return send_file(tmp_path, mimetype="audio/wav", as_attachment=True, download_name="output.wav")
 @app.route("/api/v1/stt", methods=["POST"])
 def stt_api():
-    data = request.get_json()
-    audio_b64 = data.get("audio", "")
-    if audio_b64:
-        audio_bytes = base64.b64decode(audio_b64)
-        buf = io.BytesIO(audio_bytes)
-        waveform, sr = torchaudio.load(buf)
-        mean_amp = waveform.abs().mean().item()
-        recognized_text = f"Audio processed with mean amplitude {mean_amp:.3f}"
-        return jsonify({"text": recognized_text})
-    return jsonify({"text": ""})
 @app.route("/api/v1/sentiment", methods=["POST"])
 def sentiment_api():
-    data = request.get_json()
-    text = data.get("text", "")
-    if not text:
-        return jsonify({"sentiment": "neutral"})
-    ascii_vals = [ord(c) for c in text[:10]]
-    while len(ascii_vals) < 10:
-        ascii_vals.append(0)
-    features = torch.tensor(ascii_vals, dtype=torch.float32).unsqueeze(0)
-    output = sentiment_model(features)
-    sentiment_idx = torch.argmax(output, dim=1).item()
-    sentiment = "positivo" if sentiment_idx == 1 else "negativo"
-    return jsonify({"sentiment": sentiment})
 @app.route("/api/v1/imagegen", methods=["POST"])
 def imagegen_api():
-    data = request.get_json()
-    prompt = data.get("prompt", "")
-    image_tensor = torch.rand(3, 256, 256)
-    np_image = image_tensor.mul(255).clamp(0, 255).byte().numpy().transpose(1, 2, 0)
-    img = Image.fromarray(np_image)
-    buf = io.BytesIO()
-    img.save(buf, format="PNG")
-    buf.seek(0)
-    return send_file(buf, mimetype="image/png", as_attachment=True, download_name="image.png")
 @app.route("/api/v1/musicgen", methods=["POST"])
 def musicgen_api():
-    data = request.get_json()
-    prompt = data.get("prompt", "")
-    sr = 22050
-    duration = 5.0
-    t = torch.linspace(0, duration, int(sr * duration))
-    frequency = 440.0
-    audio = 0.5 * torch.sin(2 * torch.pi * frequency * t)
-    audio = audio.unsqueeze(0)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-        torchaudio.save(tmp.name, tmp.name, sr)
-        tmp_path = tmp.name
-    return send_file(tmp_path, mimetype="audio/wav", as_attachment=True, download_name="music.wav")
 @app.route("/api/v1/translation", methods=["POST"])
 def translation_api():
-    data = request.get_json()
-    text = data.get("text", "")
-    translated = " ".join(text.split()[::-1])
-    return jsonify({"translated_text": translated})
 @app.route("/api/v1/codegen", methods=["POST"])
 def codegen_api():
-    data = request.get_json()
-    prompt = data.get("prompt", "")
-    generated_code = f"# Generated code based on prompt: {prompt}\nprint('Hello from Torch-generated code')"
-    return jsonify({"code": generated_code})
 @app.route("/api/v1/text_to_video", methods=["POST"])
 def text_to_video_api():
-    data = request.get_json()
-    prompt = data.get("prompt", "")
-    video_tensor = torch.randint(0, 255, (10, 3, 64, 64), dtype=torch.uint8)
-    video_bytes = video_tensor.numpy().tobytes()
-    buf = io.BytesIO(video_bytes)
-    return send_file(buf, mimetype="video/mp4", as_attachment=True, download_name="video.mp4")
 @app.route("/api/v1/summarization", methods=["POST"])
 def summarization_api():
-    data = request.get_json()
-    text = data.get("text", "")
-    sentences = text.split('.')
-    summary = sentences[0] if sentences[0] else text
-    return jsonify({"summary": summary})
 @app.route("/api/v1/image_to_3d", methods=["POST"])
 def image_to_3d_api():
-    data = request.get_json()
-    prompt = data.get("prompt", "")
-    obj_data = "o Cube\nv 0 0 0\nv 1 0 0\nv 1 1 0\nv 0 1 0\nf 1 2 3 4"
-    buf = io.BytesIO(obj_data.encode("utf-8"))
-    return send_file(buf, mimetype="text/plain", as_attachment=True, download_name="model.obj")
-@app.route("/api/v1/sadtalker", methods=["GET"])
 def sadtalker():
-    return jsonify({"message": "Respuesta de sadtalker"})
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 from main import *
+from tts_api import tts_api as tts_route
+from stt_api import stt_api as stt_route
+from sentiment_api import sentiment_api as sentiment_route
+from imagegen_api import imagegen_api as imagegen_route
+from musicgen_api import musicgen_api as musicgen_route
+from translation_api import translation_api as translation_route
+from codegen_api import codegen_api as codegen_route
+from text_to_video_api import text_to_video_api as text_to_video_route
+from summarization_api import summarization_api as summarization_route
+from image_to_3d_api import image_to_3d_api as image_to_3d_route
 from flask import Flask, request, jsonify, Response, send_file, stream_with_context
 from flask_cors import CORS
 import torch
 import torch.nn.functional as F
 import torchaudio
 import numpy as np
 import queue
 import json
 import base64
+from markupsafe import Markup
+from markupsafe import escape
 app = Flask(__name__)
 CORS(app)
 html_code = """<!DOCTYPE html>
 <html lang="en">
 <head>
 """
 feedback_queue = queue.Queue()
 @app.route("/")
 def index():
     top_k = int(request.args.get("top_k", 40))
     top_p = float(request.args.get("top_p", 0.0))
     reppenalty = float(request.args.get("reppenalty", 1.2))
+    response_queue = queue.Queue()
+    reasoning_queue.put({
+        'text_input': text,
+        'temperature': temp,
+        'top_k': top_k,
+        'top_p': top_p,
+        'repetition_penalty': reppenalty,
+        'response_queue': response_queue
+    })
     @stream_with_context
     def event_stream():
+        while True:
+            output = response_queue.get()
+            if "error" in output:
+                yield "data: <ERROR>\n\n"
+                break
+            text_chunk = output.get("text")
+            if text_chunk:
+                for word in text_chunk.split(' '):
+                    clean_word = word.strip()
+                    if clean_word:
+                        yield "data: " + clean_word + "\n\n"
+                yield "data: <END_STREAM>\n\n"
+                break
     return Response(event_stream(), mimetype="text/event-stream")
 @app.route("/api/v1/generate", methods=["POST"])
     top_k = int(data.get("top_k", 40))
     top_p = float(data.get("top_p", 0.0))
     reppenalty = float(data.get("reppenalty", 1.2))
+    response_queue = queue.Queue()
+    reasoning_queue.put({
+        'text_input': text,
+        'temperature': temp,
+        'top_k': top_k,
+        'top_p': top_p,
+        'repetition_penalty': reppenalty,
+        'response_queue': response_queue
+    })
+    output = response_queue.get()
+    if "error" in output:
+        return jsonify({"error": output["error"]}), 500
+    result_text = output.get("text", "").strip()
+    return jsonify({"response": result_text})
 @app.route("/api/v1/feedback", methods=["POST"])
 def feedback():
 @app.route("/api/v1/tts", methods=["POST"])
 def tts_api():
+    return tts_route()
 @app.route("/api/v1/stt", methods=["POST"])
 def stt_api():
+    return stt_route()
 @app.route("/api/v1/sentiment", methods=["POST"])
 def sentiment_api():
+    return sentiment_route()
 @app.route("/api/v1/imagegen", methods=["POST"])
 def imagegen_api():
+    return imagegen_route()
 @app.route("/api/v1/musicgen", methods=["POST"])
 def musicgen_api():
+    return musicgen_route()
 @app.route("/api/v1/translation", methods=["POST"])
 def translation_api():
+    return translation_route()
 @app.route("/api/v1/codegen", methods=["POST"])
 def codegen_api():
+    return codegen_route()
 @app.route("/api/v1/text_to_video", methods=["POST"])
 def text_to_video_api():
+    return text_to_video_route()
 @app.route("/api/v1/summarization", methods=["POST"])
 def summarization_api():
+    return summarization_route()
 @app.route("/api/v1/image_to_3d", methods=["POST"])
 def image_to_3d_api():
+    return image_to_3d_route()
+@app.route("/api/v1/sadtalker", methods=["POST"])
 def sadtalker():
+    from sadtalker_api import router as sadtalker_router
+    return sadtalker_router.create_video()
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

background_tasks.py CHANGED Viewed

@@ -114,46 +114,12 @@ def background_training():
         except Exception:
             time.sleep(5)
-class ReasoningModel(nn.Module):
-    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
-        super(ReasoningModel, self).__init__()
-        self.embedding = nn.Embedding(vocab_size, embed_dim)
-        self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
-        self.fc = nn.Linear(hidden_dim, vocab_size)
-    def forward(self, x, hidden=None):
-        emb = self.embedding(x)
-        output, hidden = self.rnn(emb, hidden)
-        logits = self.fc(output)
-        return logits, hidden
-    def generate(self, input_seq, max_length=999999999, temperature=1.0):
-        self.eval()
-        tokens = input_seq.copy()
-        hidden = None
-        generated = []
-        while True:
-            input_tensor = torch.tensor([tokens], dtype=torch.long)
-            logits, hidden = self.forward(input_tensor, hidden)
-            next_token_logits = logits[0, -1, :] / temperature
-            probabilities = torch.softmax(next_token_logits, dim=0)
-            next_token = torch.multinomial(probabilities, 1).item()
-            tokens.append(next_token)
-            generated.append(next_token)
-            if next_token == word_to_index.get("<EOS>"):
-                break
-            if len(generated) > max_length:
-                break
-        return generated
-reasoning_model = ReasoningModel(len(vocabulary))
 def perform_reasoning_stream(text_input, temperature=0.7, top_k=40, top_p=0.0, repetition_penalty=1.2):
-    tokens = tokenize_text(text_input)
-    update_vocabulary(tokens)
-    tokens_indices = [word_to_index.get(token, 0) for token in tokens]
-    generated_indices = reasoning_model.generate(tokens_indices, max_length=999999999, temperature=temperature)
-    for idx in generated_indices:
-        yield vocabulary[idx] + " "
-    yield "<END_STREAM>"
 def background_reasoning_queue():
     global reasoning_queue, seen_responses
@@ -179,7 +145,7 @@ def background_reasoning_queue():
                 if chunk == "<END_STREAM>":
                     break
                 full_response += chunk
-            cleaned_response = re.sub(r'\s+(?=[.,，。])', '', full_response.replace("<|endoftext|>", "")).strip()
             if cleaned_response in seen_responses:
                 final_response = "**Response is repetitive. Please try again or rephrase your query.**";
                 resp_queue.put({"text": final_response})

         except Exception:
             time.sleep(5)
 def perform_reasoning_stream(text_input, temperature=0.7, top_k=40, top_p=0.0, repetition_penalty=1.2):
+    for token in sample_sequence(text_input, model_gpt2, enc, length=999999999, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, device=device):
+        if token == "<END_STREAM>":
+            yield "<END_STREAM>"
+            break
+        yield token + " "
 def background_reasoning_queue():
     global reasoning_queue, seen_responses
                 if chunk == "<END_STREAM>":
                     break
                 full_response += chunk
+            cleaned_response = re.sub(r'\s+(?=[.,，。])', '', full_response.replace("<|endoftext|>", "").strip())
             if cleaned_response in seen_responses:
                 final_response = "**Response is repetitive. Please try again or rephrase your query.**";
                 resp_queue.put({"text": final_response})

codegen_api.py CHANGED Viewed

@@ -2,10 +2,10 @@ from flask import jsonify, send_file, request
 from main import *
 def generate_code(prompt, output_path="output_code.py"):
-    if codegen_model is None:
-        return "Code generation model not initialized."
-    input_ids = codegen_tokenizer.encode(prompt, return_tensors='pt').to(device)
-    output = codegen_model.generate(input_ids, max_length=999999999, temperature=0.7, top_p=0.9)
     code = codegen_tokenizer.decode(output[0], skip_special_tokens=True)
     with open(output_path, "w") as file:
         file.write(code)
@@ -17,6 +17,6 @@ def codegen_api():
     if not prompt:
         return jsonify({"error": "Prompt is required"}), 400
     output_file = generate_code(prompt)
-    if output_file == "Code generation model not initialized.":
         return jsonify({"error": "Code generation failed"}), 500
     return send_file(output_file, mimetype="text/x-python", as_attachment=True, download_name="output.py")

 from main import *
 def generate_code(prompt, output_path="output_code.py"):
+    if codegen_model is None or codegen_tokenizer is None:
+        return "Code generation model or tokenizer not initialized."
+    input_ids = codegen_tokenizer(prompt, return_tensors='pt').to(device)
+    output = codegen_model.generate(input_ids, max_length=2048, temperature=0.7, top_p=0.9)
     code = codegen_tokenizer.decode(output[0], skip_special_tokens=True)
     with open(output_path, "w") as file:
         file.write(code)
     if not prompt:
         return jsonify({"error": "Prompt is required"}), 400
     output_file = generate_code(prompt)
+    if output_file == "Code generation model or tokenizer not initialized.":
         return jsonify({"error": "Code generation failed"}), 500
     return send_file(output_file, mimetype="text/x-python", as_attachment=True, download_name="output.py")

configs.py CHANGED Viewed

@@ -58,11 +58,33 @@ class CodeGenConfig:
 class SummarizationConfig:
     def __init__(self):
-        self.vocab_size = 10000
-        self.embedding_dim = 256
-        self.hidden_dim = 512
-        self.num_layers = 2
-        self.max_seq_len = 512
 class Clip4ClipConfig:
     def __init__(self, vocab_size=30522, hidden_size=512, num_hidden_layers=6, num_attention_heads=8, intermediate_size=2048, hidden_act="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, bos_token_id=1, eos_token_id=2, **kwargs):

 class SummarizationConfig:
     def __init__(self):
+        self.vocab_size = 50265
+        self.max_position_embeddings = 1024
+        self.encoder_layers = 12
+        self.encoder_ffn_dim = 4096
+        self.encoder_attention_heads = 16
+        self.decoder_layers = 12
+        self.decoder_ffn_dim = 4096
+        self.decoder_attention_heads = 16
+        self.encoder_layerdrop = 0.0
+        self.decoder_layerdrop = 0.0
+        self.activation_function = "gelu"
+        self.d_model = 1024
+        self.dropout = 0.1
+        self.attention_dropout = 0.0
+        self.activation_dropout = 0.0
+        self.init_std = 0.02
+        self.classifier_dropout = 0.0
+        self.num_labels = 3
+        self.pad_token_id = 1
+        self.bos_token_id = 0
+        self.eos_token_id = 2
+        self.layer_norm_eps = 1e-05
+        self.num_beams = 4
+        self.early_stopping = True
+        self.max_length = 100
+        self.min_length = 30
+        self.scale_embedding = False
 class Clip4ClipConfig:
     def __init__(self, vocab_size=30522, hidden_size=512, num_hidden_layers=6, num_attention_heads=8, intermediate_size=2048, hidden_act="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, bos_token_id=1, eos_token_id=2, **kwargs):

constants.py CHANGED Viewed

@@ -158,13 +158,7 @@ html_code = """<!DOCTYPE html>
             top_p: top_p_val,
             reppenalty: repetition_penalty_val
         };
-        eventSource = new EventSource('/generate_stream', {
-            headers: {
-                'Content-Type': 'application/json'
-            },
-            method: 'POST',
-            body: JSON.stringify(requestData)
-        });
         eventSource.onmessage = function(event) {
             if (event.data === "<END_STREAM>") {
                 eventSource.close();

             top_p: top_p_val,
             reppenalty: repetition_penalty_val
         };
+        eventSource = new EventSource('/api/v1/generate_stream?' + new URLSearchParams(requestData).toString());
         eventSource.onmessage = function(event) {
             if (event.data === "<END_STREAM>") {
                 eventSource.close();

extensions.py CHANGED Viewed

@@ -159,6 +159,86 @@ class RealESRGANer():
         output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2RGB)
         return [output_img, None]
 def save_video_with_watermark(video_frames, audio_path, output_path, watermark_path='./assets/sadtalker_logo.png'):
     try:
         watermark = imageio.imread(watermark_path)
@@ -249,4 +329,4 @@ def get_prior_from_bfm(bfm_path):
         'u_tex': u_tex,
         'u_exp': u_exp
     }
-    return prior_coeff

         output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2RGB)
         return [output_img, None]
+    def enhance(self, img, outscale=None, tile=None, tile_pad=None, pre_pad=None, half=None):
+        h_input, w_input = img.shape[0:2]
+        if outscale is None:
+            outscale = self.scale
+        if tile is None:
+            tile = self.tile
+        if tile_pad is None:
+            tile_pad = self.tile_pad
+        if pre_pad is None:
+            pre_pad = self.pre_pad
+        if half is None:
+            half = self.half
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        img_tensor = img2tensor(img)
+        img_tensor = img_tensor.unsqueeze(0).to(self.device)
+        if half:
+            img_tensor = img_tensor.half()
+        mod_scale = self.mod_scale
+        h_pad, w_pad = 0, 0
+        if mod_scale is not None:
+            h_pad, w_pad = int(np.ceil(h_input / mod_scale) * mod_scale - h_input), int(np.ceil(w_input / mod_scale) * mod_scale - w_input)
+            img_tensor = F.pad(img_tensor, (0, w_pad, 0, h_pad), 'reflect')
+        window_size = 256
+        scale = self.scale
+        overlap_ratio = 0.5
+        if w_input * h_input < window_size**2:
+            tile = None
+        if tile is not None and tile > 0:
+            tile_overlap = tile * overlap_ratio
+            sf = scale
+            stride_w = math.ceil(tile - tile_overlap)
+            stride_h = math.ceil(tile - tile_overlap)
+            numW = math.ceil((w_input + tile_overlap) / stride_w)
+            numH = math.ceil((h_input + tile_overlap) / stride_h)
+            paddingW = (numW - 1) * stride_w + tile - w_input
+            paddingH = (numH - 1) * stride_h + tile - h_input
+            padding_bottom = int(max(paddingH, 0))
+            padding_right = int(max(paddingW, 0))
+            padding_left, padding_top = 0, 0
+            img_tensor = F.pad(img_tensor, (padding_left, padding_right, padding_top, padding_bottom), mode='reflect')
+            output_h, output_w = padding_top + h_input * scale + padding_bottom, padding_left + w_input * scale + padding_right
+            output_tensor = torch.zeros([1, 3, output_h, output_w], dtype=img_tensor.dtype, device=self.device)
+            windows = []
+            for row in range(numH):
+                for col in range(numW):
+                    start_x = col * stride_w
+                    start_y = row * stride_h
+                    end_x = min(start_x + tile, img_tensor.shape[3])
+                    end_y = min(start_y + tile, img_tensor.shape[2])
+                    windows.append(img_tensor[:, :, start_y:end_y, start_x:end_x])
+            results = []
+            batch_size = 8
+            for i in range(0, len(windows), batch_size):
+                batch_windows = torch.stack(windows[i:min(i + batch_size, len(windows))], dim=0)
+                with torch.no_grad():
+                    results.append(self.model(batch_windows))
+            results = torch.cat(results, dim=0)
+            count = 0
+            for row in range(numH):
+                for col in range(numW):
+                    start_x = col * stride_w
+                    start_y = row * stride_h
+                    end_x = min(start_x + tile, img_tensor.shape[3])
+                    end_y = min(start_y + tile, img_tensor.shape[2])
+                    out_start_x, out_start_y = start_x * sf, start_y * sf
+                    out_end_x, out_end_y = end_x * sf, end_y * sf
+                    output_tensor[:, :, out_start_y:out_end_y, out_start_x:out_end_x] += results[count][:, :, :end_y * sf - out_start_y, :end_x * sf - out_start_x]
+                    count += 1
+            forward_img = output_tensor[:, :, :h_input * sf, :w_input * sf]
+        else:
+            with torch.no_grad():
+                forward_img = self.model(img_tensor)
+        if half:
+            forward_img = forward_img.float()
+        output_img = tensor2img(forward_img.squeeze(0).clamp_(0, 1))
+        if mod_scale is not None:
+            output_img = output_img[:h_input * self.scale, :w_input * self.scale, ...]
+        output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2RGB)
+        return [output_img, None]
 def save_video_with_watermark(video_frames, audio_path, output_path, watermark_path='./assets/sadtalker_logo.png'):
     try:
         watermark = imageio.imread(watermark_path)
         'u_tex': u_tex,
         'u_exp': u_exp
     }
+    return prior_coeff

imagegen_api.py CHANGED Viewed

@@ -10,10 +10,11 @@ def generate_image(prompt, output_path="output_image.png"):
         return "Image generation model not initialized."
     generator = torch.Generator(device=device).manual_seed(0)
-    image = imagegen_model(
-        prompt,
-        generator=generator,
-    ).images[0]
     image.save(output_path)
     return output_path

         return "Image generation model not initialized."
     generator = torch.Generator(device=device).manual_seed(0)
+    with torch.no_grad():
+        image = imagegen_model(
+            prompt,
+            generator=generator,
+        ).images[0]
     image.save(output_path)
     return output_path

main.py CHANGED Viewed

@@ -7,7 +7,7 @@ import re
 import json
 from flask import Flask
 from flask_cors import CORS
-from api import *
 from extensions import *
 from constants import *
 from configs import *
@@ -17,8 +17,7 @@ from model_loader import *
 from utils import *
 from background_tasks import generate_and_queue_text, background_training, background_reasoning_queue
 from text_generation import *
-from sadtalker_utils import *
-import torch
 state_dict = None
 enc = None
@@ -59,7 +58,7 @@ tts_model = None
 musicgen_model = None
 def load_models():
-    global model_gpt2, enc, translation_model, codegen_model, codegen_tokenizer, codegen_vocabulary, codegen_index_to_word, codegen_word_to_index, summarization_model, imagegen_model, image_to_3d_model, text_to_video_model, sadtalker_instance, sentiment_model, stt_model, tts_model, musicgen_model, checkpoint_path, gfpgan_model_file, restoreformer_model_file, codeformer_model_file, realesrgan_model_file, kp_file, aud_file, wav_file, gen_file, mapx_file, den_file
     model_gpt2, enc = initialize_gpt2_model(GPT2_FOLDER, {MODEL_FILE: MODEL_URL, ENCODER_FILE: ENCODER_URL, VOCAB_FILE: VOCAB_URL, CONFIG_FILE: GPT2CONFHG})
     translation_model = initialize_translation_model(TRANSLATION_FOLDER, TRANSLATION_MODEL_FILES_URLS)
     codegen_model, codegen_tokenizer, codegen_vocabulary, codegen_index_to_word, codegen_word_to_index = initialize_codegen_model(CODEGEN_FOLDER, CODEGEN_FILES_URLS)
@@ -71,35 +70,7 @@ def load_models():
     stt_model = initialize_stt_model(STT_FOLDER, STT_FILES_URLS)
     tts_model = initialize_tts_model(TTS_FOLDER, TTS_FILES_URLS)
     musicgen_model = initialize_musicgen_model(MUSICGEN_FOLDER, MUSICGEN_FILES_URLS)
-class SimpleClassifier(torch.nn.Module):
-    def __init__(self, vocab_size, num_classes):
-        super(SimpleClassifier, self).__init__()
-        self.embedding = torch.nn.Embedding(vocab_size, 128)
-        self.linear = torch.nn.Linear(128, num_classes)
-    def forward(self, x):
-        embedded = self.embedding(x)
-        pooled = torch.mean(embedded, dim=1)
-        return self.linear(pooled)
-def tokenize_text(text):
-    global vocabulary, word_to_index, index_to_word
-    tokens = text.lower().split()
-    for token in tokens:
-        if token not in vocabulary:
-            vocabulary.add(token)
-            word_to_index[token] = len(index_to_word)
-            index_to_word.append(token)
-    return tokens
-def text_to_vector(text):
-    global vocabulary, word_to_index
-    tokens = tokenize_text(text)
-    vector = torch.zeros(len(vocabulary))
-    for token in tokens:
-        if token in word_to_index:
-            vector[word_to_index[token]] += 1
-    return vector
 if __name__ == "__main__":
     nltk.download('punkt')
@@ -115,4 +86,4 @@ if __name__ == "__main__":
     background_threads.append(threading.Thread(target=background_reasoning_queue, daemon=True))
     for thread in background_threads:
         thread.start()
-    app.run(host='0.0.0.0', port=7860)

 import json
 from flask import Flask
 from flask_cors import CORS
+from api import app
 from extensions import *
 from constants import *
 from configs import *
 from utils import *
 from background_tasks import generate_and_queue_text, background_training, background_reasoning_queue
 from text_generation import *
+from sadtalker_utils import SadTalker
 state_dict = None
 enc = None
 musicgen_model = None
 def load_models():
+    global model_gpt2, enc, translation_model, codegen_model, codegen_tokenizer, codegen_vocabulary, codegen_index_to_word, codegen_word_to_index, summarization_model, imagegen_model, image_to_3d_model, text_to_video_model, sadtalker_instance, sentiment_model, stt_model, tts_model, musicgen_model
     model_gpt2, enc = initialize_gpt2_model(GPT2_FOLDER, {MODEL_FILE: MODEL_URL, ENCODER_FILE: ENCODER_URL, VOCAB_FILE: VOCAB_URL, CONFIG_FILE: GPT2CONFHG})
     translation_model = initialize_translation_model(TRANSLATION_FOLDER, TRANSLATION_MODEL_FILES_URLS)
     codegen_model, codegen_tokenizer, codegen_vocabulary, codegen_index_to_word, codegen_word_to_index = initialize_codegen_model(CODEGEN_FOLDER, CODEGEN_FILES_URLS)
     stt_model = initialize_stt_model(STT_FOLDER, STT_FILES_URLS)
     tts_model = initialize_tts_model(TTS_FOLDER, TTS_FILES_URLS)
     musicgen_model = initialize_musicgen_model(MUSICGEN_FOLDER, MUSICGEN_FILES_URLS)
+    sadtalker_instance = SadTalker(checkpoint_path='./checkpoints', config_path='./src/config')
 if __name__ == "__main__":
     nltk.download('punkt')
     background_threads.append(threading.Thread(target=background_reasoning_queue, daemon=True))
     for thread in background_threads:
         thread.start()
+    app.run(host='0.0.0.0', port=7860)

model_loader.py CHANGED Viewed

@@ -265,7 +265,7 @@ class ResnetBlock(nn.Module):
         sc = self.conv_shortcut(x)
         h = F.silu(self.norm1(x))
         h = self.conv1(h)
-        h = F.silu(self.norm2(h))
         h = self.conv2(h)
         return h + sc

         sc = self.conv_shortcut(x)
         h = F.silu(self.norm1(x))
         h = self.conv1(h)
+        h = F.silu(self.norm2(x))
         h = self.conv2(h)
         return h + sc

models.py CHANGED Viewed

@@ -91,4 +91,4 @@ class MusicGenModel(nn.Module):
             audio_output.append(predicted_token.cpu())
             input_tokens = torch.cat((input_tokens, predicted_token), dim=1)
         audio_output = torch.cat(audio_output, dim=1).float()
-        return audio_output

             audio_output.append(predicted_token.cpu())
             input_tokens = torch.cat((input_tokens, predicted_token), dim=1)
         audio_output = torch.cat(audio_output, dim=1).float()
+        return audio_output

musicgen_api.py CHANGED Viewed

@@ -11,7 +11,7 @@ def generate_music(prompt, output_path="output_music.wav"):
     attributes = [prompt]
     sample_rate = 32000
-    duration = 60
     audio_values = musicgen_model.sample(
         attributes=attributes,
         sample_rate=sample_rate,

     attributes = [prompt]
     sample_rate = 32000
+    duration = 10
     audio_values = musicgen_model.sample(
         attributes=attributes,
         sample_rate=sample_rate,

sadtalker_api.py CHANGED Viewed

@@ -157,7 +157,7 @@ async def websocket_endpoint(websocket: WebSocket):
                   transcription_text_file = speech_to_text_func(tmp_audio_file.name)
                   with open(transcription_text_file, 'r') as f:
                       transcription_text = f.read()
-                  response_stream = perform_reasoning_stream(f"respond to this sentence in 10 words or less {transcription_text}", 0.7, 40, 0.0, 1.2)
                   response_text = ""
                   for chunk in response_stream:
                       if chunk == "<END_STREAM>":
@@ -198,3 +198,7 @@ async def websocket_endpoint(websocket: WebSocket):
     except Exception as e:
          print(e)
          await websocket.send_json({"error":str(e)})

                   transcription_text_file = speech_to_text_func(tmp_audio_file.name)
                   with open(transcription_text_file, 'r') as f:
                       transcription_text = f.read()
+                  response_stream = perform_reasoning_stream(transcription_text, 0.7, 40, 0.0, 1.2)
                   response_text = ""
                   for chunk in response_stream:
                       if chunk == "<END_STREAM>":
     except Exception as e:
          print(e)
          await websocket.send_json({"error":str(e)})
+router = APIRouter()
+router.add_api_route("/sadtalker", create_video, methods=["POST"])
+router.add_api_websocket_route("/ws", websocket_endpoint)

sadtalker_utils.py CHANGED Viewed

@@ -269,32 +269,33 @@ class SadTalker:
         self.sadtalker_model = SadTalkerModel(self.cfg, device_id=[0])
     def get_cfg_defaults(self):
-        return {
-            'MODEL': {
-                'CHECKPOINTS_DIR': '',
-                'CONFIG_DIR': '',
-                'DEVICE': self.device,
-                'SCALE': 64,
-                'NUM_VOXEL_FRAMES': 8,
-                'NUM_MOTION_FRAMES': 10,
-                'MAX_FEATURES': 256,
-                'DRIVEN_AUDIO_SAMPLE_RATE': 16000,
-                'VIDEO_FPS': 25,
-                'OUTPUT_VIDEO_FPS': None,
-                'OUTPUT_AUDIO_SAMPLE_RATE': None,
-                'USE_ENHANCER': False,
-                'ENHANCER_NAME': '',
-                'BG_UPSAMPLER': None,
-                'IS_HALF': False
-            },
-            'INPUT_IMAGE': {}
-        }
     def merge_from_file(self, filepath):
         if os.path.exists(filepath):
             with open(filepath, 'r') as f:
                 cfg_from_file = yaml.safe_load(f)
-            self.cfg.update(cfg_from_file)
     def test(self, source_image, driven_audio, preprocess='crop', still_mode=False, use_enhancer=False,
              batch_size=1, size=256, pose_style=0, exp_scale=1.0, use_ref_video=False, ref_video=None,
@@ -310,7 +311,7 @@ class SadTalkerModel:
     def __init__(self, sadtalker_cfg, device_id=[0]):
         self.cfg = sadtalker_cfg
-        self.device = sadtalker_cfg['MODEL'].get('DEVICE', 'cpu')
         self.sadtalker = SadTalkerInnerModel(sadtalker_cfg, device_id)
         self.preprocesser = self.sadtalker.preprocesser
         self.kp_extractor = self.sadtalker.kp_extractor
@@ -389,7 +390,7 @@ class SadTalkerInner:
             ref_pose_coeff = None
             ref_expression_coeff = None
         audio_tensor, audio_sample_rate = proc.process_audio(self.driven_audio,
-                                                             self.sadtalker_model.cfg['MODEL']['DRIVEN_AUDIO_SAMPLE_RATE'])
         batch = {
             'source_image': source_image_tensor.unsqueeze(0).to(self.device),
             'audio': audio_tensor.unsqueeze(0).to(self.device),
@@ -455,12 +456,11 @@ class SadTalkerInner:
         audio_name = os.path.splitext(os.path.basename(self.driven_audio))[0]
         output_video_path = os.path.join(self.result_dir, base_name + '_' + audio_name + '.mp4')
         self.output_path = output_video_path
-        video_fps = self.sadtalker_model.cfg['MODEL']['VIDEO_FPS'] if self.sadtalker_model.cfg['MODEL'][
-                                                                          'OUTPUT_VIDEO_FPS'] is None else \
-            self.sadtalker_model.cfg['MODEL']['OUTPUT_VIDEO_FPS']
-        audio_output_sample_rate = self.sadtalker_model.cfg['MODEL']['DRIVEN_AUDIO_SAMPLE_RATE'] if \
-            self.sadtalker_model.cfg['MODEL']['OUTPUT_AUDIO_SAMPLE_RATE'] is None else \
-            self.sadtalker_model.cfg['MODEL']['OUTPUT_AUDIO_SAMPLE_RATE']
         if self.use_enhancer:
             enhanced_path = os.path.join(self.result_dir, base_name + '_' + audio_name + '_enhanced.mp4')
             save_video_with_watermark(output_video, self.driven_audio, enhanced_path)
@@ -489,13 +489,12 @@ class SadTalkerInnerModel:
     def __init__(self, sadtalker_cfg, device_id=[0]):
         self.cfg = sadtalker_cfg
-        self.device = sadtalker_cfg['MODEL'].get('DEVICE', 'cpu')
         self.preprocesser = Preprocesser(sadtalker_cfg, self.device)
         self.kp_extractor = KeyPointExtractor(sadtalker_cfg, self.device)
         self.audio_to_coeff = Audio2Coeff(sadtalker_cfg, self.device)
         self.animate_from_coeff = AnimateFromCoeff(sadtalker_cfg, self.device)
-        self.face_enhancer = FaceEnhancer(sadtalker_cfg, self.device) if sadtalker_cfg['MODEL'][
-            'USE_ENHANCER'] else None
         self.generator = Generator(sadtalker_cfg, self.device)
         self.mapping = Mapping(sadtalker_cfg, self.device)
         self.he_estimator = OcclusionAwareDenseMotion(sadtalker_cfg, self.device)
@@ -506,10 +505,10 @@ class Preprocesser:
     def __init__(self, sadtalker_cfg, device):
         self.cfg = sadtalker_cfg
         self.device = device
-        if self.cfg['INPUT_IMAGE'].get('OLD_VERSION', False):
-            self.face3d_helper = Face3DHelperOld(self.cfg['INPUT_IMAGE'].get('LOCAL_PCA_PATH', ''), device)
         else:
-            self.face3d_helper = Face3DHelper(self.cfg['INPUT_IMAGE'].get('LOCAL_PCA_PATH', ''), device)
         self.mouth_detector = MouthDetector()
     def crop(self, source_image_pil, preprocess_type, size=256):
@@ -543,7 +542,7 @@ class Preprocesser:
             cropped_image_pil = cropped_image_pil.resize((size, size), Image.Resampling.LANCZOS)
         source_image_tensor = self.img2tensor(cropped_image_pil)
         return source_image_tensor, [[y_min, y_max], [x_min, x_max], old_size, cropped_image_pil.size], os.path.basename(
-            self.cfg['INPUT_IMAGE'].get('SOURCE_IMAGE', ''))
     def img2tensor(self, img):
         img = np.array(img).astype(np.float32) / 255.0
@@ -577,7 +576,7 @@ class Preprocesser:
         return ref_expression_coeff
     def generate_idles_pose(self, length_of_audio, pose_style):
-        num_frames = int(length_of_audio * self.cfg['MODEL']['VIDEO_FPS'])
         ref_pose_coeff = torch.zeros((num_frames, 64), dtype=torch.float32).to(self.device)
         start_pose = self.generate_still_pose(pose_style)
         end_pose = self.generate_still_pose(pose_style)
@@ -587,7 +586,7 @@ class Preprocesser:
         return ref_pose_coeff
     def generate_idles_expression(self, length_of_audio):
-        num_frames = int(length_of_audio * self.cfg['MODEL']['VIDEO_FPS'])
         ref_expression_coeff = torch.zeros((num_frames, 64), dtype=torch.float32).to(self.device)
         start_exp = self.generate_still_expression(1.0)
         end_exp = self.generate_still_expression(1.0)
@@ -601,11 +600,11 @@ class KeyPointExtractor(nn.Module):
     def __init__(self, sadtalker_cfg, device):
         super(KeyPointExtractor, self).__init__()
-        self.kp_extractor = OcclusionAwareKPDetector(kp_channels=sadtalker_cfg['MODEL']['NUM_MOTION_FRAMES'],
                                                       num_kp=10,
                                                       num_dilation_blocks=2,
                                                       dropout_rate=0.1).to(device)
-        checkpoint_path = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'kp_detector.safetensors')
         self.load_kp_detector(checkpoint_path, device)
     def load_kp_detector(self, checkpoint_path, device):
@@ -628,12 +627,12 @@ class Audio2Coeff(nn.Module):
     def __init__(self, sadtalker_cfg, device):
         super(Audio2Coeff, self).__init__()
         self.audio_model = Wav2Vec2Model().to(device)
-        checkpoint_path = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'wav2vec2.pth')
         self.load_audio_model(checkpoint_path, device)
         self.pose_mapper = AudioCoeffsPredictor(2048, 64).to(device)
         self.exp_mapper = AudioCoeffsPredictor(2048, 64).to(device)
         self.blink_mapper = AudioCoeffsPredictor(2048, 1).to(device)
-        mapping_checkpoint = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'audio2pose_00140-model.pth')
         self.load_mapping_model(mapping_checkpoint, device)
     def load_audio_model(self, checkpoint_path, device):
@@ -753,13 +752,13 @@ class Generator(nn.Module):
     def __init__(self, sadtalker_cfg, device):
         super(Generator, self).__init__()
-        self.generator = Hourglass(block_expansion=sadtalker_cfg['MODEL']['SCALE'],
-                                     num_blocks=sadtalker_cfg['MODEL']['NUM_VOXEL_FRAMES'],
-                                     max_features=sadtalker_cfg['MODEL']['MAX_FEATURES'],
                                      num_channels=3,
                                      kp_size=10,
-                                     num_deform_blocks=sadtalker_cfg['MODEL']['NUM_MOTION_FRAMES']).to(device)
-        checkpoint_path = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'generator.pth')
         self.load_generator(checkpoint_path, device)
     def load_generator(self, checkpoint_path, device):
@@ -786,7 +785,7 @@ class Mapping(nn.Module):
     def __init__(self, sadtalker_cfg, device):
         super(Mapping, self).__init__()
         self.mapping_net = MappingNet(num_coeffs=64, num_layers=3, hidden_dim=128).to(device)
-        checkpoint_path = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'mapping.pth')
         self.load_mapping_net(checkpoint_path, device)
         self.f_3d_mean = torch.zeros(1, 64, device=device)
@@ -814,10 +813,10 @@ class OcclusionAwareDenseMotion(nn.Module):
         super(OcclusionAwareDenseMotion, self).__init__()
         self.dense_motion_network = DenseMotionNetwork(num_kp=10,
                                                         num_channels=3,
-                                                        block_expansion=sadtalker_cfg['MODEL']['SCALE'],
-                                                        num_blocks=sadtalker_cfg['MODEL']['NUM_MOTION_FRAMES'] - 1,
-                                                        max_features=sadtalker_cfg['MODEL']['MAX_FEATURES']).to(device)
-        checkpoint_path = os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'dense_motion.pth')
         self.load_dense_motion_network(checkpoint_path, device)
     def load_dense_motion_network(self, checkpoint_path, device):
@@ -839,20 +838,20 @@ class FaceEnhancer(nn.Module):
     def __init__(self, sadtalker_cfg, device):
         super(FaceEnhancer, self).__init__()
-        enhancer_name = sadtalker_cfg['MODEL']['ENHANCER_NAME']
-        bg_upsampler = sadtalker_cfg['MODEL']['BG_UPSAMPLER']
         if enhancer_name == 'gfpgan':
             from gfpgan import GFPGANer
-            self.face_enhancer = GFPGANer(model_path=os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'], 'GFPGANv1.4.pth'),
                                           upscale=1,
                                           arch='clean',
                                           channel_multiplier=2,
                                           bg_upsampler=bg_upsampler)
         elif enhancer_name == 'realesrgan':
             from realesrgan import RealESRGANer
-            half = False if device == 'cpu' else sadtalker_cfg['MODEL']['IS_HALF']
             self.face_enhancer = RealESRGANer(scale=2,
-                                               model_path=os.path.join(sadtalker_cfg['MODEL']['CHECKPOINTS_DIR'],
                                                                       'RealESRGAN_x2plus.pth'),
                                                tile=0,
                                                tile_pad=10,

         self.sadtalker_model = SadTalkerModel(self.cfg, device_id=[0])
     def get_cfg_defaults(self):
+        return CN(
+            MODEL=CN(
+                CHECKPOINTS_DIR='',
+                CONFIG_DIR='',
+                DEVICE=self.device,
+                SCALE=64,
+                NUM_VOXEL_FRAMES=8,
+                NUM_MOTION_FRAMES=10,
+                MAX_FEATURES=256,
+                DRIVEN_AUDIO_SAMPLE_RATE=16000,
+                VIDEO_FPS=25,
+                OUTPUT_VIDEO_FPS=None,
+                OUTPUT_AUDIO_SAMPLE_RATE=None,
+                USE_ENHANCER=False,
+                ENHANCER_NAME='',
+                BG_UPSAMPLER=None,
+                IS_HALF=False
+            ),
+            INPUT_IMAGE=CN()
+        )
     def merge_from_file(self, filepath):
         if os.path.exists(filepath):
             with open(filepath, 'r') as f:
                 cfg_from_file = yaml.safe_load(f)
+            self.cfg.MODEL.update(CN(cfg_from_file['MODEL']))
+            self.cfg.INPUT_IMAGE.update(CN(cfg_from_file['INPUT_IMAGE']))
     def test(self, source_image, driven_audio, preprocess='crop', still_mode=False, use_enhancer=False,
              batch_size=1, size=256, pose_style=0, exp_scale=1.0, use_ref_video=False, ref_video=None,
     def __init__(self, sadtalker_cfg, device_id=[0]):
         self.cfg = sadtalker_cfg
+        self.device = sadtalker_cfg.MODEL.get('DEVICE', 'cpu')
         self.sadtalker = SadTalkerInnerModel(sadtalker_cfg, device_id)
         self.preprocesser = self.sadtalker.preprocesser
         self.kp_extractor = self.sadtalker.kp_extractor
             ref_pose_coeff = None
             ref_expression_coeff = None
         audio_tensor, audio_sample_rate = proc.process_audio(self.driven_audio,
+                                                             self.sadtalker_model.cfg.MODEL.DRIVEN_AUDIO_SAMPLE_RATE)
         batch = {
             'source_image': source_image_tensor.unsqueeze(0).to(self.device),
             'audio': audio_tensor.unsqueeze(0).to(self.device),
         audio_name = os.path.splitext(os.path.basename(self.driven_audio))[0]
         output_video_path = os.path.join(self.result_dir, base_name + '_' + audio_name + '.mp4')
         self.output_path = output_video_path
+        video_fps = self.sadtalker_model.cfg.MODEL.VIDEO_FPS if self.sadtalker_model.cfg.MODEL.OUTPUT_VIDEO_FPS is None else \
+            self.sadtalker_model.cfg.MODEL.OUTPUT_VIDEO_FPS
+        audio_output_sample_rate = self.sadtalker_model.cfg.MODEL.DRIVEN_AUDIO_SAMPLE_RATE if \
+            self.sadtalker_model.cfg.MODEL.OUTPUT_AUDIO_SAMPLE_RATE is None else \
+            self.sadtalker_model.cfg.MODEL.OUTPUT_AUDIO_SAMPLE_RATE
         if self.use_enhancer:
             enhanced_path = os.path.join(self.result_dir, base_name + '_' + audio_name + '_enhanced.mp4')
             save_video_with_watermark(output_video, self.driven_audio, enhanced_path)
     def __init__(self, sadtalker_cfg, device_id=[0]):
         self.cfg = sadtalker_cfg
+        self.device = sadtalker_cfg.MODEL.DEVICE
         self.preprocesser = Preprocesser(sadtalker_cfg, self.device)
         self.kp_extractor = KeyPointExtractor(sadtalker_cfg, self.device)
         self.audio_to_coeff = Audio2Coeff(sadtalker_cfg, self.device)
         self.animate_from_coeff = AnimateFromCoeff(sadtalker_cfg, self.device)
+        self.face_enhancer = FaceEnhancer(sadtalker_cfg, self.device) if sadtalker_cfg.MODEL.USE_ENHANCER else None
         self.generator = Generator(sadtalker_cfg, self.device)
         self.mapping = Mapping(sadtalker_cfg, self.device)
         self.he_estimator = OcclusionAwareDenseMotion(sadtalker_cfg, self.device)
     def __init__(self, sadtalker_cfg, device):
         self.cfg = sadtalker_cfg
         self.device = device
+        if self.cfg.INPUT_IMAGE.get('OLD_VERSION', False):
+            self.face3d_helper = Face3DHelperOld(self.cfg.INPUT_IMAGE.get('LOCAL_PCA_PATH', ''), device)
         else:
+            self.face3d_helper = Face3DHelper(self.cfg.INPUT_IMAGE.get('LOCAL_PCA_PATH', ''), device)
         self.mouth_detector = MouthDetector()
     def crop(self, source_image_pil, preprocess_type, size=256):
             cropped_image_pil = cropped_image_pil.resize((size, size), Image.Resampling.LANCZOS)
         source_image_tensor = self.img2tensor(cropped_image_pil)
         return source_image_tensor, [[y_min, y_max], [x_min, x_max], old_size, cropped_image_pil.size], os.path.basename(
+            self.cfg.INPUT_IMAGE.get('SOURCE_IMAGE', ''))
     def img2tensor(self, img):
         img = np.array(img).astype(np.float32) / 255.0
         return ref_expression_coeff
     def generate_idles_pose(self, length_of_audio, pose_style):
+        num_frames = int(length_of_audio * self.cfg.MODEL.VIDEO_FPS)
         ref_pose_coeff = torch.zeros((num_frames, 64), dtype=torch.float32).to(self.device)
         start_pose = self.generate_still_pose(pose_style)
         end_pose = self.generate_still_pose(pose_style)
         return ref_pose_coeff
     def generate_idles_expression(self, length_of_audio):
+        num_frames = int(length_of_audio * self.cfg.MODEL.VIDEO_FPS)
         ref_expression_coeff = torch.zeros((num_frames, 64), dtype=torch.float32).to(self.device)
         start_exp = self.generate_still_expression(1.0)
         end_exp = self.generate_still_expression(1.0)
     def __init__(self, sadtalker_cfg, device):
         super(KeyPointExtractor, self).__init__()
+        self.kp_extractor = OcclusionAwareKPDetector(kp_channels=sadtalker_cfg.MODEL.NUM_MOTION_FRAMES,
                                                       num_kp=10,
                                                       num_dilation_blocks=2,
                                                       dropout_rate=0.1).to(device)
+        checkpoint_path = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'kp_detector.safetensors')
         self.load_kp_detector(checkpoint_path, device)
     def load_kp_detector(self, checkpoint_path, device):
     def __init__(self, sadtalker_cfg, device):
         super(Audio2Coeff, self).__init__()
         self.audio_model = Wav2Vec2Model().to(device)
+        checkpoint_path = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'wav2vec2.pth')
         self.load_audio_model(checkpoint_path, device)
         self.pose_mapper = AudioCoeffsPredictor(2048, 64).to(device)
         self.exp_mapper = AudioCoeffsPredictor(2048, 64).to(device)
         self.blink_mapper = AudioCoeffsPredictor(2048, 1).to(device)
+        mapping_checkpoint = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'audio2pose_00140-model.pth')
         self.load_mapping_model(mapping_checkpoint, device)
     def load_audio_model(self, checkpoint_path, device):
     def __init__(self, sadtalker_cfg, device):
         super(Generator, self).__init__()
+        self.generator = Hourglass(block_expansion=sadtalker_cfg.MODEL.SCALE,
+                                     num_blocks=sadtalker_cfg.MODEL.NUM_VOXEL_FRAMES,
+                                     max_features=sadtalker_cfg.MODEL.MAX_FEATURES,
                                      num_channels=3,
                                      kp_size=10,
+                                     num_deform_blocks=sadtalker_cfg.MODEL.NUM_MOTION_FRAMES).to(device)
+        checkpoint_path = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'generator.pth')
         self.load_generator(checkpoint_path, device)
     def load_generator(self, checkpoint_path, device):
     def __init__(self, sadtalker_cfg, device):
         super(Mapping, self).__init__()
         self.mapping_net = MappingNet(num_coeffs=64, num_layers=3, hidden_dim=128).to(device)
+        checkpoint_path = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'mapping.pth')
         self.load_mapping_net(checkpoint_path, device)
         self.f_3d_mean = torch.zeros(1, 64, device=device)
         super(OcclusionAwareDenseMotion, self).__init__()
         self.dense_motion_network = DenseMotionNetwork(num_kp=10,
                                                         num_channels=3,
+                                                        block_expansion=sadtalker_cfg.MODEL.SCALE,
+                                                        num_blocks=sadtalker_cfg.MODEL.NUM_MOTION_FRAMES - 1,
+                                                        max_features=sadtalker_cfg.MODEL.MAX_FEATURES).to(device)
+        checkpoint_path = os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'dense_motion.pth')
         self.load_dense_motion_network(checkpoint_path, device)
     def load_dense_motion_network(self, checkpoint_path, device):
     def __init__(self, sadtalker_cfg, device):
         super(FaceEnhancer, self).__init__()
+        enhancer_name = sadtalker_cfg.MODEL.ENHANCER_NAME
+        bg_upsampler = sadtalker_cfg.MODEL.BG_UPSAMPLER
         if enhancer_name == 'gfpgan':
             from gfpgan import GFPGANer
+            self.face_enhancer = GFPGANer(model_path=os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR, 'GFPGANv1.4.pth'),
                                           upscale=1,
                                           arch='clean',
                                           channel_multiplier=2,
                                           bg_upsampler=bg_upsampler)
         elif enhancer_name == 'realesrgan':
             from realesrgan import RealESRGANer
+            half = False if device == 'cpu' else sadtalker_cfg.MODEL.IS_HALF
             self.face_enhancer = RealESRGANer(scale=2,
+                                               model_path=os.path.join(sadtalker_cfg.MODEL.CHECKPOINTS_DIR,
                                                                       'RealESRGAN_x2plus.pth'),
                                                tile=0,
                                                tile_pad=10,

sentiment_api.py CHANGED Viewed

@@ -2,25 +2,28 @@ from flask import jsonify
 from main import *
 import torch
-def analyze_sentiment(text, output_path="output_sentiment.json"):
     if sentiment_model is None:
-        return "Sentiment model not initialized."
-    input_tokens = sentiment_model.tokenizer(text, return_tensors="pt", padding=True).to(device)
     with torch.no_grad():
-        sentiment_logits = sentiment_model(input_tokens['input_ids'])
-    predicted_class_id = torch.argmax(sentiment_logits, dim=-1).item()
-    sentiment_label = sentiment_model.config.id2label[predicted_class_id]
-    probability = torch.softmax(sentiment_logits, dim=-1)[0][predicted_class_id].item()
-    return {"sentiment": sentiment_label, "probability": probability}
 def sentiment_api():
     data = request.get_json()
     text = data.get('text')
     if not text:
         return jsonify({"error": "Text is required"}), 400
-    output_file = analyze_sentiment(text)
-    if output_file == "Sentiment model not initialized.":
-        return jsonify({"error": "Sentiment analysis failed"}), 500
-    return jsonify(output_file)

 from main import *
 import torch
+def analyze_sentiment(text):
     if sentiment_model is None:
+        return {"error": "Sentiment model not initialized."}
+    features = [ord(c) for c in text[:10]]
+    while len(features) < 10:
+        features.append(0)
+    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
     with torch.no_grad():
+        output = sentiment_model(features_tensor)
+        sentiment_idx = torch.argmax(output, dim=1).item()
+        sentiment_label = "positive" if sentiment_idx == 1 else "negative"
+    return {"sentiment": sentiment_label}
 def sentiment_api():
     data = request.get_json()
     text = data.get('text')
     if not text:
         return jsonify({"error": "Text is required"}), 400
+    output = analyze_sentiment(text)
+    if "error" in output:
+        return jsonify({"error": output["error"]}), 500
+    return jsonify(output)

stt_api.py CHANGED Viewed

@@ -5,9 +5,9 @@ from main import *
 import torch
 import torchaudio
-def speech_to_text_func(audio_path, output_path="output_stt.txt"):
     if stt_model is None:
-        return "STT model not initialized."
     waveform, sample_rate = torchaudio.load(audio_path)
     if waveform.ndim > 1:
@@ -18,9 +18,7 @@ def speech_to_text_func(audio_path, output_path="output_stt.txt"):
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = stt_model.tokenizer.decode(predicted_ids[0].cpu().tolist())
-    with open(output_path, "w") as file:
-        file.write(transcription)
-    return output_path
 def stt_api():
     if 'audio' not in request.files:
@@ -28,8 +26,8 @@ def stt_api():
     audio_file = request.files['audio']
     temp_audio_path = f"temp_audio_{uuid.uuid4()}.wav"
     audio_file.save(temp_audio_path)
-    output_file = speech_to_text_func(temp_audio_path)
     os.remove(temp_audio_path)
-    if output_file == "STT model not initialized.":
-        return jsonify({"error": "STT failed"}), 500
-    return send_file(output_file, mimetype="text/plain", as_attachment=True, download_name="output.txt")

 import torch
 import torchaudio
+def speech_to_text_func(audio_path):
     if stt_model is None:
+        return {"error": "STT model not initialized."}
     waveform, sample_rate = torchaudio.load(audio_path)
     if waveform.ndim > 1:
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = stt_model.tokenizer.decode(predicted_ids[0].cpu().tolist())
+    return {"text": transcription}
 def stt_api():
     if 'audio' not in request.files:
     audio_file = request.files['audio']
     temp_audio_path = f"temp_audio_{uuid.uuid4()}.wav"
     audio_file.save(temp_audio_path)
+    output = speech_to_text_func(temp_audio_path)
     os.remove(temp_audio_path)
+    if "error" in output:
+        return jsonify({"error": output["error"]}), 500
+    return jsonify(output)

summarization_api.py CHANGED Viewed

@@ -3,15 +3,14 @@ from main import *
 import torch
 def summarize_text(text, output_path="output_summary.txt"):
-    if summarization_model is None:
-        return "Summarization model not initialized."
-    input_tokens = [summarization_word_to_index.get(word.lower(), 1) for word in text.split()]
-    input_tensor = torch.tensor([input_tokens], dtype=torch.long).to(device)
     with torch.no_grad():
-        summary_ids = summarization_model.generate(input_tensor, num_beams=4, max_length=999999999, early_stopping=True)
-        summary_text = summarization_model.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     with open(output_path, "w") as file:
         file.write(summary_text)
@@ -23,6 +22,6 @@ def summarization_api():
     if not text:
         return jsonify({"error": "Text is required"}), 400
     output_file = summarize_text(text)
-    if output_file == "Summarization model not initialized.":
         return jsonify({"error": "Summarization failed"}), 500
     return send_file(output_file, mimetype="text/plain", as_attachment=True, download_name="output_summary.txt")

 import torch
 def summarize_text(text, output_path="output_summary.txt"):
+    if summarization_model is None or summarization_tokenizer is None:
+        return "Summarization model or tokenizer not initialized."
+    input_ids = summarization_tokenizer.encode(text, return_tensors="pt").to(device)
     with torch.no_grad():
+        summary_ids = summarization_model.generate(input_ids, num_beams=4, max_length=100, early_stopping=True)
+        summary_text = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     with open(output_path, "w") as file:
         file.write(summary_text)
     if not text:
         return jsonify({"error": "Text is required"}), 400
     output_file = summarize_text(text)
+    if output_file == "Summarization model or tokenizer not initialized.":
         return jsonify({"error": "Summarization failed"}), 500
     return send_file(output_file, mimetype="text/plain", as_attachment=True, download_name="output_summary.txt")

text_generation.py CHANGED Viewed

@@ -22,124 +22,114 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
     return logits
 def sample_sequence(prompt, model, enc, length, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device="cpu"):
-    start_time = time.time()
     context_tokens = enc.encode(prompt)
     context_tokens_tensor = torch.tensor([context_tokens], dtype=torch.long, device=device)
     generated = context_tokens
-    past = None
-    text_generated_count = 0
-    past_key_values = past if past is not None else None
     with torch.no_grad():
-        outputs = model(context_tokens_tensor, past_key_values=past_key_values)
-        next_token_logits = outputs[0][:, -1, :] / temperature
-        past = outputs[1]
-        for token_index in set(generated):
-            next_token_logits[0, token_index] /= repetition_penalty
-        filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-        if temperature == 0:
-            next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(0)
-        else:
-            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
-        generated += next_token.tolist()[0]
-        text_generated_count += 1
-        token = next_token.tolist()[0][0]
-        yield enc.decode([token])
-        if token == enc.encoder[END_OF_TEXT_TOKEN]:
-            yield "<END_STREAM>"
 def sample_sequence_codegen(prompt, model, tokenizer, length, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device="cpu"):
-    start_time = time.time()
     context_tokens = tokenizer.encode(prompt)
     context_tokens_tensor = torch.tensor([context_tokens], dtype=torch.long, device=device).unsqueeze(0)
     generated = context_tokens
-    past = None
-    text_generated_count = 0
     with torch.no_grad():
-        outputs = model(input_ids=context_tokens_tensor, past_key_values=past, labels=None)
-        next_token_logits = outputs[0][:, -1, :] / temperature
-        past = outputs[1]
-        for token_index in set(generated):
-            next_token_logits[0, token_index] /= repetition_penalty
-        filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-        if temperature == 0:
-            next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(0)
-        else:
-            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
-        generated.append(next_token.tolist()[0][0])
-        text_generated_count += 1
-        token = next_token.tolist()[0][0]
-        yield tokenizer.decode([token])
-        if token == 50256:
-            yield "<END_STREAM>"
 def perform_reasoning_stream(text_input, temperature, top_k, top_p, repetition_penalty):
-    try:
-        prompt_text = system_prompt + "\n\n"
-        prompt_text += "User: " + text_input + "\nCyrah: "
-        reasoning_prompt = prompt_text
-        ddgs = DDGS()
-        search_results = [r for r in ddgs.text(text_input, max_results=MAX_XDD)]
-        if search_results:
-            prompt_text += "\nWeb Search Results:\n"
-            for result in search_results:
-                prompt_text += f"- {result['body']}\n"
-            prompt_text += "\n"
-        generated_text_stream = []
-        stream_type = "text"
-        if "code" in text_input.lower() or "program" in text_input.lower():
-            if codegen_model and codegen_tokenizer:
-                generated_text_stream = sample_sequence_codegen(
-                    prompt=reasoning_prompt,
-                    model=codegen_model,
-                    tokenizer=codegen_tokenizer,
-                    length=999999999,
-                    temperature=temperature,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    device=device
-                )
-                stream_type = "text"
-        elif "summarize" in text_input.lower() or "summary" in text_input.lower():
-            if summarization_model:
-                summary = summarize_func(text_input)
-                yield f"SUMMARY_TEXT:{summary}"
-                yield "<END_STREAM>"
-                stream_type = "summary"
-        else:
-            if model_gpt2 and enc:
-                generated_text_stream = sample_sequence(
-                    prompt=reasoning_prompt,
-                    model=model_gpt2,
-                    enc=enc,
-                    length=999999999,
-                    temperature=temperature,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    device=device
-                )
-                stream_type = "text"
-        accumulated_text = ""
-        if stream_type == "text":
-            for token in generated_text_stream:
-                if token == "<END_STREAM>":
-                    yield accumulated_text
-                    yield "<END_STREAM>"
-                    return
-                if token == END_OF_TEXT_TOKEN:
-                    accumulated_text += END_OF_TEXT_TOKEN
-                    continue
-                if token:
-                    accumulated_text += token
-    except Exception as e:
-        print(f"Reasoning Error: {e}")
-        yield "Error during reasoning. Please try again."
-        yield "<END_STREAM>"

     return logits
 def sample_sequence(prompt, model, enc, length, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device="cpu"):
     context_tokens = enc.encode(prompt)
     context_tokens_tensor = torch.tensor([context_tokens], dtype=torch.long, device=device)
     generated = context_tokens
+    past_key_values = None
     with torch.no_grad():
+        for _ in range(length):
+            outputs = model(context_tokens_tensor, past_key_values=past_key_values)
+            next_token_logits = outputs[0][:, -1, :] / temperature
+            past_key_values = outputs[1]
+            for token_index in set(generated):
+                next_token_logits[0, token_index] /= repetition_penalty
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            if temperature == 0:
+                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(0)
+            else:
+                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated += next_token.tolist()[0]
+            token = next_token.tolist()[0][0]
+            yield enc.decode([token])
+            if token == enc.encoder[END_OF_TEXT_TOKEN]:
+                yield "<END_STREAM>"
+                return
 def sample_sequence_codegen(prompt, model, tokenizer, length, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device="cpu"):
     context_tokens = tokenizer.encode(prompt)
     context_tokens_tensor = torch.tensor([context_tokens], dtype=torch.long, device=device).unsqueeze(0)
     generated = context_tokens
+    past_key_values = None
     with torch.no_grad():
+        for _ in range(length):
+            outputs = model(input_ids=context_tokens_tensor, past_key_values=past_key_values, labels=None)
+            next_token_logits = outputs[0][:, -1, :] / temperature
+            past_key_values = outputs[1]
+            for token_index in set(generated):
+                next_token_logits[0, token_index] /= repetition_penalty
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            if temperature == 0:
+                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(0)
+            else:
+                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated.append(next_token.tolist()[0][0])
+            token = next_token.tolist()[0][0]
+            yield tokenizer.decode([token])
+            if token == 50256:
+                yield "<END_STREAM>"
+                return
 def perform_reasoning_stream(text_input, temperature, top_k, top_p, repetition_penalty):
+    prompt_text = SYSTEM_PROMPT + "\n\n"
+    prompt_text += "User: " + text_input + "\nAssistant:"
+    reasoning_prompt = prompt_text
+    ddgs = DDGS()
+    search_results = [r for r in ddgs.text(text_input, max_results=MAX_XDD)]
+    if search_results:
+        prompt_text += "\nWeb Search Results:\n"
+        for result in search_results:
+            prompt_text += f"- {result['body']}\n"
+        prompt_text += "\n"
+    generated_text_stream = []
+    stream_type = "text"
+    if "code" in text_input.lower() or "program" in text_input.lower():
+        if codegen_model and codegen_tokenizer:
+            generated_text_stream = sample_sequence_codegen(
+                prompt=reasoning_prompt,
+                model=codegen_model,
+                tokenizer=codegen_tokenizer,
+                length=999999999,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                device=device
+            )
+            stream_type = "text"
+    elif "summarize" in text_input.lower() or "summary" in text_input.lower():
+        if summarization_model:
+            summary = summarize_text(text_input)
+            yield f"SUMMARY_TEXT:{summary}"
+            yield "<END_STREAM>"
+            stream_type = "summary"
+    else:
+        if model_gpt2 and enc:
+            generated_text_stream = sample_sequence(
+                prompt=reasoning_prompt,
+                model=model_gpt2,
+                enc=enc,
+                length=999999999,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                device=device
+            )
+            stream_type = "text"
+    accumulated_text = ""
+    if stream_type == "text":
+        for token in generated_text_stream:
+            if token == "<END_STREAM>":
+                yield accumulated_text
+                yield "<END_STREAM>"
+                return
+            if token == END_OF_TEXT_TOKEN:
+                accumulated_text += END_OF_TEXT_TOKEN
+                continue
+            if token:
+                accumulated_text += token

tokenxxx.py CHANGED Viewed

@@ -139,4 +139,4 @@ def codegen_tokenize(text, tokenizer):
     return tokenizer.encode(text)
 def codegen_decode(tokens, tokenizer):
-    return tokenizer.decode(tokens)

     return tokenizer.encode(text)
 def codegen_decode(tokens, tokenizer):
+    return tokenizer.decode(tokens)

translation_api.py CHANGED Viewed

@@ -1,17 +1,15 @@
 from flask import jsonify, send_file, request
 from main import *
-def perform_translation(text, target_language_code='es_XX', source_language_code='en_XX', output_path="output_translation.txt"):
     if translation_model is None:
-        return "Translation model not initialized."
     encoded_text = translation_model.tokenizer(text, return_tensors="pt", padding=True).to(device)
     generated_tokens = translation_model.generate(input_ids=encoded_text['input_ids'], attention_mask=encoded_text['attention_mask'], forced_bos_token_id=translation_model.config.lang_code_to_id[target_language_code])
     translation = translation_model.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-    with open(output_path, "w") as file:
-        file.write(translation)
-    return output_path
 def translation_api():
     data = request.get_json()
@@ -20,7 +18,7 @@ def translation_api():
     source_lang = data.get('source_lang', 'en')
     if not text:
         return jsonify({"error": "Text is required"}), 400
-    output_file = perform_translation(text, target_language_code=f'{target_lang}_XX', source_language_code=f'{source_lang}_XX')
-    if output_file == "Translation model not initialized.":
-        return jsonify({"error": "Translation failed"}), 500
-    return send_file(output_file, mimetype="text/plain", as_attachment=True, download_name="output_translation.txt")

 from flask import jsonify, send_file, request
 from main import *
+def perform_translation(text, target_language_code='es_XX', source_language_code='en_XX'):
     if translation_model is None:
+        return {"error": "Translation model not initialized."}
     encoded_text = translation_model.tokenizer(text, return_tensors="pt", padding=True).to(device)
     generated_tokens = translation_model.generate(input_ids=encoded_text['input_ids'], attention_mask=encoded_text['attention_mask'], forced_bos_token_id=translation_model.config.lang_code_to_id[target_language_code])
     translation = translation_model.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+    return {"translated_text": translation}
 def translation_api():
     data = request.get_json()
     source_lang = data.get('source_lang', 'en')
     if not text:
         return jsonify({"error": "Text is required"}), 400
+    output = perform_translation(text, target_language_code=f'{target_lang}_XX', source_language_code=f'{source_lang}_XX')
+    if "error" in output:
+        return jsonify({"error": output["error"]}), 500
+    return jsonify(output)

tts_api.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import os
 from flask import jsonify, send_file, request
 from main import *
-def text_to_speech_func(text, output_path="output_tts.wav"):
     if tts_model is None:
-        return "TTS model not initialized."
     input_tokens = tts_model.tokenizer(text, return_tensors="pt", padding=True).to(device)
     with torch.no_grad():
         audio_output = tts_model(input_tokens['input_ids'])
-    torchaudio.save(output_path, audio_output.cpu(), 16000)
-    return output_path
 def tts_api():
     data = request.get_json()
@@ -17,6 +21,6 @@ def tts_api():
     if not text:
         return jsonify({"error": "Text is required"}), 400
     output_file = text_to_speech_func(text)
-    if output_file == "TTS model not initialized.":
-        return jsonify({"error": "TTS generation failed"}), 500
     return send_file(output_file, mimetype="audio/wav", as_attachment=True, download_name="output.wav")

 import os
 from flask import jsonify, send_file, request
 from main import *
+import torch
+import torchaudio
+import uuid
+def text_to_speech_func(text):
     if tts_model is None:
+        return {"error": "TTS model not initialized."}
     input_tokens = tts_model.tokenizer(text, return_tensors="pt", padding=True).to(device)
     with torch.no_grad():
         audio_output = tts_model(input_tokens['input_ids'])
+    temp_audio_path = f"temp_audio_{uuid.uuid4()}.wav"
+    torchaudio.save(temp_audio_path, audio_output.cpu(), 16000)
+    return temp_audio_path
 def tts_api():
     data = request.get_json()
     if not text:
         return jsonify({"error": "Text is required"}), 400
     output_file = text_to_speech_func(text)
+    if "error" in output:
+        return jsonify({"error": output["error"]}), 500
     return send_file(output_file, mimetype="audio/wav", as_attachment=True, download_name="output.wav")