# -*- coding: utf-8 -*- """🎬 Keras Video Classification CNN-RNN model Spaces for showing the model usage. Author: - Thomas Chaigneau @ChainYo """ import os import cv2 import gradio as gr import numpy as np from tensorflow import keras from tensorflow_docs.vis import embed from huggingface_hub import from_pretrained_keras # Kích thước ảnh đầu vào và số lượng đặc trưng IMG_SIZE = 224 NUM_FEATURES = 2048 # Tải mô hình CNN-RNN từ HuggingFace model = from_pretrained_keras("keras-io/video-classification-cnn-rnn") # Tạo danh sách video ví dụ từ thư mục Samples samples = [] for file in os.listdir("Samples"): tag = file.split("_")[1] samples.append([f"samples/{file}"]) # Cắt phần hình vuông ở trung tâm frame def crop_center_square(frame): y, x = frame.shape[0:2] min_dim = min(y, x) start_x = (x // 2) - (min_dim // 2) start_y = (y // 2) - (min_dim // 2) return frame[start_y : start_y + min_dim, start_x : start_x + min_dim] # Đọc video và xử lý từng frame def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)): cap = cv2.VideoCapture(path) frames = [] try: while True: ret, frame = cap.read() if not ret: break frame = crop_center_square(frame) frame = cv2.resize(frame, resize) frame = frame[:, :, [2, 1, 0]] frames.append(frame) if len(frames) == max_frames: break finally: cap.release() return np.array(frames) # Xây dựng mô hình trích xuất đặc trưng (InceptionV3) def build_feature_extractor(): feature_extractor = keras.applications.InceptionV3( weights="imagenet", include_top=False, pooling="avg", input_shape=(IMG_SIZE, IMG_SIZE, 3), ) preprocess_input = keras.applications.inception_v3.preprocess_input inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) preprocessed = preprocess_input(inputs) outputs = feature_extractor(preprocessed) return keras.Model(inputs, outputs, name="feature_extractor") # Tạo feature extractor 1 lần feature_extractor = build_feature_extractor() # Trích xuất đặc trưng cho từng frame của video def prepare_video(frames, max_seq_length: int = 20): frames = frames[None, ...] frame_mask = np.zeros(shape=(1, max_seq_length,), dtype="bool") frame_features = np.zeros(shape=(1, max_seq_length, NUM_FEATURES), dtype="float32") for i, batch in enumerate(frames): video_length = batch.shape[0] length = min(max_seq_length, video_length) for j in range(length): frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) frame_mask[i, :length] = 1 # 1 = not masked, 0 = masked return frame_features, frame_mask # Dự đoán hành động từ video def sequence_prediction(path): class_vocab = ["CricketShot", "PlayingCello", "Punch", "ShavingBeard", "TennisSwing"] frames = load_video(path) frame_features, frame_mask = prepare_video(frames) probabilities = model.predict([frame_features, frame_mask])[0] preds = {} for i in np.argsort(probabilities)[::-1]: preds[class_vocab[i]] = float(probabilities[i]) return preds # HTML mô tả bên dưới app article = article = "
" # Tạo giao diện Gradio app = gr.Interface( fn=sequence_prediction, inputs=[gr.Video(label="Video")], outputs=gr.Label(label="Prediction"), title="Keras Video Classification with CNN-RNN", description="Video classification demo using CNN-RNN based model.", article=article, examples=samples ) # Khởi chạy ứng dụng app.launch()