yhsong
/

ViTGaze

Model card Files Files and versions Community

File size: 6,113 Bytes

f9561b9

from typing import Tuple
import torch
from torchvision import transforms
import numpy as np
import pandas as pd


def to_numpy(tensor: torch.Tensor):
    if torch.is_tensor(tensor):
        return tensor.cpu().detach().numpy()
    elif type(tensor).__module__ != "numpy":
        raise ValueError("Cannot convert {} to numpy array".format(type(tensor)))
    return tensor


def to_torch(ndarray: np.ndarray):
    if type(ndarray).__module__ == "numpy":
        return torch.from_numpy(ndarray)
    elif not torch.is_tensor(ndarray):
        raise ValueError("Cannot convert {} to torch tensor".format(type(ndarray)))
    return ndarray


def get_head_box_channel(
    x_min, y_min, x_max, y_max, width, height, resolution, coordconv=False
):
    head_box = (
        np.array([x_min / width, y_min / height, x_max / width, y_max / height])
        * resolution
    )
    int_head_box = head_box.astype(int)
    int_head_box = np.clip(int_head_box, 0, resolution - 1)
    if int_head_box[0] == int_head_box[2]:
        if int_head_box[0] == 0:
            int_head_box[2] = 1
        elif int_head_box[2] == resolution - 1:
            int_head_box[0] = resolution - 2
        elif abs(head_box[2] - int_head_box[2]) > abs(head_box[0] - int_head_box[0]):
            int_head_box[2] += 1
        else:
            int_head_box[0] -= 1
    if int_head_box[1] == int_head_box[3]:
        if int_head_box[1] == 0:
            int_head_box[3] = 1
        elif int_head_box[3] == resolution - 1:
            int_head_box[1] = resolution - 2
        elif abs(head_box[3] - int_head_box[3]) > abs(head_box[1] - int_head_box[1]):
            int_head_box[3] += 1
        else:
            int_head_box[1] -= 1
    head_box = int_head_box
    if coordconv:
        unit = np.array(range(0, resolution), dtype=np.float32)
        head_channel = []
        for i in unit:
            head_channel.append([unit + i])
        head_channel = np.squeeze(np.array(head_channel)) / float(np.max(head_channel))
        head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 0
    else:
        head_channel = np.zeros((resolution, resolution), dtype=np.float32)
        head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 1
    head_channel = torch.from_numpy(head_channel)
    return head_channel


def draw_labelmap(img, pt, sigma, type="Gaussian"):
    # Draw a 2D gaussian
    # Adopted from https://github.com/anewell/pose-hg-train/blob/master/src/pypose/draw.py
    img = to_numpy(img)

    # Check that any part of the gaussian is in-bounds
    size = int(6 * sigma + 1)
    ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
    br = [ul[0] + size, ul[1] + size]
    if ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or br[0] < 0 or br[1] < 0:
        # If not, just return the image as is
        return to_torch(img)

    # Generate gaussian
    x = np.arange(0, size, 1, float)
    y = x[:, np.newaxis]
    x0 = y0 = size // 2
    # The gaussian is not normalized, we want the center value to equal 1
    if type == "Gaussian":
        g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma**2))
    elif type == "Cauchy":
        g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma**2) ** 1.5)

    # Usable gaussian range
    g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
    g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
    # Image range
    img_x = max(0, ul[0]), min(br[0], img.shape[1])
    img_y = max(0, ul[1]), min(br[1], img.shape[0])

    img[img_y[0] : img_y[1], img_x[0] : img_x[1]] += g[g_y[0] : g_y[1], g_x[0] : g_x[1]]
    # img = img / np.max(img)
    return to_torch(img)


def draw_labelmap_no_quant(img, pt, sigma, type="Gaussian"):
    img = to_numpy(img)
    shape = img.shape
    x = np.arange(shape[0])
    y = np.arange(shape[1])
    xx, yy = np.meshgrid(x, y, indexing="ij")
    dist_matrix = (yy - float(pt[0])) ** 2 + (xx - float(pt[1])) ** 2
    if type == "Gaussian":
        g = np.exp(-dist_matrix / (2 * sigma**2))
    elif type == "Cauchy":
        g = sigma / ((dist_matrix + sigma**2) ** 1.5)
    g[dist_matrix > 10 * sigma**2] = 0
    img += g
    # img = img / np.max(img)
    return to_torch(img)


def multi_hot_targets(gaze_pts, out_res):
    w, h = out_res
    target_map = np.zeros((h, w))
    for p in gaze_pts:
        if p[0] >= 0:
            x, y = map(int, [p[0] * float(w), p[1] * float(h)])
            x = min(x, w - 1)
            y = min(y, h - 1)
            target_map[y, x] = 1
    return target_map


def get_cone(tgt, src, wh, theta=150):
    eye = src * wh
    gaze = tgt * wh

    pixel_mat = np.stack(
        np.meshgrid(np.arange(wh[0]), np.arange(wh[1])),
        -1,
    )

    dot_prod = np.sum((pixel_mat - eye) * (gaze - eye), axis=-1)
    gaze_vector_norm = np.sqrt(np.sum((gaze - eye) ** 2))
    pixel_mat_norm = np.sqrt(np.sum((pixel_mat - eye) ** 2, axis=-1))

    gaze_cones = dot_prod / (gaze_vector_norm * pixel_mat_norm)
    gaze_cones = np.nan_to_num(gaze_cones, nan=1)

    theta = theta * (np.pi / 180)
    beta = np.arccos(gaze_cones)
    # Create mask where true if beta is less than theta/2
    pixel_mat_presence = beta < (theta / 2)

    # Zero out values outside the gaze cone
    gaze_cones[~pixel_mat_presence] = 0
    gaze_cones = np.clip(gaze_cones, 0, None)

    return torch.from_numpy(gaze_cones).unsqueeze(0).float()


def get_transform(
    input_resolution: int, mean: Tuple[int, int, int], std: Tuple[int, int, int]
):
    return transforms.Compose(
        [
            transforms.Resize((input_resolution, input_resolution)),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ]
    )


def smooth_by_conv(window_size, df, col):
    padded_track = pd.concat(
        [
            pd.DataFrame([[df.iloc[0][col]]] * (window_size // 2), columns=[0]),
            df[col],
            pd.DataFrame([[df.iloc[-1][col]]] * (window_size // 2), columns=[0]),
        ]
    )
    smoothed_signals = np.convolve(
        padded_track.squeeze(), np.ones(window_size) / window_size, mode="valid"
    )
    return smoothed_signals