|
|
|
|
|
|
|
|
|
import os |
|
import random |
|
import json |
|
import numpy as np |
|
import torch |
|
import pickle |
|
import math |
|
|
|
from tqdm import tqdm |
|
|
|
|
|
class Predictor(object): |
|
"""this base class is used to save predictions to disk |
|
(and being called by a evaluator later). |
|
Predictor has minimum support of single gpu prediction. |
|
""" |
|
def __init__(self, config): |
|
self.pred_dir = None |
|
if hasattr(config, "eval") and config.eval is not None: |
|
self.pred_dir = config.eval.save_path |
|
os.makedirs(self.pred_dir, exist_ok=True) |
|
|
|
def __call__(self, outputs): |
|
"""extract the prediction and save it.""" |
|
raise NotImplementedError |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file=None): |
|
"""on-the-fly prediction on a single gpu.""" |
|
self.full_scores = [] |
|
model.eval() |
|
model = model.to(0) |
|
with torch.no_grad(): |
|
for data in eval_dataloader: |
|
data = self.to_ctx(data) |
|
outputs = model(**data) |
|
outputs.update(data) |
|
self(outputs) |
|
return self.finalize(output_file) |
|
|
|
def finalize(self, output_file): |
|
pass |
|
|
|
def to_ctx(self, data, ctx=0, dtype=None): |
|
if isinstance(data, dict): |
|
for key in data: |
|
if torch.is_tensor(data[key]): |
|
if dtype is not None and data[key].dtype == torch.float32: |
|
data[key] = data[key].to(dtype) |
|
data[key] = data[key].to(ctx) |
|
return data |
|
else: |
|
raise ValueError("non-dict type of batch is not supported yet.") |
|
|
|
|
|
class NLGPredictor(Predictor): |
|
"""Predicting Text from MMFusion models.""" |
|
"""TODO: make a context.""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
from transformers import AutoTokenizer |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
config.dataset.bert_name, |
|
bos_token="[CLS]", eos_token="[SEP]") |
|
self.bos_token_id = self.tokenizer.bos_token_id |
|
self.eos_token_id = self.tokenizer.eos_token_id |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file=None): |
|
"""TODO: refactor base classes.""" |
|
ctx = 0 |
|
outputs = {"outputs": [], "targets": [[]]} |
|
model.eval() |
|
model = model.to(ctx) |
|
with torch.no_grad(): |
|
for data in tqdm(eval_dataloader): |
|
data = self.to_ctx(data, ctx) |
|
self(data, model, outputs) |
|
return self.finalize(outputs, output_file) |
|
|
|
def __call__(self, data, model, outputs): |
|
data.update({ |
|
"bos_token_id": self.bos_token_id, |
|
"eos_token_id": self.eos_token_id |
|
}) |
|
|
|
output = model.generate(**data) |
|
assert len(output) == len(data["ref"]) |
|
for idx, _output in enumerate(output): |
|
generated_text = self.tokenizer.decode( |
|
_output, skip_special_tokens=True) |
|
if generated_text == "": |
|
generated_text = "none" |
|
outputs["outputs"].append(generated_text) |
|
outputs["targets"][0].append(data["ref"][idx]) |
|
if random.random() < 0.001: |
|
print("_output", _output) |
|
print("generated_text", generated_text) |
|
print("ref", data["ref"][idx]) |
|
|
|
def finalize(self, outputs, output_file=None): |
|
if output_file is not None: |
|
with open(os.path.join( |
|
self.pred_dir, output_file + ".json"), "w") as fw: |
|
json.dump(outputs, fw, indent=4) |
|
return outputs |
|
|
|
|
|
class RetrievalPredictor(Predictor): |
|
"""generated `pooled_video` and `pooled_text`.""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
from transformers import AutoTokenizer |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
config.dataset.bert_name) |
|
|
|
def predict_loop( |
|
self, |
|
model, |
|
eval_dataloader, |
|
output_file="retrieval.npy" |
|
): |
|
"""on-the-fly prediction on a single gpu.""" |
|
full_scores = [] |
|
texts = [] |
|
model.eval() |
|
model = model.cuda() |
|
with torch.no_grad(): |
|
for data in eval_dataloader: |
|
|
|
if not isinstance(data, dict): |
|
data = { |
|
"caps": data[0], |
|
"cmasks": data[1], |
|
"vfeats": data[2], |
|
"vmasks": data[3], |
|
"video_id": data[4] |
|
} |
|
data = self.to_ctx(data) |
|
outputs = model(**data) |
|
outputs.update(data) |
|
self(outputs, full_scores) |
|
for _cap in data["caps"]: |
|
texts.append( |
|
self.tokenizer.decode(_cap, skip_special_tokens=True) |
|
) |
|
|
|
return self.finalize(full_scores, texts, output_file) |
|
|
|
def __call__(self, sample, full_scores): |
|
scores = self._get_pooled_outputs(sample) |
|
self._append_scores(scores, full_scores) |
|
|
|
def finalize(self, full_scores, texts, output_file=None): |
|
outputs = self._aggregate_scores(full_scores) |
|
if output_file is not None: |
|
np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) |
|
return {"outputs": outputs, "texts": texts} |
|
|
|
def _get_pooled_outputs(self, outputs): |
|
if "pooled_video" in outputs: |
|
return outputs["pooled_video"], outputs["pooled_text"] |
|
else: |
|
raise ValueError("unknown format of outputs.") |
|
|
|
def _append_scores(self, scores, full_scores): |
|
assert len(scores) == 2 |
|
if len(full_scores) == 0: |
|
full_scores.append([]) |
|
full_scores.append([]) |
|
full_scores[0].append(scores[0].cpu().detach().numpy()) |
|
full_scores[1].append(scores[1].cpu().detach().numpy()) |
|
|
|
def _aggregate_scores(self, scores): |
|
assert len(scores) == 2 |
|
video_hidden = np.concatenate(scores[0], axis=0) |
|
text_hidden = np.concatenate(scores[1], axis=0) |
|
|
|
self.full_scores = [] |
|
return np.matmul(text_hidden, video_hidden.T) |
|
|
|
|
|
class QAPredictor(Predictor): |
|
"""generated `pooled_video` and `pooled_text`.""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
"""predictor maintains scores and aggregate them.""" |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file="qa.npy"): |
|
"""on-the-fly prediction on a single gpu.""" |
|
self.full_scores = [] |
|
model.eval() |
|
model = model.cuda() |
|
with torch.no_grad(): |
|
for data in eval_dataloader: |
|
|
|
v_len = data["vfeats"].size(1) |
|
hidden_size = data["vfeats"].size(2) |
|
data["vfeats"] = data["vfeats"].unsqueeze(1).repeat(1, 5, 1, 1).view(-1, v_len, hidden_size) |
|
data["vmasks"] = data["vmasks"].unsqueeze(1).repeat(1, 5, 1).view(-1, v_len) |
|
|
|
t_len = data["caps"].size(-1) |
|
data["caps"] = data["caps"].view(-1, t_len) |
|
data["cmasks"] = data["cmasks"].view(-1, t_len) |
|
|
|
data = self.to_ctx(data) |
|
outputs = model(**data) |
|
outputs.update(data) |
|
self(outputs) |
|
return self.finalize(output_file) |
|
|
|
def __call__(self, sample): |
|
hidden_size = sample["pooled_video"].size(-1) |
|
pooled_video = sample["pooled_video"].view(-1, 5, hidden_size) |
|
pooled_text = sample["pooled_text"].view(-1, 5, hidden_size) |
|
scores = torch.bmm(pooled_video, pooled_text.transpose(2, 1)) |
|
scores = scores.argmax(-1) |
|
self._append_scores(scores[:, 0], sample["answers"], self.full_scores) |
|
|
|
def finalize(self, output_file=None): |
|
outputs, targets = self._aggregate_scores(self.full_scores) |
|
if output_file is not None: |
|
np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) |
|
return {"outputs": outputs, "targets": targets} |
|
|
|
def _append_scores(self, scores, answers, full_scores): |
|
if len(full_scores) == 0: |
|
full_scores.append([]) |
|
full_scores.append([]) |
|
full_scores[0].append(scores.cpu().detach().numpy()) |
|
full_scores[1].append(answers.cpu().detach().numpy()) |
|
|
|
def _aggregate_scores(self, scores): |
|
assert len(scores) == 2 |
|
outputs = np.concatenate(scores[0], axis=0) |
|
targets = np.concatenate(scores[1], axis=0) |
|
|
|
self.full_scores = [] |
|
return outputs, targets |
|
|
|
|
|
class CrossTaskPredictor(Predictor): |
|
""" |
|
CrossTaskPredictor needs to compute the average of logits |
|
for overlapped sliding-window. |
|
""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.lsm = torch.nn.LogSoftmax(dim=1) |
|
self.max_video_len = config.dataset.max_video_len |
|
self.sliding_window = config.dataset.sliding_window |
|
self.sliding_window_size = config.dataset.sliding_window_size |
|
self.annotation_path = config.dataset.annotation_path |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): |
|
"""refactored from line 144: |
|
https://github.com/DmZhukov/CrossTask/blob/master/train.py |
|
""" |
|
ctx = 0 |
|
model.eval() |
|
model = model.to(ctx) |
|
|
|
Y_pred = {} |
|
Y_true = {} |
|
with torch.no_grad(): |
|
for batch in eval_dataloader: |
|
self(batch, model, Y_pred, Y_true) |
|
return self.finalize(Y_pred, Y_true, output_file) |
|
|
|
def __call__(self, sample, model, Y_pred, Y_true): |
|
|
|
from dp import dp |
|
vid, task = sample['video_id'][0], sample['task'][0] |
|
sample = self.to_ctx(sample) |
|
|
|
output = model(**sample) |
|
batch_logits = output["logits"].cpu() |
|
|
|
video_len = sample["video_len"][0] |
|
|
|
|
|
logits = torch.zeros((video_len, batch_logits.size(1))) |
|
logits_counts = torch.zeros((video_len, 1), dtype=torch.long) |
|
|
|
batch_logit_idx = 0 |
|
for window_start in range(0, video_len, self.sliding_window): |
|
video_end = min(video_len - window_start, self.sliding_window_size) |
|
logits[window_start: window_start + video_end] += batch_logits[ |
|
batch_logit_idx: batch_logit_idx + video_end] |
|
batch_logit_idx += video_end |
|
logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) |
|
|
|
if (video_len - window_start) <= self.sliding_window_size: |
|
break |
|
|
|
logits /= logits_counts |
|
assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) |
|
|
|
O = self.lsm(logits) |
|
y = np.zeros(O.size(), dtype=np.float32) |
|
dp(y, -O.detach().cpu().numpy()) |
|
if task not in Y_pred: |
|
Y_pred[task] = {} |
|
Y_pred[task][vid] = y |
|
annot_path = os.path.join( |
|
self.annotation_path, task+'_'+vid+'.csv') |
|
if os.path.exists(annot_path): |
|
if task not in Y_true: |
|
Y_true[task] = {} |
|
Y_true[task][vid] = self._read_assignment( |
|
*y.shape, annot_path) |
|
|
|
def finalize(self, Y_pred, Y_true, output_file=None): |
|
if output_file is not None: |
|
with open( |
|
os.path.join(self.pred_dir, output_file + ".pkl"), |
|
"wb") as fw: |
|
pickle.dump( |
|
{"Y_pred": Y_pred, "Y_true": Y_true}, fw, |
|
protocol=pickle.HIGHEST_PROTOCOL) |
|
return {"outputs": Y_pred, "targets": Y_true} |
|
|
|
def _read_assignment(self, T, K, path): |
|
""" |
|
refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py |
|
Howto interpret contraints on loss that is going to be minimized: |
|
lambd is a big number; |
|
self.lambd * C is a big number for all valid position (csv stores invalids) |
|
|
|
def forward(self, O, Y, C): |
|
return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum() |
|
|
|
This will load the csv file and fill-in the step col from start to end rows. |
|
""" |
|
|
|
Y = np.zeros([T, K], dtype=np.uint8) |
|
with open(path, 'r') as f: |
|
for line in f: |
|
step, start, end = line.strip().split(',') |
|
start = int(math.floor(float(start))) |
|
end = int(math.ceil(float(end))) |
|
step = int(step) - 1 |
|
Y[start:end, step] = 1 |
|
return Y |
|
|
|
|
|
class COINPredictor(Predictor): |
|
""" |
|
COINPredictor is similar to CrossTask on sliding windows. |
|
""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.max_video_len = config.dataset.max_video_len |
|
self.sliding_window = config.dataset.sliding_window |
|
self.sliding_window_size = config.dataset.sliding_window_size |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): |
|
"""refactored from line 144: |
|
https://github.com/DmZhukov/CrossTask/blob/master/train.py |
|
""" |
|
ctx = 0 |
|
model.eval() |
|
model = model.to(ctx) |
|
|
|
Y_pred = [] |
|
Y_true = [] |
|
with torch.no_grad(): |
|
for batch in eval_dataloader: |
|
self(batch, model, Y_pred, Y_true) |
|
return self.finalize(Y_pred, Y_true, output_file) |
|
|
|
def __call__(self, sample, model, Y_pred, Y_true): |
|
sample = self.to_ctx(sample) |
|
|
|
output = model(**sample) |
|
logits = self._merge_windows(sample, output) |
|
Y_pred.append(logits.argmax(dim=1)) |
|
Y_true.append(sample["video_targets"].squeeze(0).cpu()) |
|
|
|
def _merge_windows(self, sample, output): |
|
targets = sample["targets"].reshape(-1).cpu() |
|
valid_mask = targets != -100 |
|
targets = targets[valid_mask] |
|
batch_logits = output["logits"].cpu() |
|
batch_logits = batch_logits.reshape(-1, batch_logits.size(-1)) |
|
batch_logits = batch_logits[valid_mask] |
|
|
|
video_len = sample["video_len"][0] |
|
|
|
|
|
logits = torch.zeros((video_len, batch_logits.size(1))) |
|
logits_counts = torch.zeros((video_len, 1), dtype=torch.long) |
|
|
|
batch_logit_idx = 0 |
|
for window_start in range(0, video_len, self.sliding_window): |
|
video_end = min(video_len - window_start, self.sliding_window_size) |
|
logits[window_start: window_start + video_end] += batch_logits[ |
|
batch_logit_idx: batch_logit_idx + video_end] |
|
batch_logit_idx += video_end |
|
logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) |
|
if (video_len - window_start) <= self.sliding_window_size: |
|
break |
|
logits /= logits_counts |
|
assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) |
|
return logits |
|
|
|
def finalize(self, Y_pred, Y_true, output_file=None): |
|
Y_pred = torch.cat(Y_pred, dim=0).numpy() |
|
Y_true = torch.cat(Y_true, dim=0).numpy() |
|
assert len(Y_pred) == len(Y_true) |
|
|
|
error_mask = Y_pred != Y_true |
|
print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) |
|
print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) |
|
|
|
if output_file is not None: |
|
with open( |
|
os.path.join(self.pred_dir, output_file + ".pkl"), |
|
"wb") as fw: |
|
pickle.dump( |
|
{"Y_pred": Y_pred, "Y_true": Y_true}, fw, |
|
protocol=pickle.HIGHEST_PROTOCOL) |
|
return {"outputs": Y_pred, "targets": Y_true} |
|
|
|
|
|
class COINZSPredictor(COINPredictor): |
|
""" |
|
COINZSPredictor for COIN zero-shot prediction. |
|
""" |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
self.dataset_config = config.dataset |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): |
|
"""refactored from line 144: |
|
https://github.com/DmZhukov/CrossTask/blob/master/train.py |
|
""" |
|
ctx = 0 |
|
model.eval() |
|
model = model.to(ctx) |
|
|
|
with torch.no_grad(): |
|
outputs = eval_dataloader.dataset.meta_processor.meta_text_labels( |
|
self.dataset_config) |
|
outputs = self.to_ctx(outputs, ctx) |
|
label_hidden_states = model.forward_text(**outputs).cpu() |
|
label_sim = label_hidden_states @ label_hidden_states.t() |
|
num_labels = label_sim.size(0) |
|
eye_mask = ~torch.eye(num_labels, dtype=torch.bool) |
|
label_sim = label_sim.masked_select(eye_mask).view(num_labels, num_labels - 1) |
|
lbd = label_sim.max() |
|
|
|
|
|
Y_pred = [] |
|
Y_true = [] |
|
with torch.no_grad(): |
|
for batch in eval_dataloader: |
|
self(batch, label_hidden_states, model, lbd, Y_pred, Y_true) |
|
return self.finalize(Y_pred, Y_true, output_file) |
|
|
|
def reshape_subsample(self, sample): |
|
for key in sample: |
|
if torch.is_tensor(sample[key]): |
|
sample[key] = self.flat_subsample(sample[key]) |
|
return sample |
|
|
|
def flat_subsample(self, tensor): |
|
if len(tensor.size()) > 1 and tensor.size(0) == 1: |
|
tensor = tensor.squeeze(0) |
|
return tensor |
|
|
|
def __call__(self, sample, label_hidden_states, model, lbd, Y_pred, Y_true): |
|
sample = self.reshape_subsample(sample) |
|
sample = self.to_ctx(sample) |
|
|
|
sample["output_hidden_states"] = True |
|
video_outputs = model.forward_video(**sample).cpu() |
|
output = {"logits": video_outputs[:, 1:sample["vmasks"].size(1)+1] @ label_hidden_states.t()} |
|
logits = self._merge_windows(sample, output) |
|
|
|
logits_argmax = logits.argmax(dim=1) + 1 |
|
logits_max = logits.max(dim=1)[0] |
|
|
|
pred = torch.zeros_like(logits_argmax) |
|
label_select = logits_max > lbd |
|
pred[label_select] = logits_argmax[label_select] |
|
|
|
Y_pred.append(pred) |
|
Y_true.append(sample["video_targets"].squeeze(0).cpu()) |
|
|
|
def finalize(self, Y_pred, Y_true, output_file=None): |
|
Y_pred = torch.cat(Y_pred, dim=0).numpy() |
|
Y_true = torch.cat(Y_true, dim=0).numpy() |
|
assert len(Y_pred) == len(Y_true) |
|
|
|
error_mask = Y_pred != Y_true |
|
print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) |
|
print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) |
|
|
|
if output_file is not None: |
|
with open( |
|
os.path.join(self.pred_dir, output_file + ".pkl"), |
|
"wb") as fw: |
|
pickle.dump( |
|
{"Y_pred": Y_pred, "Y_true": Y_true}, fw, |
|
protocol=pickle.HIGHEST_PROTOCOL) |
|
return {"outputs": Y_pred, "targets": Y_true} |
|
|
|
|
|
class DiDeMoPredictor(Predictor): |
|
"""reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py |
|
https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py |
|
""" |
|
def __init__(self, config): |
|
super().__init__(config) |
|
|
|
with open(config.dataset.test_path) as data_file: |
|
self.test_data = json.load(data_file) |
|
|
|
def predict_loop(self, model, eval_dataloader, output_file="didemo.npy"): |
|
""" |
|
TODO: two solutions here. |
|
""" |
|
import itertools |
|
|
|
self.possible_segments = [(0,0), (1,1), (2,2), (3,3), (4,4), (5,5)] |
|
for i in itertools.combinations(range(6), 2): |
|
self.possible_segments.append(i) |
|
|
|
|
|
"""on-the-fly prediction on a single gpu.""" |
|
self.full_scores = [] |
|
model.eval() |
|
model = model.cuda() |
|
with torch.no_grad(): |
|
for data in eval_dataloader: |
|
|
|
data = self.to_ctx(data) |
|
data["output_hidden_states"] = True |
|
hidden_video = model.forward_video(**data) |
|
data["output_hidden_states"] = False |
|
pooled_text = model.forward_text(**data) |
|
outputs = { |
|
"hidden_video": hidden_video, |
|
"pooled_text": pooled_text |
|
} |
|
outputs.update(data) |
|
self(outputs) |
|
return self.finalize(output_file) |
|
|
|
def __call__(self, sample): |
|
|
|
hidden_video = sample["hidden_video"] |
|
pooled_text = sample["pooled_text"] |
|
vmasks = sample["vmasks"] |
|
|
|
|
|
hidden_video = hidden_video[:, 1:-1, :] |
|
|
|
pooled_video = [] |
|
for s, e in self.possible_segments: |
|
pooled_video.append( |
|
torch.mean( |
|
hidden_video[:, int(s*5):int((e+1)*5), :], |
|
dim=1, keepdim=True) |
|
) |
|
pooled_video = torch.cat(pooled_video, dim=1) |
|
scores = torch.bmm( |
|
pooled_video, pooled_text.unsqueeze(-1)).squeeze(-1).cpu() |
|
|
|
ranks = scores.argsort(dim=-1, descending=True) |
|
|
|
for batch_idx, rank in enumerate(ranks): |
|
rank_of_moment = [] |
|
for m_idx, moment in enumerate(rank): |
|
s, e = self.possible_segments[moment.item()] |
|
if torch.any( |
|
vmasks[batch_idx, int(s*5):int((e+1)*5)] |
|
): |
|
rank_of_moment.append((s, e)) |
|
self.full_scores.append(rank_of_moment) |
|
|
|
def finalize(self, output_file=None): |
|
outputs = self._aggregate_scores(self.full_scores) |
|
if output_file is not None: |
|
np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) |
|
return {"outputs": outputs, "targets": self.test_data} |
|
|
|
def _aggregate_scores(self, scores): |
|
self.full_scores = [] |
|
return scores |
|
|