inputs = feature_extractor(
    ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], pad_end=True, return_tensors="pt"
)
with torch.no_grad():
    audio = model(**inputs)
Remove the extra padding at the end of the output.