#5114 TaskCfgSTT(uuid='3a183d1184', name='E:/QMDownload/[email protected]', dirname='E:/QMDownload', noextname='www.98T

2408:823c* Posted at: 2 hours ago

语音识别阶段出错[Huggingface_ASR] The elements of the batch contain different keys. Cannot batch them ({'is_last', 'attention_mask', 'input_features'} != {'is_last', 'num_frames', 'attention_mask', 'input_features'}):Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 425, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'is_last', 'attention_mask', 'input_features'} != {'is_last', 'num_frames', 'attention_mask', 'input_features'})

Traceback (most recent call last):

File "videotrans\task\job.py", line 35, in run

File "videotrans\task\job.py", line 100, in process_task

File "videotrans\task\speech2text.py", line 126, in recogn

File "videotrans\recognition\__init__.py", line 190, in run

File "videotrans\recognition\_base.py", line 94, in run

File "videotrans\recognition\_huggingface.py", line 42, in _exec

File "videotrans\recognition\_huggingface.py", line 66, in _pipe_asr

File "videotrans\configure\base.py", line 253, in _new_process

videotrans.configure.excepts.VideoTransError: The elements of the batch contain different keys. Cannot batch them ({'is_last', 'attention_mask', 'input_features'} != {'is_last', 'num_frames', 'attention_mask', 'input_features'}):Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 425, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'is_last', 'attention_mask', 'input_features'} != {'is_last', 'num_frames', 'attention_mask', 'input_features'})
TaskCfgSTT(uuid='3a183d1184', name='E:/QMDownload/www.98T[email protected]4', dirname='E:/QMDownload', noextname='www.98T.la@fc4852372', basename='www.98T[email protected]4', ext='mp4', target_dir='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/output/recogn', cache_folder='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/tmp/6652/3a183d1184', is_cuda=True, source_language=None, source_language_code=None, source_sub=None, source_wav=None, source_wav_output=None, target_language=None, target_language_code=None, target_sub='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/output/recogn/www.98T[email protected]', target_wav=None, target_wav_output=None, detect_language='ja', recogn_type=4, model_name='kotoba-tech/kotoba-whisper-v2.0', shibie_audio='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/tmp/6652/3a183d1184/[email protected]', remove_noise=False, enable_diariz=False, nums_diariz=0, rephrase=0, fix_punc=False)


=
system:Windows-10-10.0.22631-SP0
version:v4.01
frozen:True
language:zh
root_dir:H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602

1 Replies

该错误是因为 Huggingface_ASR 渠道在处理音频时,某些切片缺少 num_frames 字段,导致批次内数据键不一致,无法进行批量推理。常见于 kotoba-whisper 等模型与默认 VAD 切分方式不兼容。

解决方法(按优先级尝试):

  1. 在高级选项中调整 VAD 参数

    • 菜单 → 工具 → 高级选项 → 语音识别参数
    • 最长语音持续秒数 设为 6(默认可能过大)
    • 静音分割持续毫秒 设为 300500(确保片段长度稳定)
    • 取消勾选 Whisper预分割音频?(避免产生额外字段变动)
  2. 更换语音识别渠道

    • 对于日语,推荐改用 faster-whisper(本地) + large-v3 模型,兼容性更好且准确率相近。
    • 或在 Huggingface_ASR 渠道中选择其他模型(如 parakeet-ctc for English)。
  3. 降低批次大小(若源码部署)

    • 若你使用源码部署,可尝试在 videotrans/recognition/_huggingface.py 中设置 pipelinebatch_size 为 1(避免批次合并问题)。
  4. 清理缓存后重试

    • 主界面勾选 清理已生成,强制重新处理音频切分。

请查阅相关文档:

Post Your Reply
Open source and free maintenance is not easy. If this project is helpful to you, please consider making a small donation to help the project continue to maintain and update. Dondate: ko-fi.com/jianchang512

Related resource