#5115 TaskCfgSTT(uuid='01783ad87e', name='E:/QMDownload/[email protected]', dirname='E:/QMDownload', noextname='www.9

2408:823c* Posted at: 2 hours ago

语音识别阶段出错[Huggingface_ASR] The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'num_frames', 'input_features', 'is_last'} != {'attention_mask', 'input_features', 'is_last'}):Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 425, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'num_frames', 'input_features', 'is_last'} != {'attention_mask', 'input_features', 'is_last'})

Traceback (most recent call last):

File "videotrans\task\job.py", line 35, in run

File "videotrans\task\job.py", line 100, in process_task

File "videotrans\task\speech2text.py", line 126, in recogn

File "videotrans\recognition\__init__.py", line 190, in run

File "videotrans\recognition\_base.py", line 94, in run

File "videotrans\recognition\_huggingface.py", line 42, in _exec

File "videotrans\recognition\_huggingface.py", line 66, in _pipe_asr

File "videotrans\configure\base.py", line 253, in _new_process

videotrans.configure.excepts.VideoTransError: The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'num_frames', 'input_features', 'is_last'} != {'attention_mask', 'input_features', 'is_last'}):Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 425, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'num_frames', 'input_features', 'is_last'} != {'attention_mask', 'input_features', 'is_last'})
TaskCfgSTT(uuid='01783ad87e', name='E:/QMDownload/www.98T[email protected]4', dirname='E:/QMDownload', noextname='www.98T.la@fc2-4673073', basename='www.98T[email protected]4', ext='mp4', target_dir='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/output/recogn', cache_folder='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/tmp/6652/01783ad87e', is_cuda=True, source_language=None, source_language_code=None, source_sub=None, source_wav=None, source_wav_output=None, target_language=None, target_language_code=None, target_sub='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/output/recogn/www.98T[email protected]', target_wav=None, target_wav_output=None, detect_language='ja', recogn_type=4, model_name='kotoba-tech/kotoba-whisper-v2.0', shibie_audio='H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602/tmp/6652/01783ad87e/[email protected]', remove_noise=False, enable_diariz=False, nums_diariz=0, rephrase=0, fix_punc=False)


=
system:Windows-10-10.0.22631-SP0
version:v4.01
frozen:True
language:zh
root_dir:H:/BaiduNetdiskDownload/win-pyvideotrans-v4.01-0602

1 Replies

Huggingface_ASR 渠道的 kotoba-whisper-v2.0 模型在处理日语音频时,因批处理(batch)中各片段的特征字典键不一致(部分包含 num_frames 字段)而无法合并,这是该模型输入预处理的一个已知兼容性问题。

解决方案:

  1. 切换为更稳定的语音识别渠道:推荐使用 faster-whisper(本地) 并选择 large-v3 模型(不支持日语的 .en 模型),该渠道对日语支持完善且无上述批处理冲突。
  2. 改用其他 HuggingFace ASR 模型:若坚持使用 HuggingFace 渠道,可尝试换用 parakeet-ctc 模型,该模型仅支持英语,但可测试是否为模型本身问题。
  3. 降低音频复杂度:确保音频文件无过长静音、背景噪声,可先试听确认内容正常。若文件较长,可裁剪为短片段测试。

如果以上无效,请提供 logs/ 文件夹中按日期命名的日志文件底部约 30 行内容,以便进一步分析。

请查阅相关文档:

Post Your Reply
Open source and free maintenance is not easy. If this project is helpful to you, please consider making a small donation to help the project continue to maintain and update. Dondate: ko-fi.com/jianchang512

Related resource