#3733 TaskCfgVTT(is_cuda=True, uuid='354aaa2875', cache_folder='E:/APP/win-pyvideotrans-v3.98-317/tmp/26156/354aaa2875', targe

2408:8262* Posted at: 3 hours ago 👁9

语音识别阶段出错 [Huggingface_ASR] Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 585, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'is_last', 'num_frames', 'input_features'} != {'attention_mask', 'is_last', 'input_features'})

Traceback (most recent call last):
File "videotrans\task\job.py", line 105, in run
File "videotrans\task\trans_create.py", line 360, in recogn
File "videotrans\recognition\__init__.py", line 263, in run
File "videotrans\recognition\_base.py", line 143, in run
File "videotrans\recognition\_huggingface.py", line 38, in _exec
File "videotrans\recognition\_huggingface.py", line 63, in _pipe_asr
File "videotrans\configure\_base.py", line 289, in _new_process
RuntimeError: Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 585, in pipe_asr
File "transformers\pipelines\pt_utils.py", line 126, in next

item = next(self.iterator)

File "transformers\pipelines\pt_utils.py", line 271, in next

processed = self.infer(next(self.iterator), **self.params)

File "torch\utils\data\dataloader.py", line 733, in next

data = self._next_data()

File "torch\utils\data\dataloader.py", line 789, in _next_data

data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

File "torch\utils\data\_utils\fetch.py", line 43, in fetch

return self.collate_fn(data)

File "transformers\pipelines\base.py", line 182, in inner

raise ValueError(

ValueError: The elements of the batch contain different keys. Cannot batch them ({'attention_mask', 'is_last', 'num_frames', 'input_features'} != {'attention_mask', 'is_last', 'input_features'})
TaskCfgVTT(is_cuda=True, uuid='354aaa2875', cache_folder='E:/APP/win-pyvideotrans-v3.98-317/tmp/26156/354aaa2875', target_dir='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4', source_language='日语', source_language_code='ja', source_sub='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4/ja.srt', source_wav='E:/APP/win-pyvideotrans-v3.98-317/tmp/26156/354aaa2875/ja.wav', source_wav_output='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4/ja.m4a', target_language='简体中文', target_language_code='zh-cn', target_sub='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4/zh-cn.srt', target_wav='E:/APP/win-pyvideotrans-v3.98-317/tmp/26156/354aaa2875/target.wav', target_wav_output='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4/zh-cn.m4a', name='F:/影音/4 Temp/新建文件夹/av/[email protected]4', noextname='489155.com@NHDTC-155', basename='[email protected]4', ext='mp4', dirname='F:/影音/4 Temp/新建文件夹/av', shound_del_name=None, translate_type=8, tts_type=0, volume='+0%', pitch='+0Hz', voice_rate='+0%', voice_role='No', voice_autorate=False, video_autorate=False, remove_silent_mid=False, align_sub_audio=True, detect_language='ja', recogn_type=4, model_name='kotoba-tech/kotoba-whisper-v2.0', shibie_audio=None, remove_noise=False, enable_diariz=False, nums_diariz=0, rephrase=0, fix_punc=False, subtitle_language=None, app_mode='tiqu', subtitles='', targetdir_mp4='F:/影音/4 Temp/新建文件夹/av/_video_out/489155.com@NHDTC-155-mp4/[email protected]4', novoice_mp4='E:/APP/win-pyvideotrans-v3.98-317/tmp/26156/354aaa2875/novoice.mp4', is_separate=False, instrument=None, vocal=None, back_audio='', clear_cache=True, background_music=None, subtitle_type=0, only_out_mp4=False, recogn2pass=True, output_srt=0, copysrt_rawvideo=True)


=
system:Windows-10-10.0.26200-SP0
version:v3.98
frozen:True
language:zh
root_dir:E:/APP/win-pyvideotrans-v3.98-317

1 Replies

这是由于 HuggingFace ASR 渠道在处理音频批次时,内部数据键不一致导致的错误。

解决方案:

  1. 更换语音识别渠道:这是最直接的解决方法。对于日语视频,建议使用以下渠道之一:

    • 免费/本地:在“语音识别”下拉菜单中,选择 Huggingface_ASR,但将模型从 kotoba-tech/kotoba-whisper-v2.0 更换为 reazon-research/japanese-wav2vec2-large-rs35kh
    • 收费/在线:如果追求最佳效果,可选择 OpenAI语音识别APIGemini大模型识别
  2. 检查并调整高级设置:如果仍需使用 kotoba-whisper 模型,请尝试在 菜单 -> 工具 -> 高级选项 -> 语音识别参数 中,将 最长语音持续秒数 调整为一个更小的值(例如 5 或 3),以改变音频切分方式,可能规避此内部错误。
  3. 清理缓存并重试:在主界面左上角勾选 清理已生成,然后重新处理视频,确保使用全新的处理流程。

请注意:此错误与特定模型或音频处理批次有关,更换模型或调整参数是主要解决方向。

请查阅相关文档:

Post Your Reply
Open source and free maintenance is not easy. If this project is helpful to you, please consider making a small donation to help the project continue to maintain and update.

Related resource