#4406 TaskCfgVTT(is_cuda=True, uuid='d9d918925d', cache_folder='D:/win-pyv/tmp/32780/d9d918925d', target_dir='G:/camp/31-Color

103.3* Posted at: 2 hours ago 👁9

语音识别阶段出错 [openai-whisper(本地)] Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 109, in openai_whisper
File "whisper\transcribe.py", line 295, in transcribe
File "whisper\transcribe.py", line 201, in decode_with_fallback
File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "whisper\decoding.py", line 824, in decode
File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "whisper\decoding.py", line 737, in run
File "whisper\decoding.py", line 703, in _main_loop
File "whisper\decoding.py", line 283, in update
File "torch\distributions\categorical.py", line 73, in init

super().__init__(batch_shape, validate_args=validate_args)

File "torch\distributions\distribution.py", line 72, in init

raise ValueError(

ValueError: Expected parameter logits (Tensor of shape (1, 51866)) of distribution Categorical(logits: torch.S
......
on\_base.py", line 143, in run
File "videotrans\recognition\_overall.py", line 31, in _exec
File "videotrans\recognition\_overall.py", line 73, in _openai
File "videotrans\configure\_base.py", line 289, in _new_process
RuntimeError: Traceback (most recent call last):
File "videotrans\process\stt_fun.py", line 109, in openai_whisper
File "whisper\transcribe.py", line 295, in transcribe
File "whisper\transcribe.py", line 201, in decode_with_fallback
File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "whisper\decoding.py", line 824, in decode
File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "whisper\decoding.py", line 737, in run
File "whisper\decoding.py", line 703, in _main_loop
File "whisper\decoding.py", line 283, in update
File "torch\distributions\categorical.py", line 73, in init

super().__init__(batch_shape, validate_args=validate_args)

File "torch\distributions\distribution.py", line 72, in init

raise ValueError(

ValueError: Expected parameter logits (Tensor of shape (1, 51866)) of distribution Categorical(logits: torch.Size([1, 51866])) to satisfy the constraint IndependentConstraint(Real(), 1), but found invalid values:
tensor([[nan, nan, nan, ..., nan, nan, nan]], device='cuda:0')
TaskCfgVTT(is_cuda=True, uuid='d9d918925d', cache_folder='D:/win-pyv/tmp/32780/d9d918925d', target_dir='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv', source_language='英语', source_language_code='en', source_sub='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv/en.srt', source_wav='D:/win-pyv/tmp/32780/d9d918925d/en.wav', source_wav_output='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv/en.m4a', target_language='简体中文', target_language_code='zh-cn', target_sub='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv/zh-cn.srt', target_wav='D:/win-pyv/tmp/32780/d9d918925d/target.wav', target_wav_output='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv/zh-cn.m4a', name='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity.mkv', noextname='231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity', basename='231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity.mkv', ext='mkv', dirname='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More', shound_del_name=None, translate_type=13, tts_type=0, volume='+0%', pitch='+0Hz', voice_rate='+0%', voice_role='No', voice_autorate=True, video_autorate=False, remove_silent_mid=False, align_sub_audio=True, detect_language='en', recogn_type=1, model_name='large-v3-turbo', shibie_audio=None, remove_noise=False, enable_diariz=False, nums_diariz=0, rephrase=0, fix_punc=False, subtitle_language=None, app_mode='biaozhun', subtitles='', targetdir_mp4='G:/camp/31-Color_Grading_Workflow_Techniques_LUTs_and_More/_video_out/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity-mkv/231-How_to_Apply_LUTs_to_Nodes_and_Clips_The_Key_Tab_and_Node_Opacity.mp4', novoice_mp4='D:/win-pyv/tmp/32780/d9d918925d/novoice.mp4', is_separate=False, embed_bgm=True, instrument=None, vocal='D:/win-pyv/tmp/32780/d9d918925d/vocal.wav', back_audio='', clear_cache=False, background_music=None, subtitle_type=3, only_out_mp4=True, recogn2pass=True, output_srt=2, copysrt_rawvideo=False)


=
system:Windows-10-10.0.26200-SP0
version:v3.99
frozen:True
language:zh
root_dir:D:/win-pyv

1 Replies

错误原因是 openai-whisper 在 GPU (CUDA) 解码时出现了 NaN 值,通常由显存不足或相关设置过高引起。日志中显示 recogn2pass=True(即启用了“二次识别”),这会额外消耗大量显存,导致当前 CUDA 计算崩溃。

请按以下步骤操作(按顺序尝试):

  1. 关闭二次识别:在主界面右上角,取消勾选 二次识别 选项,然后重新运行任务。
  2. 使用更小的模型:将语音识别模型从 large-v3-turbo 更换为 mediumsmallbase,大幅降低显存占用。
  3. 调整高级选项中的 CUDA 参数:进入 菜单 → 工具/选项 → 高级选项,找到语音识别参数区域,进行以下修改以牺牲少量精度换取更低的显存需求:

    • CUDA数据类型: 改为 int8float16
    • beam_size: 改为 1
    • best_of: 改为 1
    • 启用上下文感知: 设为 false
  4. 检查显卡驱动与 CUDA 环境:确保已安装 CUDA 12.8 和 cuDNN 9.11,显卡驱动为最新版本(NVIDIA 卡)。如果仍无效,可暂时不勾选 CUDA 加速,用 CPU 识别尽管速度会下降,但可避免显存不足导致的 NaN 错误。

请查阅相关文档:

Post Your Reply
Open source and free maintenance is not easy. If this project is helpful to you, please consider making a small donation to help the project continue to maintain and update.

Related resource