#4861 TaskCfgVTT(uuid='079bb2b37a', name='C:/Users/juanc/Videos/aTubeCatcher/Ojas_5-mp4/Ojas_5.en.mp4', dirname='C:/Users/juan

2800:320* Posted at: 11 hours ago

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (3, 3) at dimension 2 of input [1, 64, 3],Traceback (most recent call last):
File "videotrans\process\tts_fun.py", line 116, in qwen3tts_fun
File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\inference\qwen3_tts_model.py", line 568, in generate_voice_clone

prompt_items = self.create_voice_clone_prompt(ref_audio=ref_audio, ref_text=ref_text, x_vector_only_mode=x_vector_only_mode)

File "torch\utils\_contextlib.py", line 116, in decorate_context

return func(*args, **kwargs)

File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\inference\qwen3_tts_model.py", line 446, in create_voice_clone_prompt

spk_emb = self.model.extract_speaker_embedding(audio=wav_resample,

File "torch\utils\_contextlib.py", line 116, in decorate_context
......
l(args, *kwargs)
File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 379, in forward

hidden_states = layer(hidden_states)

File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File "torch\nn\modules\module.py", line 1762, in _call_impl

return forward_call(*args, **kwargs)

File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 304, in forward

hidden_state = self.res2net_block(hidden_state)

File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File "torch\nn\modules\module.py", line 1762, in _call_impl

return forward_call(*args, **kwargs)

File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 121, in forward

output_part = self.blocks[i - 1](hidden_part)

File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File "torch\nn\modules\module.py", line 1762, in _call_impl

return forward_call(*args, **kwargs)

File "C:\win-pyvideotrans\win-pyvideotrans-v4.00-528\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 267, in forward

return self.activation(self.conv(hidden_states))

File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File "torch\nn\modules\module.py", line 1762, in _call_impl

return forward_call(*args, **kwargs)

File "torch\nn\modules\conv.py", line 375, in forward

return self._conv_forward(input, self.weight, self.bias)

File "torch\nn\modules\conv.py", line 360, in _conv_forward

F.pad(

File "torch\nn\functional.py", line 5209, in pad

return torch._C._nn.pad(input, pad, mode, value)

RuntimeError: Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (3, 3) at dimension 2 of input [1, 64, 3]
TaskCfgVTT(uuid='079bb2b37a', name='C:/Users/juanc/Videos/aTubeCatcher/Ojas_5-mp4/Ojas_5.en.mp4', dirname='C:/Users/juanc/Videos/aTubeCatcher/Ojas_5-mp4', noextname='Ojas_5.en', basename='Ojas_5.en.mp4', ext='mp4', target_dir='C:/Temp/Ojas_5.en-mp4', cache_folder='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a', is_cuda=False, source_language='English', source_language_code='en', source_sub='C:/Temp/Ojas_5.en-mp4/en.srt', source_wav='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a/en.wav', source_wav_output='C:/Temp/Ojas_5.en-mp4/en.m4a', target_language='Spanish', target_language_code='es', target_sub='C:/Temp/Ojas_5.en-mp4/es.srt', target_wav='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a/target.wav', target_wav_output='C:/Temp/Ojas_5.en-mp4/es.m4a', translate_type=0, tts_type=1, volume='+0%', pitch='+0Hz', voice_rate='+0%', voice_role='clone', voice_autorate=True, video_autorate=False, remove_silent_mid=False, align_sub_audio=True, detect_language='en', recogn_type=1, model_name='small.en', shibie_audio=None, remove_noise=False, enable_diariz=False, nums_diariz=0, rephrase=0, fix_punc=False, subtitle_language=None, app_mode='biaozhun', subtitles='', targetdir_mp4='C:/Temp/Ojas_5.en-mp4/Ojas_5.en.mp4', novoice_mp4='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a/novoice.mp4', is_separate=True, embed_bgm=True, instrument='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a/instrument.wav', vocal='C:/win-pyvideotrans/win-pyvideotrans-v4.00-528/tmp/22472/079bb2b37a/vocal.wav', clear_cache=True, background_music='', subtitle_type=0, only_out_mp4=False, recogn2pass=False, output_srt=0, copysrt_rawvideo=False, loop_backaudio=1, backaudio_volume=0.8)


=
system:Windows-10-10.0.26200-SP0
version:v4.00
frozen:True
language:en
root_dir:C:/win-pyvideotrans/win-pyvideotrans-v4.00-528

1 Replies

This error occurs because the reference audio segment used for Qwen3-TTS voice cloning is too short (only 3 frames in the mel spectrum, which is far below the minimum length required by the convolutional layers in the speaker encoder).

Solution

  1. Open Menu → Tools/Options → Advanced Options → Speech Recognition Parameters.
  2. Set Shortest voice duration (ms) to 3000.
  3. Set Maximum voice duration (seconds) to 10.
  4. Enable Pre-segment audio for whisper? and Merge short subtitles into adjacent ones.
  5. Re-run the task.

Alternative

  • If the subtitles are still very short (<3s), consider switching to a TTS channel that handles short references better, such as OmniVoice-TTS.
Post Your Reply
Open source and free maintenance is not easy. If this project is helpful to you, please consider making a small donation to help the project continue to maintain and update.

Related resource