Traceback (most recent call last):
File "videotrans\process\tts_fun.py", line 122, in qwen3tts_fun
File "torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)File "D:\Ruanjian-m\win-pyvideotrans-v3.97-0304\_internal\qwen_tts\inference\qwen3_tts_model.py", line 568, in generate_voice_clone
prompt_items = self.create_voice_clone_prompt(ref_audio=ref_audio, ref_text=ref_text, x_vector_only_mode=x_vector_only_mode)File "torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)File "D:\Ruanjian-m\win-pyvideotrans-v3.97-0304\_internal\qwen_tts\inference\qwen3_tts_model.py", line 446, in create_voice_clone_prompt
spk_emb = self.model.extract_speaker_embedding(audio=wav_resample,File "torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)File "D:\Ruanjian-m\win-pyvideotrans-v3.97-0304\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 1953, in extract_speaker_embedding
speaker_embedding = self.speaker_encoder(mels.to(self.device).to(self.dtype))[0]File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "D:\Ruanjian-m\win-pyvideotrans-v3.97-0304\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 379, in forward
hidden_states = layer(hidden_states)File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "D:\Ruanjian-m\win-pyvideotrans-v3.97-0304\_internal\qwen_tts\core\models\modeling_qwen3_tts.py", line 267, in forward
return self.activation(self.conv(hidden_states))File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "torch\nn\modules\conv.py", line 375, in forward
return self._conv_forward(input, self.weight, self.bias)File "torch\nn\modules\conv.py", line 360, in _conv_forward
F.pad(File "torch\nn\functional.py", line 5209, in pad
return torch._C._nn.pad(input, pad, mode, value)RuntimeError: Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (2, 2) at dimension 2 of input [1, 128, 1]
=
system:Windows-10-10.0.26200-SP0
version:v3.97
frozen:True
language:zh
root_dir:D:/Ruanjian-m/win-pyvideotrans-v3.97-0304