Traceback (most recent call last):
File "videotrans\process\tts_fun.py", line 122, in qwen3tts_fun
File "torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)File "E:\pyvideotrans\win-pyvideotrans-v3.98-327\_internal\qwen_tts\inference\qwen3_tts_model.py", line 568, in generate_voice_clone
prompt_items = self.create_voice_clone_prompt(ref_audio=ref_audio, ref_text=ref_text, x_vector_only_mode=x_vector_only_mode)File "torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)File "E:\pyvideotrans\win-pyvideotrans-v3.98-327\_internal\qwen_tts\inference\qwen3_tts_model.py", line 427, in create_voice_clone_prompt
enc = self.model.speech_tokenizer.encode(ref_wavs_for_code, sr=ref_sr_for_code[0])File "E:\pyvideotrans\win-pyvideotrans-v3.98-327\_internal\qwen_tts\inference\qwen3_tts_tokenizer.py", line 252, in encode
enc = self.model.encode(File "E:\pyvideotrans\win-pyvideotrans-v3.98-327\_internal\qwen_tts\core\tokenizer_12hz\modeling_qwen3_tts_tokenizer_v2.py", line 981, in encode
encoded_frames = self.encoder.encode(input_values=input_values.unsqueeze(1),File "transformers\models\mimi\modeling_mimi.py", line 1577, in encode
encoded_frames, encoder_past_key_values, padding_cache = self._encode_frame(File "transformers\models\mimi\modeling_mimi.py", line 1456, in _encode_frame
embeddings = self.encoder(input_values, padding_cache=padding_cache)File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "transformers\models\mimi\modeling_mimi.py", line 483, in forward
hidden_states = layer(hidden_states, padding_cache=padding_cache)File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "transformers\models\mimi\modeling_mimi.py", line 340, in forward
hidden_states = self.conv(hidden_states)File "torch\nn\modules\module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)File "torch\nn\modules\module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)File "torch\nn\modules\conv.py", line 375, in forward
return self._conv_forward(input, self.weight, self.bias)File "torch\nn\modules\conv.py", line 370, in _conv_forward
return F.conv1d(RuntimeError: Calculated padded input size per channel: (6). Kernel size: (7). Kernel size can't be greater than actual input size
=
system:Windows-10-10.0.26200-SP0
version:v3.98
frozen:True
language:en
root_dir:E:/pyvideotrans/win-pyvideotrans-v3.98-327