U
    %d>=                     @   s&  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d	d
lmZ d	dlmZ g ZdZG dd dejZG dd dejZG dd de
jj ej!Z"G dd de
jj ej!Z#G dd dZ$G dd dZ%eG dd dZ&eG dd dZ'G dd dZ(eG dd  d e'e&e$eZ)eG d!d" d"e'e&e%eZ*eG d#d$ d$e(e&e$eZ+eG d%d& d&e(e&e%eZ,e+d'ej-d(d)d*Z.d+e._/e,d,ej-d-d)d*Z0d.e0_/e)d/ej-d(d)d0e1 d1Z2d2e2_/e*d3ej-d-d)d0e1 d1Z3d4e3_/dS )5    N)	dataclass)AnyDictListOptionalTupleUnion)Tensor)load_state_dict_from_url)mu_law_decoding)	Tacotron2WaveRNN)
GriffinLimInverseMelScale   )utils)Tacotron2TTSBundlez.https://download.pytorch.org/torchaudio/modelsc                       sL   e Zd Z fddZedd Zeeee f e	e
e
f dddZ  ZS )_EnglishCharProcessorc                    s.   t    t | _dd t| jD | _d S )Nc                 S   s   i | ]\}}||qS  r   ).0isr   r   B/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/pipelines/_tts/impl.py
<dictcomp>   s      z2_EnglishCharProcessor.__init__.<locals>.<dictcomp>)super__init__r   Z
_get_chars_tokens	enumerate_mappingself	__class__r   r   r      s    

z_EnglishCharProcessor.__init__c                 C   s   | j S Nr   r   r   r   r   tokens   s    z_EnglishCharProcessor.tokenstextsreturnc                    s,   t |tr|g} fdd|D }t|S )Nc                    s"   g | ]} fd d|  D qS )c                    s    g | ]}| j kr j | qS r   r   )r   cr   r   r   
<listcomp>&   s     
 z=_EnglishCharProcessor.__call__.<locals>.<listcomp>.<listcomp>)lower)r   tr   r   r   r+   &   s     z2_EnglishCharProcessor.__call__.<locals>.<listcomp>)
isinstancestrr   
_to_tensor)r    r'   indicesr   r   r   __call__#   s    
z_EnglishCharProcessor.__call____name__
__module____qualname__r   propertyr%   r   r/   r   r   r	   r2   __classcell__r   r   r!   r   r      s   
r   c                       sR   e Zd Zdd fdd
Zedd Zeeee f e	e
e
f ddd	Z  ZS )
_EnglishPhoneProcessorN	dl_kwargsc                   sD   t    t | _dd t| jD | _tjd|d| _d| _	d S )Nc                 S   s   i | ]\}}||qS r   r   )r   r   pr   r   r   r   .   s      z3_EnglishPhoneProcessor.__init__.<locals>.<dictcomp>zen_us_cmudict_forward.ptr:   z(\[[A-Z]+?\]|[_!'(),.:;? -]))
r   r   r   Z_get_phonesr   r   r   Z_load_phonemizer_phonemizer_patternr    r;   r!   r   r   r   +   s
    

z_EnglishPhoneProcessor.__init__c                 C   s   | j S r#   r$   r   r   r   r   r%   2   s    z_EnglishPhoneProcessor.tokensr&   c                    sb   t |tr|g}g } j|ddD ]4}dd t j|D }| fdd|D  q"t|S )Nen_us)langc                 S   s   g | ]}t d d|qS )z[\[\]] )resub)r   rr   r   r   r+   =   s     z3_EnglishPhoneProcessor.__call__.<locals>.<listcomp>c                    s   g | ]} j | qS r   r)   )r   r<   r   r   r   r+   >   s     )	r.   r/   r=   rC   findallr>   appendr   r0   )r    r'   r1   Zphonesretr   r   r   r2   6   s    
z_EnglishPhoneProcessor.__call__r3   r   r   r!   r   r9   *   s   
r9   c                       s@   e Zd Zd
eee d fddZedd Zddd	Z	  Z
S )_WaveRNNVocoder)modelmin_level_dbc                    s    t    d| _|| _|| _d S )N"V  )r   r   _sample_rate_model_min_level_db)r    rK   rL   r!   r   r   r   H   s    
z_WaveRNNVocoder.__init__c                 C   s   | j S r#   rN   r   r   r   r   sample_rateN   s    z_WaveRNNVocoder.sample_rateNc                 C   s   t |}dt t j|dd }| jd k	rL| j| | j }t j|ddd}| j||\}}t|| jj	}t
|| jj}|d}||fS )N   gh㈵>)minr   r   )rT   max)torchexplog10clamprP   rO   Zinferr   Z_unnormalize_waveformZn_bitsr   Z	n_classesZsqueeze)r    mel_speclengthsZwaveformr   r   r   forwardR   s    


z_WaveRNNVocoder.forward)rJ   )N)r4   r5   r6   r   r   floatr   r7   rR   r\   r8   r   r   r!   r   rI   G   s   
rI   c                       s2   e Zd Z fddZedd ZdddZ  ZS )	_GriffinLimVocoderc              	      s@   t    d| _tdd| jddddd| _tdd	d
dd| _d S )NrM   i  P   g        g     @@Zslaney)Zn_stftZn_melsrR   Zf_minZf_maxZ	mel_scaleZnormi   r      )Zn_fftpowerZ
hop_lengthZ
win_length)r   r   rN   r   rR   _inv_melr   _griffin_limr   r!   r   r   r   `   s"    
	z_GriffinLimVocoder.__init__c                 C   s   | j S r#   rQ   r   r   r   r   rR   s   s    z_GriffinLimVocoder.sample_rateNc                 C   sF   t |}|  d}| |}| d}| |}||fS )NTF)rV   rW   clonedetachZrequires_grad_rb   rc   )r    rZ   r[   specZ	waveformsr   r   r   r\   w   s    


z_GriffinLimVocoder.forward)N)r4   r5   r6   r   r7   rR   r\   r8   r   r   r!   r   r^   _   s   
r^   c                   @   s   e Zd ZejdddZdS )
_CharMixinr(   c                 C   s   t  S r#   )r   r   r   r   r   get_text_processor   s    z_CharMixin.get_text_processorNr4   r5   r6   r   TextProcessorri   r   r   r   r   rg      s   rg   c                   @   s"   e Zd ZddejdddZdS )_PhoneMixinNr:   rh   c                C   s
   t |dS Nr:   )r9   r?   r   r   r   ri      s    z_PhoneMixin.get_text_processorrj   r   r   r   r   rl      s   rl   c                   @   s:   e Zd ZU eed< eeef ed< ddedddZdS )_Tacotron2Mixin_tacotron2_path_tacotron2_paramsNr:   rh   c                C   sN   t f | j}t d| j }|d kr(i n|}t|f|}|| |  |S N/)r   rp   	_BASE_URLro   r
   load_state_dictevalr    r;   rK   urlZ
state_dictr   r   r   get_tacotron2   s    
z_Tacotron2Mixin.get_tacotron2)	r4   r5   r6   r/   __annotations__r   r   r   rx   r   r   r   r   rn      s   
rn   c                   @   sJ   e Zd ZU ee ed< eeeef  ed< ddddZddddZ	dS )	_WaveRNNMixin_wavernn_path_wavernn_paramsNr:   c                C   s   | j |d}t|S rm   )_get_wavernnrI   )r    r;   Zwavernnr   r   r   get_vocoder   s    z_WaveRNNMixin.get_vocoderc                C   sN   t f | j}t d| j }|d kr(i n|}t|f|}|| |  |S rq   )r   r|   rs   r{   r
   rt   ru   rv   r   r   r   r}      s    
z_WaveRNNMixin._get_wavernn)
r4   r5   r6   r   r/   ry   r   r   r~   r}   r   r   r   r   rz      s   
rz   c                   @   s   e Zd Zdd ZdS )_GriffinLimMixinc                 K   s   t  S r#   )r^   )r    _r   r   r   r~      s    z_GriffinLimMixin.get_vocoderN)r4   r5   r6   r~   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2WaveRNNCharBundleNr4   r5   r6   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2WaveRNNPhoneBundleNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2GriffinLimCharBundleNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZdS )_Tacotron2GriffinLimPhoneBundleNr   r   r   r   r   r      s   r   z5tacotron2_english_characters_1500_epochs_ljspeech.pth&   )Z	n_symbols)ro   rp   aL  Character-based TTS pipeline with :py:class:`torchaudio.models.Tacotron2` and
:py:class:`torchaudio.transforms.GriffinLim`.

The text processor encodes the input texts character-by-character.

Tacotron2 was trained on *LJSpeech* [:footcite:`ljspeech17`] for 1,500 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.

The vocoder is based on :py:class:`torchaudio.transforms.GriffinLim`.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z3tacotron2_english_phonemes_1500_epochs_ljspeech.pth`   a7  Phoneme-based TTS pipeline with :py:class:`torchaudio.models.Tacotron2` and
:py:class:`torchaudio.transforms.GriffinLim`.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

Tacotron2 was trained on *LJSpeech* [:footcite:`ljspeech17`] for 1,500 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.

The vocoder is based on :py:class:`torchaudio.transforms.GriffinLim`.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

z=tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pthz%wavernn_10k_epochs_8bits_ljspeech.pth)ro   rp   r{   r|   ai  Character-based TTS pipeline with :py:class:`torchaudio.models.Tacotron2` and
:py:class:`torchaudio.models.WaveRNN`.

The text processor encodes the input texts character-by-character.

Tacotron2 was trained on *LJSpeech* [:footcite:`ljspeech17`] for 1,500 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

The vocder is based on :py:class:`torchaudio.models.WaveRNN`.
It was trained on 8 bits depth waveform of *LJSpeech* [:footcite:`ljspeech17`] for 10,000 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z;tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.ptha?  Phoneme-based TTS pipeline with :py:class:`torchaudio.models.Tacotron2` and
:py:class:`torchaudio.models.WaveRNN`.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

Tacotron2 was trained on *LJSpeech* [:footcite:`ljspeech17`] for 1,500 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

The vocder is based on :py:class:`torchaudio.models.WaveRNN`.
It was trained on 8 bits depth waveform of *LJSpeech* [:footcite:`ljspeech17`] for 10,000 epochs.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>


Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
)4rC   Zdataclassesr   typingr   r   r   r   r   r   rV   r	   Ztorchaudio._internalr
   Ztorchaudio.functionalr   Ztorchaudio.modelsr   r   Ztorchaudio.transformsr   r   rB   r   Z	interfacer   __all__rs   rk   r   r9   nnModuleZVocoderrI   r^   rg   rl   rn   rz   r   r   r   r   r   Z_get_taco_paramsZ"TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH__doc__Z#TACOTRON2_GRIFFINLIM_PHONE_LJSPEECHZ_get_wrnn_paramsZTACOTRON2_WAVERNN_CHAR_LJSPEECHZ TACOTRON2_WAVERNN_PHONE_LJSPEECHr   r   r   r   <module>   sn    &
	
&
+
)
