U
    %d;                     @   s   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZm
Z
 dddddgZG d	d dejZG d
d dejZG dd dejZG dd dejZG dd dejZdS )    N)ListOptionalTuple)nnTensorResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                       s:   e Zd ZdZd
edd fddZeeddd	Z  ZS )r   al  ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
       N)n_freqreturnc                    sR   t    ttj||dddt|tjddtj||dddt|| _d S )N   Fin_channelsout_channelskernel_sizebiasTZinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__ =/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/models/wavernn.pyr      s    

zResBlock.__init__specgramr   c                 C   s   |  || S )zPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r   r   r#   r    r    r!   forward(   s    	zResBlock.forward)r   	__name__
__module____qualname____doc__intr   r   r%   __classcell__r    r    r   r!   r      s   c                       sB   e Zd ZdZdeeeeedd fddZeed	d
dZ  ZS )r   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    
   r      N)n_res_blockr   n_hiddenn_outputr   r   c                    sf   t     fddt|D }tjtj| |ddt tjddf|tj |ddf | _d S )	Nc                    s   g | ]}t  qS r    )r   ).0_r0   r    r!   
<listcomp>I   s     z&MelResNet.__init__.<locals>.<listcomp>Fr   Tr   r   )r   r   r   )	r   r   ranger   r   r   r   r   melresnet_model)r   r/   r   r0   r1   r   Z	ResBlocksr   r4   r!   r   D   s    

zMelResNet.__init__r"   c                 C   s
   |  |S )zPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r7   r$   r    r    r!   r%   S   s    	zMelResNet.forward)r-   r   r   r   r.   r&   r    r    r   r!   r   4   s                c                       s:   e Zd ZdZeedd fddZeedddZ  ZS )	r	   a  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    N)
time_scale
freq_scaler   c                    s   t    || _|| _d S N)r   r   r9   r8   )r   r8   r9   r   r    r!   r   m   s    
zStretch2d.__init__r"   c                 C   s   | | jd | jdS )zPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        )Zrepeat_interleaver9   r8   r$   r    r    r!   r%   s   s    
zStretch2d.forwardr&   r    r    r   r!   r	   _   s   c                	       sP   e Zd ZdZdee eeeeedd fddZeeeef d	d
dZ	  Z
S )r
   a  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    r-   r   r.   N)upsample_scalesr/   r   r0   r1   r   r   c                    s   t    d}|D ]}||9 }q|| _|d d | | _t|||||| _t|d| _g }	|D ]d}
t|
d}tj	ddd|
d d fd|
fdd}t
jj|jd|
d d   |	| |	| q^tj|	 | _d S )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr   resnetr	   resnet_stretchr   ZConv2dtorchinitZ	constant_Zweightappendr   upsample_layers)r   r=   r/   r   r0   r1   r   rA   upsample_scaleZ	up_layersZscaleZstretchconvr   r    r!   r      s,    	


    
zUpsampleNetwork.__init__r"   c                 C   sf   |  |d}| |}|d}|d}| |}|ddddd| j| j f }||fS )a  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rC   	unsqueezerD   squeezerH   rB   )r   r#   Zresnet_outputZupsampling_outputr    r    r!   r%      s    



&zUpsampleNetwork.forward)r-   r   r   r   r.   )r'   r(   r)   r*   r   r+   r   r   r   r%   r,   r    r    r   r!   r
      s         c                       s~   e Zd ZdZdee eeeeeeeeedd fdd	Zeeed
ddZe	j
jdeee eeee f dddZ  ZS )r   a<  WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    r-      r.   r   N)r=   	n_classes
hop_lengthr/   n_rnnn_fcr   r   r0   r1   r   c                    s:  t    || _|d r |d n|d | _|| _|
d | _|| _|| _tt	
| j| _d}|D ]}||9 }q`|| jkrtd| d| t||||	|
|| _t|| j d || _tj||dd| _tj|| j |dd| _tjdd| _tjdd| _t|| j || _t|| j || _t|| j| _d S )	Nr>   r      z/Expected: total_scale == hop_length, but found z != T)Zbatch_firstr   )r   r   r   _padrP   n_auxrO   rN   r+   mathlog2n_bits
ValueErrorr
   upsampler   ZLinearfcZGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r   r=   rN   rO   r/   rP   rQ   r   r   r0   r1   rA   rI   r   r    r!   r      s,    



zWaveRNN.__init__)waveformr#   r   c                    s  | ddkstd| ddks,td|d|d }}| d}tjd| j|j|jd}tjd| j|j|jd} |\}}|	dd}|	dd} fddt
d	D }|d
d
d
d
|d |d f }|d
d
d
d
|d |d f }	|d
d
d
d
|d |d f }
|d
d
d
d
|d |d f }tj|d||gdd} |}|} ||\}}|| }|}tj||	gdd} ||\}}|| }tj||
gdd} |} |}tj||gdd} |} |} |}|dS )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicer>   c                    s   g | ]} j | qS r    rT   r2   ir   r    r!   r5   '  s     z#WaveRNN.forward.<locals>.<listcomp>r.   N   rR   r<   Zdim)sizeAssertionErrorrL   rE   zerosrP   rc   rd   rY   Z	transposer6   catrK   rZ   r[   r\   r_   r]   r`   r^   ra   )r   rb   r#   Z
batch_sizeh1h2auxZaux_idxZa1Za2a3Za4xresr3   r    rh   r!   r%     s>    
""""





zWaveRNN.forward)r#   lengthsr   c                    s  |j }|j}tjj|jjf}|\} |dk	rF|jj }g }|	 \}}}tj
d|jf||d}	tj
d|jf||d}
tj
|df||d} fddtdD }t|D ]8|ddddf }fdd|D \}}}}tj|||gdd}|}|d|	\}}	||	d	  }tj||gdd}|d|
\}}
||
d	  }tj||gdd}t|}tj||gdd}t|}|}tj|dd}t|d }d
| d
j d  d }|| qt|dd
d	|fS )a  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )rd   rc   c                    s6   g | ].} d d j | j |d  d d f qS )Nr   re   rf   )rq   r   r    r!   r5   q  s     z!WaveRNN.infer.<locals>.<listcomp>rR   c                    s"   g | ]}|d d d d  f qS r:   r    )r2   a)rg   r    r!   r5   w  s     rj   r   r>   r@   )rd   rc   rE   r   
functionalpadrS   rY   rA   rk   rm   rP   r6   rn   rZ   r[   rK   r\   FZrelur_   r`   ra   ZsoftmaxZmultinomialfloatrW   rG   stackZpermute)r   r#   ru   rd   rc   outputZb_sizer3   Zseq_lenro   rp   rs   Z	aux_splitZm_tZa1_tZa2_tZa3_tZa4_tinpZlogitsZ	posteriorr    )rq   rg   r   r!   inferD  s@    

zWaveRNN.infer)r-   rM   rM   r.   r   r   r   )N)r'   r(   r)   r*   r   r+   r   r   r%   rE   Zjitexportr   r   r~   r,   r    r    r   r!   r      s0           *7)rU   typingr   r   r   rE   Ztorch.nn.functionalr   rw   ry   r   __all__Moduler   r   r	   r
   r   r    r    r    r!   <module>   s   	#+!G