U
    %dO'                     @   s   d dl mZmZ d dlZdgZejejdddZG dd dejjZ	G d	d
 d
ejjZ
G dd dejjZG dd dejjZdS )    )OptionalTupleN	Conformer)lengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemZaranger   r   expandZ	unsqueeze)r   Z
batch_size
max_lengthZpadding_mask r   ?/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/models/conformer.py_lengths_to_padding_mask	   s    
 r   c                	       sH   e Zd ZdZdeeeeeedd fddZej	ej	dd	d
Z
  ZS )_ConvolutionModulea  Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
            FN)	input_dimnum_channelsdepthwise_kernel_sizedropoutbiasuse_group_normr   c                    s   t    |d d dks"tdtj|| _tjtjj|d| ddd|dtjj	ddtjj|||d|d d ||d|rtjj
d|dn
tj|tj tjj||ddd|d	tj|| _d S )
Nr	      r   z<depthwise_kernel_size must be odd to achieve 'SAME' padding.)stridepaddingr   )Zdim)r   r   groupsr   )Z
num_groupsr   )Zkernel_sizer   r   r   )super__init__AssertionErrorr   nn	LayerNorm
layer_norm
SequentialZConv1dZGLUZ	GroupNormZBatchNorm1dSiLUDropout
sequential)selfr   r   r   r   r   r   	__class__r   r   r!      sH    	




z_ConvolutionModule.__init__inputr   c                 C   s,   |  |}|dd}| |}|ddS )z
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        r	   r   )r%   	transposer)   )r*   r.   xr   r   r   forwardL   s    

z_ConvolutionModule.forward)r   FF)__name__
__module____qualname____doc__r   floatboolr!   r   Tensorr1   __classcell__r   r   r+   r   r      s      .r   c                       sB   e Zd ZdZd
eeedd fddZejejddd	Z	  Z
S )_FeedForwardModulezPositionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    r   N)r   
hidden_dimr   r   c                    s`   t    tjtj|tjj||ddtj tj|tjj||ddtj|| _	d S )NT)r   )
r    r!   r   r#   r&   r$   ZLinearr'   r(   r)   )r*   r   r;   r   r+   r   r   r!   c   s    



z_FeedForwardModule.__init__r-   c                 C   s
   |  |S )z
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        )r)   )r*   r.   r   r   r   r1   n   s    z_FeedForwardModule.forward)r   )r2   r3   r4   r5   r   r6   r!   r   r8   r1   r9   r   r   r+   r   r:   Z   s   r:   c                
       sf   e Zd ZdZdeeeeeeedd fddZej	ej	dd	d
Z
ej	eej	 ej	dddZ  ZS )ConformerLayera  Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    r   FN)r   ffn_dimnum_attention_headsdepthwise_conv_kernel_sizer   r   convolution_firstr   c                    s   t    t|||d| _tj|| _tjj|||d| _	tj
|| _t||||d|d| _t|||d| _tj|| _|| _d S )N)r   T)r   r   r   r   r   r   )r    r!   r:   ffn1r   r#   r$   self_attn_layer_normZMultiheadAttention	self_attnr(   self_attn_dropoutr   conv_moduleffn2final_layer_normr@   )r*   r   r=   r>   r?   r   r   r@   r+   r   r   r!      s     

	zConformerLayer.__init__r-   c                 C   s2   |}| dd}| |}| dd}|| }|S )Nr   r	   )r/   rE   )r*   r.   residualr   r   r   _apply_convolution   s    
z!ConformerLayer._apply_convolution)r.   key_padding_maskr   c                 C   s   |}|  |}|d | }| jr*| |}|}| |}| j||||dd\}}| |}|| }| jsr| |}|}| |}|d | }| |}|S )a
  
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        g      ?F)querykeyvaluerJ   Zneed_weights)rA   r@   rI   rB   rC   rD   rF   rG   )r*   r.   rJ   rH   r0   _r   r   r   r1      s.    	







zConformerLayer.forward)r   FF)r2   r3   r4   r5   r   r6   r7   r!   r   r8   rI   r   r1   r9   r   r   r+   r   r<   y   s       r<   c                
       sX   e Zd ZdZd
eeeeeeeed fddZej	ej	e
ej	ej	f ddd	Z  ZS )r   a=  Implements the Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    [:footcite:`gulati2020conformer`].

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    r   F)r   	num_headsr=   
num_layersr?   r   r   r@   c	           	   	      s:   t    tj fddt|D | _d S )Nc                    s"   g | ]}t  d qS ))r   r   r@   )r<   ).0rN   r@   r?   r   r=   r   rO   r   r   r   
<listcomp>  s   
z&Conformer.__init__.<locals>.<listcomp>)r    r!   r   r#   Z
ModuleListrangeconformer_layers)	r*   r   rO   r=   rP   r?   r   r   r@   r+   rR   r   r!      s    

zConformer.__init__)r.   r   r   c                 C   s:   t |}|dd}| jD ]}|||}q|dd|fS )aX  
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r	   )r   r/   rU   )r*   r.   r   Zencoder_padding_maskr0   Zlayerr   r   r   r1     s
    
zConformer.forward)r   FF)r2   r3   r4   r5   r   r6   r7   r!   r   r8   r   r1   r9   r   r   r+   r   r      s   $   )typingr   r   r   __all__r8   r   r#   Moduler   r:   r<   r   r   r   r   r   <module>   s   	H]