U
    %d"                  	   @   s  d dl Z d dlmZmZmZ d dlZdgZejejdddZdejejejejejeej eej ddd	Z	e
ejjd
ddZee
 eeee  dddZee ee eejejdddZG dd dejjZG dd dejjZG dd dejjZG dd deZdS )    N)ListOptionalTupleEmformer)lengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemZaranger   r	   expand	unsqueeze)r   
batch_size
max_lengthpadding_mask r   >/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/models/emformer.py_lengths_to_padding_mask
   s    
 r   )	utteranceright_contextsummaryr   memsleft_context_keyr   c                 C   s   | d|  d | d }| d}|dkr6d }nR|t|  | d }	|d k	rd| dnd}
|| d |	 |
 }t|d}|S )Nr   r
   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthZleft_context_blocks_lengthZklengthsr   r   r   _gen_padding_mask   s    

r!   )
activationr   c                 C   sH   | dkrt j S | dkr$t j S | dkr6t j S td|  d S )NreluZgeluZsiluzUnsupported activation )r   nnZReLUZGELUZSiLU
ValueError)r"   r   r   r   _get_activation_module'   s    


r&   )weight_init_scale_strategy
num_layersr   c                 C   s`   | d krdd t |D S | dkr4dd t |D S | dkrNdd t |D S td|  d S )Nc                 S   s   g | ]}d qS Nr   ).0_r   r   r   
<listcomp>4   s     z*_get_weight_init_gains.<locals>.<listcomp>	depthwisec                 S   s   g | ]}d t |d  qS )      ?r
   mathsqrtr*   	layer_idxr   r   r   r,   6   s     Zconstantc                 S   s   g | ]}d t d qS )r.      r/   r2   r   r   r   r,   8   s     z-Unsupported weight_init_scale_strategy value )ranger%   )r'   r(   r   r   r   _get_weight_init_gains2   s    r6   )
col_widthscol_masknum_rowsr   r   c                    s@   t | t |kstd fddt| |D }tj|ddS )Nz0Length of col_widths must match that of col_maskc                    s4   g | ],\}}|r t j| d nt j| d qS )r   )r   Zoneszeros)r*   Z	col_widthZis_ones_colr   r9   r   r   r,   B   s   z-_gen_attention_mask_block.<locals>.<listcomp>r
   dim)lenAssertionErrorzipr   cat)r7   r8   r9   r   Z
mask_blockr   r<   r   _gen_attention_mask_block=   s
    rC   c                       s<  e Zd ZdZdeeeee eed fddZe	j
e	j
ee	j
e	j
f d	d
dZe	j
e	j
ee	j
 e	j
dddZde	j
e	j
e	j
e	j
e	j
e	j
ee	j
 ee	j
 ee	j
e	j
e	j
e	j
f d	ddZe	j
e	j
e	j
e	j
e	j
e	j
ee	j
e	j
f dddZe	jje	j
e	j
e	j
e	j
e	j
e	j
e	j
ee	j
e	j
e	j
e	j
f dddZ  ZS )_EmformerAttentiona_  Emformer layer attention module.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
            NF    ח	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                    s   t    || dkr,td| d| d|| _|| _|| _|| _|| _| j| j d | _t	j
j|d| dd| _t	j
j||dd| _t	j
j||dd| _|rt	j
jj| jj|d	 t	j
jj| jj|d	 d S )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      r4   T)Zbias)Zgain)super__init__r%   rH   rI   rJ   rL   rM   scalingr   r$   Linearemb_to_key_valueemb_to_queryout_projinitZxavier_uniform_Zweight)selfrH   rI   rJ   rK   rL   rM   	__class__r   r   rO   X   s    	
z_EmformerAttention.__init__)inputr   r   c           
      C   sX   |j \}}}|dd }|d ||  }t||g}| |jddd\}}	||	fS )Nr   r
   r4   chunksr>   )r   r   r   rB   rR   chunk)
rV   rY   r   r   r+   summary_lengthZright_ctx_utterance_blockZmems_right_ctx_utterance_blockkeyvaluer   r   r   _gen_key_valuev   s    z!_EmformerAttention._gen_key_value)attention_weightsattention_maskr   r   c                 C   s   |  }||d| j}|d}|d| j }|d k	r||| j|d}||ddtj	| j}||| j |d}tj
jj|dd|}tj
jj|t | j| jdS )Nr   r
   r4   r=   )ptraining)floatZmasked_fillr   rM   r   rI   viewtor   boolr$   Z
functionalZsoftmaxZtype_asrJ   re   )rV   ra   rb   r   Zattention_weights_floatr   r   attention_probsr   r   r   _gen_attention_probs~   s    
 z'_EmformerAttention._gen_attention_probs)	r   r   r   r   r   rb   r   left_context_valr   c	                    s   | d | d| d | d }	t|||g}
t|||gjddd\}}|d k	r|d k	r|	t|  | d }t|d | d|  ||| d| d  g}t|d | d|  ||| d| d  g} fdd|
||fD \}}}t|j	 |
dd}t||||||}|||}t||}|j j |	jj fkst|
dd |	 j}|}| d}|d |	|  }||	| d  }jrt|}ntj|ddd	}||||fS )
Nr
   r   r4   rZ   c                    s4   g | ],}|  d  j jj ddqS )rc   r   r
   )
contiguousrg   rI   rH   	transpose)r*   Ztensorr   rV   r   r   r,      s   z4_EmformerAttention._forward_impl.<locals>.<listcomp>i
   )minr   )r   rS   r   rB   rR   r\   r   r   ZbmmrP   rn   r!   rk   r   rI   rH   r@   rm   rg   rT   rL   tanhclamp)rV   r   r   r   r   r   rb   r   rl   r   queryr^   r_   r    Zreshaped_queryZreshaped_keyZreshaped_valuera   r   rj   	attentionZoutput_right_context_memsr]   output_right_contextoutput_memsr   ro   r   _forward_impl   sN    
$	


z _EmformerAttention._forward_impl)r   r   r   r   r   rb   r   c           
      C   s,   |  ||||||\}}}	}	||dd fS )ac  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        Nrc   )rx   )
rV   r   r   r   r   r   rb   outputrw   r+   r   r   r   forward   s    "z_EmformerAttention.forward)r   r   r   r   r   r   rl   r   c              
   C   s   | d| d | d }| d| d | d | d }	t||	jtj|jd}
d|
dd| df< | j||||||
||d\}}}}|||| d| d d || d| d d fS )a  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

        Returns:
            (Tensor, Tensor, Tensor, and Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
                Tensor
                    attention key computed for left context and utterance.
                Tensor
                    attention value computed for left context and utterance.
        r   r	   r   Trc   N)r   rl   )r   r   r;   rh   ri   r   rx   )rV   r   r   r   r   r   r   rl   Z	query_dimZkey_dimrb   ry   rw   r^   r_   r   r   r   infer   s&    )(z_EmformerAttention.infer)rE   NFrF   )NN)__name__
__module____qualname____doc__r   rf   r   ri   rO   r   Tensorr   r`   rk   rx   rz   jitexportr|   __classcell__r   r   rW   r   rD   K   sd       "
  J%rD   c                       s  e Zd ZdZd'eeeeeeeeee eed fd	d
Z	eee
j ee
j dddZee
j ee
je
je
jf dddZe
je
jee
jee
j ee
j dddZe
je
je
je
jdddZe
je
jee
je
jf dddZe
je
je
jee
je
jf dddZe
je
je
je
jee
j ee
je
jf dddZe
je
je
je
jeee
j  ee
je
jee
j f dd d!Ze
je
je
je
je
jee
je
je
jf dd"d#Ze
jje
je
je
jeee
j  e
jee
je
jee
j e
jf d$d%d&Z  ZS )(_EmformerLayera$  Emformer layer that constitutes Emformer.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads.
        ffn_dim: (int): hidden layer dimension of feedforward network.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in feedforward network.
            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    rE   r#   r   NFrF   )rH   rI   ffn_dimsegment_lengthrJ   r"   left_context_lengthmax_memory_sizerK   rL   rM   c              
      s   t    t||||	|
|d| _tj|| _tjj||dd| _	t
|}tjtj|tj|||tj|tj||tj|| _tj|| _tj|| _|| _|| _|| _|| _|dk| _d S )NrG   TZkernel_sizeZstrideZ	ceil_moder   )rN   rO   rD   ru   r   r$   ZDropoutrJ   	AvgPool1d	memory_opr&   Z
SequentialZ	LayerNormrQ   pos_fflayer_norm_inputlayer_norm_outputr   r   r   rH   use_mem)rV   rH   rI   r   r   rJ   r"   r   r   rK   rL   rM   Zactivation_modulerW   r   r   rO   P  s6    



z_EmformerLayer.__init__)r   r   r   c                 C   sb   t j| j|| j|d}t j| j|| j|d}t j| j|| j|d}t jd|t j|d}||||gS )Nr:   r
   r{   )r   r;   r   rH   r   Zint32)rV   r   r   Zempty_memoryr   rl   past_lengthr   r   r   _init_state~  s
    z_EmformerLayer._init_state)stater   c                 C   s   |d d d   }t| j|}t| jt|| j }|d | j| d  }|d | j| d  }|d | j| d  }|||fS )N   r   r
   r4   )r   rq   r   r   r0   ceilr   )rV   r   r   Zpast_left_context_lengthZpast_mem_lengthpre_memslc_keylc_valr   r   r   _unpack_state  s    z_EmformerLayer._unpack_state)next_knext_vupdate_lengthr   r   r   c                 C   s   t |d |g}t |d |g}t |d |g| j d  |d< ||jd | j d  |d< ||jd | j d  |d< |d | |d< |S )Nr
   r4   r   r   )r   rB   r   r   r   )rV   r   r   r   r   r   Znew_kZnew_vr   r   r   _pack_state  s    "z_EmformerLayer._pack_state)	rc_outputr   r   r   c                 C   s4   |  |t||g }| || }| |}|S r)   )rJ   r   rB   r   r   )rV   r   r   r   resultr   r   r   _process_attention_output  s    
z(_EmformerLayer._process_attention_output)r   r   r   c                 C   s8   |  t||g}||dd  |d |d fS Nr   )r   r   rB   r   )rV   r   r   r   r   r   r   _apply_pre_attention_layer_norm  s    z._EmformerLayer._apply_pre_attention_layer_normc                 C   s2   |  |||}||dd  |d |d fS r   )r   r   )rV   r   r   r   r   r   r   _apply_post_attention_ffn  s    z(_EmformerLayer._apply_post_attention_ffn)r   r   r   r   rb   r   c           	      C   sp   |d krt d| jr6| |dddddd}ntdj|j|jd}| j	||||||d\}}||fS )Nz;attention_mask must be not None when for_inference is Falser
   r4   r   r{   )r   r   r   r   r   rb   )
r%   r   r   permuter   emptyrh   r	   r   ru   )	rV   r   r   r   r   rb   r   r   next_mr   r   r   _apply_attention_forward  s     
z'_EmformerLayer._apply_attention_forward)r   r   r   r   r   r   c              	   C   s   |d kr| j |d|jd}| |\}}}| jr`| |dddddd}	|	d d }	ntdj	|j
|jd}	| jj||||	|||d\}
}}}| |||d||}|
||fS )Nr
   r:   r4   r   r{   )r   r   r   r   r   r   rl   )r   r   r   r   r   r   r   r   r   rh   r	   ru   r|   r   )rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _apply_attention_infer  s$    	z%_EmformerLayer._apply_attention_inferc                 C   sB   |  ||\}}| |||||\}}	| |||\}
}|
||	fS )a1  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rV   r   r   r   r   rb   layer_norm_utterancelayer_norm_right_contextr   rw   output_utterancerv   r   r   r   rz     s    $
z_EmformerLayer.forward)r   r   r   r   r   r   c                 C   sF   |  ||\}}| |||||\}}	}
| |||\}}|||
|	fS )a2  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            state (List[torch.Tensor] or None): list of tensors representing layer internal state
                generated in preceding invocation of ``infer``.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

        Returns:
            (Tensor, Tensor, List[torch.Tensor], Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                List[Tensor]
                    list of tensors representing layer internal state
                    generated in current invocation of ``infer``.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rV   r   r   r   r   r   r   r   r   rw   output_stater   rv   r   r   r   r|     s    )
    
z_EmformerLayer.infer)rE   r#   r   r   NFrF   )r}   r~   r   r   r   rf   strr   ri   rO   r   r   r   r   r   r   r   r   r   r   r   r   r   rz   r   r   r|   r   r   r   rW   r   r   =  s          .& 
  /r   c                       s   e Zd Zdejjeeeed fddZejejdddZ	eee
e dd	d
ZejejdddZejejeejejf dddZejjdejejee
e
ej   eejeje
e
ej  f dddZ  ZS )_EmformerImplr   )emformer_layersr   r   right_context_lengthr   c                    sJ   t    |dk| _tjj||dd| _|| _|| _|| _	|| _
|| _d S )Nr   Tr   )rN   rO   r   r   r$   r   r   r   r   r   r   r   )rV   r   r   r   r   r   rW   r   r   rO   N  s    

z_EmformerImpl.__init__)rY   r   c                 C   s   |j d }t|| j | j }g }t|d D ].}|d | j }|| j }||||  q0|||| j d   t|S Nr   r
   )	r   r0   r   r   r   r5   appendr   rB   )rV   rY   r   num_segsZright_context_blocksseg_idxstartendr   r   r   _gen_right_contextd  s    

z _EmformerImpl._gen_right_context)r   utterance_lengthr   c              
   C   s   t || j }| j}| j}|| }|| }t|| j | d}t|d | j |}	| j| }
| jrt|| j d}|d }||| || |||
| ||	| ||	 g	}n|||
| ||	| ||	 g}|S r   )	r0   r   r   r   r   r   rq   r   r   )rV   r   r   r   rclcZrc_startZrc_endZ	seg_startZseg_endZ	rc_lengthZm_startZ
mem_lengthr7   r   r   r   _gen_attention_mask_col_widthso  s:    
	z,_EmformerImpl._gen_attention_mask_col_widthsc              	   C   s*  | d}t|| j }g }g }g }| jr`d}dd t|D }dd t|D }	|||g}
n"d}dd t|D }d }	||g}
t|D ]z}| ||}t||| j|j	}|
| t||t| j||| j  |j	}|
| |	d k	rt||	d|j	}|
| qdtd	d |
D  tj}|S )
Nr   	   c                 S   s   g | ]}|d kqS ))r
         r   r*   idxr   r   r   r,     s     z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>c                 S   s   g | ]}|d kqS ))r   r   r   r   r   r   r   r,     s        c                 S   s   g | ]}|d kqS ))r
   r   r   r   r   r   r   r,     s     r
   c                 S   s   g | ]}t |qS r   )r   rB   )r*   maskr   r   r   r,     s     )r   r0   r   r   r   r5   r   rC   r   r   r   rq   r   rB   rh   ri   )rV   rY   r   r   Zrc_maskZ
query_maskZsummary_maskZnum_colsZrc_q_cols_maskZs_cols_maskZmasks_to_concatr   r7   Zrc_mask_blockZquery_mask_blockZsummary_mask_blockrb   r   r   r   _gen_attention_mask  sL    
   
	
 z!_EmformerImpl._gen_attention_mask)rY   r   r   c           	      C   s   | ddd}| |}|d|d| j  }| |}| jrf| | ddd ddddd ntdj	|j
|jd}|}| jD ]}||||||\}}}q| ddd|fS )aG  Forward pass for training and non-streaming inference.

        B: batch size;
        T: max number of input frames in batch;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, T + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid utterance frames for i-th batch element in ``input``.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames, with shape `(B, T, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r
   r   r4   Nrc   r{   )r   r   r   r   r   r   r   r   r   rh   r	   r   r   )	rV   rY   r   r   r   rb   r   ry   layerr   r   r   rz     s    

(
z_EmformerImpl.forwardN)rY   r   statesr   c                 C   s$  | d| j| j ks:td| j| j  d| d d|ddd}| d| j }||d }|d| }tj|| j dd}| jr| |ddddddnt	dj
|j|jd	}|}	g }
t| jD ]<\}}||	|||dkrdn|| |\}	}}}|
| q|	ddd||
fS )
a  Forward pass for streaming inference.

        B: batch size;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, segment_length + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

        Returns:
            (Tensor, Tensor, List[List[Tensor]]):
                Tensor
                    output frames, with shape `(B, segment_length, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
                List[List[Tensor]]
                    output states; list of lists of tensors representing internal state
                    generated in current invocation of ``infer``.
        r
   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   r4   N)rq   r{   )r   r   r   r@   r   r   rs   r   r   r   rh   r	   r   	enumerater   r|   r   )rV   rY   r   r   Zright_context_start_idxr   r   Zoutput_lengthsr   ry   Zoutput_statesr3   r   r   r   r   r   r|     s0     z_EmformerImpl.infer)r   r   r   )N)r}   r~   r   r   r$   
ModuleListr   rO   r   r   r   r   r   r   rz   r   r   r   r|   r   r   r   rW   r   r   M  s,      $0"# r   c                       sD   e Zd ZdZdeeeeeeeeeeee eed fd	d
Z	  Z
S )r   a  Implements the Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
    [:footcite:`shi2021emformer`].

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        num_layers (int): number of Emformer layers to instantiate.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        right_context_length (int, optional): length of right context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

    Examples:
        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
        >>> lengths = torch.randint(1, 200, (128,))  # batch
        >>> output, lengths = emformer(input, lengths)
        >>> input = torch.rand(128, 5, 512)
        >>> lengths = torch.ones(128) * 5
        >>> output, lengths, states = emformer.infer(input, lengths, None)
    rE   r#   r   r-   FrF   )rH   rI   r   r(   r   rJ   r"   r   r   r   r'   rL   rM   c                    sV   t ||
tj 	
fddt|D }t j||	d d S )Nc                    s.   g | ]&}t  
| 	d qS ))rJ   r"   r   r   rK   rL   rM   )r   r2   r"   rJ   r   rH   r   r   rM   rI   r   rL   Zweight_init_gainsr   r   r,   U  s   z%Emformer.__init__.<locals>.<listcomp>)r   r   r   )r6   r   r$   r   r5   rN   rO   )rV   rH   rI   r   r(   r   rJ   r"   r   r   r   r'   rL   rM   r   rW   r   r   rO   C  s    
zEmformer.__init__)rE   r#   r   r   r   r-   FrF   )r}   r~   r   r   r   rf   r   r   ri   rO   r   r   r   rW   r   r   #  s0   &        )N)r0   typingr   r   r   r   __all__r   r   r!   r   r$   Moduler&   r   rf   r6   ri   r   rC   rD   r   r   r   r   r   r   r   <module>   s<        s   W