U
    ‰dæZ  ã                   @   sr   d dl Z d dl mZ d dlm  mZ d dlm  mZ d dl mZ d dl	m
Z
mZ d dlZG dd„ dejƒZdS )é    N)Únn)ÚTensor)ÚOptionalÚTuplec                       sÖ   e Zd ZejZdgZdeeee	e	e	eee	ddœ
‡ fdd„Z
d	d
„ Zedd„ ƒZejjdd„ ƒZedd„ ƒZdeeeee e	ee e	eeee f dœdd„Zdeeeee e	ee e	eeee f dœdd„Z‡  ZS )ÚMultiheadAttentionÚbatch_firstç        TFN)
Ú	embed_dimÚ	num_headsÚdropoutÚbiasÚadd_bias_kvÚadd_zero_attnÚkdimÚvdimr   Úreturnc              
      sú   |
|dœ}t t| ƒj|||||||||	f	|Ž tj| j| jfd|i|—Ž| _tj| j| jfd|i|—Ž| _tj| j	| jfd|i|—Ž| _
tj| j| jfd|i|—Ž| _t ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _tjj ¡ | _d S )N)ÚdeviceÚdtyper   )Úsuperr   Ú__init__r   ZLinearr	   Úlinear_Qr   Úlinear_Kr   Úlinear_VÚout_projÚnnqZFloatFunctionalÚq_scaling_productÚtorchÚaoÚquantizationZ	QuantStubÚquant_attn_outputÚquant_attn_output_weightsZDeQuantStubÚ	dequant_qÚ	dequant_kÚ	dequant_v)Úselfr	   r
   r   r   r   r   r   r   r   r   r   Zfactory_kwargs©Ú	__class__© úK/tmp/pip-unpacked-wheel-ua33x9lu/torch/nn/quantizable/modules/activation.pyr   ;   s*    
    þý
zMultiheadAttention.__init__c                 C   s   dS )NZQuantizableMultiheadAttentionr'   )r$   r'   r'   r(   Ú	_get_nameU   s    zMultiheadAttention._get_namec              	   C   s„  t |ƒ| jkst‚t|dƒs$tdƒ‚| |j|j|j|jd k	|jd k	|j	|j
|jƒ}|j|_|j|_|j|_|jj|j_|jj|j_|jr¸|j}d}||j }|j||…d d …f }|d k	rÔtj |||… |j¡}tj ||j¡|j_||j_|j}|}||j }|j||…d d …f }|d k	r<tj |||… |j¡}tj ||j¡|j_||j_|j}|}|j|d …d d …f }|d k	rštj ||d … |j¡}tj ||j¡|j_||j_n®t |j¡|j_t |j¡|j_t |j¡|j_|jd krd |j_d |j_d |j_nXt |jd|j… ¡|j_t |j|j|jd … ¡|j_t |j|jd d … ¡|j_| ¡  tjjj |dd}|S )NÚqconfigz$The float module must have 'qconfig'r   é   T)Úinplace)!ÚtypeÚ_FLOAT_MODULEÚAssertionErrorÚhasattrr	   r
   r   Úin_proj_biasÚbias_kr   r   r   Úbias_vr*   r   Úweightr   Ú_qkv_same_embed_dimÚin_proj_weightr   r   Ú	ParameterZrequires_gradr   r   r   Úq_proj_weightÚk_proj_weightÚv_proj_weightÚevalr   r   Úprepare)ÚclsÚotherZobservedr   Ú_startÚ_endr4   r'   r'   r(   Ú
from_floatX   sr      ý
ÿ

ÿ
ÿ

 zMultiheadAttention.from_floatc                 C   sl  |   | j| j| j| jdk	| jdk	| j| j| j| j	¡	}|j
| j
ksDt‚| jdk	r`t | j ¡ ¡|_| jdk	r|t | j ¡ ¡|_| j ¡ \}}t | ¡ ¡|j_|dk	r²t |¡|j_| j ¡ \}}| ¡ }| j ¡ \}}| ¡ }| j ¡ \}}	| ¡ }|j
räd}
|
|j }||j|
|…dd…f< |jdk	rLt|dkƒs>t‚||j|
|…< |}
|
|j }||j|
|…dd…f< |jdk	rœt|dkƒsŽt‚||j|
|…< |}
||j|
d…dd…f< |jdk	rht|	dkƒsÔt‚|	|j|
d…< n„t |¡|_t |¡|_t |¡|_|jdkr.d| j_d| j_d| j_n:||jd|j…< ||j|j|jd …< |	|j|jd d…< |S )zæUtility to convert the quantized MHA back to float.

        The motivation for this is that it is not trivial to conver the weights
        from the format that is used in the quantized version back to the
        float.
        Nr   r+   )r.   r	   r
   r   r1   r2   r   r   r   r   r5   r/   r   r7   Ú
dequantizer3   r   Z_weight_biasr4   r   r   r   r   r6   Úallr8   r9   r:   )r$   ÚfpÚwÚbZwQZbQZwKZbKZwVZbVr?   r@   r'   r'   r(   rB   ˜   sh       ý




zMultiheadAttention.dequantizec                 C   s¨   t jjj|d ddd d}|jd k	r^|j d¡}t j|dd\}}t  |||t j	¡}t
|d|ƒ |jd k	r¤|j d¡}t j|dd\}}t  |||t j	¡}t
|d|ƒ |S )NFT)Úmappingr,   Zremove_qconfigZconvert_custom_config_dictr2   )Zreduce_ranger3   )r   r   r   Úconvertr2   Ú_parametersÚpopZ_choose_qparams_per_tensorÚquantize_per_tensorZquint8Úsetattrr3   )r=   r>   Z	convertedr2   ZscZzpr3   r'   r'   r(   Úfrom_observedÞ   s(    ý	
ÿ

ÿ
z MultiheadAttention.from_observed)ÚqueryÚkeyÚvalueÚkey_padding_maskÚneed_weightsÚ	attn_maskÚaverage_attn_weightsr   c              	   C   s   |   |||||||¡S )aÁ  
    Note::
        Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more
        information

    Args:
        query, key, value: map a query and a set of key-value pairs to an output.
            See "Attention Is All You Need" for more details.
        key_padding_mask: if provided, specified padding elements in the key will
            be ignored by the attention. When given a binary mask and a value is True,
            the corresponding value on the attention layer will be ignored. When given
            a byte mask and a value is non-zero, the corresponding value on the attention
            layer will be ignored
        need_weights: output attn_output_weights.
        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
            the batches while a 3D mask allows to specify a different mask for the entries of each batch.

    Shape:
        - Inputs:
        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
          If a ByteTensor is provided, the non-zero positions will be ignored while the position
          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
          is provided, it will be added to the attention weight.
        - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
          heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
          effect when ``need_weights=True.``. Default: True (i.e. average weights across heads)

        - Outputs:
        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
          E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
        - attn_output_weights: If ``average_attn_weights=True``, returns attention weights averaged
          across heads of shape :math:`(N, L, S)`, where N is the batch size, L is the target sequence length,
          S is the source sequence length. If ``average_weights=False``, returns attention weights per
          head of shape :math:`(N, num_heads, L, S)`.
        )Ú_forward_impl)r$   rN   rO   rP   rQ   rR   rS   rT   r'   r'   r(   Úforwardù   s
    8  ÿzMultiheadAttention.forwardc                 C   sØ  d }d }	| j r(dd„ |||fD ƒ\}}}| ¡ \}
}}| j|ksDt‚| d¡| d¡krl| d¡| d¡kspt‚| j| j }|| j | jks”tdƒ‚t|ƒd }|  |¡}|  |¡}|  |¡}| j	 
||¡}|d k	rê|jtjks,|jtjks,|jtjks,|jtjks,|jtjks,td |j¡ƒ‚|jtjkrPt d¡ | tj¡}| ¡ d	kr–| d¡}t| ¡ ƒd| d¡| d¡gkrêtd
ƒ‚nT| ¡ dkrØt| ¡ ƒ|| j | d¡| d¡gkrêtdƒ‚ntd | ¡ ¡ƒ‚|d k	r|jtjkrt d¡ | tj¡}| jd k	rð| jd k	rð|d krÊ|	d krÊ| j}|d k	sXt‚| j}|d k	slt‚t || d|d¡g¡}t || d|d¡g¡}|d k	r²t |d¡}|d k	rît |d¡}n$|d ksÜtdƒ‚|	d kstdƒ‚n | jd ks t‚| jd kst‚|  ¡  !|
|| j |¡ "dd¡}|d k	rZ|  ¡  !d|| j |¡ "dd¡}|d k	r„|  ¡  !d|| j |¡ "dd¡}|d k	rÀ| d¡|| j ks¨t‚| d	¡|ks¼t‚|}|	d k	rü|	 d¡|| j ksät‚|	 d	¡|ksøt‚|	}| d¡}|d k	r8| d¡|ks$t‚| d¡|ks8t‚| j#r$|d7 }t $| d¡df| ¡ d	d …  ¡}|j%rŽt &|| '¡ | (¡ |j¡}tj||gdd}t $| d¡df| ¡ d	d …  ¡}|j%ræt &|| '¡ | (¡ |j¡}tj||gdd}|d k	rt |d¡}|d k	r$t |d¡}|  )|¡}|  *|¡}|  +|¡}t ,|| "dd	¡¡}t| ¡ ƒ|| j |
|gksxt‚|d k	rª|jtjkr¢| -|tdƒ¡ n||7 }|d k	rö| !|| j|
|¡}| .| d¡ d	¡tdƒ¡}| !|| j |
|¡}tj/|dd}tj0|| j0| j1d}t ,||¡}t| ¡ ƒ|| j |
|gksFt‚| j r`| !||
| j¡}n| "dd¡  ¡  !|
|| j¡}|  2|¡}|  3|¡}|  4|¡}|rÌ| !|| j|
|¡}|rÄ|j5dd}||fS |d fS d S )Nc                 S   s   g | ]}|  d d¡‘qS ©r   é   )Ú	transpose)Ú.0Úxr'   r'   r(   Ú
<listcomp>E  s     z4MultiheadAttention._forward_impl.<locals>.<listcomp>r   rX   z(embed_dim must be divisible by num_headsg      à¿zDOnly float, byte, and bool types are supported for attn_mask, not {}zZByte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.r+   z,The size of the 2D attn_mask is not correct.é   z,The size of the 3D attn_mask is not correct.z)attn_mask's dimension {} is not supportedzaByte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.rW   z#bias cannot be added to static key.z%bias cannot be added to static value.éÿÿÿÿ)Údimz-inf)ÚpÚtraining)6r   Úsizer	   r/   r
   Úfloatr   r   r   r   Z
mul_scalarr   r   Zfloat32Zfloat64Zfloat16Zuint8ÚboolÚformatÚwarningsÚwarnÚtor_   Z	unsqueezeÚlistÚRuntimeErrorr2   r3   ÚcatÚrepeatÚnnFÚpadÚ
contiguousÚviewrY   r   ÚzerosZis_quantizedrK   Zq_scaleZq_zero_pointr!   r"   r#   ZbmmZmasked_fill_Zmasked_fillZsoftmaxr   ra   r   r   r    Zmean)r$   rN   rO   rP   rQ   rR   rS   rT   Zstatic_kZstatic_vZtgt_lenZbszZembed_dim_to_checkZhead_dimZscalingÚqÚkÚvr2   r3   Zsrc_lenZk_zerosZv_zerosZattn_output_weightsZattn_outputr'   r'   r(   rU   4  sð    ,




ÿ
ÿ
ÿ
þ

$
*



 
 
 



$$




"

þ ÿ"


z MultiheadAttention._forward_impl)	r   TFFNNFNN)NTNT)NTNT)Ú__name__Ú
__module__Ú__qualname__r   r   r.   Z__constants__Úintrc   rd   r   r)   ÚclassmethodrA   r   ZjitZunusedrB   rM   r   r   r   rV   rU   Ú__classcell__r'   r'   r%   r(   r      sp   -              ü    ü
?
E
    ù ù?    ù ùr   )r   r   Ztorch.nn.functionalZ
functionalrm   Ztorch.nn.quantizedZ	quantizedr   r   Útypingr   r   rf   r   r'   r'   r'   r(   Ú<module>   s   