U
    (d{                     @   s  U d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZ d	dlmZ d	dlmZmZ dddddddddddgZG dd de	Z G dd deZ!G dd dej"Z#G dd dej"Z$G d d dej"Z%e&e&e&e&e&e
e e'ee%d!	d"d#Z(d$eiZ)ee*ef e+d%< e)d&d'd(Z,G d)d deZ-G d*d deZ.G d+d deZ/G d,d deZ0G d-d deZ1ed.e-j2fd/dd0d1e
e- e'ee%d2d3dZ3ed.e.j2fd/dd0d1e
e. e'ee%d2d4dZ4ed.e/j2fd/dd0d1e
e/ e'ee%d2d5dZ5ed.e0j2fd/dd0d1e
e0 e'ee%d2d6dZ6dd0d1e
e1 e'ee%d2d7dZ7d@e&e&d:e*e'd:d;d<d=Z8d	d>lm9Z9 e9e-j2j:e.j2j:e/j2j:e0j2j:d?Z;dS )A    N)OrderedDict)partial)AnyCallableList
NamedTupleOptionalDict   )Conv2dNormActivationMLP)ImageClassificationInterpolationMode)_log_api_usage_once   )WeightsEnumWeights)_IMAGENET_CATEGORIES)handle_legacy_interface_ovewrite_named_paramVisionTransformerViT_B_16_WeightsViT_B_32_WeightsViT_L_16_WeightsViT_L_32_WeightsViT_H_14_Weightsvit_b_16vit_b_32vit_l_16vit_l_32vit_h_14c                   @   sV   e Zd ZU eed< eed< eed< ejZedej	f ed< ej
Zedej	f ed< dS )ConvStemConfigout_channelskernel_sizestride.
norm_layeractivation_layerN)__name__
__module____qualname__int__annotations__nnZBatchNorm2dr%   r   ModuleZReLUr&    r.   r.   I/tmp/pip-unpacked-wheel-vx7f76es/torchvision/models/vision_transformer.pyr!       s
   
r!   c                       s:   e Zd ZdZdZeeed fddZ fddZ  Z	S )MLPBlockzTransformer MLP block.r
   )in_dimmlp_dimdropoutc                    sd   t  j|||gtjd |d |  D ]:}t|tjr$tj|j	 |j
d k	r$tjj|j
dd q$d S )N)r&   Zinplacer3   ư>std)super__init__r,   ZGELUmodules
isinstanceLinearinitZxavier_uniform_weightbiasnormal_)selfr1   r2   r3   m	__class__r.   r/   r8   -   s    
zMLPBlock.__init__c              	      s   | dd }|d ks|dk rxtdD ]R}	dD ]H}
| d|	d  d|
 }| d|	  d|
 }||kr,||||< q,q$t ||||||| d S )Nversionr
   )r=   r>   Zlinear_r   .   )getrangepopr7   _load_from_state_dict)r@   Z
state_dictprefixZlocal_metadatastrictZmissing_keysZunexpected_keysZ
error_msgsrD   itypeZold_keyZnew_keyrB   r.   r/   rJ   6   s"    
zMLPBlock._load_from_state_dict)
r'   r(   r)   __doc___versionr*   floatr8   rJ   __classcell__r.   r.   rB   r/   r0   (   s   	r0   c                	       sZ   e Zd ZdZeejddfeeeeee	de
jjf d fddZe
jdd	d
Z  ZS )EncoderBlockzTransformer encoder block.r4   Zeps.)	num_heads
hidden_dimr2   r3   attention_dropoutr%   c                    sV   t    || _||| _tj|||dd| _t|| _||| _	t
|||| _d S )NT)r3   Zbatch_first)r7   r8   rU   ln_1r,   ZMultiheadAttentionself_attentionDropoutr3   ln_2r0   mlp)r@   rU   rV   r2   r3   rW   r%   rB   r.   r/   r8   Y   s    	


zEncoderBlock.__init__inputc                 C   sj   t | dkd|j  | |}| j|||dd\}}| |}|| }| |}| |}|| S )NrF   2Expected (batch_size, seq_length, hidden_dim) got F)querykeyvalueZneed_weights)	torch_assertdimshaperX   rY   r3   r[   r\   )r@   r^   x_yr.   r.   r/   forwardn   s    



zEncoderBlock.forwardr'   r(   r)   rO   r   r,   	LayerNormr*   rQ   r   rc   r-   r8   Tensorrj   rR   r.   r.   rB   r/   rS   V   s   	rS   c                       s^   e Zd ZdZeejddfeeeeeeee	de
jjf d fddZe
jdd	d
Z  ZS )Encoderz?Transformer Model Encoder for sequence to sequence translation.r4   rT   .)
seq_length
num_layersrU   rV   r2   r3   rW   r%   c	                    s~   t    ttd||jdd| _t|| _	t
 }	t|D ] }
t|||||||	d|
 < qBt|	| _||| _d S )Nr   g{Gz?r5   Zencoder_layer_)r7   r8   r,   	Parameterrc   emptyr?   pos_embeddingrZ   r3   r   rH   rS   
Sequentiallayersln)r@   ro   rp   rU   rV   r2   r3   rW   r%   ru   rM   rB   r.   r/   r8   }   s    
zEncoder.__init__r]   c                 C   s<   t | dkd|j  || j }| | | |S )NrF   r_   )rc   rd   re   rf   rs   rv   ru   r3   )r@   r^   r.   r.   r/   rj      s    
zEncoder.forwardrk   r.   r.   rB   r/   rn   z   s   rn   c                       s   e Zd ZdZddddeejdddfeeeeeeeeee	e e
dejjf e	ee  d fd	d
ZejejdddZejdddZ  ZS )r   z;Vision Transformer as per https://arxiv.org/abs/2010.11929.        i  Nr4   rT   .)
image_size
patch_sizerp   rU   rV   r2   r3   rW   num_classesrepresentation_sizer%   conv_stem_configsc                    s  t    t|  t|| dkd || _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|d k	rt }d}t|D ]:\}}|d| t||j|j|j|j|jd |j}qx|dtj||dd || _ntjd|||d	| _|| d
 }ttdd|| _|d7 }t||||||||| _|| _t }|
d krRt ||	|d< n,t ||
|d< t! |d< t |
|	|d< t|| _"t#| jtjr| jj$| jjd  | jjd  }tj%j&| jj't()d| d | jj*d k	rbtj%+| jj* nj| jj,d k	rbt#| jj,tjrbtj%j-| jj,j'dt()d| jj,j d | jj,j*d k	rbtj%+| jj,j* t.| j"drt#| j"j/tj r| j"j/j0}tj%j&| j"j/j't()d| d tj%+| j"j/j* t#| j"j1tj rtj%+| j"j1j' tj%+| j"j1j* d S )Nr   z&Input shape indivisible by patch size!rF   Zconv_bn_relu_)in_channelsr"   r#   r$   r%   r&   	conv_lastr   )r}   r"   r#   )r}   r"   r#   r$   r
   head
pre_logitsZactr5   rw   g       @)Zmeanr6   )2r7   r8   r   rc   rd   rx   ry   rV   r2   rW   r3   rz   r{   r%   r,   rt   	enumerateZ
add_moduler   r"   r#   r$   r&   ZConv2d	conv_projrq   zerosclass_tokenrn   encoderro   r   r;   ZTanhheadsr:   r}   r<   Ztrunc_normal_r=   mathsqrtr>   Zzeros_r~   r?   hasattrr   Zin_featuresr   )r@   rx   ry   rp   rU   rV   r2   r3   rW   rz   r{   r%   r|   Zseq_projZprev_channelsrM   Zconv_stem_layer_configro   Zheads_layersZfan_inrB   r.   r/   r8      s    
    

     
 zVisionTransformer.__init__)rg   returnc           	      C   sx   |j \}}}}| j}t|| jkd t|| jkd || }|| }| |}||| j|| }|ddd}|S )NzWrong image height!zWrong image width!r   r
   r   )	rf   ry   rc   rd   rx   r   reshaperV   permute)	r@   rg   nchwpZn_hZn_wr.   r.   r/   _process_input  s    
z VisionTransformer._process_input)rg   c                 C   s^   |  |}|jd }| j|dd}tj||gdd}| |}|d d df }| |}|S )Nr   r   re   )r   rf   r   expandrc   catr   r   )r@   rg   r   Zbatch_class_tokenr.   r.   r/   rj   !  s    



zVisionTransformer.forward)r'   r(   r)   rO   r   r,   rl   r*   rQ   r   r   rc   r-   r   r!   r8   rm   r   rj   rR   r.   r.   rB   r/   r      s.   

i)	ry   rp   rU   rV   r2   weightsprogresskwargsr   c           
   	   K   s   |d k	rTt |dt|jd  |jd d |jd d ks>tt |d|jd d  |dd}tf || ||||d|}	|r|	|j|d	 |	S )
Nrz   
categoriesmin_sizer   r   rx      )rx   ry   rp   rU   rV   r2   )r   )r   lenmetaAssertionErrorrI   r   Zload_state_dictZget_state_dict)
ry   rp   rU   rV   r2   r   r   r   rx   modelr.   r.   r/   _vision_transformer4  s$    
 
r   r   _COMMON_METAz(https://github.com/facebookresearch/SWAGz:https://github.com/facebookresearch/SWAG/blob/main/LICENSE)recipelicensec                   @   s   e Zd Zedeeddedddddd	d
idddZedeeddej	de
dddddd
idddZedeeddej	de
ddddddd
idddZeZdS )r   z9https://download.pytorch.org/models/vit_b_16-c867db91.pthr   	crop_sizei(r   r   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_b_16ImageNet-1KgS㥛DT@g1ZW@zacc@1zacc@5
                These weights were trained from scratch by using a modified version of `DeIT
                <https://arxiv.org/abs/2012.12877>`_'s training recipe.
            
num_paramsr   r   _metrics_docsurlZ
transformsr   z>https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth  r   resize_sizeinterpolationi^-)r   r   g~jtSU@giX@
                These weights are learnt via transfer learning by end-to-end fine-tuning the original
                `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
            r   r   r   r   zAhttps://download.pytorch.org/models/vit_b_16_lc_swag-4e70ced5.pth+https://github.com/pytorch/vision/pull/5793gbX9xT@gQX@
                These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
            r   r   r   r   r   Nr'   r(   r)   r   r   r   r   IMAGENET1K_V1r   BICUBIC_COMMON_SWAG_METAIMAGENET1K_SWAG_E2E_V1IMAGENET1K_SWAG_LINEAR_V1DEFAULTr.   r.   r.   r/   r   _  sn   
c                   @   s@   e Zd Zedeeddedddddd	d
idddZeZdS )r   z9https://download.pytorch.org/models/vit_b_32-d86f8d99.pthr   r   i1Br   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_b_32r   g|?5^R@gW@r   r   r   r   N	r'   r(   r)   r   r   r   r   r   r   r.   r.   r.   r/   r     s    
c                   @   s   e Zd Zedeedddeddddd	d
didddZedeeddej	de
ddddddidddZedeeddej	de
dddddddidddZeZdS )r   z9https://download.pytorch.org/models/vit_l_16-852ce7e3.pthr      )r   r   i#r   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_l_16r   g|?5^S@gFԨW@r   a  
                These weights were trained from scratch by using a modified version of TorchVision's
                `new training recipe
                <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
            r   r   z>https://download.pytorch.org/models/vit_l_16_swag-4f3808c9.pth   r   i0)r   r   gjtV@gT㥛ĠX@r   r   zAhttps://download.pytorch.org/models/vit_l_16_lc_swag-4d563306.pthr   gMbXIU@g^I[X@r   r   Nr   r.   r.   r.   r/   r     sn   c                   @   s@   e Zd Zedeeddedddddd	d
idddZeZdS )r   z9https://download.pytorch.org/models/vit_l_32-c7638314.pthr   r   i[Er   zNhttps://github.com/pytorch/vision/tree/main/references/classification#vit_l_32r   g|?5>S@gGzDW@r   r   r   r   Nr   r.   r.   r.   r/   r   
  s    
c                   @   sz   e Zd Zedeeddejdedddddd	id
ddZ	edeeddejdeddddddd	idddZ
e	ZdS )r   z>https://download.pytorch.org/models/vit_h_14_swag-80465313.pth  r   i%)r   r   r   gS#V@g#~jX@r   r   r   r   zAhttps://download.pytorch.org/models/vit_h_14_lc_swag-c1eb923e.pthr   r   i@%r   gZd;OmU@gQnX@r   r   N)r'   r(   r)   r   r   r   r   r   r   r   r   r   r.   r.   r.   r/   r   "  sP   Z
pretrained)r   T)r   r   )r   r   r   r   c              
   K   s(   t | } tf ddddd| |d|S )a  
    Constructs a vit_b_16 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_B_16_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_B_16_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_B_16_Weights
        :members:
                ry   rp   rU   rV   r2   r   r   )r   verifyr   r   r   r   r.   r.   r/   r   W  s    
c              
   K   s(   t | } tf ddddd| |d|S )a  
    Constructs a vit_b_32 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_B_32_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_B_32_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_B_32_Weights
        :members:
        r   r   r   r   )r   r   r   r   r.   r.   r/   r   x  s    
c              
   K   s(   t | } tf ddddd| |d|S )a  
    Constructs a vit_l_16 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_L_16_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_L_16_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_L_16_Weights
        :members:
    r            r   )r   r   r   r   r.   r.   r/   r     s    
c              
   K   s(   t | } tf ddddd| |d|S )a  
    Constructs a vit_l_32 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_L_32_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_L_32_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_L_32_Weights
        :members:
    r   r   r   r   r   r   )r   r   r   r   r.   r.   r/   r     s    
c              
   K   s(   t | } tf ddddd| |d|S )a  
    Constructs a vit_h_14 architecture from
    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`_.

    Args:
        weights (:class:`~torchvision.models.ViT_H_14_Weights`, optional): The pretrained
            weights to use. See :class:`~torchvision.models.ViT_H_14_Weights`
            below for more details and possible values. By default, no pre-trained weights are used.
        progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.vision_transformer.VisionTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.ViT_H_14_Weights
        :members:
       r   r   i   i   r   )r   r   r   r   r.   r.   r/   r      s    
bicubicFzOrderedDict[str, torch.Tensor])rx   ry   model_stateinterpolation_modereset_headsr   c                 C   sh  |d }|j \}}}|dkr,td|j  | | d d }	|	|krd|d8 }|	d8 }	|ddddddf }
|ddddddf }|ddd}tt|}|| |krtd||  d| |d|||}| | }tjj	|||d	d
}|d||	}|ddd}t
j|
|gdd}||d< |rdt }| D ]\}}|ds@|||< q@|}|S )a  This function helps interpolating positional embeddings during checkpoint loading,
    especially when you want to apply a pre-trained model on images with different resolution.

    Args:
        image_size (int): Image size of the new model.
        patch_size (int): Patch size of the new model.
        model_state (OrderedDict[str, torch.Tensor]): State dict of the pre-trained model.
        interpolation_mode (str): The algorithm used for upsampling. Default: bicubic.
        reset_heads (bool): If true, not copying the state of heads. Default: False.

    Returns:
        OrderedDict[str, torch.Tensor]: A state dict which can be loaded into the new model.
    zencoder.pos_embeddingr   z%Unexpected position embedding shape: r
   Nr   zPseq_length is not a perfect square! Instead got seq_length_1d * seq_length_1d = z and seq_length = T)sizemodeZalign_cornersr   r   )rf   
ValueErrorr   r*   r   r   r   r,   Z
functionalZinterpolaterc   r   r   items
startswith)rx   ry   r   r   r   rs   r   ro   rV   Znew_seq_lengthZpos_embedding_tokenZpos_embedding_imgZseq_length_1dZnew_seq_length_1dZnew_pos_embedding_imgZnew_pos_embeddingZmodel_state_copykvr.   r.   r/   interpolate_embeddings  sF    
r   )
_ModelURLs)r   r   r   r   )r   F)<r   collectionsr   	functoolsr   typingr   r   r   r   r   r	   rc   Ztorch.nnr,   Zops.miscr   r   Ztransforms._presetsr   r   utilsr   Z_apir   r   Z_metar   _utilsr   r   __all__r!   r0   r-   rS   rn   r   r*   boolr   r   strr+   r   r   r   r   r   r   r   r   r   r   r   r    r   r   r   Z
model_urlsr.   r.   r.   r/   <module>   s     .$& ! IJ5" " " "  $  O