U
    (dZ                     @   s,  d dl mZ d dlmZmZmZmZ d dlZd dlm	  m
Z d dlm	Z	mZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ d
dlmZmZ d
dlmZ d
dlmZ dddddddgZdd Zej !d G dd de	j"Z#d/eeeeee$ e$ee$ e%e%ee ee dddZ&ej !d G dd de	j"Z'G dd  d e	j"Z(G d!d de	j"Z)ee$ e$ee$ ee$ ee$ e%ee e*ee)d"
d#d$Z+d%eiZ,G d&d deZ-G d'd deZ.G d(d deZ/dd)d*ee- e*ee)d+d,dZ0dd)d*ee. e*ee)d+d-dZ1dd)d*ee/ e*ee)d+d.dZ2dS )0    )partial)OptionalCallableListAnyN)nnTensor   )MLPPermute)StochasticDepth)ImageClassificationInterpolationMode)_log_api_usage_once   )WeightsEnumWeights)_IMAGENET_CATEGORIES)_ovewrite_named_paramSwinTransformerSwin_T_WeightsSwin_S_WeightsSwin_B_Weightsswin_tswin_sswin_bc              
   C   s8   | j dd  \}}}t| ddd|d d|d f} | S )Nr   r	   )shapeFpad)xHW_ r$   G/tmp/pip-unpacked-wheel-vx7f76es/torchvision/models/swin_transformer.py_patch_merging_pad   s     r&   c                       sF   e Zd ZdZejfeedejf d fddZ	e
dddZ  ZS )	PatchMergingzPatch Merging Layer.
    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
    .)dim
norm_layerc                    sD   t    t|  || _tjd| d| dd| _|d| | _d S )N   r	   Fbias)super__init__r   r(   r   Linear	reductionnorm)selfr(   r)   	__class__r$   r%   r.   ,   s
    
zPatchMerging.__init__r    c                 C   s   t |}|dddddddddf }|dddddddddf }|dddddddddf }|dddddddddf }t||||gd}| |}| |}|S )z
        Args:
            x (Tensor): input tensor with expected layout of [..., H, W, C]
        Returns:
            Tensor with layout of [..., H/2, W/2, 2*C]
        .r   Nr	   r   )r&   torchcatr1   r0   )r2   r    Zx0x1Zx2Zx3r$   r$   r%   forward3   s        

zPatchMerging.forward)__name__
__module____qualname____doc__r   	LayerNormintr   Moduler.   r   r:   __classcell__r$   r$   r3   r%   r'   %   s   $r'           )input
qkv_weightproj_weightrelative_position_biaswindow_size	num_heads
shift_sizeattention_dropoutdropoutqkv_bias	proj_biasc           !   	   C   sP  | j \}}}}|d ||d   |d  }|d ||d   |d  }t| ddd|d|f}|j \}}}}|d |krd|d< |d |krd|d< t|dkrtj||d  |d  fdd}||d  ||d   }||||d  |d ||d  |d |}|dddddd|| |d |d  |}t	|||	}||
d|
dd||| ddddd}|d |d |d   }}}||| d	  }||d
d}|| }t|dkrR|||f}d|d  f|d  |d  f|d  dff}d|d  f|d  |d  f|d  dff}d}|D ]>}|D ]2} |||d |d | d | d f< |d7 }q>q6|||d  |d ||d  |d }|dddd||d |d  }|d|d }||dktd|dktd}||
d| |||
d|
d}||dd }|d||
d|
d}tj|dd}tj||d}||dd|
d|
d|}t	|||
}tj||d}||||d  ||d  |d |d |}|dddddd||||}t|dkr(tj||d |d fdd}|ddd|d|ddf  }|S )aE  
    Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        input (Tensor[N, H, W, C]): The input tensor or 4-dimensions.
        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
        relative_position_bias (Tensor): The learned relative position bias added to attention.
        window_size (List[int]): Window size.
        num_heads (int): Number of attention heads.
        shift_size (List[int]): Shift size for shifted window attention.
        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
        dropout (float): Dropout ratio of output. Default: 0.0.
        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
    Returns:
        Tensor[N, H, W, C]: The output tensor after shifted window attention.
    r   r   )r   r	   )ZshiftsZdims   r	   r*      g      r6   Ng      YrC   )r(   )p)r   r   r   sumr7   ZrollviewpermuteZreshapeZlinearsizematmulZ	transposeZ	new_zeros	unsqueezeZmasked_fillfloatZsoftmaxrL   
contiguous)!rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   Br!   r"   CZpad_rZpad_br    r#   Zpad_HZpad_WZnum_windowsqkvqkvattnZ	attn_maskZh_slicesZw_slicescounthwr$   r$   r%   shifted_window_attentionG   s^     ,.0..$($$((, $re   c                
       sL   e Zd ZdZd
eee ee eeeeed fddZe	ddd	Z
  ZS )ShiftedWindowAttentionz/
    See :func:`shifted_window_attention`.
    TrC   )r(   rH   rJ   rI   rM   rN   rK   rL   c	                    s  t    t|dks"t|dkr*td|| _|| _|| _|| _|| _t	j
||d |d| _t	j
|||d| _t	td|d  d d|d  d  || _t| jd }	t| jd }
ttj|	|
dd}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | jd d 7  < |d d d d df  | jd d 7  < |d d d d df  d| jd  d 9  < |d	d	}| d
| t	jj| jdd d S )Nr	   z.window_size and shift_size must be of length 2rO   r+   r   r   Zij)Zindexingr6   relative_position_index{Gz?Zstd)r-   r.   len
ValueErrorrH   rJ   rI   rK   rL   r   r/   r]   proj	Parameterr7   zerosrelative_position_bias_tableZarangestackZmeshgridflattenrU   rZ   rS   rT   Zregister_bufferinittrunc_normal_)r2   r(   rH   rJ   rI   rM   rN   rK   rL   Zcoords_hZcoords_wZcoordsZcoords_flattenZrelative_coordsrg   r3   r$   r%   r.      s2    
&,((,zShiftedWindowAttention.__init__r5   c                 C   s~   | j d | j d  }| j| j }|||d}|ddd d}t|| jj	| j
j	|| j | j| j| j| j| jj| j
jdS )z
        Args:
            x (Tensor): Tensor with layout of [B, H, W, C]
        Returns:
            Tensor with same layout as input, i.e. [B, H, W, C]
        r   r   r6   r	   )rJ   rK   rL   rM   rN   )rH   ro   rg   rT   rU   rZ   rX   re   r]   weightrl   rI   rJ   rK   rL   r,   )r2   r    NrG   r$   r$   r%   r:      s"    zShiftedWindowAttention.forward)TTrC   rC   )r;   r<   r=   r>   r@   r   boolrY   r.   r   r:   rB   r$   r$   r3   r%   rf      s    
    +rf   c                       sr   e Zd ZdZddddejefeeee ee e	e	e	e	e
dejf e
dejf d
 fddZedd	d
Z  ZS )SwinTransformerBlocka  
    Swin Transformer Block.
    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (List[int]): Window size.
        shift_size (List[int]): Shift size for shifted window attention.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        dropout (float): Dropout rate. Default: 0.0.
        attention_dropout (float): Attention dropout rate. Default: 0.0.
        stochastic_depth_prob: (float): Stochastic depth rate. Default: 0.0.
        norm_layer (nn.Module): Normalization layer.  Default: nn.LayerNorm.
        attn_layer (nn.Module): Attention layer. Default: ShiftedWindowAttention
          @rC   .)
r(   rI   rH   rJ   	mlp_ratiorL   rK   stochastic_depth_probr)   
attn_layerc                    s   t    t|  |	|| _|
||||||d| _t|d| _|	|| _t|t	|| |gt
jd |d| _| j D ]:}t|t
jrtt
j|j |jd k	rtt
jj|jdd qtd S )N)rK   rL   row)Zactivation_layerZinplacerL   gư>ri   )r-   r.   r   norm1ra   r   stochastic_depthnorm2r
   r@   r   ZGELUmlpmodules
isinstancer/   rr   Zxavier_uniform_rt   r,   Znormal_)r2   r(   rI   rH   rJ   ry   rL   rK   rz   r)   r{   mr3   r$   r%   r.   
  s&    


"
zSwinTransformerBlock.__init__r5   c                 C   s8   ||  | | | }||  | | | }|S )N)r~   ra   r}   r   r   r2   r    r$   r$   r%   r:   -  s    zSwinTransformerBlock.forward)r;   r<   r=   r>   r   r?   rf   r@   r   rY   r   rA   r.   r   r:   rB   r$   r$   r3   r%   rw      s(   #rw   c                       sr   e Zd ZdZdee eee ee ee eeeeeeede	j
f  eede	j
f  d fdd	Zd
d Z  ZS )r   a  
    Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
    Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_ paper.
    Args:
        patch_size (List[int]): Patch size.
        embed_dim (int): Patch embedding dimension.
        depths (List(int)): Depth of each Swin Transformer layer.
        num_heads (List(int)): Number of attention heads in different layers.
        window_size (List[int]): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
        dropout (float): Dropout rate. Default: 0.0.
        attention_dropout (float): Attention dropout rate. Default: 0.0.
        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.0.
        num_classes (int): Number of classes for classification head. Default: 1000.
        block (nn.Module, optional): SwinTransformer Block. Default: None.
        norm_layer (nn.Module, optional): Normalization layer. Default: None.
    rx   rC     N.)
patch_size	embed_dimdepthsrI   rH   ry   rL   rK   rz   num_classesr)   blockc                    s  t    t|  |
| _|d kr$t}|d kr:ttjdd}g }|t	tj
d||d |d f|d |d fdtddddg|| t|}d}tt|D ]}g }|d|  }t|| D ]R |	t| |d  }||||| | fdd	|D |||||d
	 |d7 }q|tj	|  |t|d k r|t|| qtj	| | _|dt|d   }||| _td| _t||
| _|  D ]@}t|tjrtjj|jdd |jd k	rtj|j qd S )Ngh㈵>)ZepsrO   r   r   )Zkernel_sizeZstrider	   c                    s$   g | ]} d  dkrdn|d  qS )r	   r   r$   ).0rd   Zi_layerr$   r%   
<listcomp>y  s     z,SwinTransformer.__init__.<locals>.<listcomp>)rH   rJ   ry   rL   rK   rz   r)   rh   ri   )r-   r.   r   r   rw   r   r   r?   appendZ
SequentialZConv2dr   rS   rangerj   rY   r'   featuresr1   ZAdaptiveAvgPool2davgpoolr/   headr   r   rr   rs   rt   r,   Zzeros_)r2   r   r   r   rI   rH   ry   rL   rK   rz   r   r)   r   ZlayersZtotal_stage_blocksZstage_block_idZi_stageZstager(   Zsd_probZnum_featuresr   r3   r   r%   r.   F  sl    
   


zSwinTransformer.__init__c                 C   sH   |  |}| |}|dddd}| |}t|d}| |}|S )Nr   rO   r   r	   )r   r1   rU   r   r7   rq   r   r   r$   r$   r%   r:     s    



zSwinTransformer.forward)rx   rC   rC   rC   r   NN)r;   r<   r=   r>   r   r@   rY   r   r   r   rA   r.   r:   rB   r$   r$   r3   r%   r   3  s.          M)
r   r   r   rI   rH   rz   weightsprogresskwargsreturnc           
   	   K   sX   |d k	rt |dt|jd  tf | |||||d|}	|d k	rT|	|j|d |	S )Nr   
categories)r   r   r   rI   rH   rz   )r   )r   rj   metar   Zload_state_dictZget_state_dict)
r   r   r   rI   rH   rz   r   r   r   modelr$   r$   r%   _swin_transformer  s    
r   r   c                   @   sF   e Zd Zedeeddejdeddddd	d
didddZ	e	Z
dS )r   z7https://download.pytorch.org/models/swin_t-704ceda3.pth      Z	crop_sizeZresize_sizeinterpolationibr   r   Uhttps://github.com/pytorch/vision/tree/main/references/classification#swintransformerImageNet-1KguV^T@glW@zacc@1zacc@5YThese weights reproduce closely the results of the paper using a similar training recipe.Z
num_paramsZmin_sizeZrecipeZ_metricsZ_docsurlZ
transformsr   Nr;   r<   r=   r   r   r   r   ZBICUBIC_COMMON_METAZIMAGENET1K_V1DEFAULTr$   r$   r$   r%   r     s*      c                   @   sF   e Zd Zedeeddejdeddddd	d
didddZ	e	Z
dS )r   z7https://download.pytorch.org/models/swin_s-5e29d889.pthr      r   irr   r   r   gCT@gףp=
X@r   r   r   r   Nr   r$   r$   r$   r%   r     s*      c                   @   sF   e Zd Zedeeddejdeddddd	d
didddZ	e	Z
dS )r   z7https://download.pytorch.org/models/swin_b-68c6b09e.pthr      r   i<;r   r   r   gh|?T@g)\(X@r   r   r   r   Nr   r$   r$   r$   r%   r     s*      T)r   r   )r   r   r   r   c                 K   sB   t | } tf ddgdddddgddddgddgd	| |d
|S )a  
    Constructs a swin_tiny architecture from
    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.

    Args:
        weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.Swin_T_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.Swin_T_Weights
        :members:
    r*   `   r	      rO            g?r   r   r   rI   rH   rz   r   r   )r   verifyr   r   r   r   r$   r$   r%   r     s    


	c                 K   sB   t | } tf ddgdddddgddddgd	d	gd
| |d|S )a  
    Constructs a swin_small architecture from
    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.

    Args:
        weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.Swin_S_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.Swin_S_Weights
        :members:
    r*   r   r	      rO   r   r   r   r   g333333?r   )r   r   r   r   r$   r$   r%   r   )  s    


	c                 K   sB   t | } tf ddgdddddgddddgddgd	| |d
|S )a  
    Constructs a swin_base architecture from
    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.

    Args:
        weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.Swin_B_Weights` below for
            more details, and possible values. By default, no pre-trained
            weights are used.
        progress (bool, optional): If True, displays a progress bar of the
            download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.swin_transformer.SwinTransformer``
            base class. Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.Swin_B_Weights
        :members:
    r*      r	   r             r   g      ?r   )r   r   r   r   r$   r$   r%   r   M  s    


	)rC   rC   NN)3	functoolsr   typingr   r   r   r   r7   Ztorch.nn.functionalr   Z
functionalr   r   Zops.miscr
   r   Zops.stochastic_depthr   Ztransforms._presetsr   r   utilsr   Z_apir   r   Z_metar   _utilsr   __all__r&   ZfxwraprA   r'   r@   rY   re   rf   rw   r   rv   r   r   r   r   r   r   r   r   r$   r$   r$   r%   <module>   s~   *    dL9k  $ $