U
    (d?                  	   @   sx  d dl mZ d dlmZmZmZmZmZmZm	Z	m
Z
 d dlmZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ dddddddgZG dd dejZG dd dejZG dd dejZ G dd dej!Z"G dd dej!Z#G dd dejZ$G dd  d ejZ%G d!d dej!Z&ee
e"e#f  eee
ee ef   ee' ed"ej!f ee e(e	e&d#d$d%Z)d&ed'd(d)Z*G d*d deZ+G d+d deZ,G d,d deZ-ed-e+j.fd.dd/d0ee+ e(e	e&d1d2dZ/ed-e,j.fd.dd/d0ee, e(e	e&d1d3dZ0ed-e-j.fd.dd/d0ee- e(e	e&d1d4dZ1dd5lm2Z2 e2e+j.j3e,j.j3e-j.j3d6Z4dS )7    )partial)TupleOptionalCallableListSequenceTypeAnyUnionN)Tensor   )VideoClassification)_log_api_usage_once   )WeightsEnumWeights)_KINETICS400_CATEGORIES)handle_legacy_interface_ovewrite_named_paramVideoResNetR3D_18_WeightsMC3_18_WeightsR2Plus1D_18_Weightsr3d_18mc3_18r2plus1d_18c                       sP   e Zd Zd	eeee eedd fddZeeeeeef dddZ  Z	S )
Conv3DSimpleN   	in_planes
out_planes	midplanesstridepaddingreturnc                    s   t  j||d||dd d S )N)r   r   r   FZin_channelsZout_channelskernel_sizer"   r#   biassuper__init__selfr   r    r!   r"   r#   	__class__ C/tmp/pip-unpacked-wheel-vx7f76es/torchvision/models/video/resnet.pyr*      s    zConv3DSimple.__init__r"   r$   c                 C   s
   | | | fS Nr/   r"   r/   r/   r0   get_downsample_stride'   s    z"Conv3DSimple.get_downsample_stride)Nr   r   
__name__
__module____qualname__intr   r*   staticmethodr   r4   __classcell__r/   r/   r-   r0   r      s            r   c                       sL   e Zd Zd	eeeeedd fddZeeeeeef dddZ  ZS )
Conv2Plus1Dr   Nr   c                    s`   t  tj||dd||fd||fddt|tjddtj||d|ddf|ddfdd d S )	Nr   r   r   r   r   Fr&   r"   r#   r'   TZinplacer   r   r   r)   r*   nnConv3dBatchNorm3dReLUr+   r-   r/   r0   r*   -   s(    
     zConv2Plus1D.__init__r1   c                 C   s
   | | | fS r2   r/   r3   r/   r/   r0   r4   >   s    z!Conv2Plus1D.get_downsample_stride)r   r   )	r6   r7   r8   r9   r*   r:   r   r4   r;   r/   r/   r-   r0   r<   ,   s   r<   c                       sP   e Zd Zd	eeee eedd fddZeeeeeef dddZ  Z	S )
Conv3DNoTemporalNr   r   c                    s(   t  j||dd||fd||fdd d S )Nr=   r   r   Fr%   r(   r+   r-   r/   r0   r*   D   s    zConv3DNoTemporal.__init__r1   c                 C   s
   d| | fS Nr   r/   r3   r/   r/   r0   r4   Q   s    z&Conv3DNoTemporal.get_downsample_stride)Nr   r   r5   r/   r/   r-   r0   rF   C   s            rF   c                       sR   e Zd ZdZd
eeedejf eeej dd fddZ	e
e
ddd	Z  ZS )
BasicBlockr   N.inplanesplanesconv_builderr"   
downsampler$   c                    s   || d d d |d d d|   }t    t|||||t|tjdd| _t||||t|| _tjdd| _|| _	|| _
d S )Nr   Tr?   )r)   r*   rB   
SequentialrD   rE   conv1conv2relurM   r"   r,   rJ   rK   rL   r"   rM   r!   r-   r/   r0   r*   Z   s    (
  
zBasicBlock.__init__xr$   c                 C   sB   |}|  |}| |}| jd k	r,| |}||7 }| |}|S r2   )rO   rP   rM   rQ   r,   rT   Zresidualoutr/   r/   r0   forwardm   s    




zBasicBlock.forward)r   Nr6   r7   r8   	expansionr9   r   rB   Moduler   r*   r   rW   r;   r/   r/   r-   r0   rH   V   s     rH   c                       sR   e Zd ZdZdeeedejf eeej dd fddZ	e
e
dd	d
Z  ZS )
Bottleneck   r   N.rI   c                    s   t    || d d d |d d d|   }ttj||dddt|tjdd| _t|||||t|tjdd| _ttj||| j	 dddt|| j	 | _
tjdd| _|| _|| _d S )Nr   r   F)r&   r'   Tr?   )r)   r*   rB   rN   rC   rD   rE   rO   rP   rY   conv3rQ   rM   r"   rR   r-   r/   r0   r*   ~   s&    	
(  
  
zBottleneck.__init__rS   c                 C   sL   |}|  |}| |}| |}| jd k	r6| |}||7 }| |}|S r2   )rO   rP   r]   rM   rQ   rU   r/   r/   r0   rW      s    





zBottleneck.forward)r   NrX   r/   r/   r-   r0   r[   {   s     r[   c                       s&   e Zd ZdZdd fddZ  ZS )	BasicStemz$The default conv-batchnorm-relu stemNr$   c              
      s4   t  tjdddddddtdtjdd	 d S )
Nr   @   )r      ra   r   r   r   r=   Fr>   Tr?   rA   r,   r-   r/   r0   r*      s
    
zBasicStem.__init__r6   r7   r8   __doc__r*   r;   r/   r/   r-   r0   r^      s   r^   c                       s&   e Zd ZdZdd fddZ  ZS )R2Plus1dStemzRR(2+1)D stem is different than the default one as it uses separated 3D convolutionNr_   c                    sZ   t  tjdddddddtdtjdd	tjdd
dddddtd
tjdd	 d S )Nr   -   )r   ra   ra   rb   )r   r   r   Fr>   Tr?   r`   r@   r   r   r   )r   r   r   rA   rc   r-   r/   r0   r*      s    

zR2Plus1dStem.__init__rd   r/   r/   r-   r0   rf      s   rf   c                	       s   e Zd Zdeeeef  eeeee	e
f   ee edejf eedd fddZeedd	d
Zdeeeef  eeee	e
f  eeeejdddZ  ZS )r     F.N)blockconv_makerslayersstemnum_classeszero_init_residualr$   c                    s  t    t|  d| _| | _| j||d d|d dd| _| j||d d|d dd| _| j||d d|d dd| _| j||d d	|d dd| _	t
d
| _t
d	|j || _|  D ]}t|t
jrt
jj|jddd |jdk	rbt
j|jd qt|t
jr4t
j|jd t
j|jd qt|t
jrt
j|jdd t
j|jd q|r|  D ]$}t|trrt
j|jjd qrdS )a^  Generic resnet video generator.

        Args:
            block (Type[Union[BasicBlock, Bottleneck]]): resnet building block
            conv_makers (List[Type[Union[Conv3DSimple, Conv3DNoTemporal, Conv2Plus1D]]]): generator
                function for each layer
            layers (List[int]): number of blocks per layer
            stem (Callable[..., nn.Module]): module specifying the ResNet stem.
            num_classes (int, optional): Dimension of the final FC layer. Defaults to 400.
            zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False.
        r`   r   r   r3      r      r   i   rh   Zfan_outrQ   )modeZnonlinearityNg{Gz?)r)   r*   r   rJ   rm   _make_layerlayer1layer2layer3layer4rB   ZAdaptiveAvgPool3davgpoolZLinearrY   fcmodules
isinstancerC   initZkaiming_normal_Zweightr'   Z	constant_rD   Znormal_r[   Zbn3)r,   rj   rk   rl   rm   rn   ro   mr-   r/   r0   r*      s2    
zVideoResNet.__init__rS   c                 C   sT   |  |}| |}| |}| |}| |}| |}|d}| |}|S rG   )rm   rt   ru   rv   rw   rx   flattenry   )r,   rT   r/   r/   r0   rW      s    







zVideoResNet.forwardr   )rj   rL   rK   blocksr"   r$   c           
   	   C   s   d }|dks| j ||j krV||}ttj| j ||j d|ddt||j }g }||| j |||| ||j | _ td|D ]}	||| j || qtj| S )Nr   F)r&   r"   r'   )	rJ   rY   r4   rB   rN   rC   rD   appendrange)
r,   rj   rL   rK   r   r"   rM   Z	ds_striderl   ir/   r/   r0   rs   	  s    
zVideoResNet._make_layer)ri   F)r   )r6   r7   r8   r   r
   rH   r[   r   r   rF   r<   r   r9   r   rB   rZ   boolr*   r   rW   rN   rs   r;   r/   r/   r-   r0   r      s*     4 .)rj   rk   rl   rm   weightsprogresskwargsr$   c                 K   sN   |d k	rt |dt|jd  t| |||f|}|d k	rJ||j|d |S )Nrn   
categories)r   )r   lenmetar   Zload_state_dictZget_state_dict)rj   rk   rl   rm   r   r   r   modelr/   r/   r0   _video_resnet#  s    	r   )r   r   zKhttps://github.com/pytorch/vision/tree/main/references/video_classificationzSThese weights reproduce closely the accuracy of the paper for 16-frame clip inputs.)Zmin_sizer   ZrecipeZ_docsc                	   @   s<   e Zd Zedeedddeddddd	id
dZeZdS )r   z7https://download.pytorch.org/models/r3d_18-b3b3357e.pthp   r   rp      Z	crop_sizeZresize_sizeiP5Kinetics-400g     `J@gR@zacc@1zacc@5Z
num_paramsZ_metricsurlZ
transformsr   N	r6   r7   r8   r   r   r   _COMMON_METAKINETICS400_V1DEFAULTr/   r/   r/   r0   r   ?  s   c                	   @   s<   e Zd Zedeedddeddddd	id
dZeZdS )r   z7https://download.pytorch.org/models/mc3_18-a90a0ba3.pthr   r   r   iPu r   g33333J@g(\S@r   r   r   Nr   r/   r/   r/   r0   r   Q  s   c                	   @   s<   e Zd Zedeedddeddddd	id
dZeZdS )r   z<https://download.pytorch.org/models/r2plus1d_18-91a641e6.pthr   r   r   ir   g     L@gp=
׳S@r   r   r   Nr   r/   r/   r/   r0   r   c  s   Z
pretrained)r   T)r   r   )r   r   r   r$   c                 K   s.   t | } tttgd ddddgt| |f|S )a  Construct 18 layer Resnet3D model.

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.R3D_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.R3D_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.R3D_18_Weights
        :members:
    r\   r   )r   verifyr   rH   r   r^   r   r   r   r/   r/   r0   r   u  s    

c                 K   s4   t | } tttgtgd  ddddgt| |f|S )a  Construct 18 layer Mixed Convolution network as in

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.MC3_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.MC3_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.MC3_18_Weights
        :members:
    r   r   )r   r   r   rH   r   rF   r^   r   r/   r/   r0   r     s    

c                 K   s.   t | } tttgd ddddgt| |f|S )a  Construct 18 layer deep R(2+1)D network as in

    .. betastatus:: video module

    Reference: `A Closer Look at Spatiotemporal Convolutions for Action Recognition <https://arxiv.org/abs/1711.11248>`__.

    Args:
        weights (:class:`~torchvision.models.video.R2Plus1D_18_Weights`, optional): The
            pretrained weights to use. See
            :class:`~torchvision.models.video.R2Plus1D_18_Weights`
            below for more details, and possible values. By default, no
            pre-trained weights are used.
        progress (bool): If True, displays a progress bar of the download to stderr. Default is True.
        **kwargs: parameters passed to the ``torchvision.models.video.resnet.VideoResNet`` base class.
            Please refer to the `source code
            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/resnet.py>`_
            for more details about this class.

    .. autoclass:: torchvision.models.video.R2Plus1D_18_Weights
        :members:
    r\   r   )r   r   r   rH   r<   rf   r   r/   r/   r0   r     s    

)
_ModelURLs)r   r   r   )5	functoolsr   typingr   r   r   r   r   r   r	   r
   Ztorch.nnrB   Ztorchr   Ztransforms._presetsr   utilsr   Z_apir   r   Z_metar   _utilsr   r   __all__rC   r   rN   r<   rF   rZ   rH   r[   r^   rf   r   r9   r   r   r   r   r   r   r   r   r   r   r   r   Z
model_urlsr/   r/   r/   r0   <module>   sl   (%1_"#"#"$