U
    d                     @   s   d dl mZmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z ddl	mZ dd	lmZ dd
lmZ ddlmZ G dd deZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZG dd deeZG dd deZdS )     )OptionalAnyN)Tensor)	ParameterUninitializedParameterUninitializedBuffer   )
functional)init   )SyncBatchNorm)LazyModuleMixin)Modulec                       s   e Zd ZU dZdZdddddgZeed< eed< eed< e	ed< e	ed< deeee	e	dd fddZ
ddddZddddZdd Zdd Z fddZ  ZS )	_NormBasez+Common base of _InstanceNorm and _BatchNormr   track_running_statsmomentumepsnum_featuresaffineh㈵>皙?TNr   r   r   r   r   returnc           	   	      s  ||d}t t|   || _|| _|| _|| _|| _| jrftt	j
|f|| _tt	j
|f|| _n| dd  | dd  | jr| dt	j|f| | dt	j|f| |  |  | dt	jddt	jid	d
 | D  |  n$| dd  | dd  | dd  |   d S )Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr   r   c                 S   s   i | ]\}}|d kr||qS r    .0kvr"   r"   >/tmp/pip-unpacked-wheel-ua33x9lu/torch/nn/modules/batchnorm.py
<dictcomp>9   s       z&_NormBase.__init__.<locals>.<dictcomp>)r   )superr   __init__r   r   r   r   r   r   torchemptyr   r   Zregister_parameterZregister_bufferzerosZonestensorlongitemsreset_parameters	selfr   r   r   r   r   r   r   factory_kwargs	__class__r"   r'   r*      s6    

z_NormBase.__init__r   c                 C   s*   | j r&| j  | jd | j  d S Nr   )r   r   Zzero_r   Zfill_r    r3   r"   r"   r'   reset_running_statsA   s    
z_NormBase.reset_running_statsc                 C   s*   |    | jr&t| j t| j d S N)r:   r   r
   Zones_r   Zzeros_r   r9   r"   r"   r'   r1   I   s    z_NormBase.reset_parametersc                 C   s   t d S r;   )NotImplementedErrorr3   inputr"   r"   r'   _check_input_dimO   s    z_NormBase._check_input_dimc                 C   s   dj f | jS )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats})format__dict__r9   r"   r"   r'   
extra_reprR   s    z_NormBase.extra_reprc           
   	      sf   | dd }|d ks|dk rF| jrF|d }	|	|krFtjdtjd||	< tt| ||||||| d S )Nversionr   r    r   r!   )getr   r+   r.   r/   r)   r   _load_from_state_dict)
r3   Z
state_dictprefixZlocal_metadatastrictZmissing_keysZunexpected_keysZ
error_msgsrC   Znum_batches_tracked_keyr5   r"   r'   rE   X   s    

z_NormBase._load_from_state_dict)r   r   TTNN)__name__
__module____qualname____doc___versionZ__constants__int__annotations__floatboolr*   r:   r1   r?   rB   rE   __classcell__r"   r"   r5   r'   r      s6   
      &r   c                       s>   e Zd Zdeeeeedd fddZeedd	d
Z  Z	S )
_BatchNormr   r   TNr   c           	         s*   ||d}t t| j|||||f| d S Nr   )r)   rR   r*   r2   r5   r"   r'   r*   w   s    


    z_BatchNorm.__init__r>   r   c              
   C   s   |  | | jd krd}n| j}| jrb| jrb| jd k	rb| jd | jd kr\dt| j }n| j}| jrnd}n| jd ko| jd k}t	
|| jr| jr| jnd | jr| jr| jnd | j| j||| jS )N        r         ?T)r?   r   trainingr   r    add_rO   r   r   F
batch_normr   r   r   )r3   r>   exponential_average_factorbn_trainingr"   r"   r'   forward   s6    



z_BatchNorm.forward)r   r   TTNN)
rH   rI   rJ   rM   rO   rP   r*   r   r]   rQ   r"   r"   r5   r'   rR   v   s         rR   c                       sV   e Zd ZU eed< eed< ddd fdd	Zdd fd
dZddddZ  ZS )_LazyNormBaser   r   r   r   TNr7   c                    s   ||d}t t| jd||ddf| || _|| _| jrPtf || _tf || _| jrtf || _	tf || _
tjddtjidd | D | _d S )Nr   r   Fr   c                 S   s   i | ]\}}|d kr||qS r!   r"   r#   r"   r"   r'   r(      s       z*_LazyNormBase.__init__.<locals>.<dictcomp>)r   )r)   r^   r*   r   r   r   r   r   r   r   r   r+   r.   r/   r0   r    )r3   r   r   r   r   r   r   r4   r5   r"   r'   r*      s2    


 z_LazyNormBase.__init__c                    s    |   s| jdkrt   d S )Nr   )has_uninitialized_paramsr   r)   r1   r9   r5   r"   r'   r1      s    z_LazyNormBase.reset_parametersc                 C   s   |   r|jd | _| jrZt| jts*tt| jts:t| j	| jf | j	| jf | j
r| j	| jf | j	| jf |   d S r8   )r_   shaper   r   
isinstancer   r   AssertionErrorr   Zmaterializer   r   r   r1   r=   r"   r"   r'   initialize_parameters   s    z#_LazyNormBase.initialize_parameters)r   r   TTNN)	rH   rI   rJ   r   rN   r*   r1   rc   rQ   r"   r"   r5   r'   r^      s   
    r^   c                   @   s   e Zd ZdZdd ZdS )BatchNorm1da  Applies Batch Normalization over a 2D or 3D input as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0. The
    standard-deviation is calculated via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    c                 C   s.   |  dkr*|  dkr*td|  d S Nr      z'expected 2D or 3D input (got {}D input)Zdim
ValueErrorr@   r=   r"   r"   r'   r?   (  s    zBatchNorm1d._check_input_dimNrH   rI   rJ   rK   r?   r"   r"   r"   r'   rd      s   Ard   c                   @   s   e Zd ZdZeZdd ZdS )LazyBatchNorm1da6  A :class:`torch.nn.BatchNorm1d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 C   s.   |  dkr*|  dkr*td|  d S re   rg   r=   r"   r"   r'   r?   K  s    z LazyBatchNorm1d._check_input_dimN)rH   rI   rJ   rK   rd   cls_to_becomer?   r"   r"   r"   r'   rj   /  s   rj   c                   @   s   e Zd ZdZdd ZdS )BatchNorm2da  Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    c                 C   s"   |  dkrtd|  d S N   z!expected 4D input (got {}D input)rg   r=   r"   r"   r'   r?     s    zBatchNorm2d._check_input_dimNri   r"   r"   r"   r'   rl   R  s   Brl   c                   @   s   e Zd ZdZeZdd ZdS )LazyBatchNorm2da6  A :class:`torch.nn.BatchNorm2d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 C   s"   |  dkrtd|  d S rm   rg   r=   r"   r"   r'   r?     s    z LazyBatchNorm2d._check_input_dimN)rH   rI   rJ   rK   rl   rk   r?   r"   r"   r"   r'   ro     s   ro   c                   @   s   e Zd ZdZdd ZdS )BatchNorm3da  Applies Batch Normalization over a 5D input (a mini-batch of 3D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. The standard-deviation is calculated
    via the biased estimator, equivalent to `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    c                 C   s"   |  dkrtd|  d S N   z!expected 5D input (got {}D input)rg   r=   r"   r"   r'   r?     s    zBatchNorm3d._check_input_dimNri   r"   r"   r"   r'   rp     s   Crp   c                   @   s   e Zd ZdZeZdd ZdS )LazyBatchNorm3da6  A :class:`torch.nn.BatchNorm3d` module with lazy initialization of
    the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 C   s"   |  dkrtd|  d S rq   rg   r=   r"   r"   r'   r?      s    z LazyBatchNorm3d._check_input_dimN)rH   rI   rJ   rK   rp   rk   r?   r"   r"   r"   r'   rs     s   rs   c                	       sf   e Zd ZdZdeeeeeee dd fddZ	d	d
 Z
dd ZeedddZedddZ  ZS )r   a  Applies Batch Normalization over a N-Dimensional input (a mini-batch of [N-2]D inputs
    with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TN)r   r   r   r   r   process_groupr   c	           
         s0   ||d}	t t| j|||||f|	 || _d S rS   )r)   r   r*   rt   )
r3   r   r   r   r   r   rt   r   r   r4   r5   r"   r'   r*     s    

    zSyncBatchNorm.__init__c                 C   s"   |  dk rtd|  d S )Nr   z*expected at least 2D input (got {}D input)rg   r=   r"   r"   r'   r?     s    zSyncBatchNorm._check_input_dimc                 C   s   | ddkrtdd S )Nr   r   z9SyncBatchNorm number of input channels should be non-zero)sizerh   r=   r"   r"   r'   _check_non_zero_input_channels  s    z,SyncBatchNorm._check_non_zero_input_channelsrT   c           	      C   sV  |j std| | | | | jd kr2d}n| j}| jr~| jr~| jd k	sRt| j	d | jd krxd| j
  }n| j}| jrd}n| jd ko| jd k}| jr| jr| jnd }| jr| jr| jnd }|o| j}|rtjjj}| jr| j}tj|}|dk}|s(t|||| j| j||| jS |s2tt|| j| j||| j|||	S d S )Nz0SyncBatchNorm expected input tensor to be on GPUrU   r   rV   T)Zis_cudarh   r?   rv   r   rW   r   r    rb   rX   itemr   r   r+   ZdistributedgroupZWORLDrt   Zget_world_sizerY   rZ   r   r   r   sync_batch_normapply)	r3   r>   r[   r\   r   r   Z	need_syncrt   Z
world_sizer"   r"   r'   r]     sd    



	


zSyncBatchNorm.forwardc              	   C   s   |}t |tjjjjrtj|j|j|j	|j
|j|}|j
r`t  |j|_|j|_W 5 Q R X |j|_|j|_|j|_t|dr|j|_| D ]\}}||| || q~|S )a  Helper function to convert all :attr:`BatchNorm*D` layers in the model to
        :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        qconfig)ra   r+   nnmodulesZ	batchnormrR   r   r   r   r   r   r   Zno_gradr   r   r   r   r    hasattrr{   Znamed_childrenZ
add_moduleconvert_sync_batchnorm)clsmodulert   Zmodule_outputnamechildr"   r"   r'   r     s4    #

 
z$SyncBatchNorm.convert_sync_batchnorm)r   r   TTNNN)N)rH   rI   rJ   rK   rM   rO   rP   r   r   r*   r?   rv   r   r]   classmethodr   rQ   r"   r"   r5   r'   r   %  s,   g       Qr   )typingr   r   r+   r   Ztorch.nn.parameterr   r   r    r	   rY   r
   Z
_functionsr   ry   Zlazyr   r   r   r   rR   r^   rd   rj   rl   ro   rp   rs   r"   r"   r"   r'   <module>   s$   hA/I#H!I!