U
    d$*                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZ dddddZdd	d
ZdddZdddZdddddZdddddZdS )    N)nccl)_take_tensors_flatten_dense_tensors_unflatten_dense_tensors_reorder_tensors_as_get_device_index_handle_complex)List)outc                C   s^   t | } |dk|dkA s(td|||dk	rLdd |D }tj| |S tj| |S dS )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzSExactly one of 'devices' and 'out' must be specified, but got devices={} and out={}c                 S   s   g | ]}t |qS  r   .0dr   r   :/tmp/pip-unpacked-wheel-ua33x9lu/torch/nn/parallel/comm.py
<listcomp>#   s     zbroadcast.<locals>.<listcomp>)r   RuntimeErrorformattorch_CZ
_broadcastZ_broadcast_out)tensordevicesr
   r   r   r   	broadcast   s     r      c                 C   s,   dd |D }dd | D } t j| ||S )a/  Broadcasts a sequence tensors to the specified GPUs.
    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    c                 S   s   g | ]}t |qS r   r   r   r   r   r   r   8   s     z'broadcast_coalesced.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   r   r   tr   r   r   r   9   s     )r   r   Z_broadcast_coalesced)tensorsr   buffer_sizer   r   r   broadcast_coalesced)   s    r   c                    sZ  t |dd}| d  }d t| D ]v\}}|jjdks@td| |krP| | |kr$ddd	 | D }dd
d	 |D }td	|||q$ dkrt
dt| dkr| d S t| rt|   }tj| | d nlt|   jj|} fddt| D }	|   |	d j|dd }|	dd D ]}
||
j|dd q:|S )a  Sums tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc                 s   s   | ]}t |V  qd S Nstrr   r"   r   r   r   	<genexpr>T   s     zreduce_add.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r#   r$   r&   r   r   r   r'   U   s     z2input {} has invalid size: got {}, but expected {}zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputrootc                    s   g | ]\}}| kr|qS r   r   )r   ir   Z
root_indexr   r   r   c   s      zreduce_add.<locals>.<listcomp>)deviceZnon_blocking)r   size	enumerater-   typeAssertionErrorZ
get_devicejoin
ValueErrorr   r   lenr   Zis_availabler   Z
empty_likereducetoZadd_)inputsdestinationZ
input_sizer+   inpgotexpectedresultZdestination_deviceZnonroototherr   r,   r   
reduce_add=   s8      
r>   c                    s   dd | D }g }g }t |  D ]x}tdd |D rXt||}|| ||d  qt ||D ] \}}	||	jr||	 n|	 qb||d d  q fdd|D }
t |
 D ]<}dd |D }t||}t||d D ]}	||	j qqtt	||S )	a]  Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r   ~   s     z(reduce_add_coalesced.<locals>.<listcomp>c                 s   s   | ]}|j V  qd S r#   )	is_sparser   r   r   r   r'      s     z'reduce_add_coalesced.<locals>.<genexpr>r   c                    s   g | ]}t | qS r   )r   )r   r   r   r   r   r      s     c                 S   s   g | ]}t |qS r   )r   )r   chunkr   r   r   r      s     )
zipallr>   appendr@   Zto_denser   datatupler   )r7   r8   r   Zdense_tensorsr)   Z	ref_orderZtensor_at_gpusr<   Zcollr   ZitrschunksZflat_tensorsZflat_resultr   rB   r   reduce_add_coalescedk   s$    


rJ   c                C   s|   t | } |dkr6dd |D }ttj| ||||S |dk	rLtd||dk	rbtd|ttj| |||S dS )a1  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    Nc                 S   s   g | ]}t |qS r   r   r   r   r   r   r      s     zscatter.<locals>.<listcomp>zK'devices' must not be specified when 'out' is specified, but got devices={}zS'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes={})r   rH   r   r   Z_scatterr   r   Z_scatter_out)r   r   Zchunk_sizesdimZstreamsr
   r   r   r   scatter   s"    "rL   c                C   sp   dd | D } |dkrF|dkr(t d t|ddd}tj| ||S |dk	r\td|tj| ||S dS )	a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    c                 S   s   g | ]}t |qS r   r   r   r   r   r   r      s     zgather.<locals>.<listcomp>NrA   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".T)Z	allow_cpur    zS'destination' must not be specified when 'out' is specified, but got destination={})	warningswarnr   r   r   Z_gatherr   r   Z_gather_out)r   rK   r8   r
   r   r   r   gather   s    rO   )N)r   )N)Nr   )NNr   N)r   N)rM   r   Z
torch.cudar   Ztorch._utilsr   r   r   r   r   r   typingr	   r   r   r>   rJ   rL   rO   r   r   r   r   <module>   s    !

.
-2