U
    d8                     @   s   U d dl Z d dlZddlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 G dd deZG d	d
 d
eZG dd deZG dd deZdae
e	e
ejj   ed< edddZdS )    N   )comm)Function_get_device_index)ListOptionalc                   @   s$   e Zd Zedd Zedd ZdS )	Broadcastc                 G   s   t dd |D stddd |D }|| _t|dkr@t S t|| _|d  | _t	|| j}g }t
| jdd  D ]$\}}|s||D ]}|||  qq|| j|  tdd |D S )	Nc                 s   s   | ]}|j jd kV  qdS cpuNdevicetype.0i r   @/tmp/pip-unpacked-wheel-ua33x9lu/torch/nn/parallel/_functions.py	<genexpr>   s     z$Broadcast.forward.<locals>.<genexpr>z2Broadcast function not implemented for CPU tensorsc                 S   s   g | ]}t |d qS Tr   r   xr   r   r   
<listcomp>   s     z%Broadcast.forward.<locals>.<listcomp>r   r   c                 S   s   g | ]}|D ]}|qqS r   r   )r   Ztensorstr   r   r   r      s       )allAssertionErrortarget_gpuslentuple
num_inputs
get_deviceinput_devicer   Zbroadcast_coalesced	enumerateZneeds_input_gradappendZmark_non_differentiable)ctxr   inputsoutputsZnon_differentiablesidxZinput_requires_gradoutputr   r   r   forward   s"    

zBroadcast.forwardc                 G   s   dt j| j| jf|  S )NN)ReduceAddCoalescedapplyr!   r   r$   Zgrad_outputsr   r   r   backward    s    zBroadcast.backwardN__name__
__module____qualname__staticmethodr)   r.   r   r   r   r   r	   
   s   
r	   c                   @   s$   e Zd Zedd Zedd ZdS )r+   c                    sL    fddt dt D | _ fddt dt D }t||S )Nc                    s   g | ]} |   qS r   r    r   )gradsr   r   r   )   s     z.ReduceAddCoalesced.forward.<locals>.<listcomp>r   c                    s   g | ]} ||  qS r   r   r   r5   r   r   r   r   +   s   )ranger   r   r   Zreduce_add_coalesced)r$   Zdestinationr   r5   Zgrads_r   r6   r   r)   '   s
     zReduceAddCoalesced.forwardc                 G   s   dt j| jf|  S )NNN)r	   r,   r   r-   r   r   r   r.   /   s    zReduceAddCoalesced.backwardNr/   r   r   r   r   r+   %   s   
r+   c                   @   s$   e Zd Zedd Zedd ZdS )Gatherc                    s   t dd |D std|dkr*d _nt|d}| _| _tdd |D  _t dd |D r|dkrtd	d |D }td
 d _	nd _	t fdd|D  _
t| j jS )Nc                 s   s   | ]}|j jd kV  qdS r
   r   r   r   r   r   r   8   s     z!Gather.forward.<locals>.<genexpr>z/Gather function not implemented for CPU tensorsr   Tc                 s   s   | ]}|  V  qd S r*   r4   r   r   r   r   r   A   s     c                 s   s   | ]}|  d kV  qdS r   N)dimr   r   r   r   r   r   B   s     r   c                 s   s   | ]}| d V  qdS )r   N)viewr<   r   r   r   r   C   s     zvWas asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.Fc                 3   s   | ]}|  jV  qd S r*   )sizer;   r   r$   r   r   r   J   s     )r   r   target_devicer   r;   r   
input_gpuswarningswarnunsqueezed_scalarinput_sizesr   Zgather)r$   r@   r;   r%   r   r?   r   r)   6   s     

zGather.forwardc                 C   s6   t | j| j| j|}| jr.tdd |D }d| S )Nc                 s   s   | ]}|d  V  qdS r:   r   )r   gr   r   r   r   Q   s     z"Gather.backward.<locals>.<genexpr>r8   )Scatterr,   rA   rE   r;   rD   r   )r$   grad_outputZscattered_gradsr   r   r   r.   M   s    zGather.backwardNr/   r   r   r   r   r9   4   s   
r9   c                   @   s$   e Zd Zedd Zedd ZdS )rG   c           
   
   C   s   dd |D }|| _ |jjdkr(| nd| _d }tj rT| jdkrTdd |D }t	|||| j |}|d k	rt
|D ]F\}}tj|| ( tj }	|	||  ||	 W 5 Q R X qx|S )Nc                 S   s   g | ]}t |d qS r   r   r   r   r   r   r   Y   s     z#Scatter.forward.<locals>.<listcomp>r   c                 S   s   g | ]}t |qS r   )_get_stream)r   r   r   r   r   r   _   s     )r;   r   r   r    r!   torchcudaZis_availabler   Zscatterr"   Zcurrent_streamZwait_streamZrecord_stream)
r$   r   Zchunk_sizesr;   inputZstreamsr&   r   r(   Zmain_streamr   r   r   r)   W   s    
zScatter.forwardc                 G   s   d d d t j| j| jf| fS r*   )r9   r,   r!   r;   )r$   rH   r   r   r   r.   j   s    zScatter.backwardNr/   r   r   r   r   rG   U   s   
rG   _streamsr   c                 C   sH   | dkrdS t dkr$dgtj  a t |  dkr@tj| t | < t |  S )z8Gets a background stream for copying between CPU and GPUrI   N)rN   rK   rL   Zdevice_countStreamrO   r   r   r   rJ   s   s    rJ   )rB   rK    r   Ztorch.autogradr   Ztorch._utilsr   typingr   r   r	   r+   r9   rG   rN   rL   rP   __annotations__intrJ   r   r   r   r   <module>   s    !