U
    ,dK"                     @   s   d dl mZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 G dd deZG dd de
ZG d	d
 d
e	ZG dd deZdZG dd dejZdZG dd dejZdS )    )cuda)array)deviceufunc)UFuncMechanismGenerializedUFuncGUFuncCallStepsc                   @   sL   e Zd ZdZdd Zedd Zejdd Zdd Zdd
dZ	dd Z
dS )CUDAUFuncDispatcherzD
    Invoke the CUDA ufunc specialization for the given inputs.
    c                 C   s   || _ d| _d S )Nr   )	functions_maxblocksize)selfZtypes_to_retty_kernels r   :/tmp/pip-unpacked-wheel-eu7e0c37/numba/cuda/vectorizers.py__init__   s    zCUDAUFuncDispatcher.__init__c                 C   s   | j S N)r
   r   r   r   r   max_blocksize   s    z!CUDAUFuncDispatcher.max_blocksizec                 C   s
   || _ d S r   )Z_max_blocksize)r   Zblkszr   r   r   r      s    c                 O   s   t | j||S )a  
        *args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
               Cannot mix the two types in one call.

        **kws:
            stream -- cuda stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        )CUDAUFuncMechanismcallr	   )r   argskwsr   r   r   __call__   s    zCUDAUFuncDispatcher.__call__r   c              	   C   s   t t| j d dks"td|jdks4td|jd }g }|dkrTtdn|dkrd|d S |pnt	 }|
 P tjj|r|}nt||}| |||}td|jd}|j||d	 W 5 Q R X |d S )
Nr      zmust be a binary ufunc   zmust use 1d arrayzReduction on an empty array.)r   )dtypestream)lenlistr	   keysAssertionErrorndimshape	TypeErrorr   r   Zauto_synchronizecudadrvdevicearrayis_cuda_ndarray	to_device_CUDAUFuncDispatcher__reducenp_arrayr   copy_to_host)r   argr   ngpu_memsmemoutbufr   r   r   reduce&   s"    "


zCUDAUFuncDispatcher.reducec           
      C   s   |j d }|d dkrd||d \}}|| || | |||}|| | ||||dS ||d \}}	|| ||	 | ||	||d |d dkr| |||S |S d S )Nr   r   r   )r.   r   )r!   splitappendr'   )
r   r-   r,   r   r+   ZfatcutZthincutr.   leftrightr   r   r   Z__reduceC   s    





zCUDAUFuncDispatcher.__reduceN)r   )__name__
__module____qualname____doc__r   propertyr   setterr   r0   r'   r   r   r   r   r      s   


r   c                   @   sJ   e Zd ZdgZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )_CUDAGUFuncCallSteps_streamc                 C   s
   t |S r   r   Zis_cuda_arrayr   objr   r   r   is_device_array`   s    z$_CUDAGUFuncCallSteps.is_device_arrayc                 C   s   t jj|r|S t |S r   r   r#   r$   r%   Zas_cuda_arrayr>   r   r   r   as_device_arrayc   s    z$_CUDAGUFuncCallSteps.as_device_arrayc                 C   s   t j|| jdS Nr   )r   r&   r<   )r   hostaryr   r   r   r&   m   s    z_CUDAGUFuncCallSteps.to_devicec                 C   s   |j || jd}|S rC   )r)   r<   )r   devaryrD   r.   r   r   r   to_hostp   s    z_CUDAGUFuncCallSteps.to_hostc                 C   s   t j||| jdS N)r!   r   r   )r   device_arrayr<   )r   r!   r   r   r   r   rH   t   s    z!_CUDAGUFuncCallSteps.device_arrayc                 C   s   | j dd| _d S )Nr   r   )kwargsgetr<   r   r   r   r   prepare_inputsw   s    z#_CUDAGUFuncCallSteps.prepare_inputsc                 C   s   |j || jd|  d S rC   )forallr<   )r   kernelZnelemr   r   r   r   launch_kernelz   s    z"_CUDAGUFuncCallSteps.launch_kernelN)r5   r6   r7   	__slots__r@   rB   r&   rF   rH   rK   rN   r   r   r   r   r;   [   s   
r;   c                   @   s(   e Zd Zedd Zdd Zdd ZdS )CUDAGenerializedUFuncc                 C   s   t S r   )r;   r   r   r   r   _call_steps   s    z!CUDAGenerializedUFunc._call_stepsc                 C   s   t jjj|d|j|jdS N)r   r!   stridesr   gpu_data)r   r#   r$   DeviceNDArrayr   rU   )r   aryr!   r   r   r   _broadcast_scalar_input   s
    
z-CUDAGenerializedUFunc._broadcast_scalar_inputc                 C   s:   t |t |j }d| |j }tjjj|||j|jdS rR   )	r   r!   rT   r   r#   r$   rV   r   rU   )r   rW   ZnewshapeZnewaxZ
newstridesr   r   r   _broadcast_add_axis   s    
z)CUDAGenerializedUFunc._broadcast_add_axisN)r5   r6   r7   r9   rQ   rX   rY   r   r   r   r   rP   ~   s   
rP   c                   @   sL   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd ZdS )r   z%
    Provide CUDA specialization
    r   c                 C   s   |j ||d|  d S rC   )rL   )r   funccountr   r   r   r   r   launch   s    zCUDAUFuncMechanism.launchc                 C   s
   t |S r   r=   r>   r   r   r   r@      s    z"CUDAUFuncMechanism.is_device_arrayc                 C   s   t jj|r|S t |S r   rA   r>   r   r   r   rB      s    z"CUDAUFuncMechanism.as_device_arrayc                 C   s   t j||dS rC   )r   r&   )r   rD   r   r   r   r   r&      s    zCUDAUFuncMechanism.to_devicec                 C   s   |j |dS rC   )r)   )r   rE   r   r   r   r   rF      s    zCUDAUFuncMechanism.to_hostc                 C   s   t j|||dS rG   )r   rH   )r   r!   r   r   r   r   r   rH      s    zCUDAUFuncMechanism.device_arrayc                    sn    fddt tD }tt j }dg| t j }|D ]}d||< qFtjjj| j	 j
dS )Nc                    s,   g | ]$}| j ks$ j| | kr|qS r   )r    r!   ).0axrW   r!   r   r   
<listcomp>   s    
z7CUDAUFuncMechanism.broadcast_device.<locals>.<listcomp>r   rS   )ranger   r!   r   rT   r   r#   r$   rV   r   rU   )r   rW   r!   Z
ax_differsZ
missingdimrT   r^   r   r_   r   broadcast_device   s    

z#CUDAUFuncMechanism.broadcast_deviceN)r5   r6   r7   r8   ZDEFAULT_STREAMr\   r@   rB   r&   rF   rH   rb   r   r   r   r   r      s   
r   z
def __vectorized_{name}({args}, __out__):
    __tid__ = __cuda__.grid(1)
    if __tid__ < __out__.shape[0]:
        __out__[__tid__] = __core__({argitems})
c                   @   s8   e Zd Zdd Zdd Zdd Zdd Zed	d
 ZdS )CUDAVectorizec                 C   s*   t j|ddd| j}||j|j jjfS )NT)deviceinline)r   jitpyfuncZ	overloadsr   	signaturereturn_type)r   sigZcudevfnr   r   r   _compile_core   s    zCUDAVectorize._compile_corec                 C   s    | j j }|t|d |S )NZ__cuda__Z__core__)rg   __globals__copyupdater   )r   corefnZglblr   r   r   _get_globals   s
    zCUDAVectorize._get_globalsc                 C   s
   t |S r   r   rf   r   Zfnobjrj   r   r   r   _compile_kernel   s    zCUDAVectorize._compile_kernelc                 C   s
   t | jS r   )r   	kernelmapr   r   r   r   build_ufunc   s    zCUDAVectorize.build_ufuncc                 C   s   t S r   )vectorizer_stager_sourcer   r   r   r   _kernel_template   s    zCUDAVectorize._kernel_templateN)	r5   r6   r7   rk   rq   rt   rv   r9   rx   r   r   r   r   rc      s   rc   zy
def __gufunc_{name}({args}):
    __tid__ = __cuda__.grid(1)
    if __tid__ < {checkedarg}:
        __core__({argitems})
c                   @   s0   e Zd Zdd Zdd Zedd Zdd Zd	S )
CUDAGUFuncVectorizec                 C   s   t | j| j}t| j|dS )N)ru   engine)r   ZGUFuncEngineZinputsigZ	outputsigrP   ru   )r   rz   r   r   r   rv      s    zCUDAGUFuncVectorize.build_ufuncc                 C   s   t ||S r   rr   rs   r   r   r   rt      s    z#CUDAGUFuncVectorize._compile_kernelc                 C   s   t S r   )_gufunc_stager_sourcer   r   r   r   rx      s    z$CUDAGUFuncVectorize._kernel_templatec                 C   s4   t j|dd| j}| jj }|t |d |S )NT)rd   rl   )r   rf   rg   Zpy_funcrm   rn   ro   )r   rj   rp   Zglblsr   r   r   rq      s    z CUDAGUFuncVectorize._get_globalsN)r5   r6   r7   rv   rt   r9   rx   rq   r   r   r   r   ry      s
   
ry   N)Znumbar   Znumpyr   r(   Znumba.np.ufuncr   Znumba.np.ufunc.deviceufuncr   r   r   objectr   r;   rP   r   rw   ZDeviceVectorizerc   r{   ZDeviceGUFuncVectorizery   r   r   r   r   <module>   s   S#0