U
    ,dx                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ zeeddZW n e k
r   dd ZY nX dd Z!dd Z"dd Z#G dd de
j$Z%G dd de%Z&edd Z'G dd de%Z(G dd de)Z*G d d! d!e%ej+Z,G d"d# d#e%ej+Z-d5d$d%Z.d6d&d'Z/d(d) Z0d*d+ Z1d,Z2d-d. Z3d7d1d2Z4d3d4 Z5dS )8z
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object.  If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
    N)c_void_p)_devicearray)devices)driver)typesconfig)to_fixed_tuple)
dummyarray)numpy_support)prepare_shape_strides_dtype)NumbaPerformanceWarning)warn	lru_cachec                 C   s   | S N )funcr   r   B/tmp/pip-unpacked-wheel-eu7e0c37/numba/cuda/cudadrv/devicearray.pyr      s    c                 C   s   t | ddS )z$Check if an object is a CUDA ndarray__cuda_ndarray__F)getattrobjr   r   r   is_cuda_ndarray#   s    r   c                    sB   t    fdd}|dt |dt |dtj |dt dS )z,Verify the CUDA ndarray interface for an objc                    s6   t  | st| tt | |s2td| |f d S )Nz%s must be of type %s)hasattrAttributeError
isinstancer   )attrtypr   r   r   requires_attr,   s    
z4verify_cuda_ndarray_interface.<locals>.requires_attrshapestridesdtypesizeN)require_cuda_ndarraytuplenpr    int)r   r   r   r   r   verify_cuda_ndarray_interface(   s    

r&   c                 C   s   t | stddS )z9Raises ValueError is is_cuda_ndarray(obj) evaluates Falsezrequire an cuda ndarray objectN)r   
ValueErrorr   r   r   r   r"   8   s    r"   c                   @   s   e Zd ZdZdZdZd%ddZedd Zd&d	d
Z	edd Z
d'ddZdd Zedd Zedd Zejd(ddZejd)ddZd*ddZdd Zdd Zd+dd Zd!d" Zed#d$ ZdS ),DeviceNDArrayBasez$A on GPU NDArray representation
    Tr   Nc                 C   s"  t |tr|f}t |tr |f}t|}t|| _t|| jkrJtdtj	d|||j
| _t|| _t|| _|| _tttj| jd| _| jdkr|dkrt| j| j| jj
| _t | j}nt|| _n6tjrtjd}ntd}tjt |dd}d| _|| _ || _!dS )a5  
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype coercible object.
        stream
            cuda stream.
        gpu_data
            user provided device memory for the ndarray data buffer
        zstrides not match ndimr      N)contextpointerr!   )"r   r%   r$   r    lenndimr'   r	   ArrayZ	from_descitemsize_dummyr#   r   r   	functoolsreduceoperatormulr!   _driverZmemory_size_from_info
alloc_sizer   get_contextZmemallocZdevice_memory_sizeUSE_NV_BINDINGbindingCUdeviceptrr   ZMemoryPointergpu_datastream)selfr   r   r    r<   r;   nullr   r   r   __init__D   sD    






  
 zDeviceNDArrayBase.__init__c                 C   s   t jr"| jd k	rt| j}q<d}n| jjd k	r8| jj}nd}t| jt| rPd nt| j|df| j	j
| jdkrxt| jnd ddS )Nr   F   )r   r   dataZtypestrr<   version)r5   r8   device_ctypes_pointerr%   valuer#   r   is_contiguousr   r    strr<   )r=   ptrr   r   r   __cuda_array_interface__w   s    

z*DeviceNDArrayBase.__cuda_array_interface__c                 C   s   t  | }||_|S )zBind a CUDA stream to this object so that all subsequent operation
        on this array defaults to the given stream.
        )copyr<   )r=   r<   cloner   r   r   bind   s    
zDeviceNDArrayBase.bindc                 C   s   |   S r   	transposer=   r   r   r   T   s    zDeviceNDArrayBase.Tc                 C   s|   |rt |t t| jkr| S | jdkr6d}t|nB|d k	rdt|tt| jkrdtd|f nddlm} || S d S )N   z2transposing a non-2D DeviceNDArray isn't supportedzinvalid axes list %rr   rL   )r#   ranger-   NotImplementedErrorsetr'   Znumba.cuda.kernels.transposerM   )r=   ZaxesmsgrM   r   r   r   rM      s    

zDeviceNDArrayBase.transposec                 C   s   |s
| j S |S r   r<   )r=   r<   r   r   r   _default_stream   s    z!DeviceNDArrayBase._default_streamc                 C   sR   d| j k}| jd r|sd}n| jd r2|s2d}nd}t| j}t|| j|S )n
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        r   C_CONTIGUOUSCF_CONTIGUOUSFA)r   flagsr
   
from_dtyper    r   r.   r-   )r=   	broadcastZlayoutr    r   r   r   _numba_type_   s    
zDeviceNDArrayBase._numba_type_c                 C   s2   | j dkr&tjrtjdS tdS n| j jS dS )z:Returns the ctypes pointer to the GPU data buffer
        Nr   )r;   r5   r8   r9   r:   r   rC   rN   r   r   r   rC      s
    

z'DeviceNDArrayBase.device_ctypes_pointerc                 C   s   |j dkrdS t|  | |}t| t| }}t|rdt| t|| tj| || j|d nFt	j
||jd rxdndd|jd  d	}t|| tj| || j|d dS )
zCopy `ary` to `self`.

        If `ary` is a CUDA memory, perform a device-to-device transfer.
        Otherwise, perform a a host-to-device transfer.
        r   NrU   rX   rY   r[   TZ	WRITEABLE)ordersubokrI   )r!   sentry_contiguousrV   
array_corer5   is_device_memorycheck_array_compatibilitydevice_to_devicer6   r$   arrayr]   Zhost_to_device)r=   aryr<   Z	self_coreZary_corer   r   r   copy_to_device   s&    





z DeviceNDArrayBase.copy_to_devicec                 C   s   t dd | jD r(d}t|| j| jdks:td| |}|dkr`tj| jtj	d}nt
| | |}| jdkrtj|| | j|d |dkr| jdkrtj| j| j|d	}ntj| j| j| j|d
}|S )a^  Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        c                 s   s   | ]}|d k V  qdS )r   Nr   ).0sr   r   r   	<genexpr>	  s     z1DeviceNDArrayBase.copy_to_host.<locals>.<genexpr>z2D->H copy not implemented for negative strides: {}r   zNegative memory sizeNr   r    rU   )r   r    buffer)r   r    r   ro   )anyr   rR   formatr6   AssertionErrorrV   r$   emptybyterf   r5   device_to_hostr!   ndarrayr   r    )r=   ri   r<   rT   hostaryr   r   r   copy_to_host   s.    



 zDeviceNDArrayBase.copy_to_hostc                 c   s   |  |}| jdkrtd| jd | jjkr6tdttt	| j
| }| j}| jj}t|D ]R}|| }t|| | j
}|| f}	| j|| || }
t|	|| j||
dV  qddS )zSplit the array into equal partition of the `section` size.
        If the array cannot be equally divided, the last section will be
        smaller.
        r)   zonly support 1d arrayr   zonly support unit strider    r<   r;   N)rV   r-   r'   r   r    r/   r%   mathceilfloatr!   rQ   minr;   viewDeviceNDArray)r=   sectionr<   Znsectr   r/   ibeginendr   r;   r   r   r   split!  s     


zDeviceNDArrayBase.splitc                 C   s   | j S )zEReturns a device memory object that is used as the argument.
        )r;   rN   r   r   r   as_cuda_arg6  s    zDeviceNDArrayBase.as_cuda_argc                 C   s0   t  | j}t| j| j| jd}t||dS )z
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        )r   r   r    )
ipc_handle
array_desc)	r   r7   get_ipc_handler;   dictr   r   r    IpcArrayHandle)r=   Zipchdescr   r   r   r   ;  s    z DeviceNDArrayBase.get_ipc_handlec                 C   s2   | j j|d\}}t|j|j| j| || jdS )a(  
        Remove axes of size one from the array shape.

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Subset of dimensions to remove. A `ValueError` is raised if an axis
            with size greater than one is selected. If `None`, all axes with
            size one are removed.
        stream : cuda stream or 0, optional
            Default stream for the returned view of the array.

        Returns
        -------
        DeviceNDArray
            Squeezed view into the array.

        )axisr   r   r    r<   r;   )r0   squeezer   r   r   r    rV   r;   )r=   r   r<   Z	new_dummy_r   r   r   r   F  s    zDeviceNDArrayBase.squeezec                 C   s   t |}t| j}t| j}| jj|jkrv|  s<tdt|d | jj |j\|d< }|dkrltd|j|d< t	|||| j
| jdS )zeReturns a new object by reinterpretting the dtype without making a
        copy of the data.
        zHTo change to a dtype of a different size, the array must be C-contiguousr   zuWhen changing to a larger dtype, its size must be a divisor of the total size in bytes of the last axis of the array.r   )r$   r    listr   r   r/   is_c_contiguousr'   divmodr   r<   r;   )r=   r    r   r   remr   r   r   r~   b  s0    



zDeviceNDArrayBase.viewc                 C   s   | j j| j S r   )r    r/   r!   rN   r   r   r   nbytes  s    zDeviceNDArrayBase.nbytes)r   N)r   )N)r   )Nr   )r   )Nr   )__name__
__module____qualname____doc__Z__cuda_memory__r   r?   propertyrH   rK   rO   rM   rV   r`   rC   r   require_contextrj   rx   r   r   r   r   r~   r   r   r   r   r   r(   >   s4   
3





.

%r(   c                       s   e Zd ZdZd fdd	Zedd Zedd	 Zej	d
d Z
ej	dddZdddZej	dd Zej	dddZdddZ  ZS )DeviceRecordz
    An on-GPU record type
    r   Nc                    s$   d}d}t t| ||||| d S Nr   )superr   r?   )r=   r    r<   r;   r   r   	__class__r   r   r?     s
    zDeviceRecord.__init__c                 C   s   t | jjS z
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        r   r0   r]   rN   r   r   r   r]     s    zDeviceRecord.flagsc                 C   s   t | jS )rW   )r
   r^   r    rN   r   r   r   r`     s    zDeviceRecord._numba_type_c                 C   s
   |  |S r   _do_getitemr=   itemr   r   r   __getitem__  s    zDeviceRecord.__getitem__c                 C   s   |  ||S z0Do `__getitem__(item)` with CUDA stream
        r   r=   r   r<   r   r   r   getitem  s    zDeviceRecord.getitemc           
      C   s   |  |}| jj| \}}| j|}|jdkrr|jd k	rHt|||dS tj	d|d}t
j|||j|d |d S t|jd |jd d\}}}	t|||	||dS d S )	Nr   ry   r)   r    dstsrcr!   r<   r   rY   r   r   r    r;   r<   )rV   r    fieldsr;   r~   r   namesr   r$   rs   r5   ru   r/   r   Zsubdtyper   )
r=   r   r<   r   offsetnewdatarw   r   r   r    r   r   r   r     s2    


  zDeviceRecord._do_getitemc                 C   s   |  ||S r   _do_setitemr=   keyrD   r   r   r   __setitem__  s    zDeviceRecord.__setitem__c                 C   s   | j |||dS z6Do `__setitem__(key, value)` with CUDA stream
        rU   r   r=   r   rD   r<   r   r   r   setitem  s    zDeviceRecord.setitemc                 C   s   |  |}| }|r$t }| }| jj| \}}| j|}t| |||d}	t	|	j||d\}
}t
|	|
|
jj| |r|  d S )Nry   rU   )rV   r   r7   get_default_streamr    r   r;   r~   typeauto_devicer5   rg   r/   synchronize)r=   r   rD   r<   synchronousctxr   r   r   lhsrhsr   r   r   r   r     s    
zDeviceRecord._do_setitem)r   N)r   )r   )r   )r   )r   r   r   r   r?   r   r]   r`   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r     s    
	



r   c                    s>   ddl m  dkr& jdd }|S  j fdd}|S )z
    A separate method so we don't need to compile code every assignment (!).

    :param ndim: We need to have static array sizes for cuda.local.array, so
        bake in the number of dimensions into the kernel
    r   )cudac                 S   s   |d | d< d S r   r   )r   r   r   r   r   kernel  s    z_assign_kernel.<locals>.kernelc                    s     d}d}t| jD ]}|| j| 9 }q||kr8d S  jjdftjd}td ddD ]L}|| j|  |d|f< || j|  |j| dk |d|f< || j|  }q^|t|d  | t|d < d S )Nr)   rP   rn   r   r   )	ZgridrQ   r-   r   localrh   r   int64r   )r   r   location
n_elementsr   idxr   r-   r   r   r     s    
$)numbar   Zjit)r-   r   r   r   r   _assign_kernel  s    
r   c                   @   s   e Zd ZdZdd Zedd Zdd Zdd	d
Zdd Z	dd Z
d ddZejdd Zejd!ddZd"ddZejdd Zejd#ddZd$ddZdS )%r   z
    An on-GPU array type
    c                 C   s   | j jS )zA
        Return true if the array is Fortran-contiguous.
        )r0   Zis_f_contigrN   r   r   r   is_f_contiguous&  s    zDeviceNDArray.is_f_contiguousc                 C   s   t | jjS r   r   rN   r   r   r   r]   ,  s    zDeviceNDArray.flagsc                 C   s   | j jS )z;
        Return true if the array is C-contiguous.
        )r0   Zis_c_contigrN   r   r   r   r   6  s    zDeviceNDArray.is_c_contiguousNc                 C   s"   |r|   |S |    S dS )zE
        :return: an `numpy.ndarray`, so copies to the host.
        N)rx   	__array__)r=   r    r   r   r   r   <  s    zDeviceNDArray.__array__c                 C   s
   | j d S )Nr   )r   rN   r   r   r   __len__E  s    zDeviceNDArray.__len__c                 O   s   t |dkr&t|d ttfr&|d }t| }|| jkrP|| j| j| j| jdS | j	j
||\}}|| j	jgkr||j|j| j| jdS tddS )z
        Reshape the array without changing its contents, similarly to
        :meth:`numpy.ndarray.reshape`. Example::

            d_arr = d_arr.reshape(20, 50, order='F')
        r)   r   )r   r   r    r;   operation requires copyingN)r,   r   r#   r   r   r   r   r    r;   r0   reshapeextentrR   )r=   Znewshapekwsclsnewarrextentsr   r   r   r   H  s    

 
 zDeviceNDArray.reshaperY   r   c                 C   sX   |  |}t| }| jj|d\}}|| jjgkrL||j|j| j| j|dS t	ddS )zr
        Flatten the array without changing its contents, similar to
        :meth:`numpy.ndarray.ravel`.
        )ra   r   r   N)
rV   r   r0   ravelr   r   r   r    r;   rR   )r=   ra   r<   r   r   r   r   r   r   r   `  s    

 zDeviceNDArray.ravelc                 C   s
   |  |S r   r   r   r   r   r   r   q  s    zDeviceNDArray.__getitem__c                 C   s   |  ||S r   r   r   r   r   r   r   u  s    zDeviceNDArray.getitemc                 C   s   |  |}| j|}t| }t| }t|dkr| jj|d  }|j	s| j
jd k	rht| j
||dS tjd| j
d}tj||| jj|d |d S ||j|j| j
||dS n&| jj|j }||j|j| j
||dS d S )Nr)   r   ry   r   r   r   )rV   r0   r   r   Ziter_contiguous_extentr   r,   r;   r~   Zis_arrayr    r   r   r$   rs   r5   ru   r/   r   r   r   )r=   r   r<   arrr   r   r   rw   r   r   r   r   {  s8    

  
  zDeviceNDArray._do_getitemc                 C   s   |  ||S r   r   r   r   r   r   r     s    zDeviceNDArray.__setitem__c                 C   s   | j |||dS r   r   r   r   r   r   r     s    zDeviceNDArray.setitemc                 C   s\  |  |}| }|r$t }| }| j|}| jj|j }t	|t
jrTd}d}	n|j}|j}	t| ||	| j||d}
t||dd\}}|j|
jkrtd|j|
jf tj|
jtjd}|j||
j|j d < |j| }tt|
j|jD ].\}\}}|dkr||krtd|||f qttj|
jd}t|
jj||d	|
| |rX|  d S )
Nr   r   T)r<   user_explicitz$Can't assign %s-D array to %s-D selfr   r)   zCCan't copy sequence with size %d to array axis %d with dimension %drU   ) rV   r   r7   r   r0   r   r;   r~   r   r   r	   Elementr   r   r   r    r   r-   r'   r$   Zonesr   r   	enumeratezipr1   r2   r3   r4   r   forallr   )r=   r   rD   r<   r   r   r   r   r   r   r   r   r   Z	rhs_shaper   lrr   r   r   r   r     sJ    
	
zDeviceNDArray._do_setitem)N)rY   r   )r   )r   )r   )r   )r   r   r   r   r   r   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   "  s&   
	
	



r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )r   a"  
    An IPC array handle that can be serialized and transfer to another process
    in the same machine for share a GPU allocation.

    On the destination process, use the *.open()* method to creates a new
    *DeviceNDArray* object that shares the allocation from the original process.
    To release the resources, call the *.close()* method.  After that, the
    destination can no longer use the shared array object.  (Note: the
    underlying weakref to the resource is now dead.)

    This object implements the context-manager interface that calls the
    *.open()* and *.close()* method automatically::

        with the_ipc_array_handle as ipc_array:
            # use ipc_array here as a normal gpu array object
            some_code(ipc_array)
        # ipc_array is dead at this point
    c                 C   s   || _ || _d S r   )_array_desc_ipc_handle)r=   r   r   r   r   r   r?     s    zIpcArrayHandle.__init__c                 C   s$   | j t }tf d|i| jS )z
        Returns a new *DeviceNDArray* that shares the allocation from the
        original process.  Must not be used on the original process.
        r;   )r   openr   r7   r   r   )r=   Zdptrr   r   r   r     s    zIpcArrayHandle.openc                 C   s   | j   dS )z5
        Closes the IPC handle to the array.
        N)r   closerN   r   r   r   r     s    zIpcArrayHandle.closec                 C   s   |   S r   )r   rN   r   r   r   	__enter__  s    zIpcArrayHandle.__enter__c                 C   s   |    d S r   )r   )r=   r   rD   	tracebackr   r   r   __exit__  s    zIpcArrayHandle.__exit__N)	r   r   r   r   r?   r   r   r   r   r   r   r   r   r     s   r   c                   @   s   e Zd ZdZdddZdS )MappedNDArrayz4
    A host array that uses CUDA mapped memory.
    r   c                 C   s   || _ || _d S r   r;   r<   r=   r;   r<   r   r   r   device_setup  s    zMappedNDArray.device_setupN)r   r   r   r   r   r   r   r   r   r   r     s   r   c                   @   s   e Zd ZdZdddZdS )ManagedNDArrayz5
    A host array that uses CUDA managed memory.
    r   c                 C   s   || _ || _d S r   r   r   r   r   r   r     s    zManagedNDArray.device_setupN)r   r   r   r   r   r   r     s   r   c                 C   s   t | j| j| j||dS )z/Create a DeviceNDArray object that is like ary.r<   r;   )r   r   r   r    )ri   r<   r;   r   r   r   from_array_like  s    r   c                 C   s   t | j||dS )z.Create a DeviceRecord object that is like rec.r   )r   r    )Zrecr<   r;   r   r   r   from_record_like   s    r   c                 C   sF   | j r| js| S g }| j D ]}||dkr.dntd q| t| S )aG  
    Extract the repeated core of a broadcast array.

    Broadcast arrays are by definition non-contiguous due to repeated
    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
    contiguity and copy the underlying data from such arrays, we must create
    a view without the repeated dimensions.

    r   N)r   r!   appendslicer#   )ri   Z
core_indexstrider   r   r   rd   %  s    

rd   c                 C   sR   | j j}tt| jt| jD ].\}}|dkr|dkr||krD dS ||9 }qdS )z
    Returns True iff `ary` is C-style contiguous while ignoring
    broadcasted and 1-sized dimensions.
    As opposed to array_core(), it does not call require_context(),
    which can be quite expensive.
    r)   r   FT)r    r/   r   reversedr   r   )ri   r!   r   r   r   r   r   rE   7  s    
rE   zArray contains non-contiguous buffer and cannot be transferred as a single memory region. Please ensure contiguous buffer with numpy .ascontiguousarray()c                 C   s(   t | }|jd s$|jd s$ttd S )NrX   rZ   )rd   r]   r'   errmsg_contiguous_buffer)ri   corer   r   r   rc   M  s    rc   TFc                 C   s   t | r| dfS t| dr,tj| dfS t| tjrFt	| |d}n$tj
| ddd} t|  t| |d}|rtjr|st| tst| tjrd}tt| |j| |d |dfS dS )z
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    FrH   rU   T)rI   rb   zGHost array used in CUDA kernel will incur copy overhead to/from device.N)r5   re   r   r   r   Zas_cuda_arrayr   r$   voidr   rh   rc   r   r   ZCUDA_WARN_ON_IMPLICIT_COPYr   rv   r   r   rj   )r   r<   rI   r   ZdevobjrT   r   r   r   r   S  s2    


r   c                 C   s|   |   |   }}| j|jkr2td| j|jf |j|jkrRtd| j|jf | jrx|j|jkrxtd| j|jf d S )Nzincompatible dtype: %s vs. %szincompatible shape: %s vs. %szincompatible strides: %s vs. %s)r   r    	TypeErrorr   r'   r!   r   )Zary1Zary2Zary1sqZary2sqr   r   r   rf   z  s    


rf   )r   N)r   N)r   TF)6r   rz   r1   r3   rI   ctypesr   Znumpyr$   r   r   Znumba.cuda.cudadrvr   r   r5   Z
numba.corer   r   Znumba.np.unsafe.ndarrayr   Z
numba.miscr	   Znumba.npr
   Znumba.cuda.api_utilr   Znumba.core.errorsr   warningsr   r   r   r   r   r&   r"   ZDeviceArrayr(   r   r   r   objectr   rv   r   r   r   r   rd   rE   r   rc   r   rf   r   r   r   r   <module>   sV     Sg
+ 9,




'