U
    ,dE  ć                
   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZ d dlmZmZ dd	lmZ d d
lmZ d dlmZmZmZ d dlmZmZmZ e
 Z e j!Z!e j"Z#e j$Z$dd Z%e#e &e”ddd Z'e#e &e”ddd Z(e#e &e”ddd Z)e#e &e”ddd Z*e#e &e”ddd Z+e#e &e”ddd  Z,e#ed!d"d# Z-e#ed$d%d& Z.e#ed'd(d) Z/e!ej0j1d*d+ Z2e!d,ed-d. Z3e!ej4ej5d/d0 Z6d1d2 Z7e!ej8ej5d3d4 Z9e!ej:j;ej<d5d6 Z=d a>d7d8 Z?e!ej@jAejBejCd9d: ZDe!ej@jAejEejCe!ej@jAejFejCd;d< ZGe!ejHjAejBejCd=d> ZIe!ejHjAejEejCe!ejHjAejFejCd?d@ ZJe!ejKdAdB ZLe!ejMejNdCdD ZOe!ejPejNdEdF ZQe!ejRejNdGdH ZSe!ejTdIdJ ZUe!ejVdKdL ZWe!ejXdMdN ZYe!ejZdOdP Z[e!ejZejNdQdR Z\e!ej]ejNejNejNejNejNe!ej]ejNejNej^ejNejNe!ej]ejNejNej_ejNejNe!ej]ejNejNej`ejNejNdSdT Zae!ejbejNejNejcdUdV Zde!ejeejNejNe!ejeejNej^e!ejeejNej_e!ejeejNej`dWdX Zfe!ejgejNejNe!ejgejNej^e!ejgejNej_e!ejgejNej`dYdZ Zhe!ejid[d\ Zje!ejkd]d^ Zle!ejmejCd_d` Zne!ejoejCejCejCdadb Zpdcdd Zqeejrejsdedf Zteejsejrdgdh Zudidj Zveejrejwdkdl ZxeejwejreejBejrdmdn Zydodp Zzezej{j|dq ezej{j}dr ezej{j~ds e!ej{jejrdtdu Ze!ej{jejrdvdw Ze!ej{jejrejrejrdxdy ZdzZd{d| Ze!ej{jejrejred} e!ej{jejrejred~ e!ej{jejrejred e!ej{jejrejred e!ej{jejrejred e!ej{jejrejred dd Zeej{jdd eej{jdd ejdejdiZe!ejeje!ejejdd Ze!ejejdd Ze!ejejdd Ze!ejejCdd Ze!ejejNe!ejejdd Ze!ejej^e!ejejdd Ze!ejejCejCejCdd Z e!e”ej_ej_dd Z¢e!e”ej`ej_e!e”ej_ej`e!e”ej`ej`dd Z£e!e¤ej_ej_dd Z„e!e¤ej`ej_e!e¤ej_ej`e!e¤ej`ej`dd Z¦e!e§ej_e!e§ej`dd  ZØe!e§ej_ejwe!e§ej`ejwd”d¢ Z©d£d¤ ZŖej«d„ Z¬d„ej« Z­e!ej®ej_eŖe¬ e!ej®ej`eŖe¬ e!ejÆej_eŖe­ e!ejÆej`eŖe­ d¦d§ Z°dØd© Z±e!ej²j³ej<ej“ejCe!ej²j³ej<ejFejCe!ej²j³ej<ejEejCe±dŖd« Zµe!ej²j¶ej<ej“ejCe!ej²j¶ej<ejFejCe!ej²j¶ej<ejEejCe±d¬d­ Z·e!ej²jøej<ej“ejCe!ej²jøej<ejFejCe!ej²jøej<ejEejCe±d®dÆ Z¹e!ej²jŗej<ej“ejCe!ej²jŗej<ejFejCe!ej²jŗej<ejEejCe±d°d± Z»d²d³ Z¼e¼ej²j½d“ e¼ej²j¾dµ e¼ej²jæd¶ e!ej²jĄej<ej“ejCe!ej²jĄej<ejFejCe!ej²jĄej<ejEejCe±d·dø ZĮe!ej²j”ej<ej“ejCe!ej²j”ej<ejEejCe!ej²j”ej<ejFejCe±d¹dŗ ZĀe!ej²j¤ej<ej“ejCe!ej²j¤ej<ejEejCe!ej²j¤ej<ejFejCe±d»d¼ ZĆe!ej²jÄej<ej“ejCe!ej²jÄej<ejEejCe!ej²jÄej<ejFejCe±d½d¾ ZÅe!ej²jĘej<ej“ejCe!ej²jĘej<ejEejCe!ej²jĘej<ejFejCe±dædĄ ZĒe!ej²jČej<ejCejCdĮdĀ ZÉe!ejŹejĖdĆdÄ ZĢdŹdĘdĒZĶe$edČdÉ ZĪdS )Ėé    )ŚreduceN)Śir)ŚRegistryŚ
lower_cast)Śparse_dtypeŚ	signature)Śmodels)ŚtypesŚcgutilsé   )Śnvvm)Ścuda)Ś	nvvmutilsŚstubsŚerrors)Śdim3Ś
grid_groupŚCUDADispatcherc                 C   sB   t  | d| ”}t  | d| ”}t  | d| ”}t | |||f”S )Nz%s.xz%s.yz%s.z)r   Ś	call_sregr
   Zpack_struct)ŚbuilderŚprefixŚxŚyŚz© r   ś7/tmp/pip-unpacked-wheel-eu7e0c37/numba/cuda/cudaimpl.pyŚinitialize_dim3   s    r   Z	threadIdxc                 C   s
   t |dS )NŚtid©r   ©Ścontextr   ŚsigŚargsr   r   r   Ścuda_threadIdx   s    r#   ZblockDimc                 C   s
   t |dS )NŚntidr   r   r   r   r   Ścuda_blockDim$   s    r%   ZblockIdxc                 C   s
   t |dS )NZctaidr   r   r   r   r   Ścuda_blockIdx)   s    r&   ZgridDimc                 C   s
   t |dS )NŚnctaidr   r   r   r   r   Ścuda_gridDim.   s    r(   Ślaneidc                 C   s   t  |d”S )Nr)   ©r   r   r   r   r   r   Ścuda_laneid3   s    r+   Śwarpsizec                 C   s   t  |d”S )Nr,   r*   r   r   r   r   Ścuda_warpsize8   s    r-   r   c                 C   s   |  |d”S ©Nr   ©Śextract_valuer   r   r   r   Śdim3_x=   s    r1   r   c                 C   s   |  |d”S ©Nr   r/   r   r   r   r   Śdim3_yB   s    r3   r   c                 C   s   |  |d”S )Né   r/   r   r   r   r   Śdim3_zG   s    r5   c                 C   s(   |   tjd”}|j}| t |”|f”S r2   )Śget_constantr	   Śint32ŚmoduleŚcallr   Z declare_cudaCGGetIntrinsicHandle)r    r   r!   r"   ZoneŚlmodr   r   r   Ścg_this_gridL   s    žr;   zGridGroup.syncc                 C   s,   |   tjd”}|j}| t |”||f”S r.   )r6   r	   r7   r8   r9   r   Zdeclare_cudaCGSynchronize)r    r   r!   r"   Śflagsr:   r   r   r   Śptx_sync_groupU   s    žr=   c                 C   sV   |j }|tjkrtj|ddS t|tjrFtj||jd}t 	||”S t
d| d S )Nr   )Śdimz(Unexpected return type %s from cuda.grid)Śreturn_typer	   r7   r   Zget_global_idŚ
isinstanceŚUniTupleŚcountr
   Ś
pack_arrayŚ
ValueError)r    r   r!   r"   ŚrestypeŚidsr   r   r   Ś	cuda_grid`   s    
rG   c                 C   s0   t  | d| ”}t  | d| ”}|  ||”S )Nzntid.znctaid.)r   r   Śmul)r   r>   r$   r'   r   r   r   Ś_nthreads_for_diml   s    rI   c                 C   s   |j }t|d}|tjkr|S t|tjrtt|d}|jdkrNt |||f”S |jdkrtt|d}t ||||f”S t	d| d S )Nr   r   r4   é   r   z*Unexpected return type %s of cuda.gridsize)
r?   rI   r	   r7   r@   rA   rB   r
   rC   rD   )r    r   r!   r"   rE   ZnxŚnyZnzr   r   r   Ścuda_gridsizer   s    





rL   c                 C   s   |d S r.   r   r   r   r   r   Ścuda_const_array_like   s    rM   c                 C   s   t d7 a d | t ”S )zĶDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})Ś_unique_smem_idŚformat©Śnamer   r   r   Ś_get_unique_smem_id   s    rR   c              	   C   s8   |j d j}t|j d }t| ||f|tdtjddS )Nr   r   Ś_cudapy_smemT©ŚshapeŚdtypeŚsymbol_nameŚ	addrspaceŚcan_dynsized)r"   Śliteral_valuer   Ś_generic_arrayrR   r   ŚADDRSPACE_SHARED©r    r   r!   r"   ŚlengthrV   r   r   r   Ścuda_shared_array_integer   s    żr_   c              	   C   s>   dd |j d D }t|j d }t| |||tdtjddS )Nc                 S   s   g | ]
}|j qS r   ©rZ   ©Ś.0Śsr   r   r   Ś
<listcomp>©   s     z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   rS   TrT   )r"   r   r[   rR   r   r\   ©r    r   r!   r"   rU   rV   r   r   r   Ścuda_shared_array_tuple¦   s    
żrf   c              	   C   s4   |j d j}t|j d }t| ||f|dtjddS )Nr   r   Ś_cudapy_lmemFrT   )r"   rZ   r   r[   r   ŚADDRSPACE_LOCALr]   r   r   r   Ścuda_local_array_integer±   s    żri   c              	   C   s:   dd |j d D }t|j d }t| |||dtjddS )Nc                 S   s   g | ]
}|j qS r   r`   ra   r   r   r   rd   ¾   s     z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rg   FrT   )r"   r   r[   r   rh   re   r   r   r   Śptx_lmem_alloc_array»   s    
żrj   c                 C   sD   |rt d}|j}t t ” d”}t |||”}| |d” |  ” S )Nzllvm.nvvm.barrier0r   ©	ŚAssertionErrorr8   r   ŚFunctionTypeŚVoidTyper
   Śget_or_insert_functionr9   Śget_dummy_value©r    r   r!   r"   Śfnamer:   ŚfntyŚsyncr   r   r   Śptx_syncthreadsĘ   s    ru   c                 C   s>   d}|j }t t d”t d”f”}t |||”}| ||”S )Nzllvm.nvvm.barrier0.popcé    ©r8   r   rm   ŚIntTyper
   ro   r9   rq   r   r   r   Śptx_syncthreads_countŃ   s
    ry   c                 C   s>   d}|j }t t d”t d”f”}t |||”}| ||”S )Nzllvm.nvvm.barrier0.andrv   rw   rq   r   r   r   Śptx_syncthreads_andŚ   s
    rz   c                 C   s>   d}|j }t t d”t d”f”}t |||”}| ||”S )Nzllvm.nvvm.barrier0.orrv   rw   rq   r   r   r   Śptx_syncthreads_orć   s
    r{   c                 C   sD   |rt d}|j}t t ” d”}t |||”}| |d” |  ” S )Nzllvm.nvvm.membar.ctar   rk   rq   r   r   r   Śptx_threadfence_blockģ   s    r|   c                 C   sD   |rt d}|j}t t ” d”}t |||”}| |d” |  ” S )Nzllvm.nvvm.membar.sysr   rk   rq   r   r   r   Śptx_threadfence_system÷   s    r}   c                 C   sD   |rt d}|j}t t ” d”}t |||”}| |d” |  ” S )Nzllvm.nvvm.membar.glr   rk   rq   r   r   r   Śptx_threadfence_device  s    r~   c                 C   s,   |   tjd”}ttjtj}t| |||gS )Nl   ’’ )r6   r	   r7   r   ŚnoneŚptx_syncwarp_mask)r    r   r!   r"   ŚmaskZmask_sigr   r   r   Śptx_syncwarp  s    r   c                 C   sD   d}|j }t t ” t d”f”}t |||”}| ||” |  ” S )Nzllvm.nvvm.bar.warp.syncrv   )	r8   r   rm   rn   rx   r
   ro   r9   rp   rq   r   r   r   r     s    r   c              
   C   sü  |\}}}}}|j d }	|	tjkr6| |t |	j””}d}
|j}t t 	t d”t d”f”t d”t d”t d”t d”t d”f”}t
 |||
”}|	jdkrü| ||||||f”}|	tjkrś| |d”}| |d”}| |t ” ”}t
 |||f”}nü| |t d””}| ||  tjd””}| |t d””}| ||||||f”}| ||||||f”}| |d”}| |d”}| |d”}| |t d””}| |t d””}| ||  tjd””}| ||”}|	tjkrč| |t ” ”}t
 |||f”}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r4   zllvm.nvvm.shfl.sync.i32rv   r   r   é@   )r"   r	   Śreal_domainŚbitcastr   rx   Śbitwidthr8   rm   ŚLiteralStructTyper
   ro   r9   Śfloat32r0   Ś	FloatTypeZmake_anonymous_structŚtruncZlshrr6   Śi8ŚzextZshlŚor_Śfloat64Ś
DoubleType)r    r   r!   r"   r   ŚmodeŚvalueŚindexŚclampZ
value_typerr   r:   rs   ŚfuncŚretŚrvŚpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   Śptx_shfl_sync_i32  sJ    

 ’ž

r   c                 C   s^   d}|j }t t t d”t d”f”t d”t d”t d”f”}t |||”}| ||”S )Nzllvm.nvvm.vote.syncrv   r   )r8   r   rm   r   rx   r
   ro   r9   )r    r   r!   r"   rr   r:   rs   r   r   r   r   Śptx_vote_syncT  s    ’žr   c                 C   s   |\}}|j d j}|j d tjkr6| |t |””}d |”}|j}t 	t d”t d”t |”f”}	t
 ||	|”}
| |
||f”S )Nr   zllvm.nvvm.match.any.sync.i{}rv   )r"   r   r	   r   r   r   rx   rO   r8   rm   r
   ro   r9   ©r    r   r!   r"   r   r   Świdthrr   r:   rs   r   r   r   r   Śptx_match_any_sync_  s    
"r   c                 C   s   |\}}|j d j}|j d tjkr6| |t |””}d |”}|j}t 	t 
t d”t d”f”t d”t |”f”}	t ||	|”}
| |
||f”S )Nr   zllvm.nvvm.match.all.sync.i{}rv   )r"   r   r	   r   r   r   rx   rO   r8   rm   r   r
   ro   r9   r   r   r   r   Śptx_match_all_synco  s    
’žr   c                 C   s,   t jt  t  d”g ”dddd}| |g ”S )Nrv   zactivemask.b32 $0;ś=rT©Zside_effect©r   Ś	InlineAsmrm   rx   r9   ©r    r   r!   r"   Ś
activemaskr   r   r   Śptx_activemask  s      ’r¤   c                 C   s,   t jt  t  d”g ”dddd}| |g ”S )Nrv   zmov.u32 $0, %lanemask_lt;r   Tr   r    r¢   r   r   r   Śptx_lanemask_lt  s     žr„   c                 C   s   |  |d ”S r.   )Zctpopr   r   r   r   Śptx_popc  s    r¦   c                 C   s
   |j | S ©N)Śfmar   r   r   r   Śptx_fma  s    r©   c                 C   sD   ddd}z
||  W S  t k
r>   d|  d}t |”Y nX d S )N)Zf32Śf)Zf64Śd)rv   r   z$Conversion between float16 and floatś unsupported©ŚKeyErrorr   ZCudaLoweringError©r   ŚtypemapŚmsgr   r   r   Śfloat16_float_ty_constraint  s    

r²   c           	      C   sd   |j |j kr|S t|j \}}t |  |”t d”g”}t |d| dd| d”}| ||g”S )Né   zcvt.ś.f16 $0, $1;ś=ś,h)r   r²   r   rm   Śget_value_typerx   r”   r9   ©	r    r   ŚfromtyŚtotyŚvalŚtyŚ
constraintrs   Śasmr   r   r   Śfloat16_to_float_cast¤  s    ræ   c           	      C   sb   |j |j kr|S t|j \}}t t d”|  |”g”}t |d| dd| ”}| ||g”S )Nr³   ścvt.rn.f16.ś $0, $1;ś=h,)r   r²   r   rm   rx   r·   r”   r9   rø   r   r   r   Śfloat_to_float16_cast°  s    rĆ   c                 C   sH   ddddd}z
||  W S  t k
rB   d|  d}t |”Y nX d S )NŚcŚhŚrŚl)é   r³   rv   r   z"Conversion between float16 and intr¬   r­   rÆ   r   r   r   Śfloat16_int_constraint¼  s    
rÉ   c           
      C   sf   |j }t|}|jrdnd}t |  |”t d”g”}t |d| | dd| d”}	| |	|g”S )Nrc   Śur³   zcvt.rni.r“   rµ   r¶   )	r   rÉ   Śsignedr   rm   r·   rx   r”   r9   ©
r    r   r¹   rŗ   r»   r   r½   Z
signednessrs   r¾   r   r   r   Śfloat16_to_integer_castĘ  s    
žrĶ   c           
      C   sd   |j }t|}|jrdnd}t t d”|  |”g”}t |d| | dd| ”}	| |	|g”S )Nrc   rŹ   r³   rĄ   rĮ   rĀ   )	r   rÉ   rĖ   r   rm   rx   r·   r”   r9   rĢ   r   r   r   Śinteger_to_float16_castÓ  s    
’žrĪ   c                    s    t | tjtj fdd}d S )Nc                    sB   t  t  d”t  d”t  d”g”}t  |  dd”}| ||”S )Nr³   z.f16 $0,$1,$2;ś=h,h,h©r   rm   rx   r”   r9   ©r    r   r!   r"   rs   r¾   ©Śopr   r   Śptx_fp16_binaryć  s
    ’z*lower_fp16_binary.<locals>.ptx_fp16_binary©Ślowerr	   Śfloat16)ŚfnrÓ   rŌ   r   rŅ   r   Ślower_fp16_binaryā  s    rŁ   ŚaddŚsubrH   c                 C   s4   t  t  d”t  d”g”}t  |dd”}| ||”S )Nr³   zneg.f16 $0, $1;ś=h,hrŠ   rŃ   r   r   r   Śptx_fp16_hnegš  s    rŻ   c                 C   sL   t j ” dk rd}nd}t t d”t d”g”}t ||d”}| ||”S )N)é
   r4   zand.b16 $0, $1, 0x7FFF;zabs.f16 $0, $1;r³   rÜ   )r   ZruntimeŚget_versionr   rm   rx   r”   r9   )r    r   r!   r"   Śinstrs   r¾   r   r   r   Śptx_fp16_habs÷  s    rį   c                 C   sH   t  d”t  d”t  d”g}t  t  d”|”}t  |dd”}| ||”S )Nr³   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   rx   rm   r”   r9   )r    r   r!   r"   Zargtysrs   r¾   r   r   r   Śptx_hfma  s    rā   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    s    fdd}|S )Nc           	         sr   t  t  d”t  d”t  d”g”}t  |tj dd”}| ||”}|  tj	d”}| 
|t  d””}| d||”S )Nr³   rŅ   rĻ   r   z!=)r   rm   rx   r”   Ś	_fp16_cmprO   r9   r6   r	   Zint16r   Zicmp_unsigned)	r    r   r!   r"   rs   r¾   ŚresultŚzeroZ
int_resultrŅ   r   r   Śptx_fp16_comparison  s    "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )rÓ   rę   r   rŅ   r   Ś_gen_fp16_cmp  s    rē   ŚeqŚneŚgeŚgtŚleŚltc                    s    t | tjtj fdd}d S )Nc                    s(   t  | |||}| ||d |d ”S )Nr   r   )rē   Śselect)r    r   r!   r"   ŚchoicerŅ   r   r   Śptx_fp16_minmax,  s    z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxrÕ   )rŲ   rr   rÓ   rš   r   rŅ   r   Ślower_fp16_minmax+  s    rń   ŚmaxŚminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }|  |”}|j}t ||g”}t |||”}	| |	|”S r§   )	r?   Ś
cbrt_funcsr·   r8   r   rm   r
   ro   r9   )
r    r   r!   r"   r¼   rr   Zftyr:   rs   rŲ   r   r   r   Śptx_cbrt@  s    
rõ   c              	   C   s2   t  |jt t d”t d”f”d”}| ||”S )Nrv   Z	__nv_brev©r
   ro   r8   r   rm   rx   r9   ©r    r   r!   r"   rŲ   r   r   r   Śptx_brev_u4L  s    żrų   c              	   C   s2   t  |jt t d”t d”f”d”}| ||”S )Nr   Z__nv_brevllrö   r÷   r   r   r   Śptx_brev_u8X  s    żrł   c                 C   s   |  |d |  tjd””S r.   )Zctlzr6   r	   Śbooleanr   r   r   r   Śptx_clzd  s    žrū   c              	   C   s2   t  |jt t d”t d”f”d”}| ||”S )Nrv   Z__nv_ffsrö   r÷   r   r   r   Ś
ptx_ffs_32k  s    żrü   c              	   C   s2   t  |jt t d”t d”f”d”}| ||”S )Nrv   r   Z
__nv_ffsllrö   r÷   r   r   r   Ś
ptx_ffs_64u  s    żrż   c                 C   s   |\}}}|  |||”S r§   )rī   )r    r   r!   r"   ŚtestŚaŚbr   r   r   Śptx_selp  s    
r  c              	   C   s4   t  |jt t ” t ” t ” f”d”}| ||”S )NZ
__nv_fmaxf©r
   ro   r8   r   rm   r   r9   r÷   r   r   r   Ś
ptx_max_f4  s    žūr  c              
   C   sh   t  |jt t ” t ” t ” f”d”}| ||  ||d |jd t	j
”|  ||d |jd t	j
”g”S )NZ	__nv_fmaxr   r   ©r
   ro   r8   r   rm   r   r9   Ścastr"   r	   Śdoubler÷   r   r   r   Ś
ptx_max_f8  s    žūžr  c              	   C   s4   t  |jt t ” t ” t ” f”d”}| ||”S )NZ
__nv_fminfr  r÷   r   r   r   Ś
ptx_min_f4”  s    žūr  c              
   C   sh   t  |jt t ” t ” t ” f”d”}| ||  ||d |jd t	j
”|  ||d |jd t	j
”g”S )NZ	__nv_fminr   r   r  r÷   r   r   r   Ś
ptx_min_f8¬  s    žūžr	  c              	   C   sJ   t  |jt t d”t ” f”d”}| ||  ||d |j	d t
j”g”S )Nr   Z__nv_llrintr   )r
   ro   r8   r   rm   rx   r   r9   r  r"   r	   r  r÷   r   r   r   Ś	ptx_round½  s    žū’r
  c                 C   s   dd }|   ||||”S )Nc                 S   sĄ   t  | ”st  | ”r| S |dkrb|dkr:d|d  }d}nd| }d}| | | }t  |”rt| S nd|  }| | }t|}t  || ”dkrdt|d  }|dkr“|| | }n||9 }|S )Nr   é   g      $@gÕMĻšDg      š?g      ą?g       @)ŚmathŚisinfŚisnanŚroundŚfabs)r   ŚndigitsZpow1Zpow2r   r   r   r   r   Śround_ndigitsŅ  s(    

z$round_to_impl.<locals>.round_ndigits)Zcompile_internal)r    r   r!   r"   r  r   r   r   Śround_to_implĻ  s    !r  c                    s    fdd}|S )Nc                    s$   |j \}|  | ”}| ||d ”S r.   )r"   r6   Zfmul)r    r   r!   r"   ZargtyZfactor©Śconstr   r   Śimpl÷  s    zgen_deg_rad.<locals>.implr   )r  r  r   r  r   Śgen_deg_radö  s    r  g     f@c                    sV   |t jkr t j|dd}|g}ntj |t|d} fddt||D }||fS )z4
    Convert integer indices into tuple of intp
    r   )rV   rB   )rB   c                    s"   g | ]\}}   ||tj”qS r   )r  r	   Śintp)rb   ŚtŚi©r   r    r   r   rd     s   ’z&_normalize_indices.<locals>.<listcomp>)r	   Zinteger_domainrA   r
   Zunpack_tupleŚlenŚzip)r    r   ŚindtyŚindsŚindicesr   r  r   Ś_normalize_indices  s    
’r!  c                    s    fdd}|S )Nc                    s¢   |j \}}}|\}}}	|j}
t| |||\}}|
|krFtd|
|f |jt|krjtd|jt|f |  |”| ||}tj| ||||dd} | ||
||	S )Nzexpect %s but got %sz#indexing %d-D array with %d-D indexT)Z
wraparound)	r"   rV   r!  Ś	TypeErrorŚndimr  Ś
make_arrayr
   Śget_item_pointer)r    r   r!   r"   Śarytyr  ŚvaltyŚaryr  r»   rV   r   ŚlaryŚptr©Śdispatch_fnr   r   Śimp  s    
’’z_atomic_dispatcher.<locals>.impr   )r,  r-  r   r+  r   Ś_atomic_dispatcher  s    r.  c                 C   s`   |t jkr&|j}| t |”||f”S |t jkrL|j}| t |”||f”S | d||d”S d S )NrŚ   Ś	monotonic)	r	   r   r8   r9   r   Zdeclare_atomic_add_float32r   Zdeclare_atomic_add_float64Ś
atomic_rmw©r    r   rV   r*  r»   r:   r   r   r   Śptx_atomic_add_tuple,  s    
’
’r2  c                 C   s`   |t jkr&|j}| t |”||f”S |t jkrL|j}| t |”||f”S | d||d”S d S )NrŪ   r/  )	r	   r   r8   r9   r   Zdeclare_atomic_sub_float32r   Zdeclare_atomic_sub_float64r0  r1  r   r   r   Śptx_atomic_sub=  s    
’
’r3  c                 C   sP   |t jjkr<|j}|j}ttd| }| ||||f”S td| dd S )NZdeclare_atomic_inc_intzUnimplemented atomic inc with ś array©	r   ŚcudadeclZunsigned_int_numba_typesr   r8   Śgetattrr   r9   r"  ©r    r   rV   r*  r»   Zbwr:   rŲ   r   r   r   Śptx_atomic_incN  s    r9  c                 C   sP   |t jjkr<|j}|j}ttd| }| ||||f”S td| dd S )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r4  r5  r8  r   r   r   Śptx_atomic_dec\  s    r:  c                    s@   t  fdd}tjtjtjfD ]}t| tj|tj| q d S )Nc                    s6   |t jjkr|  ||d”S td  d| dd S )Nr/  zUnimplemented atomic z with r4  ©r   r6  Śinteger_numba_typesr0  r"  ©r    r   rV   r*  r»   rŅ   r   r   Śimpl_ptx_atomick  s    z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r.  r	   r  rA   ŚTuplerÖ   ŚArrayŚAny)ZstubrÓ   r>  r¼   r   rŅ   r   Śptx_atomic_bitwisej  s    rB  ŚandŚorŚxorc                 C   s0   |t jjkr| d||d”S td| dd S )NZxchgr/  zUnimplemented atomic exch with r4  r;  r=  r   r   r   Śptx_atomic_exch{  s    rF  c                 C   s   |j }|tjkr&| t |”||f”S |tjkrF| t |”||f”S |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S ©Nrņ   r/  ©ZorderingZumaxz&Unimplemented atomic max with %s array)r8   r	   r   r9   r   Zdeclare_atomic_max_float64r   Zdeclare_atomic_max_float32r7   Śint64r0  Śuint32Śuint64r"  r1  r   r   r   Śptx_atomic_max  s    
’
’rL  c                 C   s   |j }|tjkr&| t |”||f”S |tjkrF| t |”||f”S |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S ©Nró   r/  rH  Zuminz&Unimplemented atomic min with %s array)r8   r	   r   r9   r   Zdeclare_atomic_min_float64r   Zdeclare_atomic_min_float32r7   rI  r0  rJ  rK  r"  r1  r   r   r   Śptx_atomic_min  s    
’
’rN  c                 C   s   |j }|tjkr&| t |”||f”S |tjkrF| t |”||f”S |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S rG  )r8   r	   r   r9   r   Zdeclare_atomic_nanmax_float64r   Zdeclare_atomic_nanmax_float32r7   rI  r0  rJ  rK  r"  r1  r   r   r   Śptx_atomic_nanmax®  s    
’
’rO  c                 C   s   |j }|tjkr&| t |”||f”S |tjkrF| t |”||f”S |tjtj	fkrh|j
d||ddS |tjtjfkr|j
d||ddS td| d S rM  )r8   r	   r   r9   r   Zdeclare_atomic_nanmin_float64r   Zdeclare_atomic_nanmin_float32r7   rI  r0  rJ  rK  r"  r1  r   r   r   Śptx_atomic_nanminĀ  s    
’
’rP  c                 C   s   |j \}}}|\}}}	|j}
|  |”| ||}|  tjd”}t | ||||f”}|jtj	j
kr|j}|jj}t ||||||	”S td|
 d S )Nr   z3Unimplemented atomic compare_and_swap with %s array)r"   rV   r$  r6   r	   r  r
   r%  r   r6  r<  r8   r   r   Zatomic_cmpxchgr"  )r    r   r!   r"   r&  Zoldtyr'  r(  Śoldr»   rV   r)  rå   r*  r:   r   r   r   r   Śptx_atomic_cas_tupleÖ  s    
’rR  c                 C   s@   t jt  t  ” t  d”g”dddd}|d }| ||g” d S )Nrv   znanosleep.u32 $0;rĘ   Tr   r   )r   r”   rm   rn   rx   r9   )r    r   r!   r"   Ś	nanosleepŚnsr   r   r   Śptx_nanosleepė  s      ’rU  Fc           "   	      s  t tj|d}|dko$|o$t|dk}|dkr:|s:td j| }	t|tjtj	fpjt|	t
jpj|tjk}
|tjkr|
std|   |”}t ||”}|tjkrøtj|||d}n|j}t ||||”}  |”}d|d  ” > |_|röd|_nt |tj”|_t  |t !d”|”}| "t #t !d”|””}| $||g”}t% &tj'”}  |”}| (|”}|}g }t)t*|D ]\}}| +|” ||9 }qtdd	 t*|D } fd
d	|D }|rtj,t -t !d”g ”dddd}| .| $|g ”t !d””}  /tj0|”}| 1||”g}n fdd	|D }t|}tj2||dd}   3| ” |}! j4|!| "||!j5j6”||  /tj0|”d d |! 7” S )Nr   r   zarray length <= 0zunsupported type: %srP   ZexternalrČ   c                 S   s   g | ]}|qS r   r   ra   r   r   r   rd   8  s     z"_generic_array.<locals>.<listcomp>c                    s   g | ]}   tj|”qS r   ©r6   r	   r  ra   ©r    r   r   rd   9  s     rv   zmov.u32 $0, %dynamic_smem_size;r   Tr   r   c                    s   g | ]}   tj|”qS r   rV  ra   rW  r   r   rd   K  s     ŚC)rV   r#  Zlayout)ŚdatarU   ŚstridesŚitemsizeZmeminfo)8r   ŚoperatorrH   r  rD   Zdata_model_managerr@   r	   ZRecordŚBooleanr   ZStructModelr×   Znumber_domainr"  Zget_data_typer   Z	ArrayTyper   rh   r
   Zalloca_oncer8   Zadd_global_variableZget_abi_sizeofŚ
bit_lengthŚalignŚlinkageŚConstantŚ	UndefinedZinitializerr   Zinsert_addrspace_convrx   r   ZPointerTyper9   ŚllZcreate_target_dataZdata_layoutZget_abi_sizeŚ	enumerateŚreversedŚappendr”   rm   r   r6   r  Zudivr@  r$  Zpopulate_arrayrY  ŚtypeZ	_getvalue)"r    r   rU   rV   rW   rX   rY   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrr:   Zgvmemr_  ŚconvZaddrspaceptrZ
targetdatar[  Z
laststrideZrstridesr  ZlastsizerZ  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper#  r&  r(  r   rW  r   r[   ö  sz    

’ż


’



 ž’ūr[   c                 C   s   |   ” S r§   )rp   )r    r   r¼   Zpyvalr   r   r   Ścuda_dispatcher_const[  s    ri  )F)ĻŚ	functoolsr   r\  r  Zllvmliter   Zllvmlite.bindingZbindingrc  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   r   Znumba.core.datamodelr   Z
numba.corer	   r
   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   r   ŚregistryrÖ   Zlower_getattrZ
lower_attrZlower_constantr   ŚModuler#   r%   r&   r(   r+   r-   r1   r3   r5   ZcgZ	this_gridr;   r=   Zgridr7   rG   rI   ZgridsizerL   r  Z
array_liker@  rM   rN   rR   ZsharedŚarrayZIntegerLiteralrA  r_   r?  rA   rf   Ślocalri   rj   Zsyncthreadsru   Zsyncthreads_countŚi4ry   Zsyncthreads_andrz   Zsyncthreads_orr{   Zthreadfence_blockr|   Zthreadfence_systemr}   Zthreadfencer~   Zsyncwarpr   r   Zshfl_sync_intrinsicr   Zf4Zf8r   Zvote_sync_intrinsicrś   r   Zmatch_any_syncr   Zmatch_all_syncr   r£   r¤   Zlanemask_ltr„   Zpopcr¦   rØ   r©   r²   r×   ZFloatræ   rĆ   rÉ   ZIntegerrĶ   rĪ   rŁ   Zfp16ZhaddZhsubZhmulZhnegrŻ   Zhabsrį   Zhfmarā   rć   rē   ZheqŚhneZhgeZhgtŚhleZhltrń   ZhmaxZhminr   r   rō   Zcbrtrõ   ZbrevZu4rų   Śu8rł   Zclzrū   Zffsrü   rż   Zselpr  rņ   r  r  ró   r  r	  r  r
  r  r  ŚpiZ_deg2radZ_rad2degŚradiansŚdegreesr!  r.  ZatomicrŚ   r  r2  rŪ   r3  Śincr9  Śdecr:  rB  Śand_r   rE  ZexchrF  rL  rN  ZnanmaxrO  ZnanminrP  Zcompare_and_swaprR  rS  rJ  rU  r[   ri  r   r   r   r   Ś<module>   sī  

















		
		












	’’’’.










	


  ž










%





 ’
e