o
    ç0 iä–  ã                
   @   sz  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z! e
ƒ Z"e"j#Z#e"j$Z%e"j&Z&dd„ Z'e%e (e¡dƒdd„ ƒZ)e%e (e¡dƒdd„ ƒZ*e%e (e¡dƒdd„ ƒZ+e%e (e¡dƒdd„ ƒZ,e%e (e¡dƒdd„ ƒZ-e%e d ƒd!d"„ ƒZ.e%e d#ƒd$d%„ ƒZ/e%e d&ƒd'd(„ ƒZ0e#ej1j2ej3ƒd)d*„ ƒZ4d a5d+d,„ Z6e#ej7j8ej9ej:ƒd-d.„ ƒZ;e#ej7j8ej<ej:ƒe#ej7j8ej=ej:ƒd/d0„ ƒƒZ>e#ej?j8ej9ej:ƒd1d2„ ƒZ@e#ej?j8ej<ej:ƒe#ej?j8ej=ej:ƒd3d4„ ƒƒZAe#ejBƒd5d6„ ƒZCe#ejDƒd7d8„ ƒZEe#ejFƒd9d:„ ƒZGe#ejHƒd;d<„ ƒZIe#ejHejJƒd=d>„ ƒZKe#ejLejJejJejJejJejJƒe#ejLejJejJejMejJejJƒe#ejLejJejJejNejJejJƒe#ejLejJejJejOejJejJƒd?d@„ ƒƒƒƒZPe#ejQejJejJejRƒdAdB„ ƒZSe#ejTejJejJƒe#ejTejJejMƒe#ejTejJejNƒe#ejTejJejOƒdCdD„ ƒƒƒƒZUe#ejVejJejJƒe#ejVejJejMƒe#ejVejJejNƒe#ejVejJejOƒdEdF„ ƒƒƒƒZWe#ejXƒdGdH„ ƒZYe#ejZƒdIdJ„ ƒZ[e#ej\ej:ƒdKdL„ ƒZ]e#ej^ej:ej:ej:ƒdMdN„ ƒZ_dOdP„ Z`eejaejbƒdQdR„ ƒZceejbejaƒdSdT„ ƒZddUdV„ ZeeejaejfƒdWdX„ ƒZgeejfejaƒeej9ejaƒdYdZ„ ƒƒZhd[d\„ Zieiejjjkd]ƒ eiejld]ƒ eiejmd]ƒ eiejjjnd^ƒ eiejod^ƒ eiejpd^ƒ eiejjjqd_ƒ eiejrd_ƒ eiejsd_ƒ e#ejjjtejaƒd`da„ ƒZue#ejvejaƒdbdc„ ƒZwe#ejjjxejaƒddde„ ƒZye#ezejaƒdfdg„ ƒZ{e#ejjj|ejaejaejaƒdhdi„ ƒZ}e#ej~ejaejaƒe#ejejaejaƒdjdk„ ƒƒZ€dlZdmdn„ Z‚e#ejjjƒejaejaƒe‚doƒƒ e#ej„ejaejaƒe‚doƒƒ e#ejjj…ejaejaƒe‚dpƒƒ e#ej†ejaejaƒe‚dpƒƒ e#ejjj‡ejaejaƒe‚dqƒƒ e#ejˆejaejaƒe‚dqƒƒ e#ejjj‰ejaejaƒe‚drƒƒ e#ejŠejaejaƒe‚drƒƒ e#ejjj‹ejaejaƒe‚dsƒƒ e#ejŒejaejaƒe‚dsƒƒ e#ejjjejaejaƒe‚dtƒƒ e#ejŽejaejaƒe‚dtƒƒ dudv„ Zeejjjdwdrƒ eejjj‘dxdtƒ ej’dyej“dziZ”e#ej•ej’ƒe#ej•ej“ƒd{d|„ ƒƒZ–e#ej—ej˜ƒd}d~„ ƒZ™e#ej—ejšƒdd€„ ƒZ›e#ejœej:ƒdd‚„ ƒZe#ejžejJƒe#ejžej˜ƒdƒd„„ ƒƒZŸe#ejžejMƒe#ejžejšƒd…d†„ ƒƒZ e#ej¡ej:ej:ej:ƒd‡dˆ„ ƒZ¢e#e£ejNejNƒd‰dŠ„ ƒZ¤e#e£ejOejNƒe#e£ejNejOƒe#e£ejOejOƒd‹dŒ„ ƒƒƒZ¥e#e¦ejNejNƒddŽ„ ƒZ§e#e¦ejOejNƒe#e¦ejNejOƒe#e¦ejOejOƒdd„ ƒƒƒZ¨e#e©ejNƒe#e©ejOƒd‘d’„ ƒƒZªe#e©ejNejfƒe#e©ejOejfƒd“d”„ ƒƒZ«d•d–„ Z¬ej­d— Z®d—ej­ Z¯e#ej°ejNƒe¬e®ƒƒ e#ej°ejOƒe¬e®ƒƒ e#ej±ejNƒe¬e¯ƒƒ e#ej±ejOƒe¬e¯ƒƒ d˜d™„ Z²dšd›„ Z³e#ej´jlej3ejµej:ƒe#ej´jlej3ej=ej:ƒe#ej´jlej3ej<ej:ƒe³dœd„ ƒƒƒƒZ¶e#ej´joej3ejµej:ƒe#ej´joej3ej=ej:ƒe#ej´joej3ej<ej:ƒe³dždŸ„ ƒƒƒƒZ·e#ej´j¸ej3ejµej:ƒe#ej´j¸ej3ej=ej:ƒe#ej´j¸ej3ej<ej:ƒe³d d¡„ ƒƒƒƒZ¹e#ej´jºej3ejµej:ƒe#ej´jºej3ej=ej:ƒe#ej´jºej3ej<ej:ƒe³d¢d£„ ƒƒƒƒZ»d¤d¥„ Z¼e¼ej´j½d¦ƒ e¼ej´j¾d§ƒ e¼ej´j¿d¨ƒ e#ej´jÀej3ejµej:ƒe#ej´jÀej3ej=ej:ƒe#ej´jÀej3ej<ej:ƒe³d©dª„ ƒƒƒƒZÁe#ej´j£ej3ejµej:ƒe#ej´j£ej3ej<ej:ƒe#ej´j£ej3ej=ej:ƒe³d«d¬„ ƒƒƒƒZÂe#ej´j¦ej3ejµej:ƒe#ej´j¦ej3ej<ej:ƒe#ej´j¦ej3ej=ej:ƒe³d­d®„ ƒƒƒƒZÃe#ej´jÄej3ejµej:ƒe#ej´jÄej3ej<ej:ƒe#ej´jÄej3ej=ej:ƒe³d¯d°„ ƒƒƒƒZÅe#ej´jÆej3ejµej:ƒe#ej´jÆej3ej<ej:ƒe#ej´jÆej3ej=ej:ƒe³d±d²„ ƒƒƒƒZÇe#ej´jÈej3ej:ej:ƒd³d´„ ƒZÉe#ej´jÊej3ejµej:ej:ƒe#ej´jÊej3ej<ej:ej:ƒe#ej´jÊej3ej=ej:ej:ƒdµd¶„ ƒƒƒZËe#ejÌejÍƒd·d¸„ ƒZÎ	¹d¾dºd»„ZÏe&e!ƒd¼d½„ ƒZÐee Ñ¡ e#ƒ dS )¿é    )ÚreduceN)Úir)ÚRegistryÚ
lower_cast)Úparse_dtype)Úmodels)ÚtypesÚcgutils)Úufunc_db)Úregister_ufuncsé   )Únvvm)Úcuda)Ú	nvvmutilsÚstubsÚerrors)Údim3ÚCUDADispatcherc                 C   sB   t  | d| ¡}t  | d| ¡}t  | d| ¡}t | |||f¡S )Nz%s.xz%s.yz%s.z)r   Ú	call_sregr	   Zpack_struct)ÚbuilderÚprefixÚxÚyÚz© r   ú_/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/numba/cuda/cudaimpl.pyÚinitialize_dim3   s   r   Z	threadIdxc                 C   ó
   t |dƒS )NÚtid©r   ©Úcontextr   ÚsigÚargsr   r   r   Úcuda_threadIdx    ó   
r$   ZblockDimc                 C   r   )NZntidr   r    r   r   r   Úcuda_blockDim%   r%   r&   ZblockIdxc                 C   r   )NZctaidr   r    r   r   r   Úcuda_blockIdx*   r%   r'   ZgridDimc                 C   r   )NZnctaidr   r    r   r   r   Úcuda_gridDim/   r%   r(   Úlaneidc                 C   s   t  |d¡S )Nr)   )r   r   r    r   r   r   Úcuda_laneid4   ó   r*   r   c                 C   ó   |  |d¡S ©Nr   ©Úextract_valuer    r   r   r   Údim3_x9   r+   r0   r   c                 C   r,   )Nr   r.   r    r   r   r   Údim3_y>   r+   r1   r   c                 C   r,   )Né   r.   r    r   r   r   Údim3_zC   r+   r3   c                 C   s   |d S r-   r   r    r   r   r   Úcuda_const_array_likeJ   s   r4   c                 C   s   t d7 a d | t ¡S )zÍDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})Ú_unique_smem_idÚformat©Únamer   r   r   Ú_get_unique_smem_idT   s   r9   c              	   C   s8   |j d j}t|j d ƒ}t| ||f|tdƒtjddS )Nr   r   Ú_cudapy_smemT©ÚshapeÚdtypeÚsymbol_nameÚ	addrspaceÚcan_dynsized)r#   Úliteral_valuer   Ú_generic_arrayr9   r   ÚADDRSPACE_SHARED©r!   r   r"   r#   Úlengthr=   r   r   r   Úcuda_shared_array_integer^   s   ýrF   c              	   C   s>   dd„ |j d D ƒ}t|j d ƒ}t| |||tdƒtjddS )Nc                 S   ó   g | ]}|j ‘qS r   ©rA   ©Ú.0Úsr   r   r   Ú
<listcomp>k   ó    z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   r:   Tr;   )r#   r   rB   r9   r   rC   ©r!   r   r"   r#   r<   r=   r   r   r   Úcuda_shared_array_tupleh   s   
ýrO   c              	   C   s4   |j d j}t|j d ƒ}t| ||f|dtjddS )Nr   r   Ú_cudapy_lmemFr;   )r#   rA   r   rB   r   ÚADDRSPACE_LOCALrD   r   r   r   Úcuda_local_array_integers   s   ýrR   c              	   C   s:   dd„ |j d D ƒ}t|j d ƒ}t| |||dtjddS )Nc                 S   rG   r   rH   rI   r   r   r   rL   €   rM   z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rP   Fr;   )r#   r   rB   r   rQ   rN   r   r   r   Úptx_lmem_alloc_array}   s   
ýrS   c                 C   óD   |rJ ‚d}|j }t t ¡ d¡}t |||¡}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.ctar   ©Úmoduler   ÚFunctionTypeÚVoidTyper	   Úget_or_insert_functionÚcallÚget_dummy_value©r!   r   r"   r#   ÚfnameÚlmodÚfntyÚsyncr   r   r   Úptx_threadfence_blockˆ   ó   ra   c                 C   rT   )Nzllvm.nvvm.membar.sysr   rU   r\   r   r   r   Úptx_threadfence_system“   rb   rc   c                 C   rT   )Nzllvm.nvvm.membar.glr   rU   r\   r   r   r   Úptx_threadfence_devicež   rb   rd   c                 C   s*   |   tjd¡}t tj¡}t| |||gƒS )Nl   ÿÿ )Úget_constantr   Úint32ÚnoneÚptx_syncwarp_mask)r!   r   r"   r#   ÚmaskZmask_sigr   r   r   Úptx_syncwarp©   s   rj   c                 C   sD   d}|j }t t ¡ t d¡f¡}t |||¡}| ||¡ |  ¡ S )Nzllvm.nvvm.bar.warp.syncé    )	rV   r   rW   rX   ÚIntTyper	   rY   rZ   r[   r\   r   r   r   rh   °   s   rh   c              
   C   sü  |\}}}}}|j d }	|	tjv r| |t |	j¡¡}d}
|j}t t 	t d¡t d¡f¡t d¡t d¡t d¡t d¡t d¡f¡}t
 |||
¡}|	jdkr| ||||||f¡}|	tjkr}| |d¡}| |d¡}| |t ¡ ¡}t
 |||f¡}|S | |t d¡¡}| ||  tjd¡¡}| |t d¡¡}| ||||||f¡}| ||||||f¡}| |d¡}| |d¡}| |d¡}| |t d¡¡}| |t d¡¡}| ||  tjd¡¡}| ||¡}|	tjkrô| |t ¡ ¡}t
 |||f¡}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r2   zllvm.nvvm.shfl.sync.i32rk   r   r   é@   )r#   r   Úreal_domainÚbitcastr   rl   ÚbitwidthrV   rW   ÚLiteralStructTyper	   rY   rZ   Úfloat32r/   Ú	FloatTypeZmake_anonymous_structÚtruncZlshrre   Úi8ÚzextZshlÚor_Úfloat64Ú
DoubleType)r!   r   r"   r#   ri   ÚmodeÚvalueÚindexÚclampÚ
value_typer]   r^   r_   ÚfuncÚretÚrvÚpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   Úptx_shfl_sync_i32º   sJ   

ÿþ

ñ
rƒ   c                 C   s^   d}|j }t t t d¡t d¡f¡t d¡t d¡t d¡f¡}t |||¡}| ||¡S )Nzllvm.nvvm.vote.syncrk   r   )rV   r   rW   rq   rl   r	   rY   rZ   )r!   r   r"   r#   r]   r^   r_   r   r   r   r   Úptx_vote_syncð   s   ÿþr„   c                 C   s†   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t d¡t d¡t |¡f¡}	t
 ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.any.sync.i{}rk   )r#   rp   r   rn   ro   r   rl   r6   rV   rW   r	   rY   rZ   ©r!   r   r"   r#   ri   r{   Úwidthr]   r^   r_   r   r   r   r   Úptx_match_any_syncû   s   
"r‡   c                 C   s–   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t 
t d¡t d¡f¡t d¡t |¡f¡}	t ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.all.sync.i{}rk   )r#   rp   r   rn   ro   r   rl   r6   rV   rW   rq   r	   rY   rZ   r…   r   r   r   Úptx_match_all_sync  s   
ÿþrˆ   c                 C   ó,   t jt  t  d¡g ¡dddd}| |g ¡S )Nrk   zactivemask.b32 $0;ú=rT©Zside_effect©r   Ú	InlineAsmrW   rl   rZ   ©r!   r   r"   r#   Ú
activemaskr   r   r   Úptx_activemask  s   ÿr   c                 C   r‰   )Nrk   zmov.u32 $0, %lanemask_lt;rŠ   Tr‹   rŒ   rŽ   r   r   r   Úptx_lanemask_lt$  s
   þr‘   c                 C   s   |  |d ¡S r-   )Zctpopr    r   r   r   Úptx_popc,  ó   r’   c                 C   s
   |j |Ž S ©N)Úfmar    r   r   r   Úptx_fma1  r%   r–   c                 C   s:   dddœ}z||  W S  t y   d| › d}t |¡‚w )N)Zf32Úf)Zf64Úd)rk   rm   z$Conversion between float16 and floatú unsupported©ÚKeyErrorr   ZCudaLoweringError©rp   ÚtypemapÚmsgr   r   r   Úfloat16_float_ty_constraint6  s   


þrŸ   c           	      C   sd   |j |j kr|S t|j ƒ\}}t |  |¡t d¡g¡}t |d|› dd|› d¡}| ||g¡S )Né   zcvt.ú.f16 $0, $1;ú=ú,h)rp   rŸ   r   rW   Úget_value_typerl   r   rZ   ©	r!   r   ÚfromtyÚtotyÚvalÚtyÚ
constraintr_   Úasmr   r   r   Úfloat16_to_float_cast@  s   r¬   c           	      C   sb   |j |j kr|S t|j ƒ\}}t t d¡|  |¡g¡}t |d|› dd|› ¡}| ||g¡S )Nr    úcvt.rn.f16.ú $0, $1;ú=h,)rp   rŸ   r   rW   rl   r¤   r   rZ   r¥   r   r   r   Úfloat_to_float16_castL  s   r°   c                 C   s>   dddddœ}z||  W S  t y   d| › d}t |¡‚w )NÚcÚhÚrÚl)é   r    rk   rm   z"Conversion between float16 and intr™   rš   rœ   r   r   r   Úfloat16_int_constraintX  s   

þr¶   c           
      C   sf   |j }t|ƒ}|jrdnd}t |  |¡t d¡g¡}t |d|› |› dd|› d¡}	| |	|g¡S )NrK   Úur    zcvt.rni.r¡   r¢   r£   )	rp   r¶   Úsignedr   rW   r¤   rl   r   rZ   ©
r!   r   r¦   r§   r¨   rp   rª   Z
signednessr_   r«   r   r   r   Úfloat16_to_integer_castb  s   
þrº   c           
      C   sd   |j }t|ƒ}|jrdnd}t t d¡|  |¡g¡}t |d|› |› dd|› ¡}	| |	|g¡S )NrK   r·   r    r­   r®   r¯   )	rp   r¶   r¸   r   rW   rl   r¤   r   rZ   r¹   r   r   r   Úinteger_to_float16_casto  s   
ÿþr»   c                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    sB   t  t  d¡t  d¡t  d¡g¡}t  |ˆ › dd¡}| ||¡S )Nr    z.f16 $0,$1,$2;ú=h,h,h©r   rW   rl   r   rZ   ©r!   r   r"   r#   r_   r«   ©Úopr   r   Úptx_fp16_binary  s
   ÿz*lower_fp16_binary.<locals>.ptx_fp16_binary©Úlowerr   Úfloat16)ÚfnrÀ   rÁ   r   r¿   r   Úlower_fp16_binary~  ó   rÆ   ÚaddÚsubÚmulc                 C   ó4   t  t  d¡t  d¡g¡}t  |dd¡}| ||¡S )Nr    zneg.f16 $0, $1;ú=h,hr½   r¾   r   r   r   Úptx_fp16_hneg’  ó   rÍ   c                 C   ó   t | |||ƒS r”   )rÍ   r    r   r   r   Úoperator_hneg™  r“   rÐ   c                 C   rË   )Nr    zabs.f16 $0, $1;rÌ   r½   r¾   r   r   r   Úptx_fp16_habsž  rÎ   rÑ   c                 C   rÏ   r”   )rÑ   r    r   r   r   Úoperator_habs¥  r“   rÒ   c                 C   sH   t  d¡t  d¡t  d¡g}t  t  d¡|¡}t  |dd¡}| ||¡S )Nr    zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   rl   rW   r   rZ   )r!   r   r"   r#   Zargtysr_   r«   r   r   r   Úptx_hfmaª  s   rÓ   c                 C   ó   dd„ }|   ||||¡S )Nc                 S   s   t j | |¡S r”   )r   Úfp16Zhdiv)r   r   r   r   r   Úfp16_divµ  s   zfp16_div_impl.<locals>.fp16_div©Zcompile_internal)r!   r   r"   r#   rÖ   r   r   r   Úfp16_div_impl²  s   rØ   z’{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    ó   ‡ fdd„}|S )Nc           	         sr   t  t  d¡t  d¡t  d¡g¡}t  |tjˆ dd¡}| ||¡}|  tj	d¡}| 
|t  d¡¡}| d||¡S )Nr    r¿   r¼   r   z!=)r   rW   rl   r   Ú	_fp16_cmpr6   rZ   re   r   Zint16ro   Zicmp_unsigned)	r!   r   r"   r#   r_   r«   ÚresultÚzeroZ
int_resultr¿   r   r   Úptx_fp16_comparisonÃ  s   "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )rÀ   rÝ   r   r¿   r   Ú_gen_fp16_cmpÂ  s   rÞ   ÚeqÚneÚgeÚgtÚleÚltc                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    s(   t ˆ ƒ| |||ƒ}| ||d |d ¡S )Nr   r   )rÞ   Úselect)r!   r   r"   r#   Úchoicer¿   r   r   Úptx_fp16_minmaxÝ  s   z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxrÂ   )rÅ   r]   rÀ   rç   r   r¿   r   Úlower_fp16_minmaxÜ  rÇ   rè   ÚmaxÚminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }|  |¡}|j}t ||g¡}t |||¡}	| |	|¡S r”   )	Úreturn_typeÚ
cbrt_funcsr¤   rV   r   rW   r	   rY   rZ   )
r!   r   r"   r#   r©   r]   Zftyr^   r_   rÅ   r   r   r   Úptx_cbrtñ  s   
rí   c              	   C   ó2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nrk   Z	__nv_brev©r	   rY   rV   r   rW   rl   rZ   ©r!   r   r"   r#   rÅ   r   r   r   Úptx_brev_u4ý  ó   ýrñ   c              	   C   rî   )Nrm   Z__nv_brevllrï   rð   r   r   r   Úptx_brev_u8	  rò   ró   c                 C   s   |  |d |  tjd¡¡S r-   )Zctlzre   r   Úbooleanr    r   r   r   Úptx_clz  s   þrõ   c              	   C   rî   )Nrk   Z__nv_ffsrï   rð   r   r   r   Ú
ptx_ffs_32  ó   ýrö   c              	   C   s2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nrk   rm   Z
__nv_ffsllrï   rð   r   r   r   Ú
ptx_ffs_64&  r÷   rø   c                 C   s   |\}}}|  |||¡S r”   )rå   )r!   r   r"   r#   ÚtestÚaÚbr   r   r   Úptx_selp0  s   
rü   c              	   C   ó4   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||¡S )NZ
__nv_fmaxf©r	   rY   rV   r   rW   rs   rZ   rð   r   r   r   Ú
ptx_max_f46  ó   þûrÿ   c              
   C   óh   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||  ||d |jd t	j
¡|  ||d |jd t	j
¡g¡S )NZ	__nv_fmaxr   r   ©r	   rY   rV   r   rW   ry   rZ   Úcastr#   r   Údoublerð   r   r   r   Ú
ptx_max_f8A  ó   þûþr  c              	   C   rý   )NZ
__nv_fminfrþ   rð   r   r   r   Ú
ptx_min_f4R  r   r  c              
   C   r  )NZ	__nv_fminr   r   r  rð   r   r   r   Ú
ptx_min_f8]  r  r  c              	   C   sJ   t  |jt t d¡t ¡ f¡d¡}| ||  ||d |j	d t
j¡g¡S )Nrm   Z__nv_llrintr   )r	   rY   rV   r   rW   rl   ry   rZ   r  r#   r   r  rð   r   r   r   Ú	ptx_roundn  s   þûÿr	  c                 C   rÔ   )Nc                 S   sÂ   t  | ¡s
t  | ¡r| S |dkr1|dkrd|d  }d}nd| }d}| | | }t  |¡r0| S n	d|  }| | }t|ƒ}t  || ¡dkrOdt|d ƒ }|dkr[|| | }|S ||9 }|S )Nr   é   g      $@g’ÕMÏð€Dg      ð?g      à?g       @)ÚmathÚisinfÚisnanÚroundÚfabs)r   ÚndigitsZpow1Zpow2r   r   r   r   r   Úround_ndigitsƒ  s,   
ÿ
þz$round_to_impl.<locals>.round_ndigitsr×   )r!   r   r"   r#   r  r   r   r   Úround_to_impl€  s   !r  c                    rÙ   )Nc                    s$   |j \}|  |ˆ ¡}| ||d ¡S r-   )r#   re   Zfmul)r!   r   r"   r#   ZargtyÚfactor©Úconstr   r   Úimpl¨  s   zgen_deg_rad.<locals>.implr   )r  r  r   r  r   Úgen_deg_rad§  s   r  g     €f@c                    s˜   |t jv rt j|dd}|g}n
tjˆ |t|ƒd}‡ ‡fdd„t||ƒD ƒ}|j}||kr6td||f ƒ‚|j	t|ƒkrHtd|j	t|ƒf ƒ‚||fS )z4
    Convert integer indices into tuple of intp
    r   )r=   Úcount)r  c                    s"   g | ]\}}ˆ  ˆ ||tj¡‘qS r   )r  r   Úintp)rJ   ÚtÚi©r   r!   r   r   rL   À  s    ÿz&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Zinteger_domainÚUniTupler	   Zunpack_tupleÚlenÚzipr=   Ú	TypeErrorÚndim)r!   r   ÚindtyÚindsÚarytyÚvaltyÚindicesr=   r   r  r   Ú_normalize_indices·  s   
ÿÿr'  c                    rÙ   )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}ˆ | ||
||	ƒS )NT©Z
wraparound)r#   r=   r'  Ú
make_arrayr	   Úget_item_pointer)r!   r   r"   r#   r$  r"  r%  Úaryr#  r¨   r=   r&  ÚlaryÚptr©Údispatch_fnr   r   ÚimpÏ  s   

ÿÿz_atomic_dispatcher.<locals>.impr   )r/  r0  r   r.  r   Ú_atomic_dispatcherÎ  s   r1  c                 C   ó\   |t jkr|j}| t |¡||f¡S |t jkr&|j}| t |¡||f¡S | d||d¡S )NrÈ   Ú	monotonic)	r   rr   rV   rZ   r   Zdeclare_atomic_add_float32rx   Zdeclare_atomic_add_float64Ú
atomic_rmw©r!   r   r=   r-  r¨   r^   r   r   r   Úptx_atomic_add_tupleà  ó   
ÿ
ÿr6  c                 C   r2  )NrÉ   r3  )	r   rr   rV   rZ   r   Zdeclare_atomic_sub_float32rx   Zdeclare_atomic_sub_float64r4  r5  r   r   r   Úptx_atomic_subñ  r7  r8  c                 C   óL   |t jjv r|j}|j}ttd|› ƒ}| ||ƒ||f¡S td|› dƒ‚)NZdeclare_atomic_inc_intzUnimplemented atomic inc with ú array©	r   ÚcudadeclZunsigned_int_numba_typesrp   rV   Úgetattrr   rZ   r   ©r!   r   r=   r-  r¨   Úbwr^   rÅ   r   r   r   Úptx_atomic_inc  ó   r@  c                 C   r9  )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r:  r;  r>  r   r   r   Úptx_atomic_dec  rA  rB  c                    s@   t ‡ fdd„ƒ}tjtjtjfD ]}t| tj|tjƒ|ƒ qd S )Nc                    s2   |t jjv r| ˆ ||d¡S tdˆ › d|› dƒ‚)Nr3  zUnimplemented atomic z with r:  ©r   r<  Úinteger_numba_typesr4  r   ©r!   r   r=   r-  r¨   r¿   r   r   Úimpl_ptx_atomic  s   z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r1  r   r  r  ÚTuplerÃ   ÚArrayÚAny)ZstubrÀ   rF  r©   r   r¿   r   Úptx_atomic_bitwise  s
   ÿrJ  ÚandÚorÚxorc                 C   s,   |t jjv r| d||d¡S td|› dƒ‚)NZxchgr3  zUnimplemented atomic exch with r:  rC  rE  r   r   r   Úptx_atomic_exch/  s   rN  c                 C   ó–   |j }|tjkr| t |¡||f¡S |tjkr#| t |¡||f¡S |tjtj	fv r4|j
d||ddS |tjtjfv rE|j
d||ddS td| ƒ‚©Nré   r3  ©ZorderingZumaxz&Unimplemented atomic max with %s array)rV   r   rx   rZ   r   Zdeclare_atomic_max_float64rr   Zdeclare_atomic_max_float32rf   Úint64r4  Úuint32Úuint64r   r5  r   r   r   Úptx_atomic_max:  ó   
ÿ
ÿrU  c                 C   rO  ©Nrê   r3  rQ  Zuminz&Unimplemented atomic min with %s array)rV   r   rx   rZ   r   Zdeclare_atomic_min_float64rr   Zdeclare_atomic_min_float32rf   rR  r4  rS  rT  r   r5  r   r   r   Úptx_atomic_minN  rV  rX  c                 C   rO  rP  )rV   r   rx   rZ   r   Zdeclare_atomic_nanmax_float64rr   Zdeclare_atomic_nanmax_float32rf   rR  r4  rS  rT  r   r5  r   r   r   Úptx_atomic_nanmaxb  rV  rY  c                 C   rO  rW  )rV   r   rx   rZ   r   Zdeclare_atomic_nanmin_float64rr   Zdeclare_atomic_nanmin_float32rf   rR  r4  rS  rT  r   r5  r   r   r   Úptx_atomic_nanminv  rV  rZ  c                 C   sT   |  |jd tj|jd |jd ¡}|d |  tjd¡|d |d f}t| |||ƒS )Nr   r   r2   )rë   r#   r   r  re   Úptx_atomic_casr    r   r   r   Úptx_atomic_compare_and_swapŠ  s   $"r\  c                 C   s–   |j \}}}}|\}}	}
}t| |||	||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}|jtjjv rD|j	}|jj
}t |||||
|¡S td|j ƒ‚)NTr(  z&Unimplemented atomic cas with %s array)r#   r'  r)  r	   r*  r=   r   r<  rD  rV   rp   r   Zatomic_cmpxchgr   )r!   r   r"   r#   r$  r"  Zoldtyr%  r+  r#  Úoldr¨   r&  r,  r-  r^   rp   r   r   r   r[  ‘  s   ÿÿr[  c                 C   s@   t jt  t  ¡ t  d¡g¡dddd}|d }| ||g¡ d S )Nrk   znanosleep.u32 $0;r³   Tr‹   r   )r   r   rW   rX   rl   rZ   )r!   r   r"   r#   Ú	nanosleepÚnsr   r   r   Úptx_nanosleep©  s
   ÿr`  Fc               	      sb  t tj|dƒ}|dko|ot|ƒdk}|dkr|stdƒ‚ˆ j| }	t|tjtj	fƒp5t|	t
jƒp5|tjk}
|tjvrC|
sCtd| ƒ‚ˆ  |¡}t ||¡}|tjkr\tj|||d}n4|j}t ||||¡}ˆ  |¡}d|d  ¡ > |_|r{d|_nt |tj¡|_| |t  t !d¡¡d¡}t" #t $¡ j%¡}ˆ  |¡}| &|¡}|}g }t't(|ƒƒD ]\}}| )|¡ ||9 }q¬d	d
„ t(|ƒD ƒ}‡ fdd
„|D ƒ}|rútj*t +t !d¡g ¡dddd}| ,| -|g ¡t !d¡¡}ˆ  .tj/|¡}| 0||¡g}n	‡ fdd
„|D ƒ}t|ƒ}tj1||dd}ˆ  2|¡ˆ |ƒ}ˆ j3|| 4||j5j6¡||ˆ  .tj/|¡d d | 7¡ S )Nr   r   zarray length <= 0zunsupported type: %sr7   Zexternalrµ   Zgenericc                 S   s   g | ]}|‘qS r   r   rI   r   r   r   rL   õ  s    z"_generic_array.<locals>.<listcomp>c                    ó   g | ]	}ˆ   tj|¡‘qS r   ©re   r   r  rI   ©r!   r   r   rL   ö  ó    rk   zmov.u32 $0, %dynamic_smem_size;rŠ   Tr‹   rm   c                    ra  r   rb  rI   rc  r   r   rL     rd  ÚC)r=   r!  Zlayout)Údatar<   ÚstridesÚitemsizeZmeminfo)8r   ÚoperatorrÊ   r  Ú
ValueErrorZdata_model_managerÚ
isinstancer   ZRecordÚBooleanr   ZStructModelrÄ   Znumber_domainr   Zget_data_typer   Ú	ArrayTyper   rQ   r	   Zalloca_oncerV   Zadd_global_variableZget_abi_sizeofÚ
bit_lengthÚalignÚlinkageÚConstantÚ	UndefinedZinitializerZaddrspacecastZPointerTyperl   ÚllZcreate_target_dataZNVVMZdata_layoutZget_abi_sizeÚ	enumerateÚreversedÚappendr   rW   rv   rZ   re   r  ZudivrH  r)  Zpopulate_arrayro   rf  ÚtypeZ	_getvalue) r!   r   r<   r=   r>   r?   r@   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrr^   Zgvmemro  Z
targetdatarh  Z
laststrideZrstridesr  Zlastsizerg  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper!  r$  r+  r   rc  r   rB   ´  sx   

ÿý


ÿ
ÿ



þÿûrB   c                 C   s   |   ¡ S r”   )r[   )r!   r   r©   Zpyvalr   r   r   Úcuda_dispatcher_const  s   rx  )F)ÒÚ	functoolsr   ri  r  Zllvmliter   Zllvmlite.bindingZbindingrs  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   Znumba.core.datamodelr   Z
numba.corer   r	   Znumba.npr
   Znumba.np.npyimplr   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   ÚregistryrÃ   Zlower_getattrZ
lower_attrZlower_constantr   ÚModuler$   r&   r'   r(   r*   r0   r1   r3   r  Z
array_likerH  r4   r5   r9   ZsharedÚarrayZIntegerLiteralrI  rF   rG  r  rO   ÚlocalrR   rS   Zthreadfence_blockra   Zthreadfence_systemrc   Zthreadfencerd   Zsyncwarprj   Úi4rh   Zshfl_sync_intrinsicru   Zf4Zf8rƒ   Zvote_sync_intrinsicrô   r„   Zmatch_any_syncr‡   Zmatch_all_syncrˆ   r   r   Zlanemask_ltr‘   Zpopcr’   r•   r–   rŸ   rÄ   ÚFloatr¬   r°   r¶   ÚIntegerrº   r»   rÆ   rÕ   ZhaddrÈ   ÚiaddZhsubrÉ   ÚisubZhmulrÊ   ÚimulZhnegrÍ   ÚnegrÐ   ZhabsrÑ   ÚabsrÒ   ZhfmarÓ   ÚtruedivÚitruedivrØ   rÚ   rÞ   Zheqrß   Úhnerà   Zhgerá   Zhgtrâ   Úhlerã   Zhlträ   rè   ZhmaxZhminrr   rx   rì   Zcbrtrí   ZbrevZu4rñ   Úu8ró   Zclzrõ   Zffsrö   rø   Zselprü   ré   rÿ   r  rê   r  r  r  r	  r  r  ÚpiZ_deg2radZ_rad2degÚradiansÚdegreesr'  r1  Zatomicr  r6  r8  Úincr@  ÚdecrB  rJ  Úand_rw   rM  ZexchrN  rU  rX  ZnanmaxrY  ZnanminrZ  Zcompare_and_swapr\  Zcasr[  r^  rS  r`  rB   rx  Z
get_ufuncsr   r   r   r   Ú<module>   sú   










		
		







	ÿÿÿÿ.










	





þ










%






ÿd
