o
    0 i6                     @   s  d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ g dZd	d
dZdd ZG dd dejZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZedZedZe Ze Ze Ze Z d S )!    )runtime)_compile)_cuda_types)BuiltinFunc)Constant)Data)wraps_class_method)	this_gridthis_thread_blocksyncwait
wait_priormemcpy_asyncz[#include <cuda/barrier>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
z,#include <cooperative_groups/memcpy_async.h>)cgcg_memcpy_asyncc                 C   sH   t | jd| }|du r"| jjt|  t| jd| d d S d S )Ninclude_FT)getattr	generatedcodesappend_header_to_codesetattr)envheaderflag r   X/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/cupyx/jit/cg.py_check_include   s
   r   c                   @   s,   e Zd ZdZdZdd Zdd Zdd ZdS )	_ThreadGroupz( Base class for all cooperative groups. Nc                 C   s   t N)NotImplementedErrorselfr   r   r   __init__&   s   z_ThreadGroup.__init__c                 C   s   | j  S r   
child_typer!   r   r   r   __str__)   s   z_ThreadGroup.__str__c                 C      t |d t|j dtjS )Nr   z.sync()r   _Datacoder   voidr"   r   instancer   r   r   _sync,   s   
z_ThreadGroup._sync)__name__
__module____qualname____doc__r%   r#   r&   r.   r   r   r   r   r   !   s    r   c                       s   e Zd ZdZdd Zedd Ze fddZedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Z  ZS )
_GridGroupa  A handle to the current grid group. Must be created via :func:`this_grid`.

    .. seealso:: `CUDA Grid Group API`_, :class:`numba.cuda.cg.GridGroup`

    .. _CUDA Grid Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#grid-group-cg
    c                 C   
   d| _ d S )Nzcg::grid_groupr$   r!   r   r   r   r#   :      
z_GridGroup.__init__c                 C   r'   )zU
        is_valid()

        Returns whether the grid_group can synchronize.
        r   z.is_valid())r   r)   r*   r   Zbool_r,   r   r   r   is_valid=      
z_GridGroup.is_validc                    s   d|j _t ||S )z
        sync()

        Synchronize the threads named in the group.

        .. seealso:: :meth:`numba.cuda.cg.GridGroup.sync`
        T)r   Z	enable_cgsuperr.   r,   	__class__r   r   r   G   s   z_GridGroup.syncc                 C   r'   z`
        thread_rank()

        Rank of the calling thread within ``[0, num_threads)``.
        r   z.thread_rank()r   r)   r*   r   uint64r,   r   r   r   thread_rankU   r7   z_GridGroup.thread_rankc                 C   2   t  dk r
tdt|d t|j dtjS )z]
        block_rank()

        Rank of the calling block within ``[0, num_blocks)``.
        4+  z'block_rank() is supported on CUDA 11.6+r   z.block_rank()_runtime_getLocalRuntimeVersionRuntimeErrorr   r)   r*   r   r=   r,   r   r   r   
block_rank_      
z_GridGroup.block_rankc                 C   r?   zN
        num_threads()

        Total number of threads in the group.
        r@   z(num_threads() is supported on CUDA 11.6+r   z.num_threads()rA   r,   r   r   r   num_threadsk   rF   z_GridGroup.num_threadsc                 C   r?   )zL
        num_blocks()

        Total number of blocks in the group.
        r@   z'num_blocks() is supported on CUDA 11.6+r   z.num_blocks()rA   r,   r   r   r   
num_blocksw   rF   z_GridGroup.num_blocksc                 C   r?   )z[
        dim_blocks()

        Dimensions of the launched grid in units of blocks.
        r@   z'dim_blocks() is supported on CUDA 11.6+r   z.dim_blocks()rB   rC   rD   r   r)   r*   r   dim3r,   r   r   r   
dim_blocks   rF   z_GridGroup.dim_blocksc                 C   r?   )zc
        block_index()

        3-Dimensional index of the block within the launched grid.
        r@   z(block_index() is supported on CUDA 11.6+r   z.block_index()rJ   r,   r   r   r   block_index   rF   z_GridGroup.block_indexc                 C   r'   zG
        size()

        Total number of threads in the group.
        r   z.size()r<   r,   r   r   r   size      
z_GridGroup.sizec                 C   r'   )zZ
        group_dim()

        Dimensions of the launched grid in units of blocks.
        r   .group_dim()r   r)   r*   r   rK   r,   r   r   r   	group_dim   rP   z_GridGroup.group_dim)r/   r0   r1   r2   r#   _wraps_class_methodr6   r   r>   rE   rH   rI   rL   rM   rO   rS   __classcell__r   r   r9   r   r3   1   s.    
	
	






r3   c                       s   e Zd ZdZdd Ze fddZedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Z  ZS )_ThreadBlockGroupa  A handle to the current thread block group. Must be
    created via :func:`this_thread_block`.

    .. seealso:: `CUDA Thread Block Group API`_

    .. _CUDA Thread Block Group API:
        https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#thread-block-group-cg
    c                 C   r4   )Nzcg::thread_blockr$   r!   r   r   r   r#      r5   z_ThreadBlockGroup.__init__c                    s   t  ||S )zM
        sync()

        Synchronize the threads named in the group.
        )r8   r.   r,   r9   r   r   r      s   z_ThreadBlockGroup.syncc                 C   r'   r;   r   r)   r*   r   uint32r,   r   r   r   r>      r7   z_ThreadBlockGroup.thread_rankc                 C   r'   )zc
        group_index()

        3-Dimensional index of the block within the launched grid.
        r   z.group_index()rR   r,   r   r   r   group_index   r7   z_ThreadBlockGroup.group_indexc                 C   r'   )zf
        thread_index()

        3-Dimensional index of the thread within the launched block.
        r   z.thread_index()rR   r,   r   r   r   thread_index   r7   z_ThreadBlockGroup.thread_indexc                 C   r?   )z^
        dim_threads()

        Dimensions of the launched block in units of threads.
        r@   z(dim_threads() is supported on CUDA 11.6+r   z.dim_threads()rJ   r,   r   r   r   dim_threads   rF   z_ThreadBlockGroup.dim_threadsc                 C   r?   rG   )rB   rC   rD   r   r)   r*   r   rX   r,   r   r   r   rH      rF   z_ThreadBlockGroup.num_threadsc                 C   r'   rN   rW   r,   r   r   r   rO      rP   z_ThreadBlockGroup.sizec                 C   r'   )z\
        group_dim()

        Dimensions of the launched block in units of threads.
        r   rQ   rR   r,   r   r   r   rS   	  rP   z_ThreadBlockGroup.group_dim)r/   r0   r1   r2   r#   rT   r   r>   rY   rZ   r[   rH   rO   rS   rU   r   r   r9   r   rV      s&    	
	
	
	



rV   c                       s,   e Zd Zdd Z fddZdd Z  ZS )_ThisCgGroupc                 C   sf   |dkr	d}d}n|dkrd}d}nt || _d| d| d	| d
| _|dkr1|  jd7  _d S d S )Ngridz
grid groupr3   thread_blockzthread block grouprV   z
        Returns the current z (:class:`~cupyx.jit.cg.z/`).

        .. seealso:: :class:`cupyx.jit.cg.`z!, :func:`numba.cuda.cg.this_grid`)r    
group_typer2   )r"   r`   nametypenamer   r   r   r#     s$   
z_ThisCgGroup.__init__c                    s   t    d S r   r8   __call__r!   r9   r   r   rd   (  s   z_ThisCgGroup.__call__c                 C   sD   t jrtd| jdkrt }n| jdkrt }td| j d|S )Nz)cooperative group is not supported on HIPr]   r^   z	cg::this_z())rB   Zis_hiprD   r`   r3   rV   r)   )r"   r   Zcg_typer   r   r   
call_const+  s   

z_ThisCgGroup.call_const)r/   r0   r1   r#   rd   re   rU   r   r   r9   r   r\     s    r\   c                       $   e Zd Z fddZdd Z  ZS )_Syncc                       t    dS )a  Calls ``cg::sync()``.

        Args:
            group: a valid cooperative group

        .. seealso:: `cg::sync`_

        .. _cg::sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-sync
        Nrc   r"   groupr9   r   r   rd   7     z_Sync.__call__c                 C   s4   t |jts
tdt|d td|j dtjS )Nz'group must be a valid cooperative groupr   z	cg::sync())	
isinstancectyper   
ValueErrorr   r)   r*   r   r+   r"   r   rj   r   r   r   callD  s   
z
_Sync.callr/   r0   r1   rd   rq   rU   r   r   r9   r   rg   5      rg   c                       s0   e Zd Zdd fdd
ZddddZ  ZS )_MemcpySyncN)aligned_sizec                   rh   )a  Calls ``cg::memcpy_sync()``.

        Args:
            group: a valid cooperative group
            dst: the destination array that can be viewed as a 1D
                C-contiguous array
            dst_idx: the start index of the destination array element
            src: the source array that can be viewed as a 1D C-contiguous
                array
            src_idx: the start index of the source array element
            size (int): the number of bytes to be copied from
                ``src[src_index]`` to ``dst[dst_idx]``
            aligned_size (int): Use ``cuda::aligned_size_t<N>`` to guarantee
                the compiler that ``src``/``dst`` are at least N-bytes aligned.
                The behavior is undefined if the guarantee is not held.

        .. seealso:: `cg::memcpy_sync`_

        .. _cg::memcpy_sync:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-memcpy-async
        Nrc   )r"   rj   dstdst_idxsrcsrc_idxrO   ru   r9   r   r   rd   M  s   z_MemcpySync.__call__c             
   C   s   t |d t |d t||}t||}||fD ]}	t|	jtjtjfs*tdqt	
|||}t	
|||}t	|tjd|}t||}|j }
|rat|tsWtdd|j d|
 d}
td	|j d
|j d|j d|
 d	tjS )Nr   r   zdst/src must be of array type.Z	same_kindz,aligned_size must be a compile-time constantzcuda::aligned_size_t<>(rl   zcg::memcpy_async(z, &(z), &(z), )r   r)   initrm   rn   r   ZCArrayZPtr	TypeErrorr   Z	_indexingZ_astype_scalarrX   r*   	_Constantro   objr+   )r"   r   rj   rv   rw   rx   ry   rO   ru   ZarrZ	size_coder   r   r   rq   f  s@   




z_MemcpySync.callrr   r   r   r9   r   rt   K  s
    rt   c                       rf   )_Waitc                    rh   )a  Calls ``cg::wait()``.

        Args:
            group: a valid cooperative group

        .. seealso: `cg::wait`_

        .. _cg::wait:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nrc   ri   r9   r   r   rd     rk   z_Wait.__call__c                 C   s    t |d td|j dtjS )Nr   z	cg::wait(rl   r(   rp   r   r   r   rq     s   
z
_Wait.callrr   r   r   r9   r   r     rs   r   c                       rf   )
_WaitPriorc                    rh   )aX  Calls ``cg::wait_prior<N>()``.

        Args:
            group: a valid cooperative group
            step (int): wait for the first ``N`` steps to finish

        .. seealso: `cg::wait_prior`_

        .. _cg::wait_prior:
            https://docs.nvidia.com/cuda/archive/11.6.0/cuda-c-programming-guide/index.html#collectives-cg-wait
        Nrc   ri   r9   r   r   rd     s   z_WaitPrior.__call__c                 C   s:   t |d t|tstdtd|j d|j dtjS )Nr   z$step must be a compile-time constantzcg::wait_prior<rz   rl   )	r   rm   r}   ro   r)   r~   r*   r   r+   )r"   r   rj   stepr   r   r   rq     s   

z_WaitPrior.callrr   r   r   r9   r   r     s    r   r]   r^   N)!Z	cupy.cudar   rB   Z	cupyx.jitr   r   Zcupyx.jit._internal_typesr   Z_BuiltinFuncr   r}   r   r)   r   rT   __all__r   r   ZTypeBaser   r3   rV   r\   rg   rt   r   r   r	   r
   r   r   r   r   r   r   r   r   <module>   s6     c :
