o
    0 iy                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ z	eeddZ W n e!yv   dd Z Y nw dd Z"dd Z#dd Z$G dd de
j%Z&G dd de&Z'e dd Z(G dd de&Z)G dd de*Z+G d d! d!e&ej,Z-G d"d# d#e&ej,Z.d5d$d%Z/d5d&d'Z0d(d) Z1d*d+ Z2d,Z3d-d. Z4d6d1d2Z5d3d4 Z6dS )7z
A CUDA ND Array is recognized by checking the __cuda_memory__ attribute
on the object.  If it exists and evaluate to True, it must define shape,
strides, dtype and size attributes similar to a NumPy ndarray.
    N)c_void_p)_devicearray)devices
dummyarray)driver)typesconfig)to_fixed_tuple)numpy_version)numpy_support)prepare_shape_strides_dtype)NumbaPerformanceWarning)warn	lru_cachec                 C   s   | S N )funcr   r   j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/numba/cuda/cudadrv/devicearray.pyr      s   c                 C   s   t | ddS )z$Check if an object is a CUDA ndarray__cuda_ndarray__F)getattrobjr   r   r   is_cuda_ndarray#      r   c                    sB   t    fdd}|dt |dt |dtj |dt dS )z,Verify the CUDA ndarray interface for an objc                    s6   t  | s	t| tt | |std| |f d S )Nz%s must be of type %s)hasattrAttributeError
isinstancer   )attrtypr   r   r   requires_attr,   s
   
z4verify_cuda_ndarray_interface.<locals>.requires_attrshapestridesdtypesizeN)require_cuda_ndarraytuplenpr"   int)r   r   r   r   r   verify_cuda_ndarray_interface(   s   

r(   c                 C   s   t | stddS )z9Raises ValueError is is_cuda_ndarray(obj) evaluates Falsezrequire an cuda ndarray objectN)r   
ValueErrorr   r   r   r   r$   8   s   r$   c                   @   s   e Zd ZdZdZdZd%ddZedd Zd&d	d
Z	edd Z
d'ddZdd Zedd Zedd Zejd&ddZejd(ddZd&ddZdd Zdd Zd(dd Zd!d" Zed#d$ ZdS ))DeviceNDArrayBasez$A on GPU NDArray representation
    Tr   Nc                 C   s"  t |tr|f}t |tr|f}t|}t|| _t|| jkr%tdtj	d|||j
| _t|| _t|| _|| _tttj| jd| _| jdkrn|du rgt| j| j| jj
| _t | j}n"t|| _ntjrxtjd}ntd}tjt |dd}d| _|| _ || _!dS )a5  
        Args
        ----

        shape
            array shape.
        strides
            array strides.
        dtype
            data type as np.dtype coercible object.
        stream
            cuda stream.
        gpu_data
            user provided device memory for the ndarray data buffer
        zstrides not match ndimr      N)contextZpointerr#   )"r   r'   r&   r"   lenndimr)   r   ArrayZ	from_descitemsize_dummyr%   r    r!   	functoolsreduceoperatormulr#   _driverZmemory_size_from_info
alloc_sizer   get_contextZmemallocZdevice_memory_sizeUSE_NV_BINDINGbindingCUdeviceptrr   ZMemoryPointergpu_datastream)selfr    r!   r"   r=   r<   nullr   r   r   __init__D   s>   








zDeviceNDArrayBase.__init__c                 C   s   t jr| jd urt| j}nd}n| jjd ur| jj}nd}t| jt| r(d nt| j|df| j	j
| jdkr?t| jddS d ddS )Nr   F   )r    r!   dataZtypestrr=   version)r6   r9   device_ctypes_pointerr'   valuer%   r    is_contiguousr!   r"   strr=   )r>   Zptrr   r   r   __cuda_array_interface__w   s"   

z*DeviceNDArrayBase.__cuda_array_interface__c                 C   s   t  | }||_|S )zBind a CUDA stream to this object so that all subsequent operation
        on this array defaults to the given stream.
        )copyr=   )r>   r=   cloner   r   r   bind   s   
zDeviceNDArrayBase.bindc                 C      |   S r   	transposer>   r   r   r   T   s   zDeviceNDArrayBase.Tc                 C   st   |rt |t t| jkr| S | jdkrd}t||d ur0t|tt| jkr0td|f ddlm} || S )N   z2transposing a non-2D DeviceNDArray isn't supportedzinvalid axes list %rr   rM   )r%   ranger.   NotImplementedErrorsetr)   Znumba.cuda.kernels.transposerN   )r>   ZaxesmsgrN   r   r   r   rN      s   
zDeviceNDArrayBase.transposec                 C   s   |s| j S |S r   r=   )r>   r=   r   r   r   _default_stream   s   z!DeviceNDArrayBase._default_streamc                 C   sR   d| j v }| jd r|sd}n| jd r|sd}nd}t| j}t|| j|S )n
        Magic attribute expected by Numba to get the numba type that
        represents this object.
        r   C_CONTIGUOUSCF_CONTIGUOUSFA)r!   flagsr   
from_dtyper"   r   r/   r.   )r>   	broadcastZlayoutr"   r   r   r   _numba_type_   s   
zDeviceNDArrayBase._numba_type_c                 C   s,   | j du rtjrtjdS tdS | j jS )z:Returns the ctypes pointer to the GPU data buffer
        Nr   )r<   r6   r9   r:   r;   r   rD   rO   r   r   r   rD      s
   
z'DeviceNDArrayBase.device_ctypes_pointerc                 C   s   |j dkrdS t|  | |}t| t|}}t|r3t| t|| tj| || j|d dS t	j
||jd r=dnddtdk rI|jd	  ndd
}t|| tj| || j|d dS )zCopy `ary` to `self`.

        If `ary` is a CUDA memory, perform a device-to-device transfer.
        Otherwise, perform a a host-to-device transfer.
        r   NrV   rY   rZ   r\   TrQ   r   Z	WRITEABLE)ordersubokrI   )r#   sentry_contiguousrW   
array_corer6   is_device_memorycheck_array_compatibilitydevice_to_devicer7   r&   arrayr^   r
   Zhost_to_device)r>   aryr=   Z	self_coreZary_corer   r   r   copy_to_device   s*   





z DeviceNDArrayBase.copy_to_devicec                 C   s   t dd | jD rd}t|| j| jdksJ d| |}|du r0tj| jtjd}nt	| | |}| jdkrFt
j|| | j|d |du rg| jdkr[tj| j| j|d	}|S tj| j| j| j|d
}|S )a^  Copy ``self`` to ``ary`` or create a new Numpy ndarray
        if ``ary`` is ``None``.

        If a CUDA ``stream`` is given, then the transfer will be made
        asynchronously as part as the given stream.  Otherwise, the transfer is
        synchronous: the function returns after the copy is finished.

        Always returns the host array.

        Example::

            import numpy as np
            from numba import cuda

            arr = np.arange(1000)
            d_arr = cuda.to_device(arr)

            my_kernel[100, 100](d_arr)

            result_array = d_arr.copy_to_host()
        c                 s   s    | ]}|d k V  qdS r   Nr   ).0sr   r   r   	<genexpr>
  s    z1DeviceNDArrayBase.copy_to_host.<locals>.<genexpr>z2D->H copy not implemented for negative strides: {}r   zNegative memory sizeNr    r"   rV   )r    r"   buffer)r    r"   r!   rr   )anyr!   rS   formatr7   rW   r&   emptybyterh   r6   device_to_hostr#   ndarrayr    r"   )r>   rk   r=   rU   hostaryr   r   r   copy_to_host   s.   



zDeviceNDArrayBase.copy_to_hostc                 c   s    |  |}| jdkrtd| jd | jjkrtdttt	| j
| }| j}| jj}t|D ])}|| }t|| | j
}|| f}	| j|| || }
t|	|| j||
dV  q3dS )zSplit the array into equal partition of the `section` size.
        If the array cannot be equally divided, the last section will be
        smaller.
        r+   zonly support 1d arrayr   zonly support unit strider"   r=   r<   N)rW   r.   r)   r!   r"   r0   r'   mathceilfloatr#   rR   minr<   viewDeviceNDArray)r>   sectionr=   Znsectr!   r0   ibeginendr    r<   r   r   r   split"  s$   



zDeviceNDArrayBase.splitc                 C   s   | j S )zEReturns a device memory object that is used as the argument.
        )r<   rO   r   r   r   as_cuda_arg7  s   zDeviceNDArrayBase.as_cuda_argc                 C   s0   t  | j}t| j| j| jd}t||dS )z
        Returns a *IpcArrayHandle* object that is safe to serialize and transfer
        to another process to share the local allocation.

        Note: this feature is only available on Linux.
        )r    r!   r"   )
ipc_handle
array_desc)	r   r8   get_ipc_handler<   dictr    r!   r"   IpcArrayHandle)r>   ZipchZdescr   r   r   r   <  s   z DeviceNDArrayBase.get_ipc_handlec                 C   s2   | j j|d\}}t|j|j| j| || jdS )a(  
        Remove axes of size one from the array shape.

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Subset of dimensions to remove. A `ValueError` is raised if an axis
            with size greater than one is selected. If `None`, all axes with
            size one are removed.
        stream : cuda stream or 0, optional
            Default stream for the returned view of the array.

        Returns
        -------
        DeviceNDArray
            Squeezed view into the array.

        )axisr    r!   r"   r=   r<   )r1   squeezer   r    r!   r"   rW   r<   )r>   r   r=   Z	new_dummy_r   r   r   r   G  s   zDeviceNDArrayBase.squeezec                 C   s   t |}t| j}t| j}| jj|jkr;|  stdt|d | jj |j\|d< }|dkr6td|j|d< t	|||| j
| jdS )zeReturns a new object by reinterpretting the dtype without making a
        copy of the data.
        zHTo change to a dtype of a different size, the array must be C-contiguousr   zuWhen changing to a larger dtype, its size must be a divisor of the total size in bytes of the last axis of the array.r   )r&   r"   listr    r!   r0   is_c_contiguousr)   divmodr   r=   r<   )r>   r"   r    r!   remr   r   r   r   c  s0   



zDeviceNDArrayBase.viewc                 C   s   | j j| j S r   )r"   r0   r#   rO   r   r   r   nbytes  s   zDeviceNDArrayBase.nbytesrm   r   r   Nr   )__name__
__module____qualname____doc__Z__cuda_memory__r   r@   propertyrH   rK   rP   rN   rW   ra   rD   r   require_contextrl   rz   r   r   r   r   r   r   r   r   r   r   r*   >   s6    
3






.
%r*   c                       s   e Zd ZdZd fdd	Zedd Zedd	 Zej	d
d Z
ej	dddZdddZej	dd Zej	dddZdddZ  ZS )DeviceRecordz
    An on-GPU record type
    r   Nc                    s$   d}d}t t| ||||| d S Nr   )superr   r@   )r>   r"   r=   r<   r    r!   	__class__r   r   r@     s
   zDeviceRecord.__init__c                 C      t | jjS z
        For `numpy.ndarray` compatibility. Ideally this would return a
        `np.core.multiarray.flagsobj`, but that needs to be constructed
        with an existing `numpy.ndarray` (as the C- and F- contiguous flags
        aren't writeable).
        r   r1   r^   rO   r   r   r   r^        zDeviceRecord.flagsc                 C   s   t | jS )rX   )r   r_   r"   rO   r   r   r   ra     s   zDeviceRecord._numba_type_c                 C   
   |  |S r   _do_getitemr>   itemr   r   r   __getitem__     
zDeviceRecord.__getitem__c                 C      |  ||S z0Do `__getitem__(item)` with CUDA stream
        r   r>   r   r=   r   r   r   getitem     zDeviceRecord.getitemc           
      C   s   |  |}| jj| \}}| j|}|jdkr9|jd ur$t|||dS tj	d|d}t
j|||j|d |d S t|jd |jd d\}}}	t|||	||dS )	Nr   r{   r+   r"   dstsrcr#   r=   r   rZ   r    r!   r"   r<   r=   )rW   r"   fieldsr<   r   r    namesr   r&   ru   r6   rw   r0   r   Zsubdtyper   )
r>   r   r=   r   offsetnewdatary   r    r!   r"   r   r   r   r     s.   



zDeviceRecord._do_getitemc                 C   r   r   _do_setitemr>   keyrE   r   r   r   __setitem__  r   zDeviceRecord.__setitem__c                 C      | j |||dS z6Do `__setitem__(key, value)` with CUDA stream
        rV   r   r>   r   rE   r=   r   r   r   setitem     zDeviceRecord.setitemc                 C   s   |  |}| }|rt }| }| jj| \}}| j|}t| |||d}	t	|	j||d\}
}t
|	|
|
jj| |rG|  d S d S )Nr{   rV   )rW   r   r8   get_default_streamr"   r   r<   r   typeauto_devicer6   ri   r0   synchronize)r>   r   rE   r=   synchronousctxr   r   r   lhsrhsr   r   r   r   r     s   
zDeviceRecord._do_setitemrm   r   )r   r   r   r   r@   r   r^   ra   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r     s"    
	



r   c                    s>   ddl m  dkr jdd }|S  j fdd}|S )z
    A separate method so we don't need to compile code every assignment (!).

    :param ndim: We need to have static array sizes for cuda.local.array, so
        bake in the number of dimensions into the kernel
    r   )cudac                 S   s   |d | d< d S r   r   )r   r   r   r   r   kernel     z_assign_kernel.<locals>.kernelc                    s     d}d}t| jD ]	}|| j| 9 }q||krd S  jjdftjd}td ddD ]&}|| j|  |d|f< || j|  |j| dk |d|f< || j|  }q/|t|d  | t|d < d S )Nr+   rQ   rq   r   r   )	gridrR   r.   r    localrj   r   int64r	   )r   r   location
n_elementsr   idxr   r.   r   r   r     s   
$$)numbar   Zjit)r.   r   r   r   r   _assign_kernel  s   
r   c                   @   s   e Zd ZdZdd Zedd Zdd Zdd	d
Zdd Z	dd Z
d ddZejdd Zejd!ddZd!ddZejdd Zejd!ddZd!ddZdS )"r   z
    An on-GPU array type
    c                 C      | j jS )zA
        Return true if the array is Fortran-contiguous.
        )r1   Zis_f_contigrO   r   r   r   is_f_contiguous'     zDeviceNDArray.is_f_contiguousc                 C   r   r   r   rO   r   r   r   r^   -  r   zDeviceNDArray.flagsc                 C   r   )z;
        Return true if the array is C-contiguous.
        )r1   Zis_c_contigrO   r   r   r   r   7  r   zDeviceNDArray.is_c_contiguousNc                 C   s   |r	|   |S |    S )zE
        :return: an `numpy.ndarray`, so copies to the host.
        )rz   	__array__)r>   r"   r   r   r   r   =  s   zDeviceNDArray.__array__c                 C   s
   | j d S r   )r    rO   r   r   r   __len__F  s   
zDeviceNDArray.__len__c                 O   s   t |dkrt|d ttfr|d }t| }|| jkr(|| j| j| j| jdS | j	j
|i |\}}|| j	jgkrF||j|j| j| jdS td)z
        Reshape the array without changing its contents, similarly to
        :meth:`numpy.ndarray.reshape`. Example::

            d_arr = d_arr.reshape(20, 50, order='F')
        r+   r   )r    r!   r"   r<   operation requires copying)r-   r   r%   r   r   r    r!   r"   r<   r1   reshapeextentrS   )r>   Znewshapekwsclsnewarrextentsr   r   r   r   I  s   


zDeviceNDArray.reshaperZ   r   c                 C   sT   |  |}t| }| jj|d\}}|| jjgkr&||j|j| j| j|dS t	d)z
        Flattens a contiguous array without changing its contents, similar to
        :meth:`numpy.ndarray.ravel`. If the array is not contiguous, raises an
        exception.
        )rc   r   r   )
rW   r   r1   ravelr   r    r!   r"   r<   rS   )r>   rc   r=   r   r   r   r   r   r   r   a  s   

zDeviceNDArray.ravelc                 C   r   r   r   r   r   r   r   r   s  r   zDeviceNDArray.__getitem__c                 C   r   r   r   r   r   r   r   r   w  r   zDeviceNDArray.getitemc                 C   s   |  |}| j|}t| }t| }t|dkrW| jj|d  }|j	sK| j
jd ur4t| j
||dS tjd| j
d}tj||| jj|d |d S ||j|j| j
||dS | jj|j }||j|j| j
||dS )Nr+   r   r{   r   r   r   )rW   r1   r   r   Ziter_contiguous_extentr   r-   r<   r   Zis_arrayr"   r   r   r&   ru   r6   rw   r0   r    r!   r   )r>   r   r=   arrr   r   r   ry   r   r   r   r   }  s0   


zDeviceNDArray._do_getitemc                 C   r   r   r   r   r   r   r   r     r   zDeviceNDArray.__setitem__c                 C   r   r   r   r   r   r   r   r     r   zDeviceNDArray.setitemc                 C   s^  |  |}| }|rt }| }| j|}| jj|j }t	|t
jr*d}d}	n|j}|j}	t| ||	| j||d}
t||dd\}}|j|
jkrUtd|j|
jf tj|
jtjd}|j||
j|j d < |j| }tt|
j|jD ]\}\}}|dkr||krtd|||f qwttj|
jd}t|
jj||d	|
| |r|  d S d S )
Nr   r   T)r=   user_explicitz$Can't assign %s-D array to %s-D selfr   r+   zCCan't copy sequence with size %d to array axis %d with dimension %drV   ) rW   r   r8   r   r1   r   r<   r   r   r   r   Elementr    r!   r   r"   r   r.   r)   r&   Zonesr   r   	enumeratezipr2   r3   r4   r5   r   forallr   )r>   r   rE   r=   r   r   r   r   r    r!   r   r   r   Z	rhs_shaper   lrr   r   r   r   r     sN   
	
zDeviceNDArray._do_setitemr   )rZ   r   r   )r   r   r   r   r   r   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   #  s(    
	
	



r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )r   a"  
    An IPC array handle that can be serialized and transfer to another process
    in the same machine for share a GPU allocation.

    On the destination process, use the *.open()* method to creates a new
    *DeviceNDArray* object that shares the allocation from the original process.
    To release the resources, call the *.close()* method.  After that, the
    destination can no longer use the shared array object.  (Note: the
    underlying weakref to the resource is now dead.)

    This object implements the context-manager interface that calls the
    *.open()* and *.close()* method automatically::

        with the_ipc_array_handle as ipc_array:
            # use ipc_array here as a normal gpu array object
            some_code(ipc_array)
        # ipc_array is dead at this point
    c                 C   s   || _ || _d S r   )_array_desc_ipc_handle)r>   r   r   r   r   r   r@        
zIpcArrayHandle.__init__c                 C   s$   | j t }tdd|i| jS )z
        Returns a new *DeviceNDArray* that shares the allocation from the
        original process.  Must not be used on the original process.
        r<   Nr   )r   openr   r8   r   r   )r>   Zdptrr   r   r   r     s   zIpcArrayHandle.openc                 C   s   | j   dS )z5
        Closes the IPC handle to the array.
        N)r   closerO   r   r   r   r     s   zIpcArrayHandle.closec                 C   rL   r   )r   rO   r   r   r   	__enter__  s   zIpcArrayHandle.__enter__c                 C   s   |    d S r   )r   )r>   r   rE   	tracebackr   r   r   __exit__  s   zIpcArrayHandle.__exit__N)	r   r   r   r   r@   r   r   r   r   r   r   r   r   r     s    r   c                   @      e Zd ZdZdddZdS )MappedNDArrayz4
    A host array that uses CUDA mapped memory.
    r   c                 C      || _ || _d S r   r<   r=   r>   r<   r=   r   r   r   device_setup  r   zMappedNDArray.device_setupNr   r   r   r   r   r   r   r   r   r   r         r   c                   @   r   )ManagedNDArrayz5
    A host array that uses CUDA managed memory.
    r   c                 C   r   r   r   r   r   r   r   r     r   zManagedNDArray.device_setupNr   r   r   r   r   r   r    r   r  c                 C   s   t | j| j| j||dS )z/Create a DeviceNDArray object that is like ary.r=   r<   )r   r    r!   r"   )rk   r=   r<   r   r   r   from_array_like  s   r  c                 C   s   t | j||dS )z.Create a DeviceRecord object that is like rec.r  )r   r"   )Zrecr=   r<   r   r   r   from_record_like"  r   r  c                 C   sF   | j r| js| S g }| j D ]}||dkrdntd q| t| S )aG  
    Extract the repeated core of a broadcast array.

    Broadcast arrays are by definition non-contiguous due to repeated
    dimensions, i.e., dimensions with stride 0. In order to ascertain memory
    contiguity and copy the underlying data from such arrays, we must create
    a view without the repeated dimensions.

    r   N)r!   r#   appendslicer%   )rk   Z
core_indexstrider   r   r   rf   '  s   

rf   c                 C   sR   | j j}tt| jt| jD ]\}}|dkr&|dkr&||kr" dS ||9 }qdS )z
    Returns True iff `ary` is C-style contiguous while ignoring
    broadcasted and 1-sized dimensions.
    As opposed to array_core(), it does not call require_context(),
    which can be quite expensive.
    r+   r   FT)r"   r0   r   reversedr    r!   )rk   r#   r    r  r   r   r   rF   9  s   rF   zArray contains non-contiguous buffer and cannot be transferred as a single memory region. Please ensure contiguous buffer with numpy .ascontiguousarray()c                 C   s,   t | }|jd s|jd sttd S d S )NrY   r[   )rf   r^   r)   errmsg_contiguous_buffer)rk   corer   r   r   re   O  s   re   TFc                 C   s   t | r	| dfS t| drtj| dfS t| tjr#t	| |d}ntj
| tdk r,dnddd} t|  t| |d}|r\tjrU|sUt| tsUt| tjrUd}tt| |j| |d |dfS )	z
    Create a DeviceRecord or DeviceArray like obj and optionally copy data from
    host to device. If obj already represents device memory, it is returned and
    no copy is made.
    FrH   rV   rb   NT)rI   rd   zGHost array used in CUDA kernel will incur copy overhead to/from device.)r6   rg   r   r   r   Zas_cuda_arrayr   r&   voidr  rj   r
   re   r  r   ZCUDA_WARN_ON_IMPLICIT_COPYr   rx   r   r   rl   )r   r=   rI   r   ZdevobjrU   r   r   r   r   U  s2   


r   c                 C   s   |   |  }}| j|jkrtd| j|jf |j|jkr)td| j|jf | jr<|j|jkr>td| j|jf d S d S )Nzincompatible dtype: %s vs. %szincompatible shape: %s vs. %szincompatible strides: %s vs. %s)r   r"   	TypeErrorr    r)   r#   r!   )Zary1Zary2Zary1sqZary2sqr   r   r   rh   |  s   


rh   rm   )r   TF)7r   r|   r2   r4   rI   ctypesr   numpyr&   r   r   Znumba.cuda.cudadrvr   r   r   r6   Z
numba.corer   r   Znumba.np.unsafe.ndarrayr	   Znumba.np.numpy_supportr
   Znumba.npr   Znumba.cuda.api_utilr   Znumba.core.errorsr   warningsr   r   r   r   r   r(   r$   ZDeviceArrayr*   r   r   r   objectr   rx   r   r  r  r  rf   rF   r	  re   r   rh   r   r   r   r   <module>   sZ      Tg
+ :,




'