o
    )i
+                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlmZ dee fddZdZzd d	lmZmZmZ d d
lmZ edZe ZdZW n eyc   dZdZdZdZdZdZY nw eeeeef Ze jG dd dZdeddfddZdeddfddZdeegef deeegdf de
j j!j"fddZ#edeegef deeegdf ddfddZ$G dd dZ%dS )    N)contextmanager)AnyCallableOptionalUnion)is_pin_memory_availablereturnc                 C   s   d}t d}|D ]
}| |v r|} nq	W d   n1 sw   Y  |du r)dS |d}||d  }|dd }|dd | sQJ d| d|  |S )	a)  
    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
    the file `/proc/self/maps` contains the memory maps of the process, which includes the
    shared libraries loaded by the process. We can use this file to find the path of the
    a loaded library.
    Nz/proc/self/maps/z.sor   zUnexpected filename: z for library )openindexstripsplit
rpartition
startswith)lib_nameZ
found_lineflinestartpathfilename r   g/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/device_allocator/cumem.pyfind_loaded_library   s$   

r   F)init_modulepython_create_and_mappython_unmap_and_release)CudaRTLibraryZcumem_allocatorTc                   @   s0   e Zd ZU eed< eed< dZeej	 ed< dS )AllocationDatahandletagNcpu_backup_tensor)
__name__
__module____qualname__
HandleType__annotations__strr!   r   torchZTensorr   r   r   r   r   F   s   
 r   allocation_handlec                 C      t |   d S N)r   r)   r   r   r   create_and_mapM      r-   c                 C   r*   r+   )r   r,   r   r   r   unmap_and_releaseQ   r.   r/   python_malloc_fnpython_free_funcc                 C   s    t | | tjjtdd}|S )NZ	my_mallocZmy_free)r   r(   cudamemoryCUDAPluggableAllocatorr   )r0   r1   	new_allocr   r   r   get_pluggable_allocatorU   s
   
r6   c                 c   sZ    t | |}tjj|j}tjj| ||fV  W d    d S 1 s&w   Y  d S r+   )r6   r(   r2   r3   ZMemPoolZ
_allocatorZuse_mem_pool)r0   r1   r5   Zmem_poolr   r   r   use_memory_pool_with_allocator`   s   
"r7   c                   @   s   e Zd ZU dZdZd ed< dZeed< edddZ	d	d
 Z
deddfddZdedefddZ	ddeeeedf ef  ddfddZddeee  ddfddZeddee fddZdefddZdS )CuMemAllocatora'  
    A singleton class that manages a memory pool for CUDA tensors.
    The memory in this pool can be offloaded or discarded when the
    allocator sleeps.

    Inside the `use_memory_pool(tag)` context, all tensors created will
    be allocated in the memory pool, and has the same tag as the
    tag passed to the context.

    When we call `sleep`, all tensors with the specified tag will be
    offloaded to CPU memory, and the rest of the tensors will be discarded.
    When we call `wake_up`, all tensors that are previously offloaded
    will be loaded back to GPU memory, and the rest of the tensors will
    have empty memory.

    Why it needs to be a singleton?
    When allocated tensors are garbage collected, PyTorch will call
    the free callback, which will call the `python_free_callback` method.
    The C-extension uses a global variable to store the function of an
    instance of this class. If we create multiple instances of this class,
    the global variable will be overwritten and the free callback will
    not work as expected.
    Ninstancedefaultdefault_tagr   c                   C   s$   t sJ dtjdu rt t_tjS )z
        CuMemAllocator is a singleton class.
        We cannot call the constructor directly.
        Call this method to get the instance.
        z cumem allocator is not availableN)cumem_availabler8   r9   r   r   r   r   get_instance   s   
zCuMemAllocator.get_instancec                 C   s6   t jdd}d|vsJ di | _tj| _i | _d S )NZPYTORCH_CUDA_ALLOC_CONF zexpandable_segments:TruezExpandable segments are not compatible with memory pool. Please track https://github.com/pytorch/pytorch/issues/147851 for the latest updates.)osenvirongetpointer_to_datar8   r;   current_tagallocator_and_pools)selfconfr   r   r   __init__   s   

zCuMemAllocator.__init__r)   c                 C   s   |d }t || j| j|< dS )zj
        Internal method to store the allocation data
        when memory is allocated in the memory pool.   N)r   rC   rB   )rE   r)   Zpy_d_memr   r   r   python_malloc_callback   s
   
z%CuMemAllocator.python_malloc_callbackptrc                 C   s"   | j |}|jdurd|_|jS )zh
        Internal method to look up the allocation data
        when memory is freed in the memory pool.N)rB   popr!   r   )rE   rJ   datar   r   r   python_free_callback   s   
z#CuMemAllocator.python_free_callbackoffload_tags.c                 C   s   |du r	t jf}nt|tr|f}t|tsJ | j D ]-\}}|j}|j|v rF|d }t	j
|t	jdt d}| }t||| ||_t| qt  t	j  dS )aS  
        Put the allocator in sleep mode.
        All data in the memory allocation with the specified tag will be
        offloaded to CPU memory, and others will be discarded.

        :param offload_tags: The tags of the memory allocation that will be
            offloaded. The rest of the memory allocation will be discarded.
        N   cpu)ZdtypeZdeviceZ
pin_memory)r8   r;   
isinstancer'   tuplerB   itemsr   r    r(   emptyZuint8r   data_ptr	libcudart
cudaMemcpyr!   r/   gcZcollectr2   Zempty_cache)rE   rN   rJ   rL   r   size_in_bytesr!   cpu_ptrr   r   r   sleep   s*   



zCuMemAllocator.sleeptagsc                 C   s|   | j  D ]6\}}|du s|j|v r;|j}t| |jdur;|j}|dur;| |  }| }t	
||| d|_qdS )a  
        Wake up the allocator from sleep mode.
        All data that is previously offloaded will be loaded back to GPU 
        memory, and the rest of the data will have empty memory.
        
        :param tags: The tags of the memory allocation that will be loaded
            back to GPU memory. If None, all memory allocation will be loaded
            back to GPU memory.
        N)rB   rS   r    r   r-   r!   ZnumelZelement_sizerU   rV   rW   )rE   r\   rJ   rL   r   r!   rY   rZ   r   r   r   wake_up   s   

zCuMemAllocator.wake_upr    c                 c   st    |du rt j}t|tsJ | j}|| _t| j| j}|| j|< dV  || _W d   dS 1 s3w   Y  dS )a'  
        A context manager to use the memory pool.
        All memory allocation created inside the context will be allocated
        in the memory pool, and has the specified tag.

        :param tag: The tag of the memory allocation. If None, the default tag
            will be used.
        N)	r8   r;   rQ   r'   rC   r7   rI   rM   rD   )rE   r    Zold_tagrL   r   r   r   use_memory_pool   s   

"zCuMemAllocator.use_memory_poolc                 C   s.   d}| j  D ]\}}|j}||d 7 }q|S )zM
        Get the total number of bytes allocated in the memory pool.
        r   rO   )rB   rS   r   )rE   Z	sum_bytesrJ   rL   r   r   r   r   get_current_usage  s
   z CuMemAllocator.get_current_usage)r   r8   r+   )r"   r#   r$   __doc__r9   r&   r;   r'   staticmethodr=   rG   r%   rI   intrM   r   r   rR   r[   listr]   r   r^   r_   r   r   r   r   r8   j   s*   
 	
&%r8   )&dataclassesrX   r?   
contextlibr   typingr   r   r   r   r(   Z
vllm.utilsr   r'   r   r<   Zvllm.cumem_allocatorr   r   r   Z2vllm.distributed.device_communicators.cuda_wrapperr   r   rV   ModuleNotFoundErrorrR   rb   r%   	dataclassr   r-   r/   r2   r3   r4   r6   r7   r8   r   r   r   r   <module>   s`   


	