o
    * iSK                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZ e s"e r-d dlm	Z
 dd ZndZ
dd Zg dZd aG d	d
 d
Z	dddZdd Zdd Zdd Zdd Zdd Zdd ZdS )    N)core)	CUDAPlaceis_compiled_with_cudais_compiled_with_rocm)	CUDAGraphc                   C      dS )NT r   r   r   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/device/cuda/graphs.pyis_cuda_graph_supported      r
   c                   C   r   )NFr   r   r   r   r	   r
   #   r   )globalthread_localrelaxedc                   @   s@   e Zd ZdddZdd Zdd Zd	d
 Zdd ZdddZdS )r   Nr   c                 C   s`   t d usJ dd | _|d u rttjdd}t|}|| _|tv s%J t	|| _
|| _d S )NzFCUDA Graph is only supported on PaddlePaddle compiled with NVIDIA GPU.ZFLAGS_selected_gpusr   )CoreCUDAGraph_graphintosenvirongetr   _place	ALL_MODESindex_mode_pool_id)selfZplacemodeZpool_idZ	device_idr   r   r	   __init__,   s   

zCUDAGraph.__init__c                 C   s   t | j| j| j d S N)r   Zbegin_capture_with_pool_idr   r   r   r   r   r   r	   capture_begin:   s   zCUDAGraph.capture_beginc                 C   s   t  | _d S r   )r   end_capturer   r   r   r   r	   capture_end?      zCUDAGraph.capture_endc                 C      | j   d S r   )r   replayr   r   r   r	   r$   B   r"   zCUDAGraph.replayc                 C   r#   r   )r   resetr   r   r   r	   r%   E   r"   zCUDAGraph.resetc                 C   s\   t |ttfs
|j}tj|dd tj|sJ d| d|d u r%d}| j	|| d S )NT)nameexist_okzThe dirname z should be a directoryi  )

isinstancestrbytesr&   r   makedirspathisdirr   print_to_dot_files)r   dirnameflagsr   r   r	   r.   H   s   
zCUDAGraph.print_to_dot_files)Nr   Nr   )	__name__
__module____qualname__r   r   r!   r$   r%   r.   r   r   r   r	   r   +   s    
r   r   defaultc           
         s  |t v sJ t sCddlm} tt}td7 a|dkrd}n|dkr(t }nt	d| ||d t| d |  fdd	S dd
l
m} ddlm} | }t |r\|j}	n|}	||	_|dkrjd|	_|S |dkrut |	_|S t||r|jj|	_|S |j|	_|S )Nr   )_cuda_graph_guard   r4   newzMmemory_pool should be one of default or new under static graph mode, but got ;c                     s    | i |S r   r   )argskwargsfunctionr   r	   <lambda>g   s    z!wrap_cuda_graph.<locals>.<lambda>)	to_static)Layer)r   paddleZin_dynamic_modeZpaddle.base.frameworkr5   r)   cuda_graph_idr   Zgen_new_memory_pool_id
ValueErrorZ
paddle.jitr>   Z	paddle.nnr?   r(   forwardZ_cuda_graph_capture_modeZ_cuda_graph_pool_id)
r<   r   Zmemory_poolr5   Zgraph_idmemory_pool_idr>   r?   Znew_functionZ	mock_funcr   r;   r	   wrap_cuda_graphT   sH   





rE   c                 C   sX   |  |j | |j | |j | |j | |j	 | 
|j | |j dS )z
    copy var desc from src to dst

    :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance
    :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance
    :return: no return
    N)	set_shapeshapeZ	set_dtypeZdtypeZset_lod_levelZ	lod_levelset_typetypeZset_persistablepersistableZset_is_parameterZis_parameterZset_stop_gradientstop_gradient)dstsrcr   r   r	   copy_var_desc   s   rN   c                 C   sD   g }t | jD ]\}}||krq|jD ]}|| qqtt|S )a1  
    find all inputs of ops after an idx, used to determine the logical output of a cuda graph section

    :param block: framework.Block, the original block
    :param begin_idx: int, from which idx (not include) to find the later ins
    :return: a list of inputs names for all ops behind begin_idx
    )	enumerateopsinput_arg_namesappendlistset)blockZ	begin_idxinsidxopin_namer   r   r	   all_inputs_of_later_op   s   
rZ   c                 C   sN  t j }| }| }g }g }tj }t||d }	| D ]}}
|
jD ]=}|	|}|j
	|jd}t|| ||dkrN||dkrN|| q$|	|dkra||dkra|| q$|
jD ]"}|	|}|j
	|jd}t|| ||dkr|| qe|j
 }||
j
 |||
| q|  |||gfS )a  
    1. Construct a new program for corresponding section
    2. Find all the logical inputs and outputs of a program section

    :param section: list, one cuda graph section, list of ops
    :param origin_program: framework.Program, origin program
    :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx
    :return: a new program for the cuda graph section
             the logical ins and outs of the cuda graph section
    asciir   )r@   ZstaticZProgramglobal_blockr   op_proto_and_checker_makerkOpRoleAttrNamerZ   rQ   varZdescr&   encoderN   countrR   removeZoutput_arg_namesZ	append_opZ	copy_fromZ	_set_attrattrZ_sync_with_cpp)sectionorigin_programsection_idxprogramrU   origin_blockrV   outsop_role_attr_nameZ	later_insrX   rY   r`   Znew_var_descZout_nameZnew_op_descr   r   r	   #construct_program_and_find_ins_outs   s:   










rl   c                 C   s  |   }g }g }d}g }g }g }g }d}	tj }
ttjjj}ttjjj}||B }t|j	D ]\}}|j
dks?|j
dkrH|jdu sHJ d|drT|drTd}|jdur-t|jtsdJ d	|jd
}t|dkstJ dt|d }||	krt|dkrt|t|ksJ d|D ]H}t||
|kpt||
|k}|j
dkp|j
dk}|s|rg }g }t|t|ksJ dt|dkr|| || g }g } nqtt|D ]}|||  |||  qg }g }|| || q0g }g }|}	t|t|ksJ dt|dkr&|| || |g}|g}q0|| || q0t|t|ksEJ dt|dkrV|| || |||fS )a  
    get all sections that should run under cuda graph and the corresponding idx

    :param program: framework.Program, the original program
    :return: A list of cuda graph sections and the corresponding ops' idx in the block.
             The program is under is test or not.
    Fr[   Zconditional_blockwhileNz9Cuda graph not support conditional block op and while op.is_testTzcuda_graph_attr should be a strr8      zccuda graph attr should have three fields: cuda graph mode, cuda graph memory pool id, cuda graph id   r   z@len of internal section should be equal with len of internal idxz-num of section's op is not equal with the idxz0num of section's op is not equal with num of idx)r]   r   r^   r_   r   ZOpRoleZLossZBackwardrO   rP   rI   _cuda_graph_attrZhas_attrrd   r(   r)   splitlenrR   range)rh   rU   cuda_graph_sectionssections_idxrn   Zinternal_sectionZinternal_idxZcurrent_sectionZcurrent_idxZcurrent_cuda_graph_idrk   Zloss_op_roleZbackward_op_roleZloss_grad_op_rolerW   rX   Zcuda_graph_attrsZlocal_cuda_graph_idZinternal_opZloss_relatedZsub_block_relatedir   r   r	   get_cuda_graph_sections   s   











rx   c                 C   s2  | d }| d }|d }	|  }
t|D ]	}|
j|dd qd}d}|D ]}|jdur=|jd}|d }t|d } nq$|durF|dusJJ d|
jdt| tj	j
jd	d	d
}|
jdt| tj	j
jd	d	d
}tj|| }|
j|	dd|i|||d|  dt|  j||||d|  |  d
d dS )a  
    Use section_program and ins_and_outs to initialize a run_program_op,
    and replace the section_idx marks ops in the origin program.

    :param ins_and_outs: list, the logical ins and outs of the section program
    :param section_program: framework.Program, the partial program need to run under cuda graph
    :param section_idx: list, the idx need to be removed from origin program
    :param origin_program: framework.Program, the origin program
    :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test
    :param order: int, the order of current section, used to create unique cuda graph var
    :param is_test: bool, the program is running under is_test or not
    :return: no return
    r   r6   F)syncNr8   z>mode and memory pool id should be specified in cuda graph attrZcuda_graph_T)r&   rI   rJ   rK   Zprogram_out_scope_Zrun_programX)ZOutZOutScoper   )
r]   Zstart_op_indexZend_op_indexrn   
program_idZcuda_graph_capture_modeZcuda_graph_pool_idZuse_interpretorcoreZforward_global_blockZbackward_global_block)rI   ZinputsZoutputsattrs)r]   reversedZ
_remove_oprq   rr   r   Z
create_varr)   r   ZVarDescZVarTypeZRAWZSTEP_SCOPESr@   utilsZ_hash_with_idZ
_insert_oprs   rP   )ins_and_outssection_programrg   rf   Zcuda_graph_sectionorderrn   rV   rj   Z
insert_idxri   rW   r   rD   rX   r|   Zcuda_graph_varZout_scope_varr{   r   r   r	   replace_cuda_graph_sectionR  sf   



r   c           	   
   C   s   t | jdkrtd t| \}}}t |t |ksJ dg }g }tt |D ]}t|| | || \}}|| || q)t |t |ksNJ dttt |D ]}t	|| || || | || ||d qV|S )z
    replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph

    :param program: framework.Program, the program to be transformed
    :return: the cuda graph section program, user should hold these programs!
    r6   zSub block(s) has been detected in the program. Cuda graph not support op with sub block, and it will only handle the global block.z@num of cuda graph sections is not equal with num of idx sectionszJthe num of cuda graph sections should be equal with the num of new program)r   rn   )
rs   blockswarningswarnrx   rt   rl   rR   r}   r   )	rh   ru   rv   rn   r   Zsection_programsrw   r   Zins_outsr   r   r	   cuda_graph_transform  s@   

r   )r   r4   )r   r   r@   Zpaddle.baser   Zpaddle.base.corer   r   r   r   r   r
   r   rA   rE   rN   rZ   rl   rx   r   r   r   r   r   r	   <module>   s(   

),4|Y