o
    * i`                     @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl	m
Z
 d dlmZ d dlmZ d dlmZ erEd dlmZ ejjd	ejjd
ejjd
ejjdejjdejjd	ejjd	ejjd	ejjd
ejjdejjdejjd	ejj dejj!diZ"g dZ#de$d< ddgZ%de$d< g Z&de$d< g dZ'de$d< g dZ(de$d< dgZ)de$d< dZ*dZ+dZ,dd Z-G d d! d!Z.G d"d# d#Z/	dTdUd1d2Z0dVd6d7Z1d8d9 Z2d:d; Z3i di fd<d=Z4d>d? Z5d@dA Z6dBdC Z7dDdE Z8dFdG Z9dHdI Z:dJdK Z;dLdM Z<dNdO Z=dPdQ Z>dRdS Z?dS )W    )annotationsN)TYPE_CHECKING)pir)backward_utils)core)in_cinn_debug_mode)Sequence               )Mpd_op.full_int_array
pd_op.fullzpd_op.dividezpd_op.subtractz	pd_op.addzpd_op.multiplyzpd_op.elementwise_powzpd_op.rsqrtzpd_op.reshapezpd_op.full_likezpd_op.assignzpd_op.expandzpd_op.scalez	pd_op.expz	pd_op.sinz	pd_op.coszpd_op.add_nz
pd_op.castzpd_op.concatzpd_op.full_with_tensorzpd_op.gather_ndzpd_op.logical_andzpd_op.logical_notzpd_op.wherez	pd_op.powzpd_op.shapezpd_op.shape64zpd_op.slicezpd_op.squeezezpd_op.unsqueezezpd_op.transposez	pd_op.logzpd_op.log1pzpd_op.logitzpd_op.expand_aszpd_op.splitzpd_op.arangezpd_op.put_along_axisz
pd_op.tanhz
pd_op.atanzpd_op.atanhz
pd_op.sinhz
pd_op.asinzpd_op.asinhz
pd_op.coshz
pd_op.acoszpd_op.acoshz	pd_op.absz
pd_op.signzpd_op.expm1z	pd_op.erfzpd_op.erfinvz
pd_op.ceilzpd_op.floorz
pd_op.fraczpd_op.roundzpd_op.trunczpd_op.anglezpd_op.as_complexzpd_op.as_realzpd_op.complexz
pd_op.realz
pd_op.imagz
pd_op.conjzpd_op.greater_equalzpd_op.greater_thanzpd_op.not_equalzpd_op.equalzpd_op.less_equalzpd_op.less_thanzpd_op.bitwise_andzpd_op.bitwise_orzpd_op.bitwise_xorzpd_op.bitwise_notzpd_op.isinfzpd_op.isnanzpd_op.sigmoidz	list[str]DEFAULT_RECOMPUTABLE_OPSr   r   TENDING_TO_RECOMPUTE_OPSVIEW_OPS)zpd_op.randintzpd_op.uniformzpd_op.dropout
RANDOM_OPS)zpd_op.matmulzpd_op.conv2dzpd_op.layer_normzpd_op.batchnormzpd_op.softmaxzpd_op.all_reduce_zpd_op.c_broadcast_zpd_op.reduce_COMPUTE_INTENSIVE_OPSzcf.stack_create
IGNORE_OPSF   g?c                  G  s8   t d}|rt| dv rt| ddi d S d S d S )NZ FLAGS_print_auto_recompute_debug)1trueflushT)osgetenvstrlowerprint)argsflag r!   j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/decomposition/recompute.py
DebugPrint   s   
r#   c                   @  s$   e Zd Zdd Zdd Zdd ZdS )JudgeFusionLoopc                 C  sB   |  j| _|| _dd | jD | _dd | jD | _|   d S )Nc                 S     i | ]}|t  qS r!   set.0opr!   r!   r"   
<dictcomp>       z,JudgeFusionLoop.__init__.<locals>.<dictcomp>c                 S  r%   r!   r&   r(   r!   r!   r"   r+      r,   )global_blockopsunrecomputable_ops!downstream_unrecomputable_ops_mapupstream_unrecomputable_ops_map_set_has_unfusible_on_path_map)selfprogramr/   r!   r!   r"   __init__   s
   zJudgeFusionLoop.__init__c                   s   fddfddfdddd  fd	d
} fdd}j D ]}j|  ||O  < q'tj D ]}j|  ||O  < q:d S )Nc                   s   t  }g } |||  |S Nr&   )r*   defined_valuesused_values_get_used_external_value_implr!   r"   _get_used_external_value   s   zPJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_used_external_valuec           	        s   |  D ]}|| vr|| | | q| D ]}| D ]}| | q| D ]	\}}| | q+q| D ]}|jD ]} | || q?q:| D ]}| | qMd S r6   )operands_sourceappendaddblocksr   kwargsr.   results)	r7   r8   r*   operandblockvalue_Zinner_opZresult_valuer9   r!   r"   r:      s(   


zUJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_used_external_value_implc                   sH   t  } | D ]}| d u rq| }| |  kr!|| q|S r6   )r'   get_defining_opZget_parent_blockr>   )r*   Z	producersrB   Z	source_op)r;   r!   r"   _get_producer_ops   s   
zIJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_producer_opsc                 S  s8   t  }|  D ]}| D ]}|d ur|| qq|S r6   )r'   rA   all_used_ops_in_same_blockr>   )r*   Z	consumersresultZ	parent_opr!   r!   r"   _get_consumer_ops   s   
zIJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_consumer_opsc                   >   t  } | D ]	}|j| O }q|  jv r||  |S r6   )r'   r1   namer/   r>   )curZupstream_unrecomputable_opsnew_op)rG   r3   r!   r"   _get_upstream_ops_recursively      
zUJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_upstream_ops_recursivelyc                   rK   r6   )r'   r0   rL   r/   r>   )rM   Zdownstream_unrecomputable_opsrN   )rJ   r3   r!   r"   _get_downstream_ops_recursively   rP   zWJudgeFusionLoop._set_has_unfusible_on_path_map.<locals>._get_downstream_ops_recursively)r.   r1   reversedr0   )r3   rO   rQ   r*   r!   )rJ   rG   r;   r:   r3   r"   r2      s   





z.JudgeFusionLoop._set_has_unfusible_on_path_mapc                 C  sR   t | j| | j| @ dkot | j| | j| @ dk}|d ur'|d ur'| S dS )Nr   F)lenr0   r1   )r3   Zop1Zop2Zno_unfusible_op_on_pathr!   r!   r"   _has_unfusible_op_on_any_path   s(   z-JudgeFusionLoop._has_unfusible_op_on_any_pathN)__name__
__module____qualname__r5   r2   rT   r!   r!   r!   r"   r$      s    Hr$   c                   @  s   e Zd Zdd Zdd ZdS )	Op2IdxMapc                 C  s,   i | _ t| jD ]	\}}|| j |< q
d S r6   )op_to_idx_map	enumerater-   r.   )r3   r4   idxZop_iterr!   r!   r"   r5     s   zOp2IdxMap.__init__c                 C  s    | j |d r| j | S td)Nzop not found in program)rY   getRuntimeError)r3   r*   r!   r!   r"   get_idx  s   
zOp2IdxMap.get_idxN)rU   rV   rW   r5   r^   r!   r!   r!   r"   rX     s    rX   r4   paddle.static.PrograminputsSequence[pir.Value]outputsgrad_outputsfwd_op_end_idxintbackward_op_start_idxrecomputable_opsSequence[str] | Nonereturn!tuple[paddle.static.Program, int]c           &        s  t d|  ddl}t }t| ||\
}	}
t|	dks%|t|  jkr)| |fS |  j}t|
t}t	|7 }	durCt
	nt
|	t}t}t|| 	t
|B t|}t|}||B }i }| t| }t
|  jd|d  dd }|| |fdd  
fd	d
 fddfdd}	fdd}
|	B |
B D ]}| sq|  dkrq|  tv rqt| dkr| d  dv rq||	v rt d|jddd j|jd dtjd |||j< q||v r"t ddd|jd jd|jd tjd |||j< ||rF|
v rFt ddd|jd jd|jd tjd |||j< |||}j|jd |jd |d |||j< t|d}|D ]}t d|jd|jd j|jd |jd tjd qg| D ];}|v r|| |rt ddd|jd jd|jd tjd t d|jddd j|jd dtjd qq|dd\}}t d | |\}t
 }fd!d"|D D ]\}|fd#d"|D  qt }|D ]!\}} |dd$ | dd% ksJ ||dd$  }|| q|}!|}!t | |!||||\}"}#t d&|" t }$t! rQt"#d'}%|%$t"j% |%&d(|$|  d) |"|#fS )*a  
    Considering the compiler fuse strategy, we model the pir graph.
    Convert the pir calculation graph into a networkx calculation
    graph. Find the cut point through the min-cut algorithm,
    which is the value to be saved in pir forward calculation graph.

    Recompute the forward computation graph to replace intermediate
    variables in the forward graph held by the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        program (Program): The program to be recomputed.
        inputs:(list[Value]|tuple(Value)): The input Values
            of the forward graph.
        outputs:(list[Value]|tuple(Value)): The out Values
            of the forward graph.
        grad_outputs:(list[Value]|tuple(Value)): initial gradient values
            of `outputs` .
        forward_op_end_idx(int): The index of the last forward op.
        backward_op_start_idx(int): The index of the start backward op.
        recomputable_ops(list[str]|tuple(str)|None): The op names that can
            be recomputed. If 'recompute_ops' is None, we will use the
            default recomputable_ops. Default None.
    Returns:
        recomputed_program(Program): The recomputed program.
        fwd_op_end_idx(int): The index of the last forward op in recomputed program.

    Examples:
        .. code-block:: python

        >>> import numpy as np
        >>> import paddle
        >>> from paddle.autograd.ir_backward import grad as ir_grad
        >>> from paddle.base import core
        >>> from paddle.decomposition import decompose
        >>> def forward(x):
        ...     y = paddle.sin(x)
        ...     z = paddle.cos(y)
        ...     return z

        >>> np_x = np.random.random(size=[4096, 4096]).astype("float32")
        >>> paddle.enable_static()
        >>> core._set_prim_all_enabled(True)
        >>> main_program = paddle.static.Program()
        >>> with paddle.static.program_guard(main_program):
        >>>     x = paddle.static.data(
        >>>         name="x", shape=[4096, 4096], dtype="float32"
        >>>     )
        >>>     x.stop_gradient = False
        >>>     out = forward(x)
        >>>     out_grad = paddle.full(
        >>>         shape=out.shape, fill_value=3, dtype="float32"
        >>>     )
        >>>     [out] = decompose(main_program, [out])
        >>>     [dx] = ir_grad(out, [x], out_grad)
        >>>     main_program, _ = paddle.decomposition.auto_recompute(
        >>>         main_program,
        >>>         [x],
        >>>         [out],
        >>>         grad_outputs=[out_grad],
        >>>         fwd_op_end_idx=2,
        >>>         backward_op_start_idx=4
        >>>     )
        >>>     exe = paddle.static.Executor(paddle.CUDAPlace(0))
        >>>     res = exe.run(
        >>>         feed={'x': np_x},
        >>>         fetch_list=[dx],
        >>>     )
        >>>     print(main_program)
        {
            (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[false]} : () -> pd_op.tensor<4096x4096xf32>
            (%1) = "pd_op.sin" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%2) = "pd_op.cos" (%1) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%3) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true],value:(Float)3} : () -> pd_op.tensor<4096x4096xf32>
            (%4) = "pd_op.sin" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%5) = "pd_op.sin" (%4) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%6) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
            (%7) = "pd_op.scale" (%5, %6) {bias:(Float)0,bias_after_scale:true,stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<4096x4096xf32>
            (%8) = "pd_op.multiply" (%7, %3) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%9) = "pd_op.cos" (%0) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%10) = "pd_op.multiply" (%9, %8) {stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>, pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
            (%11) = "pd_op.fetch" (%10) {col:(Int32)0,is_persistable:[true],name:"fetch0",stop_gradient:[false]} : (pd_op.tensor<4096x4096xf32>) -> pd_op.tensor<4096x4096xf32>
        }
    zprogram before recompute:r   Nr	   c                 S  sb   t  }t  }|  j|d  D ]}| D ]}|| ||r$q|| qq|| }|S r6   )r   ValueSetr-   r.   r<   r>   Zis_no_need_buffer)r4   rf   Zneed_buffer_valuesZ
all_valuesr*   Zop_operand_sourcebw_no_need_buffer_valuesr!   r!   r"   _get_bw_no_need_buffer_values  s   

z5auto_recompute.<locals>._get_bw_no_need_buffer_valuesc                   s    |     v o|    v S r6   )rF   rL   )Zvalue_node1Zvalue_node2)fusible_opsr!   r"   _is_fusible  s   z#auto_recompute.<locals>._is_fusiblec                   s   t  }||  t|dkr@| }t|d}|D ]}|vr* ||s* dS |vr9t|v r9|| qt|dksdS )Nr   TF)r   rk   r>   rS   popfind_value_node_usersget_real_define_op_name)
value_nodeZcur_value_nodesZcur_value_nodeusersuser)ro   rl   forward_opsrequired_fw_value_nodesview_opsr!   r"   _is_materialized_backwards  s$   


z2auto_recompute.<locals>._is_materialized_backwardsc                   s4    |v rdS t  d}t fdd|D  S )NTc                 3  s    | ]} |V  qd S r6   r!   )r)   ru   )ro   rs   r!   r"   	<genexpr>  s    z;auto_recompute.<locals>._is_materialized.<locals>.<genexpr>)rq   all)rs   placeholder_value_nodesrt   )ro   rl   rv   rs   r"   _is_materialized  s   z(auto_recompute.<locals>._is_materializedc                   sZ   t | }|   v r|dkrtS t|dtt|  dd  } | |r)|S |d S )Nr   g?d   r	   r
   )cal_value_node_sizerF   rL   MINIMUM_WEIGHTre   maxmin)rs   r|   Zmem_sz)r~   dist_from_bwtending_to_recompute_opsr!   r"   _get_node_weight  s   
z(auto_recompute.<locals>._get_node_weightc                   s   t r
|   v S |   v rdS |   vrdS  | r$dS |  tkr,dS t| }t| }tdd |D }|d |k S )NFTc                 s  s    | ]}t |V  qd S r6   )r   r)   ir!   r!   r"   rz         z=auto_recompute.<locals>._ban_recomputation.<locals>.<genexpr>r   )AGGRESSIVE_RECOMPUTATIONrF   rL   MAX_DIST_FROM_BWr   get_real_input_nodessum)rs   Zoutput_sizer`   Zinputs_size)ry   r   rg   r   r/   r!   r"   _ban_recomputation  s   z*auto_recompute.<locals>._ban_recomputationbuiltin.combinezbuiltin.splitzbuiltin.slicezadd edge link from: z -> Zsinkz (inf) Z_in)capacityz source z (inf)sourcez(inf)Z_outTz sink z
Cut Value:c                 3  s    | ]	}| | fV  qd S r6   r!   )r)   n)nx_graphr!   r"   rz     s    z!auto_recompute.<locals>.<genexpr>c                 3  s     | ]}| v r|fV  qd S r6   r!   r)   v)non_reachableur!   r"   rz     s    zprogram after recompute:zauto-recomputez(Time of auto recompute program: ***** [ z ] ***** seconds.)'r#   Znetworkxtimeclassify_value_noderS   r-   r.    cal_value_nodes_dist_to_backwardr   r   r'   r   r   r   r   rk   ZDiGraphr$   ZinitializedrF   rL   r   rH   idZadd_edgemathinfrq   rT   Zminimum_cutupdater>   partition_joint_graphr   logging	getLoggersetLevelINFOinfo)&r4   r`   rb   rc   rd   rf   rg   nx
start_timerequired_bw_value_nodesZunclaimed_value_nodesall_opsZdefault_recomputable_opsZ
random_opsZcompute_intensive_opsr|   Zvalue_id_dictZjudge_fusion_looprm   r   r   rs   weightrt   ru   Z	cut_value	partitionZ	reachableZcutsetZnbrsZcut_value_nodesZvalue_node_inZvalue_node_outsaved_valuesZprogram_after_recomputeZfwd_op_end_idx_after_recomputeend_timeloggerr!   )ro   r~   ry   rl   r   rv   rn   r   r   rg   rw   r   r   r/   rx   r"   auto_recompute  sf  
_

















r   r   list[pir.Value]rl   c           
      C  s   t |}t |}t| ||||||}td tdd |D  td tdd |D  d}|D ]}	|t|	7 }q2td|d d d d	 t| ||||\} }| |fS )
a  
    Partition the joint graph, recompute the intermediate values
    by saved values to save memory.
    Args:
        program(Program): The program to be recomputed.
        saved_values(list[valueiable]): The saved values
            of forward graph which used by backward graph.
        inputs:(list[Value]|tuple(Value)): The input Values
            of the forward graph.
        outputs(list[valueiable]): The out values
            of the forward graph.
        forward_op_end_idx(int): The index of the last forward op.
        backward_op_start_idx(int): The index of the start backward op.
    Returns:
        recomputed_program(Program): The recomputed program.
        fwd_op_end_idx(int): The index of the last forward op in
            recomputed program.
    zsaved values: c                 S  &   g | ]}d | d|    dqS (z, )rF   r   r   r!   r!   r"   
<listcomp>     & z)partition_joint_graph.<locals>.<listcomp>zmid values: c                 S  r   r   r   r   r!   r!   r"   r     r   r   zSaved Memory is: i   GB)r   rk   analyze_mid_hold_valuesr#   r   (replace_mid_values_with_forward_subgraph)
r4   r   r`   rb   rl   rd   rf   mid_hold_valuesZmemmidr!   r!   r"   r     s6   

	r   c                 C  s  dd }t | }t|  jd |d  }t|  j|d  }|  j| }	|||}
|
d }|
d }|
d }t| |||	||\}}}|D ]}|dd qF|D ]}|d	d qQt }|D ]}||}|	|| |
| q`t|D ]:}||d }| D ]#}| D ]}||d
r|d u s||| ||k r|| }qq|d usJ |||< qw|D ]	}|||  q| |fS )Nc                   s`    fdd t  }t }|}|D ]
} || ||g  qtdt| td| |||d}|S )Nc           	        s   t |}||  |  }||v s|d u rd S | dv r(| |vr&||  d S | }t|dkrG| dvrGtd|  d|  d| |D ]}||v rY||vrX|| qI ||||| qI|| d S )N)zbuiltin.parameterz
pd_op.datar   )r   r   zEvery path to recompute value zr must have saved value or starting point of the path is one of op in [pd_op.full, pd_op.full_int_array], but find z op, op ir is )listr=   rF   rL   r>   r<   rS   	Exception)	recompute_valuer   Zmarked_recompute_opsZneeded_saved_valueschainZ	new_chain	define_opZ	op_inputsop_input_find_recompute_opsr!   r"   r     s8   



zreplace_mid_values_with_forward_subgraph.<locals>._extract_forward_recompute_subgraph_for_backward.<locals>._find_recompute_opszRecompute Ops: )r`   recompute_opsrb   )r'   r   rk   r#   rS   )r   
mid_valuesZrecompute_subgraph_opsZrecompute_subgraph_inputsZ*recompute_subgraph_outputs_backward_neededr   Zrecompute_subgraphr!   r   r"   0_extract_forward_recompute_subgraph_for_backward  s&   +
zbreplace_mid_values_with_forward_subgraph.<locals>._extract_forward_recompute_subgraph_for_backwardr	   r   r`   rb   Zis_recompute_opTZis_recompute_bw_opr   )rX   r'   r-   r.   clone_graphZset_bool_attrr   rk   Zlook_upZreplace_grad_users_withr>   rR   r\   rA   rH   r^   Zmove_before)r4   r   r   rd   rf   r   op_2_id_maprv   backward_opsZfirst_backward_opZrecompute_forward_subgraph
origin_opsZorigin_subgraph_inputsZorigin_subgraph_outputs
cloned_ops	value_mapcloned_op_first_grad_user_mapZ	origin_opZ	cloned_opZcloned_subgraph_outputsZorigin_valueZcloned_valuer*   Zfirst_subgraph_grad_userZ
op_outputschildr!   r!   r"   r     s`   D
	

r   c           	      C  s|   |   j}t|d |d  }ttd|d }t|   |}tt|d t|}t|   |}||t fS )Nr	   r   )	r-   r.   r'   r   ranger   rk   Zget_values_by_op_idxrS   )	r4   rc   rd   r   Zrequired_fw_opsZrequired_fw_op_idxsrw   Zrequired_bw_op_idxsr   r!   r!   r"   r   q  s   
r   c                   s$  t  }|  }|r| |v r fdd|D }|D ]v}| dkr_| d }| D ]2}| }	|	D ])}
t|
 dkrW|
 d  dv rW|
 d  }|t |O }q3||
 q3q+q| }	|	D ])}
t|
 dkr|
 d  dv r|
 d  }|t |O }qe||
 qeq|S )zP
    Find all the value nodes which use the same value node to be computed.
    c                   s   g | ]}| v r|qS r!   r!   r(   rv   r!   r"   r         z)find_value_node_users.<locals>.<listcomp>r   r   r	   r   )r   rk   rH   rL   rA   rS   r>   )rs   rl   Zwithout_no_need_bufferrv   rt   r.   r*   Zcombine_resultZcombine_res_used_oprA   rI   Zsplit_resultsr!   r   r"   rq     sb   	rq   c                 C  s   t  }|  }| dv r| d }| }| }n| }|D ]}| r=|  dkr=|t |  O }q#|| q#|S )Nr   r   r   )r   rk   rF   rL   r<   r>   )Zoutput_value_nodeZreal_input_nodesr   r   Zreal_define_opZinput_value_nodesZinput_value_noder!   r!   r"   r     s    

r   c                 C  s4   |   }| dv r| d }|   S | S )Nr   r   )rF   rL   r<   )rs   r   r   r!   r!   r"   rr     s
   rr   c                 C  s$   zd| j v W S    td|  d)Nz!value node not found in program:  )shape
ValueErrorr}   r!   r!   r"   is_dynamic_value_node  s   r   c                 C  s*   z	|    d uW S    td|  d)Nzvalue node illegal: r   )typeas_vec_typer   r}   r!   r!   r"   is_vector_value_node  s   r   c                 C  s<   t | rdd | jD }n| j}tdd |dt| j  S )Nc                 S  s   g | ]}|d kr|qS )r   r!   r   r!   r!   r"   r     r   z,cal_value_node_size_impl.<locals>.<listcomp>c                 S  s   | | S r6   r!   )xyr!   r!   r"   <lambda>  s    z*cal_value_node_size_impl.<locals>.<lambda>r	   )r   r   	functoolsreduce_PADDLE_DTYPE_2_NBYTESZdtype)rs   Zvalue_node_shaper!   r!   r"   cal_value_node_size_impl  s   r   c                 C  s>   t | r|    }d}|D ]}|t|7 }q|S t| S Nr   )r   r   r   as_listr   )rs   Z	value_vecZsum_res
child_noder!   r!   r"   r     s   r   c           	      C  s   t  }t| D ]J}| dkrq| }|D ]:}| }t|dkr,|d  dv r,qt|}||vr9d||< qtd||< |D ]}t	|| || d ||< qAqq|S )Nr   r	   r   r   g    eA)
r   Z	ValueDictrR   rL   rA   rH   rS   rq   re   r   )	r   rw   r   r*   Z
op_results	op_resultZused_opsZ
real_usersru   r!   r!   r"   r     s(   

r   c                 C  s   dd }t t|| S )Nc                 S  s*   |   dkrt| d dkrdS dS )Nr   r   FT)rL   rS   rI   rH   )r*   r!   r!   r"   filter_unused_combine)  s   z;all_used_op_consider_combine.<locals>.filter_unused_combine)r   filterrH   )r4   rD   r   r!   r!   r"   all_used_op_consider_combine(  s   r   c                   s   t |  jd |d  }t |  j|d   t }|D ]4}	|	 D ]-}
t| |
}t fdd|D rQ|
|vrQ|
|vrQ|
|vrQ|
|vrQ|	 t	vrQ|
|
 q$q|S )Nr	   c                 3  s    | ]}| v V  qd S r6   r!   )r)   Zused_opr   r!   r"   rz   F  r   z*analyze_mid_hold_values.<locals>.<genexpr>)r'   r-   r.   r   rk   rA   r   anyrL   r   r>   )r4   r   r`   rb   Zno_need_buffer_valuesrd   rf   rv   r   r*   rI   Zall_used_opsr!   r   r"   r   6  s"   	

r   c                 C  sF   d }|   d  D ]}||v r |d u s||||k r |}q
|S r   )rA   rH   r^   )Zfwd_opr   r   first_backward_use_opZuser_opr!   r!   r"   get_first_backward_use_opQ  s   r   c              	   C  s   t | |  j}tj  }t|}g }i }	|D ]}
||
|
 q|D ]A}||v re||tj 	ddd}t
|||}|d urX|drX|drX|d|j |d|j || |d ure||	|< q$t |   |||	fS )NFTop_rolechunk_id)r   Zset_insertion_pointr-   r.   paddleZ	IrMappingr'   r>   cloneZCloneOptionsr   Zhas_attrZset_int_attrr   r   r=   Z set_insertion_point_to_block_end)r4   r   Zgraph_inputsZclone_insertion_opr   r   r   r   r   r   Zinput_valuer*   rN   r   r!   r!   r"   r   ]  s:   




r   r6   )r4   r_   r`   ra   rb   ra   rc   ra   rd   re   rf   re   rg   rh   ri   rj   )r4   r_   r   r   r`   r   rb   r   rl   r   rd   re   rf   re   ri   rj   )@
__future__r   r   r   r   r   r   typingr   r   r   Zpaddle.autogradr   Zpaddle.baser   Zpaddle.base.frameworkr   collections.abcr   ZDataTypeZBOOLZFLOAT16ZBFLOAT16ZFLOAT32ZFLOAT64ZFLOAT8_E4M3FNZFLOAT8_E5M2ZINT8ZINT16ZINT32ZINT64ZUINT8Z	COMPLEX64Z
COMPLEX128r   r   __annotations__r   r   r   r   r   r   r   r   r#   r$   rX   r   r   r   r   rq   r   rr   r   r   r   r   r   r   r   r   r   r!   r!   r!   r"   <module>   s   Xd   
> '
6	
