o
    * iz                     @  sX  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d	d
lmZ erpd dlmZmZ d dlmZ d dlm Z  G dd de
Z!g Z"dd Z#dd Z$dd Z%G dd dZ&e& Z'ej(			d*ddZ)G dd deZ*	d+ddZ+dd  Z,d,d(d)Z-dS )-    )annotationsN)TYPE_CHECKINGAny	TypedDict)	framework)PyLayer)EagerParamBase)get_rng_state_tracker)corein_dynamic_mode   )logger)CallableSequence)NotRequired)
Sequentialc                   @  s"   e Zd ZU dZded< ded< dS )_Ctx   intsegmentszNotRequired[bool]preserve_rng_stateN)__name__
__module____qualname__r   __annotations__ r   r   x/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/fleet/recompute/recompute.pyr   ,   s   
 r   c                 C  s:   t | j}td| j| j| j| jd|}| | |S )N)shapedtype	trainablenamer   )	copydeepcopy__dict__r   r   r   r   r    _share_buffer_to)paramstate	new_paramr   r   r   _varbase_help4   s   
r(   c                 C  s   g }| D ]p}t |tjjs"t|tust |d tjjs"|| qt |tr/|t| qt|tu rgg }|D ]%}t |tjjsDJ t |trQ|t| q9|	 }|j
|_
|| q9|t| q|	 }|j
|_
|| qt|S )Nr   )
isinstancer
   eagerTensortypetupleappendr   r(   detachstop_gradient)inputsoutinpZ
detach_inpiZtmp_ixr   r   r   detach_variableA   s0   


r6   c                 C  sp   g }| D ]&}t |tjr||j qt|tu r*|D ]}t |tjr)||j qqt|r6t	d d S d S )Nz[Recompute]: None of the inputs to current recompute block need grad, therefore there is NO need to recompute this block in backward !)
r)   paddler+   r.   r0   r,   r-   allr   warning)r1   Znecessary_for_each_inputZinput_r4   r   r   r   check_recompute_necessarye   s   r:   c                   @  s(   e Zd Zd Zdd Zdd Zdd ZdS )CustomStatesManagerc                 C  s   d| _ d| _dS )__init__N)custom_get_state_funccustom_set_state_func)selfr   r   r   r<   y   s   
zCustomStatesManager.__init__c                 C      d}| j d u sJ ||| _ d S Nz=The custom_state_manager does not support duplicate settings.)r=   )r?   r=   
assert_msgr   r   r   set_custom_get_state_func~      
z-CustomStatesManager.set_custom_get_state_funcc                 C  r@   rA   )r>   )r?   r>   rB   r   r   r   set_custom_set_state_func   rD   z-CustomStatesManager.set_custom_set_state_funcN)r   r   r   __doc__r<   rC   rE   r   r   r   r   r;   v   s
    r;   c              	   c  s   t  }t  }t |  t | tj }	t	 }
tj
| t| |d ur@|d us3J |d us9J | }|| z&d V  W t | t | tj
|	 t|
 |d ure|| d S d S t | t | tj
|	 t|
 |d ur|| w w N)r7   get_rng_stater	   get_states_trackerZset_rng_stateZset_states_trackernprandom	get_stategetstateZ	set_statesetstate)Z	rng_statetrackerZnumpy_stateZrandom_stateZcustom_stater=   r>   Zorig_rng_stateZorig_rng_trackerZorig_numpy_stateZorig_random_stateZorig_custom_stater   r   r   switch_rng_state_tracker   s<   









rP   c                   @  s$   e Zd Zedd Zedd ZdS )RecomputeFunctionc                 O  s  || _ || _|| _|| _| jr/t | _t  | _	t
j | _t | _| | _|| _|| _t }|jtjjkr<dnd| _|jtjjkrJd| _n|jtjjtjjfv rYd| _ntd|j |jdkrjd| _n|jdv rsd| _ntd	|j |  \| _!| _"t#  ||i |}	W d    n1 sw   Y  g | _$g | _%d
d t&t'|D | _(g }
t)|D ]\}}t*|r|| jv rt+ r|, n|- }|.| |
/| | j%/| | j$/d  qt0|t1u rB|| jvsJ d| dd |D }t2|r2dd |D }t2|st3|rtd|
/| | j%/| d| j(|< | j$/d  qt3|r;td| j$/| q| j$/| q| j4|
  |	S )NFTO2O1zunsupported amp level: float16bfloat16Zfloat32rV   zunsupported amp dtype: c                 S  s   g | ]}d qS )Fr   ).0_r   r   r   
<listcomp>   s    z-RecomputeFunction.forward.<locals>.<listcomp>z;offload_indices should not contain tensor tuple in positionc                 S  s   g | ]}t |qS r   )r7   	is_tensorrW   ar   r   r   rY     s    c                 S  s   g | ]}|j qS r   r0   r[   r   r   r   rY     s    zJRecompute receive a tuple containing tensor holds different stop gradient.zHRecompute receive a tuple containing tensor and non-tensor at same time.)5run_functionr   offload_indiceskwargsr7   rH   fw_rng_stater	   rI   fwd_rng_state_trackerrJ   rK   rL   fwd_numpy_staterM   fwd_random_statefwd_custom_stater=   r>   r   _dygraph_tracer
_amp_levelr
   AmpLevelO0is_fw_autocastrR   	amp_levelrS   
ValueError
_amp_dtype	amp_dtype_get_amp_op_listamp_white_listamp_black_listZno_gradr1   tensor_indicesrangelenduplicate_tensor	enumeraterZ   Zis_compiled_with_cudaZ
pin_memorycpur$   r.   r,   r-   r8   anyZsave_for_backward)ctxr^   r   r_   r=   r>   argsr`   traceroutputsZtensor_inputsr4   argZcpu_argZ
is_tensorsZtensors_stop_gradientr   r   r   forward   s   














zRecomputeFunction.forwardc              
   G  s:  t jj  t| j}| j}| j}|  }t	|D ]'\}}|| j
v r/|| t jj n|| ||< || j
v rB|| j|| _qt }d|_| jrt| j| j| j| j| j| j| j7 t jj| j| j| j| j| j d t!t"|}	| j#|	i | j$}
W d    n1 sw   Y  W d    n1 sw   Y  n/t jj| j| j| j| j| j d t!t"|}	| j#|	i | j$}
W d    n1 sw   Y  t%|
t&j'j(r|
f}
t)|
t)|ksJ g }g }t*t)|
D ] }t%|
| t&j'j(r|
| js|+|
|  |+||  qt)|dkrt,dt jjdd t j-.|| W d    n	1 s1w   Y  g }t	|	D ]?\}}t%|t&j'j(rQ|+|/  q<t0|t"u rz|| rzt1dd |D rn|+d  q<|+t"d	d |D  q<t2 rt"|}nt|}|W  d    S 1 sw   Y  d S )
NTenableZcustom_white_listZcustom_black_listlevelr   r   zHnone of output has requires_grad=True, this recompute() is not necessaryF)r   c                 s  s    | ]}|j V  qd S rG   r]   rW   r4   r   r   r   	<genexpr>  s    z-RecomputeFunction.backward.<locals>.<genexpr>c                 s  s    | ]}|  V  qd S rG   )
_grad_ivarr   r   r   r   r     s    )3r7   baseZdygraphguardlistr1   rr   ru   Zsaved_tensorrv   r_   tor   Z_current_expected_placer0   rf   	_has_gradr   rP   ra   rb   rc   rd   re   r=   r>   amp	auto_castrj   rp   rq   rk   rn   r6   r-   r^   r`   r)   r
   r*   r+   rt   rs   r.   RuntimeErrorautogradbackwardr   r,   r8   r   )ry   rz   r1   rr   ru   Ztensorsr4   idxr{   Zdetached_inputsr|   Zforward_outputs_with_gradZbackward_inputs_with_gradZgradsr3   r   r   r   r   #  s   




 

&zRecomputeFunction.backwardN)r   r   r   staticmethodr~   r   r   r   r   r   rQ      s
    
lrQ   Tc                   s  rLt  }d|v rt  	n,d|v rt  	n#d|v r!t  	n|dd t j v r3t |	ntd| dt 	 
t
j t  t }|jtjjkrYdnd	|jtjjkred
n|jtjjtjjfv rrd|jdkrzdn|jdv rd| \G dd d t g  fdd}	
fdd}	t j||	 i }
W d   |
S 1 sw   Y  |
S )z
    recompute without reentrant, that means use hook to implement the recompute function rather than re-entrant autograd.
    zgpu:rw   zxpu::r   z;Recompute with RNG preserve is not support current device: .FTrR   rS   rT   rU   rV   c                   @  s   e Zd ZdS )z9_recompute_without_reentrant.<locals>.Intermediate_HolderN)r   r   r   r   r   r   r   Intermediate_Holder  s    r   c                   s     } t| |S rG   )r.   weakrefref)r5   res)r   holder_listr   r   pack  s   z*_recompute_without_reentrant.<locals>.packc                   s  d t dkr͇ fdd}dd }rt	
V tdA tjjd& tj|| i  W d    n1 sLw   Y  W d    n1 s[w   Y  W d    n1 sjw   Y  W d    n1 syw   Y  nNtdA tjjd& tj|| i  W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  | vrtd	| S )	Nr   c                   s   d7  d   d u rd S | d u rd  d   < d S t | ds(| jd ur3|  d   < d S |  r>tj| }ntj| j| j| jd tj	j
j| j}| | | d   < d S )Nr   Z	main_gradcpy)hasattrZgradZis_distr
   r*   r+   r   r   r    ZVarDescZVarTypeZDENSE_TENSORZpersistableZ_unsafe_share_buffer_to)inner_xZ
tmp_tensor)r   storageunpack_counterr   r   
inner_pack  s*   
z@_recompute_without_reentrant.<locals>.unpack.<locals>.inner_packc                 S  s   t d)Nz*An unexpected backward called on a tensor!)	Exception)r   r   r   r   inner_unpack  s   zB_recompute_without_reentrant.<locals>.unpack.<locals>.inner_unpackTr   zaNot supported to retrieve a tensor saved by autograd multiple times that is no need to recompute.)
rt   rP   r7   Zset_grad_enabledr   r   r   saved_tensors_hooksr   pop)r5   r   r   )rq   rn   rk   rp   rz   r=   r>   functionfw_cuda_rng_statefwd_cuda_rng_state_trackerre   rc   rd   r   rj   r`   r   r   )r   r   unpack  s~   
   	  
z,_recompute_without_reentrant.<locals>.unpackN)r7   Z
get_deviceZget_cuda_rng_staterH   splitZdeviceZget_all_custom_device_typer   r	   rI   rJ   rK   rL   rM   r   rf   rg   r
   rh   ri   rR   rS   rm   ro   r   WeakKeyDictionaryr   r   )r   r=   r>   r   rz   r`   Z
cur_devicer{   r   r   r|   r   )r   rq   rn   rk   rp   rz   r=   r>   r   r   r   re   rc   rd   r   rj   r`   r   r   r   _recompute_without_reentrant  sT   






.P
r   c                 O  s  | dd}| dd}tjdu r#tjdu sJ ddd}ddd}ntj}tj}t s;dd	lm} || |i |S t j	rQt
|}|t
|  t| |r| d
g }	g }
t| tjjrit| j}nt| }|j|i |}|  t|j |j D ]:\}}|j|jkr|
| q|j|j|jfv r|
| q|j|jkr|
|  q|j|j krt!dt!dt"j#| ||	||g|
R  S t$| |||g|R i |S )a   
    recompute intermediate activations to save then memory.

    Parameters:
        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs to the function.
        **kwargs(Dict): Kwargs should only contain two kinds of key-value params, the one is part of function's key-value params,
                        and the other contains 'preserve_rng_state' and 'use_reentrant'. the key-value pair of preserve_rng_state,
                        which is used to indicate whether to save the forward rng. If it is True, then the last forward rng value
                        will be restored when the forward recalculation of backpropagation is performed, its default value is True.
                        the key-value pair of use_reentrant is used to indicate which implementation of recompute you will be used.
                        'use_reentrant=True' means to use the PyLayer implementation of recompute, 'use_reentrant=False' means to
                        use the Hook implementation of recompute, its default value is True.
    Returns:
        Output of function on args.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:DISTRIBUTED, env:GPU)
            >>> import paddle
            >>> from paddle.distributed.fleet.utils import recompute
            >>> import random
            >>> paddle.seed(2023)
            >>> def get_fc_block(block_idx, input_size, is_last=False):
            ...     block_name = "block_" + str(block_idx)
            ...     block = paddle.nn.Sequential(
            ...         (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
            ...         (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
            ...         (block_name + "_relu_1", paddle.nn.ReLU()),
            ...         (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
            ...         (block_name + "_relu_2", paddle.nn.ReLU()),
            ...     )
            ...     if is_last:
            ...         block.add_sublayer(
            ...             block_name + "_fc_2",
            ...             paddle.nn.Linear(
            ...                 input_size, 1, bias_attr=False
            ...             )
            ...         )
            ...     else:
            ...         block.add_sublayer(
            ...             block_name + "_fc_2",
            ...             paddle.nn.Linear(input_size, input_size, bias_attr=False)
            ...         )
            ...     return block

            >>> class Naive_fc_net(paddle.nn.Layer):
            ...     def __init__(self, input_size=10,
            ...                 recompute_blocks=[1, 3],
            ...                 recompute_kwargs={}):
            ...         super().__init__()
            ...         self.recompute_blocks = recompute_blocks
            ...         self.recompute_kwargs = recompute_kwargs
            ...         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
            ...         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
            ...         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
            ...         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
            ...         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
            ...         self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
            ...     def forward(self, inputs):
            ...         nums = len(self.total_func)
            ...         for i in range(nums):
            ...             if i in self.recompute_blocks:
            ...                 inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
            ...             else:
            ...                 inputs = self.total_func[i](inputs)
            ...         return inputs

            >>> def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
            ...     gen = paddle.seed(10)
            ...     gen.manual_seed(10)
            ...     random.seed(10)
            ...     if cuda_state:
            ...         paddle.set_cuda_rng_state(cuda_state)
            ...     batch_size, input_size = 1, 10
            ...     model = Naive_fc_net(
            ...         input_size,
            ...         recompute_blocks=recompute_block,
            ...         recompute_kwargs=recompute_kwargs)
            ...     optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
            ...     loss_ = []
            ...     param_ = []
            ...     grad_ = []
            ...     for _ in range(5):
            ...         x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
            ...         y_pred = model(x)
            ...         loss = y_pred.mean()
            ...         loss_.append(loss.item())
            ...         loss.backward()
            ...         optimizer.step()
            ...         param_.append(model.parameters()[9])
            ...         grad_.append(model.parameters()[3]._grad_ivar())
            ...         optimizer.clear_grad()
            ...     return loss_, param_, grad_

            >>> cuda_state = paddle.get_cuda_rng_state()
            >>> # without recompute
            >>> loss_ref, param_ref, grad_ref = run_model(
            ...     cuda_state, recompute_block=[]
            ... )

            >>> loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
            >>> print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
            >>> # The result of the recompute_loss should be the same as the normal_loss.
            normal_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0], recompute_loss: [0.0018744759727269411, 0.0, 0.035971127450466156, 0.0, 0.0]

    r   Tuse_reentrantNc                 S     d S rG   r   r5   r   r   r   <lambda>      zrecompute.<locals>.<lambda>c                 S  r   rG   r   r   r   r   r   r     r   r   )	recomputer_   zCurrently, keyword-only arguments are not supported when you want to send kwargs(dict parameter) to function with use_reentrant=True.zUnknown parameter kind.rG   )%r   custom_state_managerr=   r>   r   Z*paddle.distributed.auto_parallel.interfacer   r   rf   r   r   extendvaluesr:   r)   r7   nnZLayerinspect	signaturer~   bindapply_defaultszip	arguments
parameterskindVAR_POSITIONALPOSITIONAL_ONLYPOSITIONAL_OR_KEYWORDr.   VAR_KEYWORDKEYWORD_ONLYrl   rQ   applyr   )r   rz   r`   preserver   r=   r>   Zstatic_auto_recomputeZ
check_argsr_   Z
input_argsZ
dyfunc_sigZ
bound_argsr}   r%   r   r   r   r   #  sv   p



	r   ry   	functions)Sequential | Sequence[Callable[..., Any]]rz   r   r`   returnc           
      O  s   |  dd}|  dd}dd }t|tjjrt| }t|| }d}td||d  |D ]}	|	| d }t	||	||g|R d|i|}q/||d t|d || S )	a  
    recompute intermediate activations to save the memory for 'Sequential' models. use 'ctx' to transmit some context params, it is similar to 'recompute_hybrid' API.

    Parameters:
        ctx(dict): include 'segments' and  'preserve_rng_state' keys, the key 'segments' (int, default 1), represents the number of chunks to create in the model,
                   the key 'preserve_rng_state' (bool, optional, default=True) indicate whether to save the forward rng. If it is True, then the last forward rng value will be
                   restored when the forward recalculation of backpropagation is performed.
        functions(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
              whose intermediate activations will be released to save memory in forward stage and will be recomputed
              in backward stage for gradient calculation.
        *args(Tensor): inputs(tuple) to the function.
        **kwargs(Dict): inputs(dict) to the function.

    Returns:
        Output of function on args and kwargs.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
            >>> import paddle
            >>> from paddle.incubate.distributed.fleet import recompute_sequential
            >>> input = paddle.ones(shape=[8, 10])
            >>> model = paddle.nn.Sequential(paddle.nn.Linear(10, 10), paddle.nn.Linear(10, 2))
            >>> output = recompute_sequential({'segments' : 1}, model, input)

    r   r   r   Tc                   s    fdd}|S )Nc                   s$   t  d D ]}| | } q| S )Nr   )rs   )inputr4   beginendfuncsr   r   do_run  s   z7recompute_sequential.<locals>._run_func.<locals>.do_runr   )r   r   r   r   r   r   r   	_run_func  s   z'recompute_sequential.<locals>._run_funcr   )
getr)   r7   r   r   r   childrenrt   rs   r   )
ry   r   rz   r`   r   r   r   Zsegment_sizer   r   r   r   r   recompute_sequential  s&   !
r   )NNN)T)
ry   r   r   r   rz   r   r`   r   r   r   ).
__future__r   
contextlibr!   r   rK   r   typingr   r   r   numpyrJ   r7   r   Zpaddle.autogradr   Zpaddle.base.frameworkr   Z=paddle.distributed.fleet.meta_parallel.parallel_layers.randomr	   Zpaddle.frameworkr
   r   Zutils.log_utilr   collections.abcr   r   Ztyping_extensionsr   Z	paddle.nnr   r   __all__r(   r6   r:   r;   r   contextmanagerrP   rQ   r   r   r   r   r   r   r   <module>   sN   $$ `
  <