o
    + ib                     @  s   d dl mZ d dlZd dlmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ ddlmZmZ d	d
lmZ erjd dlmZ d dlmZ d dlmZ d dlmZ d dl
mZ d	dlmZ d	dlmZ G dd deZg ZG dd deZdS )    )annotationsN)TYPE_CHECKING)_C_opspir)in_dynamic_or_pir_mode)L2Decay   )core	framework   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                   @  s6   e Zd ZU ded< ded< ded< ded< ded< d	S )
_MomentumParameterConfigzNotRequired[float]momentumzNotRequired[bool]use_nesterovrescale_gradzNotRequired[str]regularization_methodregularization_coeffN)__name__
__module____qualname____annotations__ r   r   e/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/optimizer/momentum.pyr   (   s   
 r   c                      sz   e Zd ZdZdZ										d+d, fddZdd Zdd  Zd- fd!d"	Zd#d$ Z	d%d& Z
d'd( Zd)d* Z  ZS ).Momentuma  

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:

        learning_rate (float|Tensor|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LRScheduler. The default value is 0.001.
        momentum (float): Momentum factor. The default value is 0.9.
        parameters (list|tuple|None, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        use_nesterov(bool, optional): Enables Nesterov momentum. The default value is False.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
        use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
        name (str|None, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> inp = paddle.to_tensor(inp)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> momentum = paddle.optimizer.Momentum(
            ...     learning_rate=0.1,
            ...     parameters=linear.parameters(),
            ...     weight_decay=0.01
            ... )
            >>> back = out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> momentum = paddle.optimizer.Momentum(
            ...     learning_rate=0.1,
            ...     parameters=[{ # type: ignore
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1
            ...     }],
            ...     weight_decay=0.01,
            ...     momentum=0.9
            ... )
            >>> out.backward()
            >>> momentum.step()
            >>> momentum.clear_grad()

    velocityMbP??NF      ?learning_ratefloat | Tensor | LRSchedulerr   float
parameters<Sequence[Tensor] | Sequence[_MomentumParameterConfig] | Noner   boolweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonemulti_precisionr   use_multi_tensorname
str | NonereturnNonec                   sl  |d u rt d|d u rt dt|trt|}dd }t|trSt|d trS|D ]'}d|v r5|d n|}| |\}}||d< ||d< ||rLd n|}||d< q+||rYd n|}t j|||||
d	 d
| _	|| _
t|| _| |\| _| _|| _|| _i | _|||| j| jd| _|	| _| jr|  | _|  | _|  | _d | jd< |  | _|  | _d S d S )Nzlearning_rate is not setzmomentum is not setc                 S  s   t | ttfS N)
isinstancer   r'   )Zregularr   r   r   <lambda>   s    z#Momentum.__init__.<locals>.<lambda>r   r+   r   r   )r%   r(   r+   r-   r1   r   )r   r   r   r   r   FP32_DenseTensor)
ValueErrorr6   intr'   listdict_update_regularizationsuper__init__type	_momentumr*   _use_nesterov_regularization_method_regularization_coeff_multi_precision_rescale_grad_master_weights_default_dictZ_use_multi_tensorZ_create_multi_tensor_dict_param_dict_velocity_dict_master_weight_dict_regularization_method_dict_regularization_coeff_dict)selfr%   r   r(   r   r+   r-   r/   r   r0   r1   	predicateZparam_groupZdecay
reg_method	reg_coeffZ
py_regular	__class__r   r   r?      sh   








zMomentum.__init__c                 C  s6   d}d}t |trd}|j}t |trd}|}||fS )N         l2_decay)r6   r   _coeffr'   )rN   r+   rP   rQ   r   r   r   r=      s   

zMomentum._update_regularizationc                 C  s   t |tjtjjfsJ t |tr| |}|D ]B}|j| jv r!q| j	r>| 
|jr>| |}| | j| | j|j q| 
|jrL| j	sLtd | | j| | j|j qdS )zD
        if framework.in_dynamic_mode():
            return
        zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Momentum optimizer.N)r6   r
   Blockpaddler   r<   _update_param_groupr1   Z_already_create_accumulatorrE   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_velocity_acc_straddwarningswarn)rN   blockr(   pZmaster_pr   r   r   _create_accumulators   s*   



zMomentum._create_accumulatorsc                   s*   t |drt|jtr|S t |||S )zpCreate and add backward regularization Operators

        Function helper of append_regularization_ops.
        regularizer)hasattrr6   rd   r   r>   _create_regularization_of_grad)rN   paramZgradZregularizationrR   r   r   rf     s   z'Momentum._create_regularization_of_gradc                 C  s  t |tjtjfstdt |tr| |}| | j|d }| 	|}|d }| j
}| j}t|drJt |jtrAd}|jj}n	|jd urJd}d}| joT| |d j}|r_| j|d j nd }	t rt |trp| |d  t|d |d |||	| j| j|||| jS | j| j|||| jd	}
|d g|d g|g|gd
}|d g|gd}|r|	|d< |	|d< |j| j|||
dd}|S )Nzblock is not instance of Block.r   rd   rV   rT   rU   r+   r   )mur   r   r   r/   r   ParamZGradVelocityZLearningRateZParamOutZVelocityOutMasterParamMasterParamOutTr@   inputsoutputsattrsstop_gradient)r6   r
   rX   r   	TypeErrorr<   rZ   _get_accumulator_masterr]   _create_param_lrrC   rD   re   rd   r   rW   rE   r[   r\   rG   r1   r   r=   r   Z	momentum_rA   rB   rF   	append_opr@   )rN   ra   param_and_gradvelocity_acclrrg   r   r   find_mastermaster_weightrr   rp   rq   Zmomentum_opr   r   r   _append_optimize_op  s   









zMomentum._append_optimize_opc                 C  s^  |  || |D ]}| | j|}| j}| j}t|dr2t|jtr)d}|jj	}n	|jdur2d}d}|j
tjkra| jd | | | jd | | | jd | | | jd | | q| |j
r| jd | | | jd | | | jr| jd | | j|j  nd| jd |< | jd | | | jd | | qtddS )	a  
        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, bf16, float32).
        This function will be overridden in the corresponding optimizer file.

        Args:
            target_block: the block in which the loss tensor is present
            parameters: list of parameter tensors for the optimizer
        rd   rV   NrT   rU   r8   FP16_DenseTensorz^Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is DENSE_TENSOR.)rc   ru   r]   rC   rD   re   r6   rd   r   rW   r\   rY   float32rI   appendrJ   rL   rM   r[   rE   rK   rG   r1   r9   )rN   target_blockr(   param_group_idxrg   ry   r   r   r   r   r   _multi_tensor_initl  sr   	




zMomentum._multi_tensor_initc                 C  s  t |tjsJ g g d}g g d}t |trz|D ]_}|d du r"q|d jdu rx|d jtjkrQ|d jt	j
jjkrQ|d |d  | |}|d | q| |d jrx|d jt	j
jjkrx|d |d  | |}|d | qn{|d D ]v}|d du rq~|d jdu ri }||d< |d	d
 | D  | |}|d jtjkr|d jt	j
jjkr|d |d  | |}|d | q~| |d jr|d jt	j
jjkr|d |d  | |}|d | q~ddg}	|	D ]}
t| j|
 | dkr| jo|
dk}| j|
 }|dur|| nd}t r| d}|r@t |t	jjtjjfr?| dd qt |t	jjtjjfrR| dd t| j|
 | ||
 | j |
 | ||
 || j!| j"| j#|
 | | j$|
 | || j%\}}}q| j|
 | ||
 | j |
 | ||
 d}| j|
 | | j |
 | d}| j!| j"| j#|
 | | j$|
 | d}|r| j|
 | |d< | j|
 | |d< ||d< |j&d|||dd qdS )zM
        For Multi Tensor, append optimize merged_operator to block.
        )r8   r~   r   Nr   Fr8   r~   paramsc                 S  s   i | ]\}}|d kr||qS )r   r   ).0kvr   r   r   
<dictcomp>  s
    z=Momentum._append_optimize_multi_tensor_op.<locals>.<dictcomp>	found_infTri   rl   )rh   r   r   r   rm   rn   r/   Zmerged_momentumro   )'r6   r
   rX   r;   rs   r\   rY   r   r@   r	   ZVarDescZVarTypeZDENSE_TENSORr   rv   r[   updateitemsrZ   lenrI   rE   rK   r   Z_get_auxiliary_vareagerr   r   ValueZ_set_auxiliary_varr   Zmerged_momentum_rJ   rA   rB   rL   rM   rF   rw   )rN   r   Zparameters_and_gradsr   Z	grad_dictZlr_dictrx   rz   Zparam_grad_dictZmulti_tensor_listkeyr{   r|   r   _rp   rq   rr   r   r   r    _append_optimize_multi_tensor_op  s  	











z)Momentum._append_optimize_multi_tensor_opc                 C  sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r   r   r   r   )getrH   rA   rB   rF   rC   rD   )rN   r(   r   r   r   rZ   A  s"   





zMomentum._update_param_group)
r"   r#   NFNNFr$   FN)r%   r&   r   r'   r(   r)   r   r*   r+   r,   r-   r.   r/   r*   r   r'   r0   r*   r1   r2   r3   r4   r5   )r   r   r   __doc__r]   r?   r=   rc   rf   r}   r   r   rZ   __classcell__r   r   rR   r   r    3   s,    _HW@ r    )
__future__r   r_   typingr   rY   r   r   Zpaddle.frameworkr   Zpaddle.regularizerr   baser	   r
   Z	optimizerr   collections.abcr   Ztyping_extensionsr   r   Zpaddle.nn.clipr   r   rz   r   r   r   __all__r    r   r   r   r   <module>   s(   