o
    pi6                     @  s   d dl mZ d dlmZ d dlmZmZ d dlmZ ddl	m
Z
mZ ddlmZ dd	lmZ er\d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ G dd deZg ZG dd deZdS )    )annotations)TYPE_CHECKING)_C_opspir)global_scope   )core	framework)Variable   )	Optimizer)Sequence)Callable)NotRequired)Tensor)GradientClipBase)_ParameterConfigc                   @  s6   e Zd ZU ded< ded< ded< ded< ded< d	S )
_LambParameterConfigzNotRequired[float | Tensor]beta1beta2epsilonzNotRequired[float]lamb_weight_decayz,NotRequired[Callable[[Tensor], bool] | None]exclude_from_weight_decay_fnN)__name__
__module____qualname____annotations__ r   r   \/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/optimizer/lamb.pyr   %   s   
 r   c                      st   e Zd ZdZdZdZdZdZ						
						d-d. fd!d"Zd/d#d$Z	d%d& Z
d'd( Zd)d* Zd+d, Z  ZS )0Lambu  
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing
    accuracy, which supports adaptive element-wise updating and accurate layer-wise
    correction. For more information, please refer to `Large Batch Optimization for
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t

        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2

        m_t &= \frac{m_t}{\beta_1^t}

        v_t &= \frac{v_t}{\beta_2^t}

        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}

        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Tensor, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float|Tensor, optional): A small float value for numerical stability. Default 1e-6.
        parameters (list|tuple|None, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_base_clip_ClipGradByNorm` ,
            :ref:`api_paddle_base_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_base_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
        exclude_from_weight_decay_fn (Callable|None, optional): whether to skip weight decay for a parameter when this function returns True while take the parameter as input.
        multi_precision (bool, optional) - Whether to use it during weight updates multi-precision, Default False。
        always_adapt (bool, optional): whether to use Layer-wise LR adaptation. By default, skip adaptation on parameters that are
            excluded from weight decay, unless always_adapt == True, then always enable LR adaptation.
        name(str|None, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.85], dtype="float32")
            >>> lamb = paddle.optimizer.Lamb(
            ...     learning_rate=0.002,
            ...     beta1=beta1,
            ...     beta2=beta2,
            ...     parameters=linear.parameters(),
            ...     lamb_weight_decay=0.01
            ... )
            >>> back = out.backward()
            >>> lamb.step()
            >>> lamb.clear_grad()

    moment1moment2beta1_pow_accbeta2_pow_accMbP?{Gz??+?ư>NFlearning_ratefloat | Tensorr   floatr   r   r   
parameters8Sequence[Tensor] | Sequence[_LambParameterConfig] | None	grad_clipGradientClipBase | Noner   Callable[[Tensor], bool] | Nonemulti_precisionboolalways_adaptname
str | NonereturnNonec                   s   |d usJ |d usJ |d usJ |d usJ t  j||d ||d d| _|| _|| _|| _|| _|| _|||||d| _i | _	i | _
|	| _|
| _d S )N)r)   r,   weight_decayr.   r4   Zlamb)r   r   r   r   r   )super__init__type_beta1_beta2_epsilon_lamb_weight_decay_exclude_from_weight_decay_fn_default_dict_master_weights_used_master_weights_multi_precisionr3   )selfr)   r   r   r   r   r,   r.   r   r1   r3   r4   	__class__r   r   r:      s6   
zLamb.__init__c                 C  sz   |d u rt  }|| }| j|}|d ur7|| }| | ks)J | | ks3J ||fS d }||fS N)r   Zfind_varZ
get_tensorrC   getZ_dtypeshape)rE   r4   scopeZp_tZmaster_nameZ
master_p_tr   r   r   _get_parameter   s   zLamb._get_parameterc                 C  s   t |tjtjfstdt |tr| |}|D ]0}|j| jv r"q| j	r=| 
|jr=| |}| | | j|j q| | | j|j qd S )Nblock is not instance of Block.)
isinstancer	   Blockr   	TypeErrordict_update_param_groupr4   Z_already_create_accumulatorrD   _is_dtype_fp16_or_bf16dtypeZ_create_master_weight_add_moments_powsadd)rE   blockr,   pZmaster_pr   r   r   _create_accumulators   s   




zLamb._create_accumulatorsc              	   C  s   |j }| |rtjjj}| j| j||d | j| j||d | j| j	||t
| jtr-dn| jdgtjjjdd | j| j||t
| jtrGdn| jdgtjjjdd d S )N)rT   r&   r   cpu)r4   paramrT   Z
fill_valuerJ   r;   Zdevicer'   )rT   rS   r   ZVarDescZVarTypeZFP32Z_add_accumulator_moment1_acc_str_moment2_acc_str_beta1_pow_acc_strrN   r<   r
   ZDENSE_TENSOR_beta2_pow_acc_strr=   )rE   rX   Z	acc_dtyper   r   r   rU      s.   


zLamb._add_moments_powsc                 C  s  t |tjtjfstdt |tr| |}d|j_| 	| j
|d }| 	| j|d }| 	| j|d }| 	| j|d }| jd urN| |d rNd}n| j}| |}| jo`| |d j}	|d j}
|	rt| j|
 }|j| j|
< nd }t rt|d |d ||||||d || j| j| j| j|	 d S |d |d |||||d}|d ||||d}| j| j| j|| j|	d}|	r||d	< ||d
< | d}|r||d< |j| j |||dd}|S )NrM   Tr   g        r   )ParamZGradZLearningRateZMoment1ZMoment2ZBeta1PowZBeta2Pow)ZParamOutZ
Moment1OutZ
Moment2OutZBeta1PowOutZBeta2PowOut)r   r   r   r8   r3   r1   ZMasterParamZMasterParamOut	found_infZ
SkipUpdate)r;   inputsoutputsattrsZstop_gradient)!rN   r	   rO   r   rP   rQ   rR   programZ	_use_lambZ_get_accumulator_masterr\   r]   r^   r_   r@   r?   Z_create_param_lrrD   rS   rT   r4   rB   rC   Zin_dynamic_or_pir_moder   Zlamb_r<   r=   r>   r3   Z_get_auxiliary_varZ	append_opr;   )rE   rW   Zparam_and_gradr    r!   r"   r#   r8   lrZfind_masterZp_nameZmaster_weightrb   rc   rd   ra   Zlamb_opr   r   r   _append_optimize_op   s   











	
zLamb._append_optimize_opc                 C  sr   | d| jd | _| d| jd | _| d| jd | _| d| jd | _| d| jd | _| d}|S )Nr   r   r   r   r   params)rI   rA   r<   r=   r>   r?   r@   )rE   r,   r   r   r   rR   [  s   

zLamb._update_param_group)r$   r%   r&   r'   r(   NNNFFN)r)   r*   r   r+   r   r*   r   r*   r   r*   r,   r-   r.   r/   r   r0   r1   r2   r3   r2   r4   r5   r6   r7   rH   )r   r   r   __doc__r\   r]   r^   r_   r:   rL   rY   rU   rg   rR   __classcell__r   r   rF   r   r   2   s.    N
.hr   N)
__future__r   typingr   Zpaddler   r   Zpaddle.base.executorr   baser   r	   Zbase.frameworkr
   Z	optimizerr   collections.abcr   r   Ztyping_extensionsr   r   Zpaddle.nn.clipr   r   r   __all__r   r   r   r   r   <module>   s"   
