o
    * i%                     @   sX   d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZ G dd deZdS )    N)_C_ops_legacy_C_opspir)	framework)in_dynamic_modein_pir_mode)	Optimizerc                       sJ   e Zd ZdZdZ										d fd	d
	Zdd Zdd Z  ZS )LarsMomentumOptimizera  
    Momentum optimizer with LARS support

    The update equations are as follows:

    .. math::

        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}

        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)

        & param = param - velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element. \
            momentum (float): momentum factor
        lars_coeff (float): Defines how much we trust the layer to change its weights.
        lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_paddle_regularizer_L1Decay` , :ref:`api_paddle_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_paddle_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.
        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
            before updating. Often choose to be `1.0/batch_size`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np

            >>> paddle.enable_static()
            >>> np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
            >>> inp = paddle.static.data(
            ...     name="inp", shape=[2, 2], dtype='float32')
            >>> out = paddle.static.nn.fc(inp, size=3)
            >>> out = paddle.sum(out)
            >>> optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9)
            >>> optimizer.minimize(out)

            >>> exe = paddle.static.Executor(paddle.CPUPlace())
            >>> exe.run(paddle.static.default_startup_program())
            >>> exe.run(
            ...     feed={"inp": np_inp},
            ...     fetch_list=[out.name])
    velocityMbP?Mb@?Nr   F      ?c                    s   |d usJ |d usJ t  j|||||d d| _|| _t|| _t|| _t|
| _|	d u r4g | _n|	| _|| _	t|| _
i | _d S )N)learning_rate
parametersZweight_decay	grad_clipnamelars_momentum)super__init__type	_momentumfloat_lars_coeff_lars_weight_decay_epsilon_exclude_from_weight_decay_multi_precision_rescale_grad_master_weights)selfr   Zmomentum
lars_coefflars_weight_decayZparameter_listZregularizationr   r   Zexclude_from_weight_decayepsilonmulti_precisionrescale_grad	__class__ s/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/incubate/optimizer/lars_momentum.pyr   Z   s(   




zLarsMomentumOptimizer.__init__c                 C   s~   t |tjtjfstd|D ]-}| jr'| |jr'| |}| 	| j
| q| |jr5| js5td | 	| j
| qd S )Nblock is not instance of Block.zAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Blockr   	TypeErrorr   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_velocity_acc_strwarningswarn)r   blockr   pZmaster_pr'   r'   r(   _create_accumulators   s    

z*LarsMomentumOptimizer._create_accumulatorsc                 C   s  t |tjtjfstd| j}|d j}t| jdkr*| jD ]
}||v r)d} nq| 	| j
|d }| |}| joB| |d j}|rM| j|d j nd }	| j| j|g|| j| jd}
|d |d ||d}|d |d}|ry|	|d< |	|d	< t rt|d g|d g|g|g|d g|gd
| jd| jd|gd|d| jd| j\}}d S t rt |	tjr|	g}	t|d g|d g|g|g|	| j| j|g| j|| j\}}}d S |j| j|||
dd}|S )Nr)   r   g        )mur    r!   r#   r"   r$      )ParamZGradVelocityZLearningRate)ZParamOutZVelocityOutZMasterParamZMasterParamOutr5   r    r!   r#   r"   r$   T)r   inputsoutputsattrsZstop_gradient)r*   r   r+   r   r,   r   r   lenr   Z_get_accumulator_masterr/   Z_create_param_lrr   r-   r.   r   r   r   r   r   r   r   r   r   Valuer   Zlars_momentum_Z	append_opr   )r   r2   Zparam_and_gradr   
param_namer   Zvelocity_acclrZfind_masterZmaster_weightr;   r9   r:   tmpZtmp2_Zmomentum_opr'   r'   r(   _append_optimize_op   s   






z)LarsMomentumOptimizer._append_optimize_op)
r   r   NNNNNr   Fr   )	__name__
__module____qualname____doc__r/   r   r4   rB   __classcell__r'   r'   r%   r(   r	      s     >%r	   )r0   Zpaddler   r   r   Zpaddle.baser   Zpaddle.frameworkr   r   Zpaddle.optimizerr   r	   r'   r'   r'   r(   <module>   s   