o
    pi+                     @  s   d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZ d	d
lmZ erfd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	dlmZ d	dlmZ G dd deZg ZG dd deZdS )    )annotationsN)TYPE_CHECKING)_C_ops)in_dynamic_or_pir_mode   )	framework)no_grad   )	Optimizer)Sequence)NotRequired)Tensor)GradientClipBase)WeightDecayRegularizer)LRScheduler)_ParameterConfigc                   @  s   e Zd ZU ded< ded< dS )_AdadeltaParameterConfigzNotRequired[float]epsilonrhoN)__name__
__module____qualname____annotations__ r   r   `/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/optimizer/adadelta.pyr   '   s   
 r   c                      s\   e Zd ZU dZded< dZdZ											d!d" fddZdd Zdd Z	dd  Z
  ZS )#Adadeltaa  
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:
    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.

    The update is done as follows:

    .. math::

        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2

        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }

        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2

    Args:
        learning_rate (float|Tensor|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
        rho (float): a floating point value indicating the decay rate. Default 0.95.
        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in parameter groups \
            represents the scale of base learning_rate. \
            The default value is None in static graph mode, at this time all parameters will be updated.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str|None, optional): The default value is None. Normally there is no need for user
                to set this property. For more information, please refer to
                :ref:`api_guide_Name` .

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
            >>> linear = paddle.nn.Linear(10, 10)
            >>> out = linear(inp)
            >>> loss = paddle.mean(out)
            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
            >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
            >>> back = out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

            >>> # Note that the learning_rate of linear_2 is 0.01.
            >>> linear_1 = paddle.nn.Linear(10, 10)
            >>> linear_2 = paddle.nn.Linear(10, 10)
            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
            >>> out = linear_1(inp)
            >>> out = linear_2(out)
            >>> loss = paddle.mean(out)
            >>> adadelta = paddle.optimizer.Adadelta(
            ...     learning_rate=0.1,
            ...     parameters=[{  # type: ignore
            ...         'params': linear_1.parameters()
            ...     }, {
            ...         'params': linear_2.parameters(),
            ...         'weight_decay': 0.001,
            ...         'learning_rate': 0.1,
            ...     }],
            ...     weight_decay=0.01)
            >>> out.backward()
            >>> adadelta.step()
            >>> adadelta.clear_grad()

    strtypeZ_avg_squared_gradZ_avg_squared_updateMbP?ư>ffffff?Nlearning_ratefloat | Tensor | LRSchedulerr   floatr   
parameters<Sequence[Tensor] | Sequence[_AdadeltaParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonename
str | NonereturnNonec                   st   |d u rt d|d u rt d|d u rt dt j|||||d d| _i | _d| _|| _|| _||d| _d S )Nzlearning_rate is not set.zepsilon is not set.zrho is not set.)r!   r$   r&   r(   r*   FZadadelta)r   r   )	
ValueErrorsuper__init___multi_precision_master_weightsr   _epsilon_rho_default_dict)selfr!   r   r   r$   r&   r(   r*   	__class__r   r   r0      s*   zAdadelta.__init__c                 C  s   t |tjtjjfstdt |tr|d}|D ]P}|j| j	v r#q| j
rG| |jrG| |}| | j| | | j| | j	|j q| |jrU| j
sUtd | | j| | | j| | j	|j qd S )N)block is not instance of framework.Block.paramszAccumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence.Consider using multi_precision=True option of the Lars optimizer.)
isinstancer   Blockpaddlepir	TypeErrordictgetr*   Z_already_create_accumulatorr1   _is_dtype_fp16_or_bf16dtypeZ_create_master_weightZ_add_accumulator_avg_squared_grad_acc_str_avg_squared_update_acc_straddwarningswarn)r6   blockr$   pZmaster_pr   r   r   _create_accumulators   s4   



zAdadelta._create_accumulatorsc           
      C  sH  t |tr
| |}| | j|d }| | j|d }| jo&| |d j}|r1| j	|d j
 nd }t rbt  t|d |d ||| ||| j| j|	 W d    d S 1 s[w   Y  d S t |tjtjjfsptd|d |d ||| |d}|d ||d}|r||d< ||d< |j| j||| j| j|dd	d
}	|	S )Nr   r	   r9   )ParamZGradZAvgSquaredGradZAvgSquaredUpdateZLearningRate)ZParamOutZAvgSquaredGradOutZAvgSquaredUpdateOutZMasterParamZMasterParamOut)r   r   Zmulti_precisionT)r   inputsoutputsattrsZstop_gradient)r;   r@   _update_param_groupZ_get_accumulator_masterrD   rE   r1   rB   rC   r2   r*   r   r   r   Z	adadelta_Z_create_param_lrr4   r3   r   r<   r=   r>   r?   Z	append_opr   )
r6   rI   Zparam_and_gradZavg_squared_grad_accZavg_squared_update_accZfind_masterZmaster_weightrM   rN   Zadadelta_opr   r   r   _append_optimize_op   st   





zAdadelta._append_optimize_opc                 C  s6   | d| jd | _| d| jd | _| d}|S )Nr   r   r:   )rA   r5   r3   r4   )r6   r$   r   r   r   rP     s   
zAdadelta._update_param_group)r   r   r    NNNN)r!   r"   r   r#   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   )r   r   r   __doc__r   rD   rE   r0   rK   rQ   rP   __classcell__r   r   r7   r   r   /   s    
 Q#Cr   )
__future__r   rG   typingr   r=   r   Zpaddle.base.frameworkr   baser   Zbase.dygraphr   Z	optimizerr
   collections.abcr   Ztyping_extensionsr   r   Zpaddle.nn.clipr   Zpaddle.regularizerr   lrr   r   r   __all__r   r   r   r   r   <module>   s(   