o
    piC                     @  s   d dl mZ d dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ dd	lmZ er\d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlmZ e
dddZeddddG dd deZdS )    )annotations)defaultdict)reduce)TYPE_CHECKINGAnyLiteralTypeVarN)	Optimizer)
deprecated   )_strong_wolfe)CallableSequence)Tensor)GradientClipBase)_ParameterConfig)WeightDecayRegularizer_T_coT)	covariantz2.5.0zpaddle.optimizer.LBFGS)ZsinceZ	update_tolevelc                      s   e Zd ZU dZded< ded< ded< ded< ded< ded	< d
ed< ded< 											d4d5 fdd Zd6d!d"Zd#d$ Zd%d& Zd'd( Z	d)d* Z
d+d, Zd-d. Zd7d2d3Z  ZS )8LBFGSa.  
    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
    Closely related is the Newton method for minimization. Consider the iterate update formula:

    .. math::
        x_{k+1} = x_{k} + H_k \nabla{f_k}

    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
    it's a quasi-Newton. In practice, the approximated Hessians are obtained
    by only using the gradients, over either whole or part of the search
    history, the former is BFGS, the latter is L-BFGS.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).

    Args:
        learning_rate (float, optional): learning rate .The default value is 1.
        max_iter (int, optional): maximal number of iterations per optimization step.
            The default value is 20.
        max_eval (int, optional): maximal number of function evaluations per optimization
            step. The default value is max_iter * 1.25.
        tolerance_grad (float, optional): termination tolerance on first order optimality
            The default value is 1e-5.
        tolerance_change (float, optional): termination tolerance on function
            value/parameter changes. The default value is 1e-9.
        history_size (int, optional): update history size. The default value is 100.
        line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. The default value is None.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
            It canbe a float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of \
            some derived class of ``GradientClipBase`` . There are three clipping strategies \
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Return:
        loss (Tensor): the final loss of closure.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np
            >>> from paddle.incubate.optimizer import LBFGS

            >>> paddle.disable_static()
            >>> np.random.seed(0)
            >>> np_w = np.random.rand(1).astype(np.float32)
            >>> np_x = np.random.rand(1).astype(np.float32)

            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
            >>> # y = 2x
            >>> targets = [2 * x for x in inputs]

            >>> class Net(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         w = paddle.to_tensor(np_w)
            ...         self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w))
            ...     def forward(self, x):
            ...         return self.w * x

            >>> net = Net()
            >>> opt = LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
            >>> def train_step(inputs, targets):
            ...     def closure():
            ...         outputs = net(inputs)
            ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
            ...         print('loss: ', loss.item())
            ...         opt.clear_grad()
            ...         loss.backward()
            ...         return loss
            ...     opt.step(closure)

            >>> for input, target in zip(inputs, targets):
            ...     input_tensor = paddle.to_tensor(input)
            ...     target_tensor = paddle.to_tensor(target)
            ...     train_step(input_tensor, target_tensor)

    floatlearning_rateintmax_itermax_evaltolerance_gradtolerance_changehistory_sizeLiteral['strong_wolfe'] | Noneline_search_fndict[str, dict[str, Any]]state      ?   NHz>&.>d   
int | None
parameters4Sequence[Tensor] | Sequence[_ParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | Nonename
str | Nonereturnr   c                   s   |d u r
|d d }|| _ || _|| _|| _|| _|| _|| _t|tj	r-t
dt| tt| _t jd||	|
|d t| jd tsJ| j| _nt| jD ]	\}}|d | _qOd | _d S )N      z^parameters argument given to the optimizer should be an iterable of Tensors or dicts, but got r#   )r   r)   r+   r-   r/   r   params)r   r   r   r   r   r   r    
isinstancepaddler   	TypeErrortyper   dictr"   super__init__Z_parameter_list_params	enumerateZ_param_groups_numel_cache)selfr   r   r   r   r   r   r    r)   r+   r-   r/   idxZparam_group	__class__ f/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/incubate/optimizer/lbfgs.pyr;      s8   


zLBFGS.__init__c                 C  s.   i }| j  D ]\}}|||i qd|iS )zReturns the state of the optimizer as a :class:`dict`.

        Return:
            state, a dict holding current optimization state. Its content
                differs between optimizer classes.
        r"   )r"   itemsupdate)r?   Zpacked_statekvrC   rC   rD   
state_dict   s   zLBFGS.state_dictc                 C  s$   | j d u rtdd | jd| _ | j S )Nc                 S  s   | |   S N)numel)totalprC   rC   rD   <lambda>   s    zLBFGS._numel.<locals>.<lambda>r   )r>   r   r<   r?   rC   rC   rD   _numel   s
   
zLBFGS._numelc                 C  sT   g }| j D ]}|jd u rt|dg}n|jdg}|| qtj|ddS )Nr   )Zaxis)r<   Zgradr6   Z
zeros_likereshapeappendconcat)r?   ZviewsrM   viewrC   rC   rD   _gather_flat_grad   s   

zLBFGS._gather_flat_gradc              	   C  sf   d}| j D ]#}tdd |j}t|||||  |j| |}||7 }q||  ks1J d S )Nr   c                 S  s   | | S rJ   rC   )xyrC   rC   rD   rN      s    z!LBFGS._add_grad.<locals>.<lambda>)r<   r   shaper6   assignaddrR   rP   )r?   alpha	directionoffsetrM   rK   rC   rC   rD   	_add_grad   s   

zLBFGS._add_gradc                 C  s   dd | j D S )Nc                 S  s   g | ]}|  qS rC   )clone).0rM   rC   rC   rD   
<listcomp>   s    z&LBFGS._clone_param.<locals>.<listcomp>)r<   rO   rC   rC   rD   _clone_param   s   zLBFGS._clone_paramc                 C  s&   t | j|D ]
\}}t|| qd S rJ   )zipr<   r6   rZ   )r?   Zparams_datarM   ZpdatarC   rC   rD   
_set_param   s   zLBFGS._set_paramc                 C  s0   |  || t| }|  }| | ||fS rJ   )r_   r   rV   re   )r?   closurerW   r\   dloss	flat_gradrC   rC   rD   _directional_evaluate   s
   

zLBFGS._directional_evaluaterf   Callable[[], _T_co]r   c           %   	     s  t  X t    j}j}j}j}j}j}j	}j
}	|	dd |	dd   }
t|
}d}|	d  d7  <  }|  |k}|rX|
W  d   S |	d}|	d}|	d}|	d	}|	d
}|	d}|	d}|	d}d}||k r5|d7 }|	d  d7  < |	d dkr| }g }g }g }t jd|
jd}n||}|t j||jd}||}|dkrt||kr|d |d |d || || |d|  ||| }t|}d|	vrdg| |	d< |	d }| }t|d ddD ] }|| |||  ||< t ||| ||   | qt || }}t|D ]}|| |||  } t ||| || |   | q?|du ri| }nt || |}|	d dkrtdd|   | }n|}||}!|!| krnd}"|dur|dkrt d! }# fdd}$t"|$|#|||||!\}}}}"#|| |  |k}n3#|| ||krt   t  }W d   n	1 sw   Y   }|  |k}d}"||"7 }|	d  |"7  < |rn&||   |krnt|| |k r%n||kr+n
||kr1n||k s||	d< ||	d< ||	d< ||	d	< ||	d
< ||	d< ||	d< ||	d< W d   |
S 1 saw   Y  |
S )z
        Performs a single optimization step.

        Args:
            closure (callable): A closure that reevaluates the model
                and returns the loss.

        Z
func_evalsr   n_iterr   Nrg   r\   old_ykold_skroH_diagprev_flat_grad	prev_lossr#   )dtypeg|=alrQ   Zstrong_wolfez only 'strong_wolfe' is supportedc                   s     | ||S rJ   )rj   )rW   r\   rg   rf   r?   rC   rD   obj_func  s   zLBFGS.step.<locals>.obj_func)$r6   Zno_gradZenable_gradr   r   r   r   r   r    r   r"   
setdefaultr   rV   absmaxgetnegZ	to_tensorrs   subtractmultiplydotlenpoprS   rangerZ   r[   r`   minsumRuntimeErrorrc   r   r_   )%r?   rf   r   r   r   r   r   r    r   r"   Z	orig_lossrh   Zcurrent_evalsri   Zopt_condrg   r\   rm   rn   ro   rp   rq   rr   rl   rX   sZysZnum_oldrt   qirZbe_iZgtdZls_func_evalsZx_initrv   rC   ru   rD   step   s   

















$&








  
 
 6  6z
LBFGS.step)r#   r$   Nr%   r&   r'   NNNNN)r   r   r   r   r   r(   r   r   r   r   r   r   r    r   r)   r*   r+   r,   r-   r.   r/   r0   r1   r   )r1   r!   )rf   rk   r1   r   )__name__
__module____qualname____doc____annotations__r;   rI   rP   rV   r_   rc   re   rj   r   __classcell__rC   rC   rA   rD   r   %   s<   
 [
1	r   )
__future__r   collectionsr   	functoolsr   typingr   r   r   r   r6   Zpaddle.optimizerr	   Zpaddle.utilsr
   Zline_search_dygraphr   collections.abcr   r   r   Zpaddle.nn.clipr   Zpaddle.optimizer.optimizerr   Zpaddle.regularizerr   r   r   rC   rC   rC   rD   <module>   s"   