o
    + i'y                     @  s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlZddlmZ d	d
lmZ e	rZd dlmZ d dlmZ d dlmZ d dlmZ d	dlmZ g ZG dd deZG dd deZdd Zdd Zd"ddZ 				d#ddZ!G d d! d!eZ"dS )$    )annotationsN)defaultdict)reduce)TYPE_CHECKINGNoReturn	TypedDict)NotRequired   )	framework   )	Optimizer)Sequence)Tensor)GradientClipBase)WeightDecayRegularizer)_ParameterConfigc                   @  sf   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< ded< ded< ded< ded< dS )_LbfgsStateint
func_evalsn_iterr   dalphazlist[Tensor]old_ykold_skroH_diagprev_flat_gradfloat	prev_losszNotRequired[list[Tensor]]alN__name__
__module____qualname____annotations__ r%   r%   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/optimizer/lbfgs.pyr   *   s   
 r   c                   @  s   e Zd ZU ded< dS )_LbfgsStateDictr   stateNr    r%   r%   r%   r&   r'   8   s   
 r'   c                   C  s.   t j rtddkrtd dS dS dS )z-Check and warn about TF32 acceleration statusZNVIDIA_TF32_OVERRIDE0zWarning! TF32 Tensor Cores are enabled by default on some NVIDIA GPUs for faster computation, but may compromise numerical precision in specific cases, particularly with the L-BFGS optimizer.To disable it, set: NVIDIA_TF32_OVERRIDE=0N)paddleZdeviceZis_compiled_with_cudaosgetenvwarningswarnr%   r%   r%   r&   check_tf32_override<   s   r/   c                 C  s   | | j ddS )z
    NOTE: This is a temporary workaround for unstable result computed by `paddle.dot`,
    which will be reverted when the problem is fixed."
    Zaxis)sumxyr%   r%   r&   dotI   s   r6   c                 C  s   |dur	|\}}n| |kr| |fn|| f\}}|| d||  | |   }	|	d ||  }
|
dkrj|
  }| |krN|||  || |	 || d|     }n| | | || |	 || d|     }tt|||S || d S )a]  Cubic interpolation between (x1, f1, g1) and (x2, f2, g2).
        Use two points and their gradient to determine a cubic function and get the minimum point
        between them in the cubic curve.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
        pp59: formula 3.59

    Args:
        x1, f1, g1: point1's position, value and gradient.
        x2, f2, g2: point2's position, value and gradient.
        bounds: bounds of interpolation area

    Returns:
        min_pos: the minimum point between the specified points in the cubic curve.
    N   r	   r   g       @)sqrtminmax)x1f1g1Zx2f2g2boundsZ
xmin_boundZ
xmax_boundZd1Z	d2_squareZd2Zmin_posr%   r%   r&   _cubic_interpolateQ   s   
*(rA   -C6??&.>   c           !   	   C  s  |   }| }| |||\}}d}t||}d|||f\}}}}d}d}||
k r|||| |  ks=|dkrP||krP||g}||g}|| g}||g}njt || | kre|g}|g}|g}d}nU|dkr|||g}||g}|| g}||g}n>|d||   }|d }|}t||||||||fd}|}|}| }|}| |||\}}|d7 }t||}|d7 }||
k s+||
krd|g}||g}||g}d}|d |d krd	nd
\}}|s||
k rt |d |d  | |	k rnt|d |d |d |d |d |d }dt|t|  } tt|| |t| | k rU|s1|t|ks1|t|krRt |t| t |t| k rIt||  }nt||  }d}nd}nd}| |||\}}|d7 }t||}|d7 }|||| |  ks~||| kr|||< |||< | ||< |||< |d |d krd	nd
\}}nDt || | krd}n%||| ||   dkr|| ||< || ||< || ||< || ||< |||< |||< | ||< |||< |s||
k s|| }|| }|| }||||fS )ag  Implements of line search algorithm that satisfies the strong Wolfe conditions using double zoom.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
        pp60: Algorithm 3.5 (Line Search Algorithm).

    Args:
        obj_func: the objective function to minimize. ```` accepts a multivariate input and returns a scalar.
        xk (Tensor): the starting point of the iterates.
        alpha (Scalar): the initial step size.
        d (Tensor): search direction.
        loss (scalar): the initial loss
        grad (Tensor): the initial grad
        c1 (Scalar): parameter for sufficient decrease condition.
        c2 (Scalar): parameter for curvature condition.
        tolerance_change (Scalar): terminates if the change of function value/position/parameter between
            two iterations is smaller than this value.
        max_ls(int): max iteration of line search.
        alpha_max (float): max step length.

    Returns:
        loss_new (Scaler): loss of obj_func at final alpha.
        grad_new, (Tensor): derivative of obj_func at final alpha.
        alpha(Tensor): optimal step length, or 0. if the line search algorithm did not converge.
        ls_func_evals (Scaler): number of objective function called in line search process.

    Following summarizes the essentials of the strong Wolfe line search algorithm.
    Some notations used in the description:

        - `func` denotes the objective function.
        - `obi_func` is a function of step size alpha, restricting `obj_func` on a line.

            obi_func = func(xk + alpha * d),
            where xk is the position of k'th iterate, d is the line search direction(decent direction),
            and a is the step size.
        - alpha : substitute of alpha
        - a1 is alpha of last iteration, which is alpha_(i-1).
        - a2 is alpha of current iteration, which is alpha_i.
        - a_lo is alpha in left position when calls zoom, which is alpha_low.
        - a_hi is alpha in right position when calls zoom, which is alpha_high.

    Line Search Algorithm:
        repeat
            Compute obi_func(a2) and derphi(a2).
            1. If obi_func(a2) > obi_func(0) + c_1 * a2 * obi_func'(0) or [obi_func(a2) >= obi_func(a1) and i > 1],
                alpha= zoom(a1, a2) and stop;

            2. If |obi_func'(a2)| <= -c_2 * obi_func'(0),
                alpha= a2 and stop;

            3. If obi_func'(a2) >= 0,
                alpha= zoom(a2, a1) and stop;

            a1 = a2
            a2 = min(2 * a2, a2)
            i = i + 1
        end(repeat)

    zoom(a_lo, a_hi) Algorithm:
        repeat
            aj = cubic_interpolation(a_lo, a_hi)
            Compute obi_func(aj) and derphi(aj).
            1. If obi_func(aj) > obi_func(0) + c_1 * aj * obi_func'(0) or obi_func(aj) >= obi_func(a_lo),
                then a_hi <- aj;
            2.
                2.1. If |obi_func'(aj)| <= -c_2 * obi_func'(0), then alpha= a2 and stop;

                2.2. If obi_func'(aj) * (a2 - a1) >= 0, then a_hi = a_lo

                a_lo = aj;
        end(repeat)

    reference: https://github.com/pytorch/pytorch
    r   r   FTg{Gz?
   )r@   r0   )r   r   )r   r   g?)absr:   cloner6   rA   r9   )!obj_funcZxkr   r   lossgradgtdc1c2tolerance_changeZmax_lsZd_normZloss_newZgrad_newls_func_evalsZgtd_newZt_prevZf_prevZg_prevZgtd_prevdoneZls_iterZbracketZ	bracket_fZ	bracket_gZbracket_gtdZmin_stepZmax_steptmpZinsuf_progressZlow_posZhigh_posZepsr%   r%   r&   _strong_wolfeu   s   X

3 ""
IrS   c                      s   e Zd ZdZ											d3d4 fddZd5dd Zd6d!d"Zd#d$ Zd%d& Zd'd( Z	d)d* Z
d+d, Zejd7d.d/Z	d8d9d1d2Z  ZS ):LBFGSaJ  
    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
    Closely related is the Newton method for minimization. Consider the iterate update formula:

    .. math::
        x_{k+1} = x_{k} + H_k \nabla{f_k}

    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then
    it's a quasi-Newton. In practice, the approximated Hessians are obtained
    by only using the gradients, over either whole or part of the search
    history, the former is BFGS, the latter is L-BFGS.

    Reference:
        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).

    Args:
        learning_rate (float, optional): learning rate .The default value is 1.
        max_iter (int, optional): maximal number of iterations per optimization step.
            The default value is 20.
        max_eval (int|None, optional): maximal number of function evaluations per optimization
            step. The default value is max_iter * 1.25.
        tolerance_grad (float, optional): termination tolerance on first order optimality
            The default value is 1e-5.
        tolerance_change (float, optional): termination tolerance on function
            value/parameter changes. The default value is 1e-9.
        history_size (int, optional): update history size. The default value is 100.
        line_search_fn (string|None, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. The default value is None.
        weight_decay (int|float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
            It can be a int or float value as coeff of L2 regularization or \
            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
            If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
            the regularization setting here in optimizer will be ignored for this parameter. \
            Otherwise, the regularization setting here in optimizer will take effect. \
            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
            some derived class of ``GradientClipBase`` . There are three clipping strategies \
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
        name (str|None, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Return:
        loss (Tensor): the final loss of closure.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import numpy as np

            >>> paddle.disable_static()
            >>> np.random.seed(0)
            >>> np_w = np.random.rand(1).astype(np.float32)
            >>> np_x = np.random.rand(1).astype(np.float32)

            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
            >>> # y = 2x
            >>> targets = [2 * x for x in inputs]

            >>> class Net(paddle.nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         w = paddle.to_tensor(np_w)
            ...         self.w = paddle.create_parameter(shape=w.shape, dtype=w.dtype, default_initializer=paddle.nn.initializer.Assign(w))
            ...
            ...     def forward(self, x):
            ...         return self.w * x
            ...
            >>> net = Net()
            >>> opt = paddle.optimizer.LBFGS(learning_rate=1, max_iter=1, max_eval=None, tolerance_grad=1e-07, tolerance_change=1e-09, history_size=100, line_search_fn='strong_wolfe', parameters=net.parameters())
            >>> def train_step(inputs, targets):
            ...     def closure():
            ...         outputs = net(inputs)
            ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
            ...         print('loss: ', loss.item())
            ...         opt.clear_grad()
            ...         loss.backward()
            ...         return loss
            ...     opt.step(closure)
            ...
            >>> for input_np, target_np in zip(inputs, targets):
            ...     input = paddle.to_tensor(input_np)
            ...     target = paddle.to_tensor(target_np)
            ...     train_step(input, target)
          ?   NHz>rD   d   learning_rater   max_iterr   max_eval
int | Nonetolerance_gradrO   history_sizeline_search_fn
str | None
parameters4Sequence[Tensor] | Sequence[_ParameterConfig] | Noneweight_decay%float | WeightDecayRegularizer | None	grad_clipGradientClipBase | NonenamereturnNonec                   s   t   |d u r|d d }|| _|| _|| _|| _|| _|| _|| _t|t	j
r0tdt| tt| _t jd||	|
|d t| jd tsM| j| _nt| jD ]	\}}|d | _qRd | _d S )N      z^parameters argument given to the optimizer should be an iterable of Tensors or dicts, but got rU   )rY   ra   rc   re   rg   r   params)r/   rY   rZ   r[   r]   rO   r^   r_   
isinstancer*   r   	TypeErrortyper   dictr(   super__init__Z_parameter_list_params	enumerateZ_param_groups_numel_cache)selfrY   rZ   r[   r]   rO   r^   r_   ra   rc   re   rg   idxZparam_group	__class__r%   r&   rr     s:   


zLBFGS.__init__r'   c                 C  s.   i }| j  D ]\}}|||i qd|iS )ap  Returns the state of the optimizer as a :class:`dict`.

        Return:
            state, a dict holding current optimization state. Its content
            differs between optimizer classes.

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> paddle.disable_static()

                >>> net = paddle.nn.Linear(10, 10)
                >>> opt = paddle.optimizer.LBFGS(
                ...     learning_rate=1,
                ...     max_iter=1,
                ...     max_eval=None,
                ...     tolerance_grad=1e-07,
                ...     tolerance_change=1e-09,
                ...     history_size=100,
                ...     line_search_fn='strong_wolfe',
                ...     parameters=net.parameters(),
                >>> )

                >>> def train_step(inputs, targets):
                ...     def closure():
                ...         outputs = net(inputs)
                ...         loss = paddle.nn.functional.mse_loss(outputs, targets)
                ...         opt.clear_grad()
                ...         loss.backward()
                ...         return loss
                ...
                ...     opt.step(closure)
                ...
                >>> inputs = paddle.rand([10, 10], dtype="float32")
                >>> targets = paddle.to_tensor([2 * x for x in inputs])

                >>> n_iter = 0
                >>> while n_iter < 20:
                ...     loss = train_step(inputs, targets)
                ...     n_iter = opt.state_dict()["state"]["func_evals"]
                ...     print("n_iter:", n_iter)
        r(   )r(   itemsupdate)rv   Zpacked_statekvr%   r%   r&   
state_dict  s   .zLBFGS.state_dictc                 C  s$   | j d u rtdd | jd| _ | j S )Nc                 S  s   | |   S N)numel)totalpr%   r%   r&   <lambda>+  s    zLBFGS._numel.<locals>.<lambda>r   )ru   r   rs   rv   r%   r%   r&   _numel'  s
   
zLBFGS._numelc                 C  sT   g }| j D ]}|jd u rt|dg}n|jdg}|| qtj|ddS )Nr0   r   r1   )rs   rK   r*   Z
zeros_likereshapeappendconcat)rv   Zviewsr   viewr%   r%   r&   _gather_flat_grad0  s   

zLBFGS._gather_flat_gradc              	   C  st   d}| j D ]*}|jg krtdd |jnd}t|||||  |j| |}||7 }q||  ks8J d S )Nr   c                 S  s   | | S r   r%   r3   r%   r%   r&   r   >  s    z!LBFGS._add_grad.<locals>.<lambda>r   )rs   shaper   r*   assignaddr   r   )rv   r   	directionoffsetr   r   r%   r%   r&   	_add_grad;  s   

zLBFGS._add_gradc                 C  s   dd | j D S )Nc                 S  s   g | ]}|  qS r%   )rH   ).0r   r%   r%   r&   
<listcomp>I  s    z&LBFGS._clone_param.<locals>.<listcomp>)rs   r   r%   r%   r&   _clone_paramH  s   zLBFGS._clone_paramc                 C  s&   t | j|D ]
\}}t|| qd S r   )ziprs   r*   r   )rv   Zparams_datar   Zpdatar%   r%   r&   
_set_paramK  s   zLBFGS._set_paramc                 C  s0   |  || t| }|  }| | ||fS r   )r   r   r   r   )rv   closurer4   r   r   rJ   	flat_gradr%   r%   r&   _directional_evaluateO  s
   

zLBFGS._directional_evaluater   c           %   	     s  t  X t    j}j}j}j}j}j}j	}j
}	|	dd |	dd   }
t|
}d}|	d  d7  <  }|  |k}|rX|
W  d   S |	d}|	d}|	d}|	d	}|	d
}|	d}|	d}|	d}d}||k r5|d7 }|	d  d7  < |	d dkr| }g }g }g }t jd|
jd}n||}|t j||jd}t||}|dkrt||kr|d |d |d || || |d|  |t|| }t|}d|	vrdg| |	d< |	d }| }t|d ddD ] }t|| |||  ||< t ||| ||   | qt || }}t|D ]}t|| |||  } t ||| || |   | q?|du ri| }nt || |}|	d dkrtdd|   | }n|}t||}!|!| krnd}"|dur|dkrt d! }# fdd}$t"|$|#|||||!\}}}}"#|| |  |k}n3#|| ||krt   t  }W d   n	1 sw   Y   }|  |k}d}"||"7 }|	d  |"7  < |rn&||   |krnt|| |k r%n||kr+n
||kr1n||k s||	d< ||	d< ||	d< ||	d	< ||	d
< ||	d< ||	d< ||	d< W d   |
S 1 saw   Y  |
S )a  Performs a single optimization step.

        Args:
            closure (callable): A closure that reevaluates the model
            and returns the loss.

        Examples:
            .. code-block:: python

                >>> import paddle

                >>> paddle.disable_static()

                >>> inputs = paddle.rand([10, 10], dtype="float32")
                >>> targets = paddle.to_tensor([2 * x for x in inputs])

                >>> net = paddle.nn.Linear(10, 10)
                >>> opt = paddle.optimizer.LBFGS(
                ...     learning_rate=1,
                ...     max_iter=1,
                ...     max_eval=None,
                ...     tolerance_grad=1e-07,
                ...     tolerance_change=1e-09,
                ...     history_size=100,
                ...     line_search_fn='strong_wolfe',
                ...     parameters=net.parameters(),
                >>> )

                >>> def closure():
                ...     outputs = net(inputs)
                ...     loss = paddle.nn.functional.mse_loss(outputs, targets)
                ...     print("loss:", loss.item())
                ...     opt.clear_grad()
                ...     loss.backward()
                ...     return loss
                ...
                >>> opt.step(closure)
        r   r   r   r   Nr   r   r   r   r   r   r   r   rU   )dtypeg|=r   r0   Zstrong_wolfez only 'strong_wolfe' is supportedc                   s     | ||S r   )r   )r4   r   r   r   rv   r%   r&   rI     s   zLBFGS.step.<locals>.obj_func)$r*   Zno_gradZenable_gradrY   rZ   r[   r]   rO   r_   r^   r(   
setdefaultr   r   rG   r:   getnegZ	to_tensorr   subtractmultiplyr6   lenpopr   ranger   r   rH   r9   r2   RuntimeErrorr   rS   r   )%rv   r   rY   rZ   r[   r]   rO   r_   r^   r(   Z	orig_lossrJ   Zcurrent_evalsr   Zopt_condr   r   r   r   r   r   r   r   r   r5   sZysZnum_oldr   qirZbe_irL   rP   Zx_initrI   r%   r   r&   stepV  s   )
















$&








  
 
 6  6z
LBFGS.stepr   c                 C  s   t d)z}Empty method. LBFGS optimizer does not use this way to minimize ``loss``. Please refer 'Examples' of LBFGS() above for usage.zeLBFGS optimizer does not use this way to minimize loss. Please refer 'Examples' of LBFGS() for usage.)NotImplementedError)rv   rJ   Zstartup_programra   Zno_grad_setr%   r%   r&   minimize6  s   zLBFGS.minimize)rU   rV   NrW   rD   rX   NNNNN)rY   r   rZ   r   r[   r\   r]   r   rO   r   r^   r   r_   r`   ra   rb   rc   rd   re   rf   rg   r`   rh   ri   )rh   r'   )rh   r   )rh   r   )NNN)rh   r   )r!   r"   r#   __doc__rr   r~   r   r   r   r   r   r   r
   Znon_static_onlyr   r   __classcell__r%   r%   rx   r&   rT   e  s4    \
3
4	 arT   r   )rB   rC   rD   rE   )#
__future__r   r+   r-   collectionsr   	functoolsr   typingr   r   r   Ztyping_extensionsr   r*   baser
   Z	optimizerr   collections.abcr   r   Zpaddle.nn.clipr   Zpaddle.regularizerr   r   __all__r   r'   r/   r6   rA   rS   rT   r%   r%   r%   r&   <module>   s:   
,
 q