o
    pi<4                     @  s   d dl mZ d dlmZ d dlZd dlmZmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erPd dlmZ d dlmZ d dlmZ g ZG dd deZdS )    )annotations)TYPE_CHECKINGN)	frameworkunique_name)base)Variable)LayerHelper)in_pir_mode)	Optimizer)create_parameter)Tensor)Operator)Programc                      s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZ			d+d, fddZ fddZej	e
jd-ddZdd Zdd Zdd Ze
j			d.d/d)d*Z  ZS )0	LookAheada  
    This implements the Lookahead optimizer of the
    paper : https://arxiv.org/abs/1907.08610.

    Lookahead keeps two sets of params: the fast_params and
    the slow_params. inner_optimizer update fast_params every
    training step. Lookahead updates the slow_params and fast_params
    every k training steps as follows:

    .. math::

        slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})

        fast\_param_t &=  slow\_param_t

    Args:
        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
        alpha (float, optional): The learning rate of Lookahead. The default value is 0.5.
        k (int, optional): The slow params is updated every k steps. The default value is 5.
        name (str, optional): Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name`.
            The default value is None.

    Examples:

        .. code-block:: python

            >>> import numpy as np
            >>> import paddle
            >>> import paddle.nn as nn

            >>> BATCH_SIZE = 16
            >>> BATCH_NUM = 4
            >>> EPOCH_NUM = 4

            >>> IMAGE_SIZE = 784
            >>> CLASS_NUM = 10
            >>> # define a random dataset
            >>> class RandomDataset(paddle.io.Dataset): # type: ignore[type-arg]
            ...     def __init__(self, num_samples):
            ...         self.num_samples = num_samples
            ...     def __getitem__(self, idx):
            ...         image = np.random.random([IMAGE_SIZE]).astype('float32')
            ...         label = np.random.randint(0, CLASS_NUM - 1,
            ...                                 (1, )).astype('int64')
            ...         return image, label
            ...     def __len__(self):
            ...         return self.num_samples

            >>> class LinearNet(nn.Layer):
            ...     def __init__(self):
            ...         super().__init__()
            ...         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
            ...         self.bias = self._linear.bias
            ...     @paddle.jit.to_static
            ...     def forward(self, x):
            ...         return self._linear(x)

            >>> def train(layer, loader, loss_fn, opt):
            ...     for epoch_id in range(EPOCH_NUM):
            ...         for batch_id, (image, label) in enumerate(loader()):
            ...             out = layer(image)
            ...             loss = loss_fn(out, label)
            ...             loss.backward()
            ...             opt.step()
            ...             opt.clear_grad()
            ...             print("Train Epoch {} batch {}: loss = {}".format(
            ...                 epoch_id, batch_id, np.mean(loss.numpy())))
            >>> layer = LinearNet()
            >>> loss_fn = nn.CrossEntropyLoss()
            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer.parameters())
            >>> lookahead = paddle.incubate.LookAhead(optimizer, alpha=0.2, k=5)

            >>> # create data loader
            >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
            >>> loader = paddle.io.DataLoader(
            ...     dataset,
            ...     batch_size=BATCH_SIZE,
            ...     shuffle=True,
            ...     drop_last=True,
            ...     num_workers=2)

            >>> # doctest: +SKIP('The run time is too long to pass the CI check.')
            >>> train(layer, loader, loss_fn, lookahead)

    r
   inner_optimizerfloatalphaintkstrtyper   helperZslow      ?   Nname
str | NonereturnNonec                   s   |d usJ dd|  krdksJ d J dt |tr#|dks'J d|| _| jjd u r:tj   }n| jj}t	 j
||d d |d || _|| _d| _t| jj| _d | _d | _d S )	Nzinner optimizer can not be None              ?zBalpha should be larger or equal to 0.0, and less or equal than 1.0r   zk should be a positive integer)Zlearning_rate
parametersZweight_decayZ	grad_clipr   Z	lookahead)
isinstancer   r   _parameter_listpaddlestaticZdefault_main_programZglobal_blockZall_parameterssuper__init__r   r   r   r   	__class____name__r   _global_step_varZ_k_var)selfr   r   r   r   r    r'    j/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/incubate/optimizer/lookahead.pyr&      s6   
zLookAhead.__init__c                   s    t  || | j|| d S N)r%   _set_auxiliary_varr   )r*   keyvalr+   r,   r-   r/      s   zLookAhead._set_auxiliary_varc                 C  s`   | j   |   g }| jD ]}|jsq| dur%| }|||f q| jdd|d dS )a  
        Execute the optimizer and update parameters once.

        Returns:
            None

        Examples:

            .. code-block:: python

                >>> import paddle
                >>> inp = paddle.rand([1,10], dtype="float32")
                >>> linear = paddle.nn.Linear(10, 1)
                >>> out = linear(inp)
                >>> loss = paddle.mean(out)
                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                >>> loss.backward()
                >>> lookahead.step()
                >>> lookahead.clear_grad()

        N)lossstartup_programparams_grads)r   step_increment_global_varr"   	trainableZ
_grad_ivarappend_apply_optimize)r*   r4   paramZgrad_varr,   r,   r-   r5      s   


zLookAhead.stepc                 C  s4   t |tjtjjfsJ |D ]	}| | j| qd S r.   )r!   r   ZBlockr#   pirZ_add_accumulator	_slow_str)r*   blockr    pr,   r,   r-   _create_accumulators   s   zLookAhead._create_accumulatorsc              	   C  s   t  r'| jd u rtddgtddtjjjdddd| _t	| jd| _d S | jd u r<tj
jtddgd	dd
d| _| jjdd| jgid| jgiddid d S )Nint32   Zlookahead_stepFr   valueZ	force_cpudtypeshaper   r7   initializerr   r   Tr   rF   rC   rE   Zpersistable	incrementXZOutr5   )r   ZinputsZoutputsattrs)r	   r)   r   r   generater#   nnrG   ConstantInitializerrI   r$   create_global_varr   Z	append_op)r*   r,   r,   r-   r6      s4   
	



zLookAhead._increment_global_varc              	   C  s\  t jdgddd}t jdgddd}t r-tddgtddt jjj	t
| jddd	}nt jjtddg| jdd
d}t | j|}t | j|}t j|dd}t ||}t j|dd}| | j|d }	||d  d| |	  }
t |
|	 | j|d  d| j |	  }
||
 d| |d   }t ||d  ||
 d| |	  }t ||	 d S )NrA   r@   Zlookahead_ones)rF   rE   r   Zlookahead_zerosZlookahead_kFrB   rD   TrH   Zfloat32)rE   r   r   )r#   ZonesZzerosr	   r   r   rL   rM   rG   rN   r   r   r$   rO   	remainderr)   equalcastZ_get_accumulatorr<   Zassignr   )r*   r=   Zparam_and_gradZone_varZzero_varZk_varmodZcond_1Zcond_2Zslow_varZtmp_varZ	tmp_var_1r,   r,   r-   _append_optimize_op   sD   

zLookAhead._append_optimize_opr2   r   r3   Program | Noner    list[Tensor] | list[str] | Noneno_grad_setset[Tensor] | set[str] | None2tuple[list[Operator], list[tuple[Tensor, Tensor]]]c                 C  sR   t |ttjjfsJ d| jj||||d\}}|   | j|||d}||fS )a  
        Add operations to minimize ``loss`` by updating ``parameters``.

        Args:
            loss (Tensor): A ``Tensor`` containing the value to minimize.
            startup_program (Program, optional): :ref:`api_paddle_static_Program` for
                initializing parameters in ``parameters``. The default value
                is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.

        Returns:
            tuple: tuple (optimize_ops, params_grads), A list of operators appended
            by minimize and a list of (param, grad) tensor pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
            indicate program pruning. If so, the program will be pruned by ``feed`` and
            ``fetch_list`` before run, see details in ``Executor``.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> inp = paddle.rand([1, 10], dtype="float32")
                >>> linear = paddle.nn.Linear(10, 1)
                >>> out = linear(inp)
                >>> loss = paddle.mean(out)
                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                >>> loss.backward()
                >>> lookahead.minimize(loss)
                >>> lookahead.clear_grad()

        zThe loss should be an Tensor.)r3   r    rW   )r3   r4   )	r!   r   r#   r;   Valuer   minimizer6   r9   )r*   r2   r3   r    rW   Zoptimize_opsr4   _r,   r,   r-   r[   &  s   /
zLookAhead.minimize)r   r   N)
r   r
   r   r   r   r   r   r   r   r   )r   r   )NNN)
r2   r   r3   rU   r    rV   rW   rX   r   rY   )r(   
__module____qualname____doc____annotations__r<   r&   r/   r   Zdygraph_onlyimperative_baseZno_gradr5   r?   r6   rT   r[   __classcell__r,   r,   r+   r-   r   $   s0   
 W&&,r   )
__future__r   typingr   r#   Zpaddle.baser   r   Zpaddle.base.dygraphr   ra   Zpaddle.base.frameworkr   Zpaddle.base.layer_helperr   Zpaddle.frameworkr	   Zpaddle.optimizerr
   Zpaddle.pir.corer   r   r   Zpaddle.staticr   __all__r   r,   r,   r,   r-   <module>   s    