o
    * i                     @  s  d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ ddlmZ erpd dlmZ d dlmZ d dl m!Z! G dd de
Z"G dd deZ#dd Z$G dd dZ%G dd de%Z&dS )    )annotationsN)defaultdict)Enum)TYPE_CHECKINGAny	TypedDict)_C_ops_legacy_C_ops)coreunique_name)
check_type)Operator_dygraph_tracerin_pir_mode)in_dynamic_mode   )amp_global_state)Tensor)OptimizerWithMixedPrecision)	Optimizerc                   @  sN   e Zd ZU ded< ded< ded< ded< ded< ded	< ded
< ded< dS )_ScaleStateDictr   scalefloat
incr_ratio
decr_ratiointincr_every_n_stepsdecr_every_n_nan_or_inf
incr_count
decr_countbooluse_dynamic_loss_scalingN)__name__
__module____qualname____annotations__ r&   r&   b/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/amp/grad_scaler.pyr   )   s   
 r   c                   @  s   e Zd ZdZdZdZdS )OptimizerStater   r      N)r"   r#   r$   INITUNSCALEDSTEPPEDr&   r&   r&   r'   r(   4   s    r(   c                   C  s
   dt jiS )Nstate)r(   r*   r&   r&   r&   r'   _refresh_optimizer_state:   s   
r.   c                   @  s   e Zd ZdZ							dJdKddZdLddZdMd d!Zd"d# Zd$d% ZdNd&d'Z	dNd(d)Z
dOd*d+ZdPd-d.ZdOd/d0ZdQd2d3ZdOd4d5ZdRd7d8ZdSd9d:ZdTd<d=ZdSd>d?ZdUdAdBZdVdDdEZdWdGdHZdIS )X	AmpScalera	  
    AmpScaler is used for Auto-Mixed-Precision training/inferring in imperative
    mode. It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has seventeen methods `scale()`, `unscale_()`, `minimize()` and `get`/`set` api of parameters.

    `scale()` is used to multiply the loss by a scale ratio.
    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling.

    Commonly, it is used together with `amp_guard` to achieve Auto-Mixed-Precision in
    imperative mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                steps with finite gradients. Default is 1000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                    accumulated steps with nan or inf gradients. Default is 2.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
    Returns:
        An AmpScaler object.

    Examples:

        .. code-block:: python

            >>> import numpy as np
            >>> import paddle

            >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
            >>> model = paddle.nn.Conv2D(3, 2, 3)
            >>> optimizer = paddle.optimizer.SGD(
            ...         learning_rate=0.01, parameters=model.parameters())
            >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
            >>> data = paddle.to_tensor(data)
            >>> with paddle.amp.amp_guard():
            ...     conv = model(data)
            ...     loss = paddle.mean(conv)
            ...     scaled = scaler.scale(loss)
            ...     scaled.backward()
            ...     scaler.minimize(optimizer, scaled)
    T      @       @      ?  r   enabler    init_loss_scalingr   r   r   r   r   r   r!   returnNonec           	      C  s  t  r)t }|std|r)|j s)|j s)|j s)td|j d d}|| _	d| _
d| _d | _| j	r|dks@J d|dk sHJ d|| _|| _|| _|| _|| _d| _d| _|| _
t r|tjjjd	d
gtdtjjj| jdd| _d S ttdg tj!| _"ttdg tj!| _#ttdg tj!| _$ttdg tj!| _%ttdg tj!| _&tt| jg tj'| _d | _(t)t*| _+d S d S )Nz;current_tracer is None, maybe it is not in imperative mode.zWAmpScaler can only be enabled on CUDAPlace, XPUPlace and CustomPlace, current place is z, so it makes no effect.F      ?zThe incr_ratio must be > 1.0.zThe decr_ratio must be < 1.0.r   float32r   Zloss_scaling)value)dtypeshapenameinitializer),r   r   
ValueErrorZ_expected_placeZis_gpu_placeZis_xpu_placeZis_custom_placewarningswarn_enable_use_dynamic_loss_scaling_init_loss_scaling_scale_incr_ratio_decr_ratio_incr_every_n_steps_decr_every_n_nan_or_inf_incr_count_decr_countr   paddlepirr
   Zcreate_persistable_valuer   generatennr>   ZConstantInitializer	to_tensornparrayastypeZbool_
_found_inf_temp_found_inf_value_false_temp_found_inf_fp16_temp_found_inf_bf16_temp_found_inf_fp32r9   _cache_found_infr   r.   _optimizer_states)	selfr4   r5   r   r   r   r   r!   Ztracerr&   r&   r'   __init__o   s|   
	zAmpScaler.__init__varr   c                 C  s   t |dtjtjjfd | jr,t jdkr,| jr,d| _d| _d| _	t
dt j d t rn|jtjjkr;|d}| js@|S tj|| j}| }| }|jrl|jrltjjj|jj|j |j |jj|_|S | jru| sw|S || j S )	al  
        Multiplies a Tensor by the scale factor and returns scaled outputs.
        If this instance of :class:`AmpScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The Tensor to scale.
        Returns:
            The scaled Tensor or original Tensor.

        Examples:

            .. code-block:: python

                >>> import numpy as np
                >>> import paddle

                >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
                >>> model = paddle.nn.Conv2D(3, 2, 3)
                >>> optimizer = paddle.optimizer.SGD(
                ...         learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
                >>> data = paddle.to_tensor(data)
                >>> with paddle.amp.amp_guard():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                ...     scaled = scaler.scale(loss)
                ...     scaled.backward()
                ...     scaler.minimize(optimizer, scaled)
        r]   zAmpScaler.scale()float16Fr8   z6It is not recommended to use dynamic loss scaling for z&, so GradScaler is disable by default.r9   )r   rL   r   rM   ValuerB   r   Z	amp_dtyperC   rD   r@   rA   r   r;   r
   ZDataTypeZFLOAT32rS   r   multiplyrE   Zget_defining_opZ	dist_attrbaseZ	libpaddleZcreate_op_dist_attributeZprocess_meshZoperandsresultsZchunk_idZ_is_initialized)r[   r]   Z	scale_outZmultiply_opZ
src_var_opr&   r&   r'   r      sJ   


zAmpScaler.scale	optimizer'Optimizer | OptimizerWithMixedPrecisionargsr   kwargs2tuple[list[Operator], list[tuple[Tensor, Tensor]]]c                 O  sF  t  r@t|tjjjjsJ | j|_| j|_| j	|_
|d |_| jr8| j|_| j|_| j|_| j|_d|_d|_|j|i |S | jsK|j|i |S | jt| }|d tju r^| | d\}}t|dr|d| j |j|i |\}}|d| _n| jrd| _n|j|i |\}}d| _| jr|   tt | _||fS )	a  
        This function is similar as `Optimizer.minimize()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Finally, the loss scaling ratio is updated.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
            args:  Arguments, which will be forward to `Optimizer.minimize()`.
            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.

        Examples:

            .. code-block:: python

                >>> import numpy as np
                >>> import paddle

                >>> data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
                >>> model = paddle.nn.Conv2D(3, 2, 3)
                >>> optimizer = paddle.optimizer.SGD(
                ...     learning_rate=0.01,
                ...     parameters=model.parameters()
                ... )
                >>> scaler = paddle.amp.AmpScaler(init_loss_scaling=1024)
                >>> data = paddle.to_tensor(data)
                >>> with paddle.amp.amp_guard():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                ...     scaled = scaler.scale(loss)
                ...     scaled.backward()
                ...     scaler.minimize(optimizer, scaled)
        r   Nr-   )NN_set_auxiliary_var	found_infTF)!r   
isinstancerL   Zstaticamp	decoratorr   rC   rD   rE   Z_loss_scalingZ_scaled_lossrH   rI   rF   rG   Z_num_good_stepsZ_num_bad_stepsminimizerB   rZ   idr(   r*   _unscalehasattrrh   rT   _get_auxiliary_varrY   _updater   r.   )r[   rc   re   rf   optimizer_stateZoptimize_opsZparams_gradsr&   r&   r'   rm     sH   *




zAmpScaler.minimizec           	      C  s  | j sdS | jt| }|d tju rtd|d tju r"tdt|ddrzt|j	d t
rzg }g }g }g }|j	D ]=}|d D ]6}| durw||  | jtjkr`||  qA| jtjkrp||  qA||  qAq;n+t rtj|j\}}}ndd	 |jD }d
d	 |D }dd	 |D }dd	 |D }| j| _t|rt|| j|| j t| j| j| _t|rt|| j|| j t| j| j| _t|rt|| j|| j  t| j| j | _tj|d< dS )a  
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
        Returns:
            The unscaled parameters or original parameters.
        Nr-   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step()._param_groupsr   paramsc                 S  s    g | ]}|  d ur|  qS N)
_grad_ivar.0paramr&   r&   r'   
<listcomp>  s
    z&AmpScaler._unscale.<locals>.<listcomp>c                 S     g | ]
}|j tjkr|qS r&   )r;   rL   r^   rx   r&   r&   r'   r{     
    c                 S  r|   r&   )r;   rL   bfloat16rx   r&   r&   r'   r{     r}   c                 S  r|   r&   )r;   rL   r9   rx   r&   r&   r'   r{     r}   )!rB   rZ   rn   r(   r+   RuntimeErrorr,   getattrrj   rt   dictrw   appendr;   rL   r^   r~   r   r
   eagerZget_grads_listsZ_parameter_listrU   rT   lenr	   Zcheck_finite_and_unscalerE   rV   r   Z
bitwise_orrW   rX   )	r[   rc   rs   Zparam_gradsZparam_grads_fp16Zparam_grads_bf16Zparam_grads_fp32grouprz   r&   r&   r'   ro   b  s   	

zAmpScaler._unscalec                 C  s   | j sdS | jr9d| _| jd | _| j| jkr7tdt| j dt| j dt| j  | j| j | _d| _dS d| _| jd | _| j| j	krR| j| j
 | _d| _dS )z+
        Updates the loss_scaling.
        Nr   r   z$Found inf or nan, current scale is: z, decrease to: *)rB   rY   rJ   rK   rI   printr   rE   rG   rH   rF   r[   r&   r&   r'   rr     s$   &zAmpScaler._updatec                 C     | j S )z
        Enable loss scaling or not.

        Returns:
            bool: enable loss scaling return True else return False.
        )rB   r   r&   r&   r'   	is_enable     zAmpScaler.is_enablec                 C  r   )z
        Whether to use dynamic loss scaling.

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamically return true.
        )rC   r   r&   r&   r'   is_use_dynamic_loss_scaling  r   z%AmpScaler.is_use_dynamic_loss_scalingc                 C  r   )z
        Return the initial loss scaling factor.

        Returns:
            float:  the initial loss scaling factor.
        )rD   r   r&   r&   r'   get_init_loss_scaling  r   zAmpScaler.get_init_loss_scalingnew_init_loss_scalingc                 C  s(   || _ tt| j gtj| _dS )z
        Set the initial loss scaling factor by `new_init_loss_scaling`.

        Args:
            new_init_loss_scaling(int):  The new_init_loss_scaling used to update initial loss scaling factor.s
        N)rD   rL   rP   rQ   rR   rS   r9   rE   r[   r   r&   r&   r'   set_init_loss_scaling  s   
zAmpScaler.set_init_loss_scalingc                 C  r   )z
        Return the multiplier to use when increasing the loss scaling.

        Returns:
            float:  the multiplier to use when increasing the loss scaling.
        rF   r   r&   r&   r'   get_incr_ratio
  r   zAmpScaler.get_incr_rationew_incr_ratioc                 C  s   |dksJ d|| _ dS )a  
        Set the multiplier to use when increasing the loss scaling by `new_incr_ratio`, `new_incr_ratio` should > 1.0.

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
        r8   z!The new_incr_ratio must be > 1.0.Nr   r[   r   r&   r&   r'   set_incr_ratio     
zAmpScaler.set_incr_ratioc                 C  r   )z
        Get the less-than-one-multiplier to use when decreasing the loss scaling.

        Returns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.
        rG   r   r&   r&   r'   get_decr_ratio  r   zAmpScaler.get_decr_rationew_decr_ratioc                 C  s   |dk sJ d|| _ dS )a)  
        Set the less-than-one-multiplier to use when decreasing the loss scaling by `new_incr_ratio`, `new_decr_ratio` should < 1.0.

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
        r8   z!The new_decr_ratio must be < 1.0.Nr   r[   r   r&   r&   r'   set_decr_ratio&  r   zAmpScaler.set_decr_ratioc                 C  r   )a  
        Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Returns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        rH   r   r&   r&   r'   get_incr_every_n_steps0  r   z AmpScaler.get_incr_every_n_stepsnew_incr_every_n_stepsc                 C  
   || _ dS )a^  
        Set the num `n` by `new_incr_every_n_steps`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
        Nr   r[   r   r&   r&   r'   set_incr_every_n_steps9  s   
z AmpScaler.set_incr_every_n_stepsc                 C  r   )a  
        Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Returns:
            int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        rI   r   r&   r&   r'   get_decr_every_n_nan_or_infB  r   z%AmpScaler.get_decr_every_n_nan_or_infnew_decr_every_n_nan_or_infc                 C  r   )au  
        Set the num `n` by `new_decr_every_n_nan_or_inf`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
        Nr   r[   r   r&   r&   r'   set_decr_every_n_nan_or_infK  s   
	z%AmpScaler.set_decr_every_n_nan_or_infr   c              	   C  s4   | j r| j | j| j| j| j| j| j| j	dS i S )a  
        Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.

        Returns:
            A dict of scaler includes:
            scale (tensor): The loss scaling factor.
            incr_ratio(float): The multiplier to use when increasing the loss scaling.
            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling.
            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients.
            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
            incr_count(int): The number of recent consecutive unskipped steps.
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
        )r   r   r   r   r   r   r   r!   )
rB   rE   numpyrF   rG   rH   rI   rJ   rK   rC   r   r&   r&   r'   
state_dictV  s   zAmpScaler.state_dictr   c                 C  s   | j sdS t|dkrtd|d d | _tt| jgtj	| _
|d | _|d | _|d | _|d | _|d	 | _|d
 | _|d | _dS )z
        Loads the scaler state.

        Args:
           state_dict(dict): scaler state. Should be an object returned from a call to `AmpScaler.state_dict()`.
        Nr   zdThe input state dict is empty, possibly because it was saved from a disabled instance of GradScaler.r   r   r   r   r   r   r   r!   )rB   r   r   rD   rL   rP   rQ   rR   rS   r9   rE   rF   rG   rH   rI   rJ   rK   rC   r[   r   r&   r&   r'   load_state_dictt  s"   





zAmpScaler.load_state_dictN)Tr0   r1   r2   r3   r   Tr4   r    r5   r   r   r   r   r   r   r   r   r   r!   r    r6   r7   r]   r   r6   r   rc   rd   re   r   rf   r   r6   rg   r6   r    r6   r   r   r   r6   r7   r   r   r6   r7   r   r   r6   r7   r6   r   r   r   r6   r7   r   r   r6   r7   r6   r   r   r   r6   r7   )r"   r#   r$   __doc__r\   r   rm   ro   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r&   r&   r&   r'   r/   >   s8    2
L
J]h

	
	
	

	


	


	
	
	
r/   c                      s2  e Zd ZdZ							dLdM fddZdN fddZdO fd d!ZdPd#d$ZdQd%d&Z fd'd(Z	dR fd)d*Z
dR fd+d,ZdS fd-d.ZdT fd0d1ZdS fd2d3ZdU fd5d6ZdS fd7d8ZdV fd:d;ZdW fd<d=ZdX fd?d@ZdW fdAdBZdY fdDdEZdZ fdGdHZd[ fdJdKZ  ZS )\
GradScaleram
  
    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
    It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.

    `scale()` is used to multiply the loss by a scale ratio.
    `unscale_()` is used to unscale the gradients of parameters, multiplies the gradients of parameters by 1/(scale ratio)
    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating, and it will update the loss_scaling, it equal to `step()` + `update()`.
    `step()` is similar as `optimizer.step()`, which performs parameters updating.
    `update` is used to update the loss_scaling.


    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
    dynamic graph mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 65536.0.
        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                steps with finite gradients. Default is 2000.
        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                    accumulated steps with nan or inf gradients. Default is 1.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
    Returns:
        An GradScaler object.

    Examples:

        .. code-block:: python

            >>> import paddle

            >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
            >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
            >>> data = paddle.rand([10, 3, 32, 32])

            >>> with paddle.amp.auto_cast():
            ...     conv = model(data)
            ...     loss = paddle.mean(conv)

            >>> scaled = scaler.scale(loss)  # scale the loss
            >>> scaled.backward()            # do backward
            >>> scaler.minimize(optimizer, scaled)  # update parameters
            >>> optimizer.clear_grad()
    T      @r1   r2     r   r4   r    r5   r   r   r   r   r   r   r!   r6   r7   c              	     s   t  ||||||| d S rv   )superr\   )r[   r4   r5   r   r   r   r   r!   	__class__r&   r'   r\     s   
zGradScaler.__init__r]   r   c                      t  |S )aJ  
        Multiplies a Tensor by the scale factor and returns scaled outputs.
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The tensor to scale.
        Returns:
            The scaled tensor or original tensor.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])

                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)

                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.minimize(optimizer, scaled)  # update parameters
                >>> optimizer.clear_grad()
        )r   r   )r[   r]   r   r&   r'   r     s   zGradScaler.scalerc   rd   re   r   rf   rg   c                   s   t  j|g|R i |S )a  
        This function is similar as `optimizer.minimize()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Finally, the loss scaling ratio is updated.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
            args:  Arguments, which will be forward to `optimizer.minimize()`.
            kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`.

        Examples:

            .. code-block:: python

                >>> import paddle

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])

                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)

                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.minimize(optimizer, scaled)  # update parameters
                >>> optimizer.clear_grad()
        )r   rm   )r[   rc   re   rf   r   r&   r'   rm     s   'zGradScaler.minimizer   c                 C  s   | j s| S | jt| }|d tju rtd|d tju r%| | t	|dr<|
d| j |  |d| _n| jrCd| _n|  d| _tj|d< | jsYtt| _dS dS )at  
        This function is similar as `optimizer.step()`, which performs parameters updating.

        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle
                >>> paddle.device.set_device('gpu')

                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.step(optimizer)       # update parameters
                >>> scaler.update()              # update the loss scaling ratio
                >>> optimizer.clear_grad()
        r-   z7step() has already been called since the last update().rh   ri   TFN)rB   steprZ   rn   r(   r,   r   r*   ro   rp   rh   rT   rq   rY   rC   r   r.   )r[   rc   rs   r&   r&   r'   r   "  s*   


zGradScaler.stepc                 C  s&   | j sdS | jr|   tt| _dS )a  
        Updates the loss_scaling.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle

                >>> paddle.device.set_device('gpu')
                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)     # scale the loss
                >>> scaled.backward()               # do backward
                >>> scaler.step(optimizer)          # update parameters
                >>> scaler.update()                 # update the loss scaling ratio
                >>> optimizer.clear_grad()
        N)rB   rC   rr   r   r.   rZ   r   r&   r&   r'   update^  s   
zGradScaler.updatec                   r   )aE  
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.

        Returns:
            The unscaled parameters or original parameters.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU)
                >>> import paddle

                >>> paddle.device.set_device('gpu')
                >>> model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
                >>> scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
                >>> data = paddle.rand([10, 3, 32, 32])
                >>> with paddle.amp.auto_cast():
                ...     conv = model(data)
                ...     loss = paddle.mean(conv)
                >>> scaled = scaler.scale(loss)  # scale the loss
                >>> scaled.backward()            # do backward
                >>> scaler.unscale_(optimizer)    # unscale the parameter
                >>> scaler.step(optimizer)
                >>> scaler.update()
                >>> optimizer.clear_grad()
        )r   ro   )r[   rc   r   r&   r'   unscale_~  s   !zGradScaler.unscale_c                   
   t   S )a  
        Enable loss scaling or not.

        Returns:
            bool: enable loss scaling return True else return False.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> enable = scaler.is_enable()
                >>> print(enable)
                True
        )r   r   r   r   r&   r'   r        
zGradScaler.is_enablec                   r   )ax  
        Whether to use dynamic loss scaling.

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamically return True.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> use_dynamic_loss_scaling = scaler.is_use_dynamic_loss_scaling()
                >>> print(use_dynamic_loss_scaling)
                True
        )r   r   r   r   r&   r'   r     r   z&GradScaler.is_use_dynamic_loss_scalingc                   r   )a&  
        Return the initial loss scaling factor.

        Returns:
            float:  the initial loss scaling factor.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> init_loss_scaling = scaler.get_init_loss_scaling()
                >>> print(init_loss_scaling)
                1024
        )r   r   r   r   r&   r'   r     r   z GradScaler.get_init_loss_scalingr   c                      t  | dS )a  
        Set the initial loss scaling factor by `new_init_loss_scaling`.

        Args:
            new_init_loss_scaling(float):  The new_init_loss_scaling used to update initial loss scaling factor.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_init_loss_scaling())
                1024
                >>> new_init_loss_scaling = 1000
                >>> scaler.set_init_loss_scaling(new_init_loss_scaling)
                >>> print(scaler.get_init_loss_scaling())
                1000
        N)r   r   r   r   r&   r'   r        z GradScaler.set_init_loss_scalingc                   r   )a>  
        Return the multiplier to use when increasing the loss scaling.

        Returns:
            float:  the multiplier to use when increasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> incr_ratio = scaler.get_incr_ratio()
                >>> print(incr_ratio)
                2.0
        )r   r   r   r   r&   r'   r     r   zGradScaler.get_incr_ratior   c                   r   )a  
        Set the multiplier to use when increasing the loss scaling by `new_incr_ratio`, `new_incr_ratio` should > 1.0.

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_incr_ratio())
                2.0
                >>> new_incr_ratio = 3.0
                >>> scaler.set_incr_ratio(new_incr_ratio)
                >>> print(scaler.get_incr_ratio())
                3.0
        N)r   r   r   r   r&   r'   r   +  r   zGradScaler.set_incr_ratioc                   r   )aW  
        Get the less-than-one-multiplier to use when decreasing the loss scaling.

        Returns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> decr_ratio = scaler.get_decr_ratio()
                >>> print(decr_ratio)
                0.5
        )r   r   r   r   r&   r'   r   I  r   zGradScaler.get_decr_ratior   c                   r   )a7  
        Set the less-than-one-multiplier to use when decreasing the loss scaling by `new_incr_ratio`, `new_decr_ratio` should < 1.0.

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_decr_ratio())
                0.5
                >>> new_decr_ratio = 0.1
                >>> scaler.set_decr_ratio(new_decr_ratio)
                >>> print(scaler.get_decr_ratio())
                0.1
        N)r   r   r   r   r&   r'   r   d  r   zGradScaler.set_decr_ratioc                   r   )a  
        Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Returns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> incr_every_n_steps = scaler.get_incr_every_n_steps()
                >>> print(incr_every_n_steps)
                1000
        )r   r   r   r   r&   r'   r     r   z!GradScaler.get_incr_every_n_stepsr   c                   r   )a  
        Set the num `n` by `new_incr_every_n_steps`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_incr_every_n_steps())
                1000
                >>> new_incr_every_n_steps = 2000
                >>> scaler.set_incr_every_n_steps(new_incr_every_n_steps)
                >>> print(scaler.get_incr_every_n_steps())
                2000
        N)r   r   r   r   r&   r'   r     r   z!GradScaler.set_incr_every_n_stepsc                   r   )a  
        Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Returns:
            int: the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> decr_every_n_nan_or_inf = scaler.get_decr_every_n_nan_or_inf()
                >>> print(decr_every_n_nan_or_inf)
                2
        )r   r   r   r   r&   r'   r     r   z&GradScaler.get_decr_every_n_nan_or_infr   c                   r   )a  
        Set the num `n` by `new_decr_every_n_nan_or_inf`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.

        Examples:
            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle
                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> print(scaler.get_decr_every_n_nan_or_inf())
                2
                >>> new_decr_every_n_nan_or_inf = 3
                >>> scaler.set_decr_every_n_nan_or_inf(new_decr_every_n_nan_or_inf)
                >>> print(scaler.get_decr_every_n_nan_or_inf())
                3
        N)r   r   r   r   r&   r'   r     s   z&GradScaler.set_decr_every_n_nan_or_infr   c                   r   )a0  
        Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.

        Returns:
            A dict of scaler includes:
            scale (tensor): The loss scaling factor.
            incr_ratio(float): The multiplier to use when increasing the loss scaling.
            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling.
            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients.
            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
            incr_count(int): The number of recent consecutive unskipped steps.
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.


        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle

                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> scaler_state = scaler.state_dict()
        )r   r   r   r   r&   r'   r     s   
"zGradScaler.state_dictr   c                   r   )a:  
        Loads the scaler state.

        Args:
            state_dict(dict): scaler state. Should be an object returned from a call to `GradScaler.state_dict()`.

        Examples:

            .. code-block:: python

                >>> # doctest: +REQUIRES(env:GPU, env:XPU)
                >>> import paddle

                >>> scaler = paddle.amp.GradScaler(
                ...     enable=True,
                ...     init_loss_scaling=1024,
                ...     incr_ratio=2.0,
                ...     decr_ratio=0.5,
                ...     incr_every_n_steps=1000,
                ...     decr_every_n_nan_or_inf=2,
                ...     use_dynamic_loss_scaling=True
                ... )
                >>> scaler_state = scaler.state_dict()
                >>> scaler.load_state_dict(scaler_state)
        N)r   r   r   r   r&   r'   r     s   zGradScaler.load_state_dict)Tr   r1   r2   r   r   Tr   r   r   )rc   r   r6   r7   )r6   r7   r   r   r   r   r   r   r   r   r   r   )r"   r#   r$   r   r\   r   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r&   r&   r   r'   r     s:    5 
)
< # $r   )'
__future__r   r@   collectionsr   enumr   typingr   r   r   r   rQ   rL   r   r	   Zpaddle.baser
   r   Zpaddle.base.data_feederr   Zpaddle.base.frameworkr   r   r   Zpaddle.frameworkr   Z	auto_castr   r   Zpaddle.static.amp.decoratorr   Z!python.paddle.optimizer.optimizerr   r   r(   r.   r/   r   r&   r&   r&   r'   <module>   s4       W