o
    + i                     @  s  d dl mZ d dlZd dlZd dlmZ d dlZd dlmZ	 d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZmZm Z  erdd dlm!Z! g Z"d6ddZ#d6ddZ$d6ddZ%da&dd Z'dd Z(d7ddZ)dd Z*G dd dZ+G d d! d!e+Z,d"d# Z-G d$d% d%Z.G d&d' d'e.Z/G d(d) d)e.Z0da1d*d+ Z2da3d,d- Z4G d.d/ d/e.Z5ej6d8d0d1Z7d2d3 Z8d4d5 Z9e.Z:e/Z;e0Z<e5Z=dS )9    )annotationsN)TYPE_CHECKING)_C_ops)core	frameworkunique_name)check_variable_and_dtype)DataType)Variable
check_typedefault_main_program)get_complete_pp_mesh)LayerHelperin_dynamic_modein_dynamic_or_pir_modein_pir_mode)Tensorc                 C  s   t  r	t| |S tdi t }t| dg dd t|dtd |du r0t	d
|jdg}|j| j|| jdd	}|jdd| id|id
|id |S )a  

    Limits the L2 norm of the input :math:`x` within :math:`max\_norm`.
    If the L2 norm of :math:`x` is less than or equal to :math:`max\_norm`, :math:`out` will be
    the same as :math:`x`. If the L2 norm of :math:`x` is greater than :math:`max\_norm`, :math:`x` will
    be linearly scaled to make the L2 norm of :math:`out` equal to :math:`max\_norm`, as
    shown in the following formula:

    .. math::

        out = \frac{max\_norm * x}{norm(x)}

    where :math:`norm(x)` represents the L2 norm of :math:`x`.

    Args:
        x(Tensor): The input of clip_by_norm and data type is float32.
            The number of dimensions must be between [1, 9].
        max_norm(float): The maximum norm value.
        name(str, optional): For detailed information, please refer
            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.

    Returns:
        Tensor: The output of clip_by_norm with shape as input.
            The data type is float32.

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> from paddle.nn import clip

            >>> input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
            >>> reward = clip.clip_by_norm(x=input, max_norm=1.0)
            >>> print(reward)
            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
            [[0.50000000, 0.50000000],
             [0.50000000, 0.50000000]])
    clip_by_normX)float16float32uint16max_normN.tmpF)typenamedtypeZpersistableOutr   inputsattrsoutputs)r   )r   r   r   r   localsr   r   floatr   Zgenerate_with_ignorable_keyjoinr   Zcreate_variabler   r   	append_op)xr   r   helperout r*   Z/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/nn/clip.pyr   +   s*   *r   c                 C  sL   t  rt| S tdi t }|j| jd}|jdd| ii d|id |S )a  
    Merge by adding duplicated rows in the input SelectedRows object.

    Args:
        x(Tensor): The input selected rows to be merge.
        name(basestring|None): Name of the output.

    Returns:
        Tensor, merged output.

    Examples:

        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base

            >>> b = paddle.static.default_main_program().global_block()
            >>> var = b.create_var(
            ...     name="X", dtype="float32", persistable=True,
            ...     type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> y = paddle.nn.clip.merge_selected_rows(var)
    merge_selected_rowsr   r   r   r   N)r,   )r   r   r,   r   r#   "create_variable_for_type_inferencer   r&   r'   r   r(   r)   r*   r*   r+   r,   q   s   
r,   c                 C  sr   t  rt| S t| dtd | jtjjj	krt
dtd	i t }|j| jd}|jdd| id|ii d |S )
aC  
    Get tensor data from input with SelectedRows type, and outputs a Tensor.

    .. code-block:: text

        input x is SelectedRows:
           x.rows = [0, 5, 5, 4, 19]
           x.height = 20
           x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]

        Output is DenseTensor:
           out.shape = [5, 2]
           out.data = [[1, 1],
                       [2, 2],
                       [2, 2],
                       [3, 3],
                       [6, 6]]

    Args:
        x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .

    Returns:
        Variable: DenseTensor transformed from SelectedRows. The data type is same with input.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> import paddle.base as base
            >>> from paddle.base import core
            >>> paddle.enable_static()
            >>> scope = core.Scope()
            >>> block = paddle.static.default_main_program().global_block()
            >>> x_rows = [0, 5, 5, 4, 19]
            >>> height = 20
            >>> x = scope.var('X').get_selected_rows()
            >>> x.set_rows(x_rows)
            >>> x.set_height(height)
            >>> x = block.create_var(name="X", dtype="float32", persistable=True, type=base.core.VarDesc.VarType.SELECTED_ROWS)
            >>> z = paddle.nn.clip.get_tensor_from_selected_rows(x)
    r'   get_tensor_from_selected_rowszGThe type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS.r-   r   r   r   r    r"   r!   N)r0   )r   r   r0   r   r
   r   r   VarDescVarTypeSELECTED_ROWS	TypeErrorr   r#   r.   r   r&   r/   r*   r*   r+   r0      s    ,
r0   Fc                  G  sB   t | dksJ t | dkrt| d tsJ t}| d a|S tS )N   r   )len
isinstancebool'_clip_by_global_norm_using_mp_type_flagargs	old_valuer*   r*   r+   "_clip_by_global_norm_using_mp_type   s   r>   c                 C  sd   | j tjjjks| j tjjjkrt r| tjjjS | j t	j
ks'| j t	jkr0t r0| t	jS | S N)r   r   r2   r3   FP16BF16r>   astypeFP32r	   FLOAT16BFLOAT16FLOAT32r'   r*   r*   r+   _cast_to_mp_type_if_enabled   s   rH   gradr   
clip_inputc                 C  s:   |   r|  s
dS |  s|  rt| jdkrdS dS )NFr   T)Z_is_initializedZis_distZis_denser7   shape)rI   rJ   r*   r*   r+   _can_inplace_clip_grad   s
   rL   c                 C  sp   t | } t rt| S d}t| dg d| t|fi t }|| j}d| i}d|i}|j	|||d |S )z1
    Return the squared L2 norm of a tensor.
    squared_l2_normr'   )r   float64r   r   r   r   r   r    r"   )
rH   r   r   rM   r   r   r#   r.   r   r&   )r'   Zop_typer(   r)   r    r"   r*   r*   r+   _squared_l2_norm   s   
rP   c                   @  s   e Zd Zdd Zdd ZdS )BaseErrorClipAttrc                 C     t r?   NotImplementedErrorselfr*   r*   r+   __str__     zBaseErrorClipAttr.__str__c                 C  rR   r?   rS   )rV   block	grad_namer*   r*   r+   _append_clip_op  rX   z!BaseErrorClipAttr._append_clip_opN)__name__
__module____qualname__rW   r[   r*   r*   r*   r+   rQ     s    rQ   c                   @  s*   e Zd ZdZd	ddZdd Zdd ZdS )
ErrorClipByValuea1  
    Clip tensor values to the range [min, max].

    Given a tensor ``t`` (see Examples below), this operation clips its value \
    to ``min`` and ``max`` inplace.

    - Any values less than min are set to min.
    - Any values greater than max are set to max.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, \
        will be set to ``-max`` by framework.

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()
            >>> BATCH_SIZE = 128
            >>> CLIP_MAX = 2e-6
            >>> CLIP_MIN = -1e-6
            >>> prog = paddle.static.Program()
            >>> with paddle.static.program_guard(main_program=prog):
            ...     image = paddle.static.data(name='x', shape=[None, 784], dtype='float32')
            ...     hidden1 = paddle.static.nn.fc(image, size=128, activation='relu')
            ...     hidden2 = paddle.static.nn.fc(hidden1, size=64, activation='relu')
            ...     predict = paddle.static.nn.fc(hidden2, size=10, activation='softmax')
            ...     label = paddle.static.data(name='y', shape=[1], dtype='int64')
            ...     cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
            ...     avg_cost = paddle.mean(cost)
            >>> prog_clip = prog.clone()
            >>> prog_clip.block(0).var(hidden1.name)._set_error_clip(
            ...     paddle.nn.clip.ErrorClipByValue(
            ...         max=CLIP_MAX, min=CLIP_MIN))
    Nc                 C  s0   t |}|d u r| }nt |}|| _|| _d S r?   )r$   maxminrV   r`   ra   r*   r*   r+   __init__E  s   
zErrorClipByValue.__init__c                 C     d| j dd| jdS )NzByValue, min=f, max=ra   r`   rU   r*   r*   r+   rW   N     zErrorClipByValue.__str__c                 C  sP   |j  }|d |d|g |d|g |d| j |d| j d S )Nclipr   r   ra   r`   )descr&   set_typeZ	set_inputZ
set_output	_set_attrra   r`   )rV   rY   rZ   Zclip_op_descr*   r*   r+   r[   Q  s   

z ErrorClipByValue._append_clip_opr?   )r\   r]   r^   __doc__rc   rW   r[   r*   r*   r*   r+   r_     s
    
&	r_   c                   s   | | j | j  d } fdd| D D ]&}|  | }t|dd }|d u s4t|ts4td|d ur>|	| | qd S )Nr6   c                   s   g | ]}| v r|qS r*   r*   ).0nZgrad_to_varr*   r+   
<listcomp>^  s    z'error_clip_callback.<locals>.<listcomp>
error_clipzIVariable's error_clip should be an instance of BaseErrorClipAttr or None.)
rj   opZop_sizeZoutput_arg_namesZ_var_recursivegetattrr8   rQ   r5   r[   )rY   contextZop_descZgrad_nZfwd_varrr   r*   rp   r+   error_clip_callbackZ  s   rv   c                      s^   e Zd Z fddZdd Ze dd Zdd Zd	d
 Z	dddZ
dd Zdd Z  ZS )ClipGradBasec                   s   t    d S r?   )superrc   rU   	__class__r*   r+   rc   l     zClipGradBase.__init__c                 C  rR   r?   rS   rU   r*   r*   r+   rW   o  rX   zClipGradBase.__str__c                 C  rR   r?   rS   rV   params_gradsr*   r*   r+   _dygraph_clipr  s   zClipGradBase._dygraph_clipc                 C  rR   r?   rS   r|   r*   r*   r+   	_pir_clipv  rX   zClipGradBase._pir_clipc                 C  rR   r?   rS   r|   r*   r*   r+   _static_clipy  rX   zClipGradBase._static_clipr}   list[tuple[Tensor, Tensor]]returnc                 C  sV   t  r| |S t r| |S |D ]\}}t|dd d ur%td  nq| |S )Ngradient_clip_attrz'set_gradient_clip' will be ineffective, because you have set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' is redundant and you can remove it.)r   r~   r   r   rt   warningswarnr   )rV   r}   pgr*   r*   r+   __call__|  s   


zClipGradBase.__call__c                 C  rR   r?   rS   rV   ru   paramrI   r*   r*   r+   _process_context  rX   zClipGradBase._process_contextc                 C  rR   r?   rS   )rV   r   rI   r*   r*   r+   _create_operators  rX   zClipGradBase._create_operators)r}   r   r   r   )r\   r]   r^   rc   rW   imperative_baseno_gradr~   r   r   r   r   r   __classcell__r*   r*   ry   r+   rw   k  s    

rw   c                      sh   e Zd ZU dZded< ded< dd fd	d
ZdddZe dd Z	dd Z
dd Zdd Z  ZS )ClipGradByValuea  
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].

    - Any values less than min are set to ``min``.

    - Any values greater than max are set to ``max``.

    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    Note:
        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        max (float): The maximum value to clip by.
        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
            automatically. In this case, ``max`` must be greater than :math:`0`.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByValue(min=-1, max=1)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r$   r`   ra   Nfloat | Noner   Nonec                   s<   t    |d u r|dksJ | }t|| _t|| _d S )N        )rx   rc   r$   r`   ra   rb   ry   r*   r+   rc     s   

zClipGradByValue.__init__strc                 C  rd   )NzClip Gradient By Value, min = re   rf   rg   rU   r*   r*   r+   rW     rh   zClipGradByValue.__str__c                 C  sb   g }|D ]*\}}|d u rqt |dddu r|||f qtj|| j| jd}|||f q|S )N	need_clipTFr'   ra   r`   )rt   appendpaddleri   ra   r`   rV   r}   params_and_gradsr   r   new_gradr*   r*   r+   r~     s   zClipGradByValue._dygraph_clipc              
   C  s   g }i }t dT |D ]I\}}|d u rqt|dddu r%|||f q|jj||g tj|| j	| j
d}W d    n1 sCw   Y  |||f |j||j< qW d    n1 s`w   Y  t|| |S )Ngradient_clipr   TFr   )r   
name_scopert   r   rY   program_optimized_guardr   ri   ra   r`   r   _correct_clip_op_role_varrV   r}   r   param_new_grad_name_dictr   r   r   r*   r*   r+   r     s$   
zClipGradByValue._static_clipc                 C     d S r?   r*   r   r*   r*   r+   r     rX   z ClipGradByValue._process_contextc                 C  s   t j|| j| jd}||fS )Nr   )r   ri   ra   r`   rV   r   rI   r   r*   r*   r+   r     s   z!ClipGradByValue._create_operatorsr?   )r`   r$   ra   r   r   r   r   r   )r\   r]   r^   rm   __annotations__rc   rW   r   r   r~   r   r   r   r   r*   r*   ry   r+   r     s   
 '

r   c                      sn   e Zd ZU dZded< d fddZdd	d
Zdd Ze	 dd Z
dd Zdd Zdd Zdd Z  ZS )ClipGradByNorma  
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .

    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.

    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.

    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::
        Out =
        \left\{
            \begin{array}{ccl}
                X & & if (norm(X) \leq clip\_norm) \\
                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
        \end{array}
        \right.


    where :math:`norm(X)` represents the L2 norm of :math:`X`.

    .. math::
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}

    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        clip_norm(float): The maximum norm value.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r$   	clip_normr   r   c                   s   t    t|| _d S r?   )rx   rc   r$   r   )rV   r   ry   r*   r+   rc   +  s   
zClipGradByNorm.__init__r   c                 C     d| j dS )Nz!Gradient Clip By Norm, clip_norm=re   r   rU   r*   r*   r+   rW   /  r{   zClipGradByNorm.__str__c                 C  s\   g }|D ]'\}}|d u rqt |dddu r|||f qt|| jd}|||f q|S )Nr   TFr'   r   )rt   r   r   r   r   r*   r*   r+   _clip_gradients2  s   zClipGradByNorm._clip_gradientsc                 C  
   |  |S r?   r   r|   r*   r*   r+   r~   >  s   
zClipGradByNorm._dygraph_clipc                 C  r   r?   r   r|   r*   r*   r+   r   B  s   
zClipGradByNorm._pir_clipc              
   C  s   g }t dS i }|D ]F\}}|d u rqt|dddu r%|||f q|jj||g t|| jd}W d    n1 s@w   Y  |j	||j	< |||f qW d    n1 s]w   Y  t
|| |S )Nr   r   TFr   )r   r   rt   r   rY   r   r   r   r   r   r   r   r*   r*   r+   r   E  s$   
zClipGradByNorm._static_clipc                 C  r   r?   r*   r   r*   r*   r+   r   W  rX   zClipGradByNorm._process_contextc                 C  s   t || jd}||fS )Nr   )r   r   r   r*   r*   r+   r   Z  s   z ClipGradByNorm._create_operators)r   r$   r   r   r   )r\   r]   r^   rm   r   rc   rW   r   r   r   r~   r   r   r   r   r   r*   r*   ry   r+   r     s   
 6

r   c                  G  >   t | dkrtS t | dkrt| d tsJ t}| d a|S Nr   r6   )r7   &_allow_pure_fp16_global_norm_clip_flagr8   r9   r;   r*   r*   r+   !_allow_pure_fp16_global_norm_clipb     r   c                  G  r   r   )r7   &_allow_pure_bf16_global_norm_clip_flagr8   r9   r;   r*   r*   r+   !_allow_pure_bf16_global_norm_clipp  r   r   c                      s|   e Zd ZU dZded< ded< ded< 			dd fddZdddZe dd Z	dd Z
dd Zdd Zdd Z  ZS )ClipGradByGlobalNorman  
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .

    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.

    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.

    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.

    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:

    .. math::

        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}

    where:

    .. math::

        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.

    Args:
        clip_norm (float): The maximum norm value.
        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
        auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.

    Examples:
        .. code-block:: python

            >>> import paddle
            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
            >>> out = linear(x)
            >>> loss = paddle.mean(out)
            >>> loss.backward()

            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
            >>> sdg.step()
    r$   r   r   
group_namer9   auto_skip_clipdefault_groupFr   r   c                   s>   t    t|| _|| _t|tsJ || _d | _d| _	d S )NF)
rx   rc   r$   r   r   r8   r9   r   _async_add_nshould_comm_on_shard_dim)rV   r   r   r   ry   r*   r+   rc     s   


zClipGradByGlobalNorm.__init__c                 C  r   )Nz)Gradient Clip By GlobalNorm, global_norm=re   r   rU   r*   r*   r+   rW     r{   zClipGradByGlobalNorm.__str__c                 C  s  g }g }g }g }d}t |dkr t |d dkr |d d j}nd }|D ]\}}	|	d u r-q$t|dddu r6q$|	}
t rH|	 rHt|	}
|
 }
n|	jtj	j
jkrXt|	}
t|
}
t|
}|d ur|	j|krd}t|	j}t|	jjt|jk rd}t|||j}t|||j}|jtjks|jtjkr|| q$|jtjkr|| q$|| q$t |t | t | dkr|S dd }t |dkrdnd}g }t |dkr||}||| t |dkr||}|dkr|| n||| t |dkr||}|| ||}|rK|d urKt }|rKd	|jv rK|d	d
krK|d	d	}|  }tj!|tj"j#|d t$||j|j}| j%rbt&| drbtj'j!|  | j(d)  | j%ryt&| drytj'j!|  | j*d)  t+|}tj,d
g|| j-d}d}| j.sd}tj/|tj0||dd}n||krd}tj/||d}|D ]\}}	|	d u rqt|dddu r|||	f q|rV|j|	jkr||	jn|}|j|	jkr3t|	jjt|jjk r|j}d}|D ]}|1 sd} nq|r|  }n't2dt|	j}t|	jjt|jk r)t|||j}tj'||	j|j}t3|	|rG|	4| |||	f qt5|	|}|||f q|||	f q|S )NTr   r   Fc                 S     t |  S r?   r   stacksumZvar_listr*   r*   r+   async_add_n	  r{   z7ClipGradByGlobalNorm._dygraph_clip.<locals>.async_add_nrN   r   ppr6   )rs   groupsharding_group)r   mp_grouprK   r   Z
fill_valuer'   yzLReshard a sharded tensor from a local mesh to a global mesh is not supported)6r7   process_meshrt   r   Zis_selected_rowsr,   Z_get_tensor_from_selected_rowsr   r   r2   r3   r4   r0   rP   r   setprocess_idsdistreshard
placementsr   r   r   bfloat16r   r   rB   Zget_meshZ	dim_namesZget_dim_sizeZget_submesh_with_dimZ	get_groupZ_local_value
all_reduceReduceOpSUMZshard_tensorr   hasattrdistributedr   waitr   sqrtfullr   r   dividemaximumZis_replicatedrT   rL   Z	multiply_multiply)rV   r}   r   sum_square_listsum_square_list_fp16sum_square_list_fp32Zflag_auto_hybrid_ppZsrc_meshr   r   
merge_grad
sum_squareZpp_meshr   	sum_dtypeglobal_norm_varglobal_norm_var_fp16global_norm_var_fp32global_norm_var_fp64Zg_meshZpp_groupZglobal_norm_var_localmax_global_normr   clip_varrJ   r   Zis_replicateZ	placementr   r*   r*   r+   r~     s&  


	










z"ClipGradByGlobalNorm._dygraph_clipc           %   
   C  s  g }g }g }g }g }g }g }g }	g }
g }d}t  }d }|D ]?\}}| r\|| j d| jjv r\|d u r?| j}q| j}t |jt |jk rP|}t |jt |jks\J qt|dkrqddlm} d}|d usqJ |D ]\}}|d u r|qst	|dddu rqs|}t
 r| rt|}t|}t|}|r| j|krtj|||| j| j| j}| jr|jd r|jtjks|jtjkr|| qs|jtjkr|| qs|| qs|jr|jtjks|jtjkr|| qs|jtjkr|| qs|| qs|jtjks|jtjkr!|
| qs|jtjkr.|| qs|	| qst|t| t| t| t| t| t|	 t|
 t| dkr]|S dd	 }t|t| t|	 dkrrd
nd}g }g }g }t|dkr||}||| t|dkr||}||| t|
dkr||
}||| t|dkr||}|dkr|| n||| t|dkr||}|dkr|| n||| t|dkr||}|dkr|| n||| t|dkr||}|| t|dkr-||}|| t|	dkr=||}|| d }t|dkrJ||}t|dkrV||}n| jrg| jrgtjdg|dd}| jr| jrtj !|| j"j#t$j%j&}tj !|| j'j#t$j%j&}|d u r|}n|| }t|dkr||}n| jr| j(rtjdg|dd}| jr| j(rtj !|| j"j#t$j%j&}|d u r|}n|| }t)|}tjdg|j| j*d} d}!| j+sd}!tj,| tj-|| dd}"n|| krd}!tj,| |d}"|D ]l\}}|d u rqt	|dddu r"|||f q|!ri|"j|jkr2|"|jn|"}#|rZ|# j| jkrZtj|#| j||# j|# j|# j}#t.||#}$|||$f q|||f q|S )NFr   r6   )to_placementsTr   Z	no_fusionc                 S  r   r?   r   r   r*   r*   r+   r     r{   z3ClipGradByGlobalNorm._pir_clip.<locals>.async_add_nrN   r   r   r   r   )/r   Zis_dist_dense_tensor_typeaddZ	dist_attrr   r   r7   Z/paddle.distributed.auto_parallel.placement_typer   rt   r   is_selected_row_typer,   r0   rP   r   r   r   Zdims_mappingZpartial_dimsr   Zoptimize_attrr   r	   rD   rE   r   rF   Zis_distributedrB   Zhas_dist_paramr   r   r   r   idr   r   r   r   Zhas_not_dist_paramr   r   r   r   r   r   )%rV   r}   r   Zno_fusion_sum_squareZno_fusion_sum_square_fp16Zno_fusion_sum_square_fp32Zsum_square_distZsum_square_dist_fp16Zsum_square_dist_fp32Zsum_square_not_distZsum_square_not_dist_fp16Zsum_square_not_dist_fp32Zauto_parallel_ppZ	pp_meshesZpp_stage0_meshr   r   Zp_meshr   r   r   r   r   Zno_fusion_global_normZglobal_norm_distZglobal_norm_not_distr   r   r   r   Zglobal_norm_dist_varZglobal_norm_not_dist_varr   r   r   rJ   r   r*   r*   r+   r     s  




	













zClipGradByGlobalNorm._pir_clipc              
     s  g }g }g }g }g } fdd}t d |D ]p\}}	|	d u r"qt|dddu r+q|	}
|jj||	gK |	jtjj	j
krGt|	}
t|
}
t|
}|jtjj	jkrY|| n!|jtjj	jkrg|| n|jtjj	jkru|| n|| W d    n1 sw   Y  qt|dkrt|dkrtdt|t| t| dkrt|t| t| dkr|W  d    S |jj||	g t|dkrd	nd
}g }t|dkr||}|s|st s||| n|| t|dkr||}|s	|s	t s||| n|| t|dkr5||}|d
kr-|| n||| t|dkrE||}|| t|dkrP||n|d }tj|d}tjdg|j jd}tj|tj||dd}W d    n	1 s|w   Y  i }|D ]\}}	|	d u rqt|dddu r|||	f q|jj||	gg t|	}|jtjj	jkr|jtjj	jkr|d}n|jtjj	jkr|jtjj	jkr|d}n|}t   }|j!d||dd|id ||	ur|j!dd|id|	i|j|	jdd W d    n	1 sw   Y  |	j"||j"< |||	f qW d    n	1 s5w   Y  t#|| |S )Nc                   s    j r
t|  S t| S r?   )r   r   r   r   Zadd_nr   rU   r*   r+   _add_n  s   
z1ClipGradByGlobalNorm._static_clip.<locals>._add_nr   r   TFr   z1FP16 and BF16 are not supported at the same time.rN   r   r6   rG   r   r   r   r   elementwise_mulr   Yr   rO   castr   )Zin_dtypeZ	out_dtyper1   )$r   r   rt   rY   r   r   r   r   r2   r3   r4   r,   r0   rP   r   r@   r   rA   rC   r7   rT   r   rB   r   r   r   r   r   r   r   rH   r   Zcurrent_blockr&   r   r   )rV   r}   r   r   r   Zsum_square_list_bf16r   r   r   r   r   r   r   r   r   Zglobal_norm_var_bf16r   Zglobal_norm_var_other_dtyper   Z	scale_varr   Znew_gZscale_inputrY   r*   rU   r+   r     s  
(



8


$ 
z!ClipGradByGlobalNorm._static_clipc                 C  s   | j |vr#g || j < | j|| j d < tjdg|j| jd|| j d < n| j|| j d  ks1td|}|jtjj	j
krDt|}t|}nt rS| rSt|}t|}t|}|| j  | || _d S )NZ_clip_valuer6   r   _clipz>All parameters' 'clip_norm' of a same group should be the same)r   r   r   r   r   
ValueErrorr   r   r2   r3   r4   r,   r0   r   r   rP   r   ru   )rV   ru   r   rI   r   Zlocal_norm_varr*   r*   r+   r   :  s(   



z%ClipGradByGlobalNorm._process_contextc                 C  s   dd }| j d }|| jvr<|| j| j  }tj|d}| j| j d  }tj|tj||dd}|jdks7J || j|< t rLt|| j| }||fS |j	j
d|| j| d	d
|id ||fS )Nc                 S  r   r?   r   r   r*   r*   r+   r   U  r{   z;ClipGradByGlobalNorm._create_operators.<locals>.async_add_nZ_scalerG   r   r   )r6   r   r   r   rO   )r   ru   r   r   r   r   rK   r   r   rY   r&   )rV   r   rI   r   Zgroup_scale_nameZgroup_norm_varr   Zgroup_scale_varr*   r*   r+   r   T  s*   


z&ClipGradByGlobalNorm._create_operators)r   F)r   r$   r   r   r   r9   r   r   r   )r\   r]   r^   rm   r   rc   rW   r   r   r~   r   r   r   r   r   r*   r*   ry   r+   r   {  s&   
 4

 F    r   c                   s   t d t| tstd du rt   djD ]}d|	 v r2d|
dv r2t d  nq|du r> d }tdd	 |D rP fd
d|D }tdd	 |D s]td|D ]}t| |_q_dS )a  
    Warning:

        This API must be used after building network, and before ``minimize`` ,
        and it may be removed in future releases, so it is not recommended.
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
         :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
         :ref:`api_paddle_nn_ClipGradByValue` .

    To specify parameters that require gradient clip.

    Args:
        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
            :ref:`api_paddle_nn_ClipGradByValue` ). Default value: None, and there is no
            gradient clipping.
        param_list (list(Variable), optional): Parameters that require gradient clip.
                It can be a list of parameter or a list of parameter's name.
                Default None, meaning that all parameters in the program will be included.
        program (Program, optional): The program where parameters are located.
                Default None, meaning that using :ref:`api_paddle_static_default_main_program` .

    Returns:
        None

    Examples:
        .. code-block:: python

            >>> import paddle

            >>> paddle.enable_static()

            >>> def network():
            ...     image = paddle.static.data(name='image', shape=[
            ...                        None, 28], dtype='float32')
            ...     param_attr1 = paddle.ParamAttr("fc1_param")
            ...     fc1 = paddle.static.nn.fc(image, size=10, weight_attr=param_attr1)
            ...     param_attr2 = paddle.ParamAttr("fc2_param")
            ...     fc2 = paddle.static.nn.fc(fc1, size=10, weight_attr=param_attr2)
            ...     loss = paddle.mean(fc2)
            ...     return loss


            >>> # network 1: clip all parameter gradient
            >>> with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 2: clip parameter gradient by name
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=["fc1_param", "fc2_param"])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 3: clip parameter gradient by value
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     param_var1 = paddle.static.default_main_program().global_block().var("fc1_param")
            ...     param_var2 = paddle.static.default_main_program().global_block().var("fc2_param")
            ...     paddle.nn.clip.set_gradient_clip(
            ...         paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
            ...         param_list=[param_var1, param_var2])
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
            ...     sgd.minimize(loss)

            >>> # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
            >>> with paddle.static.program_guard(base.Program(), paddle.static.Program()):
            ...     loss = network()
            ...     clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
            ...     clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
            ...     # Set the gradient clipping strategy: clip1
            ...     paddle.nn.clip.set_gradient_clip(clip1)
            ...     # Set the gradient clipping strategy: clip2
            ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
            ...     sgd.minimize(loss)
            ...     # 'set_gradient_clip' will not take effect when setting has a conflict,
            ...     # and the gradient clipping strategy will be 'clip2'


    zCaution! 'set_gradient_clip' is not recommended and may be deprecated in future! We recommend a new strategy: set 'grad_clip' when initializing the 'optimizer'. This method can reduce the mistakes, please refer to documentation of 'optimizer'.z<'clip' should be an instance of ClipGradBase's derived classNr   op_namescopeZ	optimizerz'minimize' has been invoked before, this will make 'set_gradient_clip' be ineffective! Please invoke 'set_gradient_clip' before 'minimize'.c                 s  s    | ]}t |tV  qd S r?   )r8   r   rn   elemr*   r*   r+   	<genexpr>  s    z$set_gradient_clip.<locals>.<genexpr>c                   s   g | ]
}  d |qS )r   )rY   varr   r   r*   r+   rq     s    z%set_gradient_clip.<locals>.<listcomp>c                 s  s    | ]	}t |tjV  qd S r?   )r8   r   	Parameterr   r*   r*   r+   r     s    zK'param_list' should be a list of Parameter or basestring(parameter's name).)r   r   r8   rw   r5   r   r   rY   opsZ	all_attrsattrZall_parametersallcopydeepcopyr   )ri   Z
param_listr   rs   r   r*   r   r+   set_gradient_clipr  s:   Z
	r   c           	   
   C  s  i }| D ]d\}}|d u rq|j j||gJ td5 t|dd }|d u r9| W  d    W  d      S t|tsBtd|j	|||d W d    n1 sTw   Y  W d    n1 scw   Y  qg }i }| D ]M\}}|d u rxqo|j j||g3 td |j
||d\}}|j||j< |||g W d    n1 sw   Y  W d    n1 sw   Y  qot|| |S )Nr   r   z8clip attribute should be an instance of GradientClipBase)ru   r   rI   )r   rI   )rY   r   r   r   r   rt   r8   rw   r5   r   r   r   r   r   )	Zparam_gradsru   r   r   Z	clip_attrresr   r   r   r*   r*   r+   append_gradient_clip_ops  sN   "
  
r   c           	      C  s   g }t |dkr
d S | D ]I\}}|d u rq|jj}||v rq|| |jj jD ]*}|drTd|dv rT|drT|dd }||v rT||| g}|	d| q*qd S )Nr   r   r   Zop_role_var)
r7   rY   idxr   r   Zglobal_blockr   Zhas_attrr   rl   )	r}   r   Zblock_id_listr   rI   Zblock_idrs   
param_nameZcorrect_p_gr*   r*   r+   r     s2   
r   r?   )rI   r   rJ   r   )NN)>
__future__r   r   r   typingr   r   Zpaddle.autogradZautogradr   Zpaddle.distributedr   r   r   Zpaddle.baser   r   r   Zpaddle.base.data_feederr   Zpaddle.base.libpaddler	   Zpaddle.common_ops_importr
   r   r   Z"paddle.distributed.utils.moe_utilsr   Zpaddle.frameworkr   r   r   r   r   __all__r   r,   r0   r:   r>   rH   rL   rP   rQ   r_   rv   rw   r   r   r   r   r   r   r   Zdygraph_not_supportr   r   r   ZGradientClipBaseZGradientClipByValueZGradientClipByNormZGradientClipByGlobalNormr*   r*   r*   r+   <module>   sd   

F
&?
<*]m     | '