o
    + i7                     @  s  U d dl mZ d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZ erYd d	lmZ d d
lmZ d dlmZ ed Zded< ed Zded< dd Z			d5d6ddZ			d7d8d#d$Z			%		d9d:d+d,Z			-d;d<d0d1Zd=d3d4ZdS )>    )annotations)TYPE_CHECKINGLiteralN)_C_ops)check_dtype)is_compiled_with_cudais_compiled_with_rocm)get_device_capability)LayerHelperin_dynamic_or_pir_mode)	TypeAlias)Tensor)	DTypeLike)weight_only_int8Zweight_only_int4zllm.int8r   _Algo)@      
_GroupSizec                  C  sX   t  st r*tj } | d ur| dkst r&t \}}t|d | }|S tddS )NFalse
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDAr   )r   r   paddleversioncudar	   int
ValueError)Zcuda_versionmajorminorarch r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/nn/quant/quantized_linear.py_get_arch_info,   s   

r!   r   r   xr   algor   
int | None
group_sizereturntuple[Tensor, Tensor]c                 C  s   |du rt  }t r.|dks.|dks.|dks.|dks.|dks.|dks.|dks.J d	| d
|dksB|dksB|dksBJ d| d
t rMt| |||S d}t|fi t }|d}|d}|j|d| i||d|||dd ||fS )ae  
    Quantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.

    Returns:
        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
        scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> print(out.shape)
            [32, 64]
            >>> print(scale.shape)
            [32]
    NF   K   P   V   Y   Z   \   DCurrently weight_quantize only support SM70/75/80/86/89/90. but got  r   r   r   5Currently group_size only support -1/64/128. but got weight_quantizeint8floatr"   )outscale)r#   r   r%   typeinputsoutputsattrs)	r!   r   r   r   r2   r
   locals"create_variable_for_type_inference	append_op)r"   r#   r   r%   r8   helperr5   r6   r   r   r    r2   ?   s8   "





r2   float16r6   	out_dtyper   c                 C  s   |dks|dks|dksJ d| dt  rt| |||S d}t|fi t }|j}||}|j|| |dd|i||d	d
 |S )a  
    Dequantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be dequantized, the data type is int8.
        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

    Returns:
        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize, weight_dequantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> x_dequant = weight_dequantize(out, scale)
    r   r   r   r1   r0   weight_dequantize)r"   r6   r5   )r#   r%   r7   )r   r   rB   r
   r<   dtyper=   r>   )r"   r6   r#   rA   r%   r8   r?   r5   r   r   r    rB      s$    

	rB   r3   weightbiasTensor | Noneweight_scaleweight_dtypec              	   C  s  |du rt  }t r*|dks*|dks*|dks*|dks*|dks*|dks*J d| d	|d
ks>|dks>|dks>J d| d	t rNt| ||||||}|S t|dddgd d}t|fi t }	| j}
| g|g|gd}|durw|g|d< |||d}|		|
}|	j
||d|i|d |S )aI  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, The bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.
    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_only_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
            ...    print(out.shape)
            [1, 2, 32]
    Nr(   r)   r*   r+   r,   r-   r/   r0   r   r   r   zLCurrently weight_quantize only support group size of -1, 64 or 128. but got rH   r3   Zint4weight_only_linearr"   rD   rG   rE   )rH   r   r%   r5   r7   )r!   r   r   r   rI   r   r
   r<   rC   r=   r>   )r"   rD   rE   rG   rH   r   r%   r5   r8   r?   rC   r9   r;   r   r   r    rI      sV   )




rI         @	thresholdr4   c                 C  s   t  rt| ||||}|S d}t|fi t }| j}| g|g|gd}	|r,|g|	d< d|i}
||}|j||	d|i|
d |S )a  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, the bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import llm_int8_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
            ...    print(out.shape)
            [1, 2, 32]
    llm_int8_linearrJ   rE   rL   r5   r7   )r   r   rM   r
   r<   rC   r=   r>   )r"   rD   rE   rG   rL   r5   r8   r?   rC   r9   r;   r   r   r    rM     s*   &

rM   scalesc                 C  sV   t  r	t| |S d}t|fi t }|| j}|j|| g|gdd|id |S )aB  
    Apply pre-quant per channel scale on activations

    Args:
        x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16.
        scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x.

    Returns:
        out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import apply_per_channel_scale

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> scales = paddle.rand(shape=[32], dtype=paddle.float16)
            >>> out = apply_per_channel_scale(x, scales)
    apply_per_channel_scale)r"   rN   r5   )r8   r9   r:   )r   r   rO   r
   r<   r=   rC   r>   )r"   rN   r8   r?   r5   r   r   r    rO   _  s   rO   )r   Nr   )
r"   r   r#   r   r   r$   r%   r   r&   r'   )r   r@   r   )r"   r   r6   r   r#   r   rA   r   r%   r   r&   r   )NNr3   Nr   )r"   r   rD   r   rE   rF   rG   rF   rH   r   r   r$   r%   r   r&   r   )NNrK   )r"   r   rD   r   rE   rF   rG   rF   rL   r4   r&   r   )r"   r   rN   r   r&   r   )
__future__r   typingr   r   r   r   Zpaddle.base.data_feederr   Zpaddle.devicer   r   Zpaddle.device.cudar	   Zpaddle.frameworkr
   r   Ztyping_extensionsr   r   Zpaddle._typingr   r   __annotations__r   r!   r2   rB   rI   rM   rO   r   r   r   r    <module>   sH   I;cB