o
    + i7  ã                   @  s  U d dl mZ d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZ erYd d	lmZ d d
lmZ d dlmZ ed Zded< ed Zded< dd„ Z			d5d6dd„Z			d7d8d#d$„Z			%		d9d:d+d,„Z			-d;d<d0d1„Zd=d3d4„ZdS )>é    )Úannotations)ÚTYPE_CHECKINGÚLiteralN)Ú_C_ops)Úcheck_dtype)Úis_compiled_with_cudaÚis_compiled_with_rocm)Úget_device_capability)ÚLayerHelperÚin_dynamic_or_pir_mode)Ú	TypeAlias)ÚTensor)Ú	DTypeLike)Úweight_only_int8Zweight_only_int4zllm.int8r   Ú_Algo)éÿÿÿÿé@   é€   Ú
_GroupSizec                  C  sX   t ƒ stƒ r*tj ¡ } | d ur| dkst ¡ r&tƒ \}}t|d | ƒ}|S tdƒ‚dS )NÚFalseé
   zmPaddle is not compiled with CUDA, we cannot get SMVersion from device, please try to compile Paddle with CUDAr   )r   r   ÚpaddleÚversionÚcudar	   ÚintÚ
ValueError)Zcuda_versionÚmajorÚminorÚarch© r   úl/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/nn/quant/quantized_linear.pyÚ_get_arch_info,   s   
ÿ
ÿr!   r   r   Úxr   Úalgor   ú
int | NoneÚ
group_sizeÚreturnútuple[Tensor, Tensor]c                 C  sð   |du rt ƒ }tƒ r.|dks.|dks.|dks.|dks.|dks.|dks.|dks.J d	|› d
ƒ‚|dksB|dksB|dksBJ d|› d
ƒ‚tƒ rMt | |||¡S d}t|fi tƒ ¤Ž}| d¡}| d¡}|j|d| i||dœ|||dœd ||fS )ae  
    Quantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be quantized, the data type is float16 or bfloat16.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4', 'llm.int8', 'w4a8' and 'w4afp8, default: 'weight_only_int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.

    Returns:
        out (Tensor): The Tensor which is the quantitative results, the data type is int8, the shape is transposition of x.
        scale (Tensor): The scale Tensor which is the scale of pre-channel, the data type is float32.
    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> print(out.shape)
            [32, 64]
            >>> print(scale.shape)
            [32]
    NéF   éK   éP   éV   éY   éZ   é\   úDCurrently weight_quantize only support SM70/75/80/86/89/90. but got ú r   r   r   ú5Currently group_size only support -1/64/128. but got Úweight_quantizeÚint8Úfloatr"   )ÚoutÚscale)r#   r   r%   ©ÚtypeÚinputsÚoutputsÚattrs)	r!   r   r   r   r2   r
   ÚlocalsÚ"create_variable_for_type_inferenceÚ	append_op)r"   r#   r   r%   r8   Úhelperr5   r6   r   r   r    r2   ?   s8   "

þ
ÿ


ür2   Úfloat16r6   Ú	out_dtyper   c                 C  sŠ   |dks|dks|dksJ d|› dƒ‚t ƒ rt | |||¡S d}t|fi tƒ ¤Ž}|j}| |¡}|j|| |dœd|i||d	œd
 |S )a¢  
    Dequantization function for weight_only and llm.int8's weight.

    Args:
        x (Tensor): The input Tensor to be dequantized, the data type is int8.
        scale (Tensor): The scale Tensor which is the output of weight_quantize, the data type is float32.
        algo (str): The algo that is x will be apply, must be one of 'weight_only_int8',
            'weight_only_int4' and 'llm.int8', default: 'weight_only_int8'.
        out_dtype (str|np.dtype): [Deprecated][Not used] The output Tensor's data type, must be one of 'float16' and 'bfloat16', default: 'float16'.

    Returns:
        out (Tensor): The Tensor which is the dequantitative results, the data type is float16 or bfloat16, the shape is transposition of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_quantize, weight_dequantize

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> out, scale = weight_quantize(x, algo='weight_only_int8')
            >>> x_dequant = weight_dequantize(out, scale)
    r   r   r   r1   r0   Úweight_dequantize)r"   r6   r5   )r#   r%   r7   )r   r   rB   r
   r<   Údtyper=   r>   )r"   r6   r#   rA   r%   r8   r?   r5   r   r   r    rB   …   s$    
ÿ
þü	rB   r3   ÚweightÚbiasúTensor | NoneÚweight_scaleÚweight_dtypec              	   C  s  |du rt ƒ }tƒ r*|dks*|dks*|dks*|dks*|dks*|dks*J d|› d	ƒ‚|d
ks>|dks>|dks>J d|› d	ƒ‚tƒ rNt | ||||||¡}|S t|dddgdƒ d}t|fi tƒ ¤Ž}	| j}
| g|g|gdœ}|durw|g|d< |||dœ}|	 	|
¡}|	j
||d|i|d |S )aI  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): The first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): The second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): The input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, The bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): The input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        weight_dtype(str): The dtype of  weight Tensor, must be one of 'int8', 'int4', Defaulted to 'int8'.
        arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, if you do not assign arch, we will get arch from your device, default: None.
        group_size (int): The group size for weight quantization. -1 stands for default per-channel mode. Currently only support 64 or 128.
    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import weight_only_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = weight_only_linear(x, weight, bias=bias, weight_scale=scale, weight_dtype='int8')
            ...    print(out.shape)
            [1, 2, 32]
    Nr(   r)   r*   r+   r,   r-   r/   r0   r   r   r   zLCurrently weight_quantize only support group size of -1, 64 or 128. but got rH   r3   Zint4Úweight_only_linear©r"   rD   rG   rE   )rH   r   r%   r5   r7   )r!   r   r   r   rI   r   r
   r<   rC   r=   r>   )r"   rD   rE   rG   rH   r   r%   r5   r8   r?   rC   r9   r;   r   r   r    rI   ½   sV   )

þ
ÿÿÿý
ý
ürI   ç      @Ú	thresholdr4   c                 C  s„   t ƒ rt | ||||¡}|S d}t|fi tƒ ¤Ž}| j}| g|g|gdœ}	|r,|g|	d< d|i}
| |¡}|j||	d|i|
d |S )a  
    Applies matrix multiplication of two tensors and then bias addition if provided.
    This method requires CUDA version >= 11.2.

    Args:
        x (Tensor): the first input Tensor to be multiplied, the data type is float16 or bfloat16.
        weight (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
            be performed. Otherwise, the bias is added to the matrix multiplication result.
        weight_scale (Tensor|None): the input scale Tensor Provided to weight for dequantization. Its rank must be 1.
        threshold(float): The min value of outlier in activation, outlier's channel will be apply multiply with x.dtype.

    Returns:
        Tensor: the output Tensor, the data type is the same as that of x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import llm_int8_linear

            >>> x = paddle.cast(paddle.randn([1, 2, 64]), dtype='float16')
            >>> weight = paddle.cast(paddle.randint(0, 127, [32, 64]), dtype='int8')
            >>> scale = paddle.randn([32], dtype='float32')
            >>> bias = paddle.cast(paddle.randn([32]), dtype='float16')
            >>> if paddle.device.cuda.get_device_capability()[0] >= 8:
            ...    out = llm_int8_linear(x, weight, bias=bias, weight_scale=scale, threshold=6.0)
            ...    print(out.shape)
            [1, 2, 32]
    Úllm_int8_linearrJ   rE   rL   r5   r7   )r   r   rM   r
   r<   rC   r=   r>   )r"   rD   rE   rG   rL   r5   r8   r?   rC   r9   r;   r   r   r    rM     s*   &ý

ürM   Úscalesc                 C  sV   t ƒ r	t | |¡S d}t|fi tƒ ¤Ž}| | j¡}|j|| g|gdœd|id |S )aB  
    Apply pre-quant per channel scale on activations

    Args:
        x (Tensor): Input tensor representing the activations, the data type can be float16 or bfloat16.
        scales(Tensor): Per-channel scale factors for pre-quantization. Data type should be compatible with x.

    Returns:
        out (Tensor): The Tensor which is the pre-quant results, the data type is compatible with x.

    Examples:
        .. code-block:: python

            >>> # doctest: +SKIP('No testing required')
            >>> import paddle
            >>> from paddle.nn.quant import apply_per_channel_scale

            >>> paddle.seed(2023)
            >>> x = paddle.rand(shape=[64, 32], dtype=paddle.float16)
            >>> scales = paddle.rand(shape=[32], dtype=paddle.float16)
            >>> out = apply_per_channel_scale(x, scales)
    Úapply_per_channel_scale)r"   rN   r5   )r8   r9   r:   )r   r   rO   r
   r<   r=   rC   r>   )r"   rN   r8   r?   r5   r   r   r    rO   _  s   ýrO   )r   Nr   )
r"   r   r#   r   r   r$   r%   r   r&   r'   )r   r@   r   )r"   r   r6   r   r#   r   rA   r   r%   r   r&   r   )NNr3   Nr   )r"   r   rD   r   rE   rF   rG   rF   rH   r   r   r$   r%   r   r&   r   )NNrK   )r"   r   rD   r   rE   rF   rG   rF   rL   r4   r&   r   )r"   r   rN   r   r&   r   )Ú
__future__r   Útypingr   r   r   r   Zpaddle.base.data_feederr   Zpaddle.devicer   r   Zpaddle.device.cudar	   Zpaddle.frameworkr
   r   Ztyping_extensionsr   r   Zpaddle._typingr   r   Ú__annotations__r   r!   r2   rB   rI   rM   rO   r   r   r   r    Ú<module>   sH   ÿüIû;ùcûB