o
    0 iK                     @   s   d dl mZmZ ddlmZ erddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ e
 r1d dlZeeZdZG d	d
 d
eZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                       s*  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	d6ddZ
dddedefddZdddddeddfddZd7ddZdddee dee fdd Z	!d8ddd"eee  fd#d$Zd%ee d&edee fd'd(Zd)d* Zd+d, Zdedefd-d.Zd9d/efd0d1Zd8d2d3Zedefd4d5Z  ZS ):Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TFZ
acceleratec                    s$   t  j|fi | || _d | _d S N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__ s/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   1   s   
zMxfp4HfQuantizer.__init__c                 C   sF   | j du r zddlm} |d| _ W | j S  ty   tdw | j S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   Zkernelsr   ImportError)r   r   r   r   r   _lazy_import_kernels6   s   
z%Mxfp4HfQuantizer._lazy_import_kernelsc                 O   sh  t  std| jjrd S tj s)tj s)| jr%t	
d d| j_d S tdt s0tdtj r?d}tdo=t }ntj }|dk}tdoNt }| jrm|s_t	
d	 d| j_d S |slt	
d
 d| j_d S n|sstd|sytd| js|   |d}|d u rt	
d d S |d ur| jst|trd| v sd| v rtdd S d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpuZdiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r
   r   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr   r   r	   Zget_device_capability
ValueErrorr   get
isinstancedictvalues)r   argsr   Zgpu_is_supportedZkernels_availableZcompute_capabilityr   r   r   r   validate_environmentA   s~   


z%Mxfp4HfQuantizer.validate_environmentdtypetorch.dtypereturnc                 C   s   |d u rt j}td| |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r"   Zbfloat16r'   info)r   r1   r   r   r   update_dtype   s   zMxfp4HfQuantizer.update_dtypemodelr   
param_namec                 K   s   ddl m} ddlm} | jjr'd|v sd|v r't||d td  \}}nt||\}}t||s<t||rD| jjrD|dv rBdS d	S dS )
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)Zdown_proj_biasZgate_up_proj_biasFT)	integrationsr9   models.gpt_oss.modeling_gpt_ossr;   r   r!   r   lenr,   )r   r6   r7   r   r9   r;   moduleZtensor_namer   r   r   param_needs_quantization   s   
z)Mxfp4HfQuantizer.param_needs_quantizationparam_valueztorch.Tensortarget_deviceztorch.devicec              	   K   s.  ddl m}m}m}m}	m}
 ddlm} | js| 	 }t
||\}}t|b t||r|	||\}}|jj|jj|jj}}}|
|||\}}d|v rPdnd}t||| t|| d|||| dd t|| d	 t|| d
 W d    d S W d    d S 1 sw   Y  d S |d}|d}|d}|d}|d}d|v sd|v r| jjrt
||d td	  \}}nt
||\}}||||||d}t||st||r| jjr| jjr|d td	  }||||||fi | d S |||||| 	 fi | d S d S d S )Nr   )r9   r!   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4r:   gate_up_proj	down_projZ_precision_config)Zrhs_data)weight_scaleZflex_ctxr>   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshr<   r=   )rM   rN   rO   rP   rQ   r6   )r?   r9   r!   rF   rG   rH   r@   r;   r&   r   r   r"   Zdevicer,   Z
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr+   r   rA   )r   r6   rD   r7   rE   r   r9   r!   rF   rG   rH   r;   r   rB   _Ztriton_weight_tensorrK   rR   rS   rT   ZprojrM   rN   rO   rP   rQ   Zshard_kwargsZdq_param_namer   r   r   create_quantized_param   sx   

"





	

z'Mxfp4HfQuantizer.create_quantized_paramc                 K   sF   | j jr	| | tj rtj  d S tj r!tj  d S d S r   )r   r!   Zremove_quantization_configr"   r#   r$   Zempty_cacher%   )r   r6   r   r   r   r   #_process_model_after_weight_loading  s   


z4Mxfp4HfQuantizer._process_model_after_weight_loadingexpected_keyscheckpoint_keysc                 C   s  g }|D ]|}| dr#|d td  }||d  ||d  q| dr@|d td  }||d  ||d  q| js{| d	rY|d td  }||d  q| d
ro|d td  }||d  q| druq|| q|| q|S )Nz.mlp.experts.gate_up_projrI   gate_up_proj_blocksZgate_up_proj_scalesz.mlp.experts.down_projrJ   down_proj_blocksZdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksr=   )endswithrA   appendr&   )r   r6   rZ   r[   Znew_expected_keyskeybaser   r   r   update_expected_keys  s,   




z%Mxfp4HfQuantizer.update_expected_keysNkeep_in_fp32_modulesc                 K   sj   ddl m} | || jj|| _|dd}|r!td d| j_|j	}||| j| j|d}| j|j	_d S )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
r?   rd   Zget_modules_to_not_convertr   rf   r+   r'   r(   r!   rg   )r   r6   rc   r   rd   re   rg   r   r   r   $_process_model_before_weight_loading(  s$   
z5Mxfp4HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   r8   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>T  s    z8Mxfp4HfQuantizer.update_missing_keys.<locals>.<listcomp>)r?   r9   named_modulesr,   r^   r_   )r   r6   ri   rj   r9   namerB   missingr   rn   r   update_missing_keysG  s   

z$Mxfp4HfQuantizer.update_missing_keysc                 C   6   d|j jv rt|dd d ur|jddddd |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrv   updater   rg   r   r   r   update_tp_planV     zMxfp4HfQuantizer.update_tp_planc                 C   rt   )Nru   base_model_ep_planrw   rx   )r   ry   rz   r   r{   r|   r   r   r   update_ep_planc  r~   zMxfp4HfQuantizer.update_ep_planc                 C   sj   | j jrd|v r|ddS d|v r|ddS |S | js3|dr(|ddS |dr3|ddS |S )Nr>    rL   rI   r\   rJ   r]   )r   r!   replacer&   r^   )r   r7   r   r   r   get_param_namep  s   

zMxfp4HfQuantizer.get_param_namesafe_serializationc                 C   s  ddl m} | }| D ]s\}}t||rt|drt|dr|jjj	|jjj
dddddd	|| d
< |jjjj	|jjjj
dd|| d< |jjj	|jjj
dddddd|| d< |jjjj	|jjjj
dd|| d< qi }||fS )Nr   r8   rI   rJ       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)r?   r9   
state_dictrp   r,   hasattrrI   ZstorageZlayoutZunswizzle_datadataZ	transposeZreshapeZgate_up_proj_precision_configrK   rJ   Zdown_proj_precision_config)r   r6   r   r9   r   rq   rB   metadatar   r   r   get_state_dict_and_metadata}  s<   

z,Mxfp4HfQuantizer.get_state_dict_and_metadatac                 C   s   dS )NTr   )r   r   r   r   r   is_serializable  s   z Mxfp4HfQuantizer.is_serializablec                 C   s   t d dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r'   r(   )r   r   r   r   is_trainable  s   zMxfp4HfQuantizer.is_trainable)r1   r2   r3   r2   )r6   r   r   )F)ry   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r   r0   r5   strboolrC   rX   rY   listrb   r   rh   rs   r}   r   r   r   r   propertyr   __classcell__r   r   r   r   r   '   sF    
O

T
 


#r   )typingr   r   ra   r   Zmodeling_utilsr   utilsr   r	   r
   r   r   Zquantizers_utilsr   r"   Z
get_loggerry   r'   r   r   r   r   r   r   <module>   s   
