o
    0 iH                     @   s
  d dl mZmZmZ e rddlZddlmZ e r ddlmZ ddlZddl	m
Z
 eeZg dZe
dd	 Zd
d Zdd ZejdddejdedejfddZG dd dejZdd Zdd Zdd Zdd Zdd  Z				!	d&d"d#Z				d'd$d%ZdS )(   )is_accelerate_availableis_torch_availablelogging    N)nn)init_empty_weights)contextmanager)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 c   s    t  redd l}t| |jr| j} n
t| tr|| } t| dd }|dkrA|j|  d V  	 W d    d S 1 s<w   Y  |dkret|dre|j	|  d V  	 W d    d S 1 s`w   Y  d V  d S )Nr   typecudaxpu)
r   torch
isinstanceTensordevicestrgetattrr
   hasattrr   )devr   Zdev_type r   k/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/transformers/integrations/mxfp4.py	on_device3   s&   

  
r   c                 C   s.   |j jj}|| tjtjdd\} }| |fS )N   )Zaxis)Znumerics_detailsZmxfpdowncast_to_mxfp_torchtor   bfloat16uint8)wtriton_kernels_hubr   w_scaler   r   r   quantize_to_mxfp4J   s   
r   c           
      C   sn   |j j|j j|j j}}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    r   )Zmx_axisdtype)tensorFP4convert_layoutwrap_torch_tensorZtensor_detailslayoutStridedLayoutZ"make_default_matmul_mxfp4_w_layout)
r   r   r   r#   r$   r%   r&   r'   Zvalue_layoutZvalue_layout_optsr   r   r   swizzle_mxfp4P   s   

r(   i   )r!   rows_per_chunkr!   r)   returnc                C   s  ddl }| jstj r|  } | }|tjd }| jdd |jks6J d| jdd d|jtjt	|| j
d}| j^ }}}||| }	| |	|} ||	d}tj|	|d	 || j
d}
td|	|D ]R}t|| |	}| || }||| }|d
@ tj}|d? tj}|
|| }|| |ddddd	f< || |ddddd	f< tj|||d ~~~~~qk|
jg |||d	 R  jg ||| d	 R  }
~ ~~|
dd	 S )zw
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   zblocks.shape[:-1]=z does not match scales.shape=)r!   r   r   r         )out)mathZis_cudar   r
   is_availabler   int32shaper"   
FP4_VALUESr   prodreshapeemptyrangeminlongldexpview	transpose
contiguous)blocksscalesr!   r)   r0   ZlutZprefix_shapeGBZ
rows_totalr/   Zr0r1ZblkexpZidx_loZidx_hisubr   r   r   convert_moe_packed_tensorsd   s4   44rF   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Mxfp4GptOssExpertsc                    sR  t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j | jd tj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| j| jd tj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr          r    FZrequires_gradgZd;?Zswiglu_limitg      @)super__init__Znum_local_expertsZnum_expertsintermediate_sizehidden_sizer   	Parameterr   Zzerosr   Zgate_up_proj_blocksZgate_up_proj_scalesfloat32gate_up_proj_biasZdown_proj_blocksZdown_proj_scalesdown_proj_biasalphar   limitgate_up_proj_precision_configdown_proj_precision_config)selfconfig	__class__r   r   rL      s>   
"  zMxfp4GptOssExperts.__init__hidden_statesr*   c                 C   s   t jjt jjt jj}}}t jj}t|j= ||d|d| j| j	fd}	||| j
| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    |S 1 sWw   Y  |S )Nswiglu)rS   rT   r   )gather_indxprecision_configgammasZfused_activation)scatter_indxr^   r_   )r   
matmul_ogsFnSpecsFusedActivationr\   	swiglu_fnr   r   rS   rT   gate_up_projrQ   r   r   rP   rU   	down_projrR   rV   	gate_scal)rW   r[   routing_data
gather_idxscatter_idxrb   rc   ra   rd   ZactZintermediate_cache1Zintermediate_cache3r   r   r   forward   s<   

zMxfp4GptOssExperts.forward)__name__
__module____qualname__rL   r   r   rk   __classcell__r   r   rY   r   rG      s    $rG   c                 C   s
  dd l }tjjtjjtjjtjjf\}}}}t| j t	j
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}t	j|dd}t	j|dd\}}t	|d|}|d}t	j|||d d	|| }|dt	j}d
}t	||k ||}t	j|ddt	j}t	|t	j}t	||k ||	}t	||k||	}t	||	k|	|}|| }t	|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 sw   Y  ||||||||fS )Nr   Z
LOCAL_RANK0r,   r   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )Nr   T)dimstablerq   )r   argsortr:   Ztake_along_dimint)valskZtk_indxZtk_valr   r   r   topk   s   "z routing_torch_dist.<locals>.topkrs   )Zbinsmaxi  T)rr   )Zsrc_indxZdst_indx)osr   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r   r   distributedZget_world_sizeru   environgetr3   Zsoftmaxsortgatherr6   Zhistcr<   r   r2   wherert   )ZlogitsZn_expts_actrz   r|   r}   r~   r   Z
world_sizerankZreplace_valueZn_tokensZn_expts_totZn_local_expertsZlocal_expert_startZlocal_expert_endZn_gates_padrx   Z	expt_scalZ	expt_indxZsort_indiceshistvarZ	topk_indxZ	gate_indxrg   r]   r`   Z	expt_dataZhit_expertsr   r   r   routing_torch_dist   sN   



4r   c           
      C   s   dd l m} | r| rt| drt}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}t|j ||| j
j\}}}W d    n1 sMw   Y  | ||||}	|		|d| j
j}	|	|fS )Nr   Z
_is_hookedr,   )Ztorch.distributedr   r1   Zis_initializedr   r   r   r{   r3   r6   ZrouterZ
hidden_dimr   Z
functionalZlinearweightZbiasr   r   Ztop_kZexperts)
rW   r[   distr{   Z
batch_sizeZrouter_logitsrh   ri   rj   Z
routed_outr   r   r   mlp_forward'  s   
r   c                    s(   d |  t fdd|D sdS dS )N.c                 3   s0    | ]}t | d  pt |  V  qdS )z\.N)rematch).0keyZcurrent_key_name_strr   r   	<genexpr>=  s     
z(should_convert_module.<locals>.<genexpr>TF)joinany)current_key_namepatternsr   r   r   should_convert_module;  s   
r   c              
   K   s  ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]e}||v r|d ur;||||||	|
||}| d
}| d}t| |ddd | t| |rt| |rtt| |t| |}|dkrttj	
 rttj	  t| |tj|| t| | t| | q&d S )Nr   shard_and_distribute_modulemodelempty_paramcasting_dtypeto_contiguousr   device_mesh)re   rf   _blocks_scalesr   r   cpu)integrations.tensor_parallelr   r   setattrrsplitr   rF   r   r   r
   r1   Zempty_cacher   rO   r   delattr)module
param_nameparam_valuetarget_deviceZdq_param_namekwargsr   r   r   r   r   r   r   projblocks_attrscales_attrZdequantizedr   r   r   
dequantizeD  s@   











r   c              	   K   sl  |j j|j j|j j}}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v rB|d
d dd }d|v rR|d
d dd }|durb|	|
||||||| nt| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkr2|jjdkr4|d}|dkr||| jd d}n
||d| jd }t|d|dkrd}|| }|| }t| t|dd|dd|\}}W d   n1 sw   Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS dS dS )zq
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r   r   r   r   r   r   r   r?   r   r,   r   r   r@   r   Nr   FrJ   metare   r	   r   r
   Z_precision_config)Zrhs_data)weight_scaleZflex_ctx)ra   PrecisionConfigFlexCtx
InFlexDatar   r   r   splitr   r   r   r   rO   r   r   r	   sizer6   rM   r   r>   r   r(   r=   SizerN   r3   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r?   r@   Zlocal_expertsZtriton_weight_tensorr   r   r   r   load_and_swizzle_mxfp4g  sf   






$









r   Fc           
   	   C   s   |d u rg }|   D ]i\}}|| t||s|d q
|jjdkrC|jsCt  t|| j	|< d}W d    n1 s>w   Y  |jjdkrX|jsXddl
m} |t||_tt| dkrnt||||||d\}	}|d q
| |fS )Nr,   ZGptOssExpertsTZ	GptOssMLPr   )
MethodType)has_been_replacedrX   )Znamed_childrenappendr   poprZ   rl   r   r   rG   Z_modulestypesr   r   rk   lenlistchildren_replace_with_mxfp4_linear)
r   modules_to_not_convertr   quantization_configr   rX   namer   r   _r   r   r   r     s4   



r   c                 C   sz   |j r| S ddlm} |da|d u rdgn|}|jd ur#||j tt|}t| ||||d\} }|s;t	
d | S )Nr   )
get_kernelz kernels-community/triton_kernelsZlm_head)rX   zYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   Zkernelsr   r   r   extendr   setr   loggerwarning)r   r   r   r   rX   r   r   r   r   r   replace_with_mxfp4_linear  s(   

r   )NNNFN)NNNN) utilsr   r   r   r   r   Z
accelerater   r   
contextlibr   Z
get_loggerrl   r   r4   r   r   r(   r   r!   ru   r   rF   ModulerG   r   r   r   r   r   r   r   r   r   r   r   <module>   sP   


6ID	#E
'