o
    )i                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	  m
Z d dlmZ d dlmZ dZedZg d	ZejZd
d ZeeeejejdgddgdZejdejdddiZdefddZdefddZejjrse d dS eeeed eeeed dS )    N)nullcontext)partial)Any)	benchmark)benchmark_main_helperg      ?cuda))i$     
  )iX  r   r	   )x  r   r	   )r
   r   i   )   i   i   )r   i    i V  c                  k   s6    |   }|  }tj| D ]
}tt||V  qd S )N)keysvalues	itertoolsproductdictzip)kwargsr   valsinstance r   p/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/benchmarks/benchmark_swiglu.pyproduct_dict)   s   r   autocast_halfTF)shapedtypebiaszb16   zf16   zf16.acr   c              	   c   s(   |dkrt jt jd}}}n||d}}}t j| d d t|d}tj| d | d |dt|}t||}|r@dnd	}	| d
| d  d| d  d| d  d|	 	}
|	 }|r`dnd}t
j| d||ttjtdddtj|
dV  t
j| d||ttjtjdddd|
dV  d S )Nr   TF   devicer      Zin_featuresZhidden_featuresr   r   nobi B=r   , I=, H= z3with torch.autocast("cuda", dtype=torch.half):
     zfn(x, *args))op)xargsfnZ	swiglu_fwstmtglobalslabeldescription	sub_labeleager)torchfloatrandnr   xswSwiGLUto	DTYPE2STRget_ordered_paramsr   Timerr   swigluOPNAMESwiGLUEagerOp)r   r   r   	inp_dtypemodel_dtypeautocastr(   module	dtype_strbstrr0   paramsZPREFIXr   r   r   benchmark_swiglu?   sD   ,rG   c              	   c   s   |dkrt jt j}}tt jjddt jd}n||}}t}t j| d d t|d}|	  t
j| d | d |dt|}t||}|rKd	nd
}	| d| d  d| d  d| d  d|	 	}
| }|  t
j|g|R dti}W d    n1 sw   Y  t |}tjd||ddtj|
dV  ~|  t
j|g|R dt
ji}W d    n1 sw   Y  tjd||ddd|
dV  d S )Nr   r   T)enabledr   r   r   r   r    r   r!   r"   r   r#   r$   r%   r'   z%out.backward(grad, retain_graph=True))outgradZ	swiglu_bwr+   r1   )r2   r3   r   amprB   Zfloat16r   r4   r   Zrequires_grad_r5   r6   r7   r8   r9   r:   r<   r=   Z
zeros_liker   r;   r>   r?   )r   r   r   r@   rA   cmr(   rC   rD   rE   r0   rF   rI   rJ   r   r   r   benchmark_swiglu_bwk   sT   
,

rM   z)This benchmark could not be done on ROCM!)min_run_time)!r   
contextlibr   	functoolsr   typingr   r2   Zxformers.ops.swiglu_opopsZ	swiglu_opr5   Ztorch.utilsr   Zxformers.benchmarks.utilsr   rN   r   ZSHAPESZSwiGLUPackedFusedOpr=   r   listZbfloat16ZhalfZCASESr8   boolrG   rM   versionZhipprintr   r   r   r   <module>   s<   
	,4