o
    )ij                     @   sX  U d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZddlmZmZmZmZ eG dd deZeG dd	 d	eZeG d
d deZeG dd deZdeeeef fddZe Zddd eD ZeG dd deZeG dd deZdefddZ 	dlddZ!dmdej"fddZ#dmdej"fdd Z$dmdej"fd!d"Z%dmdej"fd#d$Z&dmdej"fd%d&Z'dej"fd'd(Z(e j)d)d* Z*d+d, Z+i ej,j-j.e+ej,j-j/e+ej,j-j0e(ej,j-j1e!ej,j-j2e!ej,j-j3e!ej,j-j4ee!d-d.ej,j-j5e!ej,j-j6ee!d/d.ej,j-j7ee!d-d.ej,j-j8ee!d/d.ej,j-j9e#ej,j-j:e#ej,j-j;e&ej,j-j<e'ej,j-j=e%Z>ej,j-j.e+ej,j-j/e+ej,j-j0e(ej,j-j;e&ej,j-j<e'ej,j-j9e#ej,j-j:e#ej,j-j?e$ej,j-j=e%i	Z@G d0d1 d1ej"ZAG d2d3 d3eAZBi ZCe	eeeeeDejEef ef eFd4< ejGHd5d6d7kZId8e
e d9ej"d:ej"d;eej" d<edefd=d>ZJejKjLd?dd@gdAd8e
e d9ej"d:ej"d;eej" d<edej"fdBdCZMejKNd?d8e
e d9ej"d:ej"d;eej" d<edej"fdDdEZOG dFdG dGeAZPejQReP ejQReB dHZSdIZTdJZUdKZVdLZWdMZXdNej"dOeDdPeDdeAfdQdRZYG dSdT dTejZj[Z\G dUdV dVejZj[Z]G dWdX dXejZj[Z^edYedZef d[Z_d\e_de_fd]d^ZReRd_eSeVfdNej"dOeDd`eDdPeDdeAf
dadbZ`eRd_eVdcdcfdNej"dOeDdPeDddeadeeadeAfdfdgZbeReSd_dfdNej"dhej"d`eDdPeDdiee deAfdjdkZcdS )n    N)partial)AnyCallablecastDictListOptionalTupleTypeVar   )BaseOperatorget_operatorget_xformers_operatorregister_operatorc                   @      e Zd ZedZdZdZdS )SparsifyBothWaysZsparse24_sparsify_both_wayssp24N__name__
__module____qualname__r   OPERATOROPERATOR_CATEGORYNAME r   r   ]/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/sp24.pyr          r   c                   @   r   )SparsifyApplyZsparse24_applyr   Nr   r   r   r   r   r      r   r   c                   @   r   )SparsifyApplyDenseOutputZsparse24_apply_dense_outputr   Nr   r   r   r   r   r      r   r   c                   @   r   )Sp24GemmZ_sparse24_gemmr   Nr   r   r   r   r   r   &   r   r   returnc                  C   sF   t jj sdS t jj } | du rdS | d d | d d | d fS )zJ
    Returns the version of the cusparselt.so library used by pytorch
    )r   r   r   Ni'  d   )torchbackends
cusparseltis_availableversion)r&   r   r   r   _get_cusparselt_torch_version-   s   r'   .c                 c   s    | ]}t |V  qd S N)str).0vr   r   r   	<genexpr>:   s    r-   c                   @   $   e Zd ZeddZdZde ZdS )Sp24GemmCuspltSearchatenZ_cslt_sparse_mm_searchr   z_cslt_sparse_mm_search@Nr   r   r   r   r   r   _cusplt_version_strr   r   r   r   r   r/   =       
r/   c                   @   r.   )Sp24GemmCuspltr0   Z_cslt_sparse_mmr   z_cslt_sparse_mm@Nr1   r   r   r   r   r4   D   r3   r4   c                  C   sB   t dk} | sdS d}tj rtjd}t dk r|dkrdS | S )N)r      r   F)r   r   cuda)   r   r   )	   r   )_cusplt_versionr"   r6   r%   Zget_device_capability)	availableZcompute_capabilityr   r   r   _has_cusparseLtK   s   
r;   r   c           	      C   s8  d }|D ]	}t |tr|}q|d usJ g }t|D ]]\}}t |tjrrt |tsG||v r3t||}ntd| j d| j d| dt	| |j
d u se|j
d u se|j
 |j
 kse|j
 |j
 krrtd| j d| j d|| qt |tsJ dt|j| dd |D  |j| d	d |D  |j|j
S )
Nz
Operation r(   zL on Sparse24Tensor requires all operands to be Sparse24Tensors, but operand z is a z] on Sparse24Tensor requires all operands to be Sparse24Tensors with the same sparsity patternz$Only implemented for CUTLASS tensorsc                 S       g | ]}t |tr|jn|qS r   )
isinstanceSparse24Tensorpackedr+   xr   r   r   
<listcomp>~   s     z)sparse24_pointwise_op.<locals>.<listcomp>c                 S   r<   r   )r=   r>   packed_tr@   r   r   r   rB      s    )r=   r>   	enumerater"   Tensorsparsify24_like
ValueErrorr   r   typethreads_masksZdata_ptrstrideappendSparse24TensorCutlassshapemetameta_t)	functypesargskwargsallow_sparsify_args_listselftensorZargs_updatedir   r   r   sparse24_pointwise_opZ   s\   



rX   c                 C   st   t |dksJ |\}}|jdks|jdkrtdt|tr$||S | }t|ts/J |j| dd S )N   8`Sparse24Tensor` matmul: Broadcasting is not implementedT)prefer_col_major_output)lenndimNotImplementedErrorr=   r>   _mmt)rP   rQ   rR   rS   ABB_tr   r   r   sparse24_mm   s   

rd   c                 C   s   t |dksJ |\}}}|jdks|jdkrtd|jdkr(td|j t|tr1td| }t|ts<J |j| |dd S )	N   rY   rZ   r   z:`Sparse24Tensor` matmul: only bias dim=1 supported. Shape=z@`Sparse24Tensor` matmul: only operand B of `addmm` can be sparseTbiasr[   )r\   r]   r^   rM   r=   r>   r`   r_   )rP   rQ   rR   rS   rg   ra   rb   rc   r   r   r   sparse24_addmm   s"   



rh   c                 C   sd   t |dv sJ |d d \}}t |dkr|d nd }|d u r&||  S td d ||| gdS )N)rY   re   rY   re   )rP   rQ   rR   )r\   r`   rh   )rP   rQ   rR   rS   ra   rb   rg   r   r   r   sparse24_linear   s   ri   c              
   C   sp   t |dksJ |d }t|tsJ t |jdksJ |j|jd |jd f|j|j|j|j|j	
dddS )Nr   r   rY   r?   rN   rC   rO   rI   )r\   r=   r>   rM   	__class__rC   rO   r?   rN   rI   Z	transposerP   rQ   rR   rS   rU   r   r   r   
sparse24_t   s   rn   c                 C   s:   t |dksJ |\}}t||jkrtd| d|S )NrY   zO`view` is not implemented for Sparse24Tensor, except for the dummy case (shape=))r\   tuplerM   r^   )rP   rQ   rR   rS   rU   rM   r   r   r   sparse24_view   s   
rq   c              	   C   s<   t |dksJ |d }|j|j|j|j|j|j|jddS )Nr   r   F)rM   r?   rN   rC   rO   rI   requires_grad)r\   rl   rM   r?   rN   rC   rO   rI   rm   r   r   r   sparse24_detach   s   rs   c                  c   s     t j } zd V  W ~ d S ~ w r)   )r"   _CZ_DisableTorchDispatch)guardr   r   r   no_dispatch   s
   

rv   c                 C   s2   t   | | W  d    S 1 sw   Y  d S r)   )rv   )rP   rQ   rR   rS   r   r   r   fallback_dispatcher   s   $rw   )r   r   )rT   )r   c                   @   s   e Zd ZU ejed< ejed< ejed< ejed< ejed< g dZedddejdejdejdejdejf
d	d
Zdd Z	dejfddZ
ddddejdedeej dejfddZejjZdd Zedd ZdS )r>   r?   rN   rC   rO   rI   rk   Frr   c          	      C   sL   t |tjsJ tjj| ||j|j|d}||_||_||_||_	||_
|S )N)devicedtyperr   )r=   r"   rE   Z_make_wrapper_subclassry   rz   r?   rN   rC   rO   rI   )	clsrM   r?   rN   rC   rO   rI   rr   rV   r   r   r   __new__+  s   zSparse24Tensor.__new__c                 C   s   | j j d| j dS )Nz(shape=ro   )rl   r   rM   rU   r   r   r   __repr__F  s   zSparse24Tensor.__repr__r    c                 C   s*   t j| jd | jd | j| jd}| | S )Nr   )ry   rz   )r"   eyerM   ry   rz   )rU   er   r   r   _sp24_to_denseI  s   zSparse24Tensor._sp24_to_denseNr[   rg   rb   r[   rg   c                C   s   t  r)   )r^   )rU   rb   r[   rg   r   r   r   r_   Q  s   zSparse24Tensor._mmc                 C   s   | j | j| jffS r)   )	__slots__rM   rr   r}   r   r   r   __tensor_flatten__\     z!Sparse24Tensor.__tensor_flatten__c                 C   s    |\}}| |fi |d|iS )Nrr   r   )r{   Zinner_tensorsZflatten_specZ
outer_sizeZouter_striderM   rr   r   r   r   __tensor_unflatten___  s   z#Sparse24Tensor.__tensor_unflatten__)r   r   r   r"   rE   __annotations__r   staticmethodr|   r~   r   boolr   r_   rt   Z_disabled_torch_function_implZ__torch_function__r   classmethodr   r   r   r   r   r>   !  sJ   
 






	r>   c                	   @   sF   e Zd Zddddejdeej dedejfdd	ZedddZ	dS )rL   NFrf   rb   rg   r[   r    c                C   s   t |tr	td|d urtdt dt d| jdks"|jdkr,td| jj d| j	d |j	d	 krXtd| jj d
| j	d	  d| j	d  d|j	d	  d|j	d  dt
| j|| jd | j	d	  S )NB`Sparse24Tensor @ Sparse24Tensor` is not supported by the hardwarez`Sparse24Tensor` with backend='zF' does not support matmul with bias. Remove the bias, or use backend=''rY   `)` matmul: Broadcasting is not implementedr   r   ` matmul: invalid shapes     (, ) @ (ro   )r=   r>   rG   r^   BACKEND_CUTLASSBACKEND_CUSPARSELTr]   rl   r   rM   r   r   r?   rN   )rU   rb   rg   r[   r   r   r   r_   l  s6   
 zSparse24TensorCutlass._mmr   c                 C   6   |j tvrt| j d|j dt|j  ||||S NzI only supports a specific set of operations, can't perform requested op (ro   )_overloadpacketSPARSE24_DISPATCH_CUTLASSr^   r   r{   rP   rQ   rR   rS   r   r   r   __torch_dispatch__     
z(Sparse24TensorCutlass.__torch_dispatch__r   N)
r   r   r   r"   rE   r   r   r_   r   r   r   r   r   r   rL   k  s    
rL   _CUSPLT_ALG_CACHEZXFORMERS_CUSPARSELT_TUNE01rM   r?   rb   rg   transpose_resultc              
   C   s.  t sdS | \}}|jd }d}||ddkrdnd7 }||r!dnd7 }|||||j|duf}	|	tv r7t|	 S d}
g }tdD ]G}d	}t|
D ](}ztj|||||d
 W n tya   d}Y  nw |dkrot	j
  t }qG|rt nt	j
  t | }|||f q?|  |d d t|	< t|	 S )a  
    cuSPARSELt has multiple algorithms (that correspond to different kernels)
    to run a given GEMM, because the optimal kernel depends on the GEMM dimensions.
    This function attempts to find the most efficient one by benchmarking all
    of them.
    NOTE: cuSPARSELt also provides a function to search the best algorithm
    (exposed via `aten:_cslt_sparse_mm_search`) but it often fails to find the best
    algorithm, so we need this workaround.
    r   r   rrj   cN
   F   Frg   r   alg_idT)_CUSPLT_TUNErM   rJ   rz   r   ranger4   r   RuntimeErrorr"   r6   Zsynchronizetime	monotonicrK   sort)rM   r?   rb   rg   r   MKNfmthREPEATZ	TIME_ALGOalgoZ	has_errorrW   r`   dtr   r   r   _cusplt_find_alg  sF   




r   zxformers::_cusplt_mmr6   )Zmutates_argsZdevice_typesc                 C   s&   t | ||||d}tj|||||dS )z
    This operator wraps find_algo + gemm. This is because we don't want find_algo
    to be visible by torch compile, otherwise it will remove it from the graph.
    rg   r   r   )r   r4   r   )rM   r?   rb   rg   r   r   r   r   r   
_cusplt_mm  s   

r   c                 C   sF   | \}}|j d }|rtj||g|j|jdS tj||g|j|jdS )Nr   )rz   ry   )rM   r"   emptyrz   ry   )rM   r?   rb   rg   r   r   r   r   r   r   r   _cusplt_mm_meta  s
   
r   c                	   @   sF   e Zd Zddddejdedeej dejfdd	ZedddZ	dS )Sparse24TensorCuSparseLtFNr   rb   r[   rg   r    c                C   s  t |tr	td| jdks|jdkrtd| jj d| jd |jd krItd| jj d| jd  d| jd  d	|jd  d|jd  d
|jd d dkrhtd| jj dt| j dt|j d|j	| j	krtd| jj dt| j dt|j d| j	 d|j	 d|d ur|j	| j	krtd| jj dt| j dt|j d| j	 d|j	 dt
 sJ tjjj| j| j|||d}|r| }|d | jd  S )Nr   rY   r   r   r   r   r   r   r   ro      z` matmul: trying to do `A=z @ B=zD`. The dense matrix B should have the second dimension aligned to 8.z`, with A.dtype=z and B.dtype=zH. This operation is only supported when A and B have the same data type.z + C`, with A.dtype=B.dtype=z and C.dtype=zK. This operation is only supported when A, B and C have the same data type.r   )r=   r>   rG   r]   r^   rl   r   rM   rp   rz   r;   r"   opsZxformersr   r?   r`   )rU   rb   r[   rg   outr   r   r   r_     sf   
&$$
zSparse24TensorCuSparseLt._mmr   c                 C   r   r   )r   SPARSE24_DISPATCH_CUSPARSELTr^   r   r   r   r   r   r   #  r   z+Sparse24TensorCuSparseLt.__torch_dispatch__r   )
r   r   r   r"   rE   r   r   r_   r   r   r   r   r   r   r     s    
1r   Z24sparseZ24denseZsteZcutlassr$   ZdenserA   r   backendc          	   	   C   s|   |t tfv sJ d| t| tr| jd u rtd| S tj| ||d\}}}}}|t kr0tnt	}|| j
|||||ddS )NInvalid backend: z'Input to `sparsify24` is already sparse)	algorithmr   F)r?   rN   rC   rO   rI   rr   )r   r   r=   r>   rI   rG   r   r   rL   r   rM   )	rA   r   r   r?   rN   rC   rO   rI   r{   r   r   r   _sparsify24_forward;  s4   

r   c                	   @   s@   e Zd ZedejdededefddZedejfdd	Zd
S )_Sparsify24FuncrA   r   gradientr   c              	   C   sf   |t ttfvrtd| dt  dt dt t|||d}|j| _|j| _|j| _|j| _|| _	|S )NzInvalid gradient type: 'z'. Expected 'z' or 'r   r   )
GRADIENT_SP24GRADIENT_DENSEGRADIENT_STErG   r   rI   rN   rO   rz   r   )ctxrA   r   r   r   r   r   r   r   forwardY  s"   z_Sparsify24Func.forwardgrad_outc              	   C   s   t |ts
| jtkr|d d d fS t |trJ |j| jksJ | jtkr@t|| j\}}}}t	|j
|| j|| j| j|jd}n| jtkrT| j sLJ t|| j}nJ d| j |d d d fS )Nrx   FzUnsupported gradient type: )r=   r>   r   r   rz   r   r   r   rI   rL   rM   rN   rO   rr   r   is_contiguousr   )r   r   r?   _rC   grad_inr   r   r   backwardh  s0   

	z_Sparsify24Func.backwardN)	r   r   r   r   r"   rE   r*   r   r   r   r   r   r   r   X  s
    r   c                   @   sD   e Zd Zedejdedededef
ddZedejfd	d
Z	dS )_Sparsify24STEFuncrA   r   r   bw_mul0bw_mul1c                 C   s&   t |||d}|j| _|| _|| _|S )Nr   )r   rI   r   r   )r   rA   r   r   r   r   r   r   r   r   r     s
   	z_Sparsify24STEFunc.forwardr   c                 C   sN   t |trJ | jdkr| jdkr|}ntj|| j| j| jd}|d d d d fS )N      ?)Zmul0Zmul1)r=   r>   r   r   r   r   rI   )r   r   r   r   r   r   r     s   z_Sparsify24STEFunc.backwardN)
r   r   r   r   r"   rE   r*   floatr   r   r   r   r   r   r     s    r   c                	   @   s@   e Zd ZedejdededefddZedejfdd	Z	d
S )_Sparsify24LikeFuncrA   patternr   r   c           	   	   C   s  t |ts	td|j std|tttfvr!td| d|j| _|j	| _	|j
| _
|j| _|| _|tkrF| j s?J t|| jS tj|| j|d\}}}}|tkrgt|j|| j	|| j
| j|jdS |tksrJ d| ||j	 ||j
 t|j||||| j|jdS )Nz4`sparsify24_like`: `pattern` must be a sparse tensorzA`sparsify24_like` is not implemented when `pattern` is transposedz*`sparsify24_like`: invalid gradient type ""r   rx   r   )r=   r>   r^   rI   r   r   r   r   rG   rN   rO   rz   r   BACKEND_DENSEr   r   r   r   rL   rM   rr   r   Zcopy_r   )	r   rA   r   r   r   r?   rN   rC   rO   r   r   r   r     sV   

	z_Sparsify24LikeFunc.forwardr   c              	   C   s   | j tks
t|tr|d d d fS t|trJ |j| jksJ | j tkr6| j s+J t	|| jd d d fS | j t
ks=J tj	|| jtd\}}}}t|j|| j|| j| j|jdd d d fS )Nr   rx   )r   r   r=   r>   rz   r   rI   r   r   r   r   r   r   rL   rM   rN   rO   rr   )r   r   r?   r   rC   r   r   r   r     s8   
	z_Sparsify24LikeFunc.backwardN)
r   r   r   r   r"   rE   r>   r*   r   r   r   r   r   r   r     s
    -r   F.)boundrP   c                 C   s   t ttj| S r)   )r   r   r"   _dynamoallow_in_graph)rP   r   r   r   r     r   r    r   c                 C   s   t | |||S r)   )r   apply)rA   r   r   r   r   r   r   
sparsify24  s   r   r   r   r   c                 C   s   t | ||||S )a  
    2:4 sparsification, with Straight Through Estimator for the
    backward pass (eg the gradient is *not* sparsified).
    Optionally, `bw_mul[0-1]` provide the option to rescale the gradient
    differently for pruned (`bw_mul0`) and kept values (`bw_mul1`).
    )r   r   )rA   r   r   r   r   r   r   r   sparsify24_ste  s   r   r   	out_densec                 C   s|   |d ur|rt }|dkrt|trtnt}t|ts#tdt| |j	 s6t
|  | || S t
| |||S )Nr   z/`pattern` must be a `Sparse24Tensor` but got a )r   r=   r   r   r   r>   rG   rH   rI   r   r   r   r`   )rA   r   r   r   r   r   r   r   rF      s   

rF   )r   Nr   r   )d
contextlibosr   	functoolsr   typingr   r   r   r   r   r   r	   r
   r"   commonr   r   r   r   r   r   r   r   intr'   r9   joinr2   r/   r4   r   r;   rX   rE   rd   rh   ri   rn   rq   rs   contextmanagerrv   rw   r   r0   Zis_same_sizeZdetach_detachZreluZgeluZsilumuladdZgelu_backwardZsilu_backwardZthreshold_backwardmmmatmulr`   viewZlinearr   Zaddmmr   r>   rL   r   r*   rz   r   environgetr   r   ZlibraryZ	custom_opr   Zregister_faker   r   r   r   r   r   r   r   r   r   r   ZautogradFunctionr   r   r   r   r   r   r   rF   r   r   r   r   <module>   s  
(
2

















 
$







J&(
4
>."V	