o
    )iU                  
   @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZmZmZ d dlZddlmZ ddlmZ d	d
lmZmZmZmZmZmZmZmZ dedefddZdeeej ef  deej gej f deeej ef  fddZ!G dd dej Z"dej deej e#f de"fddZ$eG dd dZ%eG dd dZ&eG dd dZ'G dd  d eZ(G d!d" d"e(Z)G d#d$ d$e(Z*eeee)  eee*  f Z+d%e,dej fd&d'Z-d(e	e. d)e.dej d*e,ddf
d+d,Z/dS )-    N)	dataclass)partial)
AnyCallableIterableListMappingOptionalSetTupleTypeUnion   )_built_with_cuda   )BaseOperator   )AttentionBiasAttentionBiasSubTensorBlockDiagonalGappyKeysMaskBlockDiagonalMaskBlockDiagonalPaddedKeysMaskLowerTriangularMaskPagedBlockDiagonalGappyKeysMask PagedBlockDiagonalPaddedKeysMaskattn_bias_typereturnc                 C   s$   t d | rdS | ttjfv rdS dS )NTF)
isinstancer   torchTensor)r    r    d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/fmha/common.py_is_bias_type_supported_in_BMK&   s
   
r"   	attn_biasopc                 C   s"   t | tjr| jdkr|| S | S )Nr   )r   r   r   ndim)r#   r$   r    r    r!   _attn_bias_apply/   s   r&   c                   @   s   e Zd Zg dZejjZ	ddejdejde	ejejgejf dej
dedd fd	d
ZdejfddZdeejejf fddZdd ZdS )ScaledTensor)scaledequant_funcoriginal_dtypeFdatar(   r)   r*   require_gradr   c                 C   s(   t jj| ||d}||_||_||_|S )a  
        Creates a new ScaledTensor subclass instance.

        Parameters:
        - data: The underlying quantized tensor (e.g., int8, int4).
        - scale: The scale tensor or scalar to be used for dequantization.
        - dequant_func: A callable that applies dequantization, which takes both the data and scale as input.
        - original_dtype: The data type before quantization (e.g., float32, float16).
        - require_grad: Whether or not to track gradients (default: False for inference use).
        )r,   )r   r   Z_make_subclassr(   r)   r*   )clsr+   r(   r)   r*   r,   instancer    r    r!   __new__>   s
   zScaledTensor.__new__c                 C   s(   t |  }| || j}|| jS )z
        Applies the custom dequantization function provided at the tensor's creation.
        After dequantization, the data is cast back to its original data type.
        )r   r   floatr)   r(   tor*   )selfr+   Zdequantized_datar    r    r!   
dequantize]   s   zScaledTensor.dequantizec                 C   s   | j | jfS )z
        Unpacks the ScaledTensor by returning its data and scale as a tuple.
        Returns:
        - A tuple of (data, scale), both of which are torch.Tensor objects.
        )r+   r(   r2   r    r    r!   unpackk   s   zScaledTensor.unpackc                 C   s   d| j  d| j d| j dS )z@
        Custom string representation for ScaledTensor.
        zScaledTensor(data=z, scale=z, original_dtype=))r+   r(   r*   r4   r    r    r!   __repr__s   s   zScaledTensor.__repr__NF)__name__
__module____qualname__	__slots__r   Z_CZ_disabled_torch_function_implZ__torch_function__r   r   dtypeboolr/   r3   r   r5   r7   r    r    r    r!   r'   8   s(    
r'   xr(   c                 C   s4   t |trtj|g| jd}dd }t| |||dS )z;
    Pack a tensor into a tensorwise fp8 ScaledTensor.
    )devicec                 S   s   | |d d d d d d f  S Nr    )r?   r(   r    r    r!   r)      s   z2pack_fp8_tensorwise_per_head.<locals>.dequant_func)r+   r(   r)   r*   )r   r0   r   tensorr@   r'   )r?   r(   r*   r)   r    r    r!   pack_fp8_tensorwise_per_headz   s   
rC   c                   @   s
  e Zd ZU dZejed< ejed< ejed< dZee	eje
f  ed< dZeed< dZee ed	< dZeej ed
< dZeed< edejfddZedefddZdeejejejf fddZdeedf fddZdddZdejfddZedefddZdS )InputszE
    Stores inputs to the `memory_efficient_attention` operators
    querykeyvalueNr#           pr(   output_dtypeF
is_partialr   c                 C   s   | j jS rA   )rE   r@   r4   r    r    r!   r@      s   zInputs.devicec                 C   s    | j d u r| jjd d S | j S )Ng      )r(   rE   shaper4   r    r    r!   scale_float   s    zInputs.scale_floatc                 C   s   | j jdkr| j | j| jfS | j jdkr%| j d| jd| jdfS | jjdkrQ| j d d d d d d f | jd d d d d d f | jd d d d d d f fS J )N      r   r   )rE   r%   rF   rG   	unsqueezer4   r    r    r!   get_qkv_in_bmghk   s   


zInputs.get_qkv_in_bmghk.c                 C   s   | j jdvrtd| j j d| jjtjkrt| j j}n| j jd d | jjd f }| j jdkrT| j 	d| _ | j
	d| _
| j	d| _t| jttj	dd| _|S )	Nr   rP   rO   zInvalid shape for query: z|. Expected shape [batch, seqlen, head_groups, num_heads_per_group, K], [batch, seqlen, num_heads, K], or [batch, seqlen, K].rL   r   r   r   )dim)rE   r%   
ValueErrorrM   rG   r=   r   int32tuplerQ   rF   r&   r#   r   )r2   Zoutput_shaper    r    r!   normalize_bmhk   s   zInputs.normalize_bmhkc                    s   j  j jf} j jdvst fdd|D r,td j j d jj d jj t fdd|D r;tdt jt	t
tttfr_ jjjj}| j jkr_td	 j j d
| d jj jj  komtjkn  }t fdd|D }|s|std j j d jj d jj  j jdkrtt jstdt jj dd }t jtr jjr jj}n
t jtjrƈ j} j jdkr|d ur j jd  j jd  j jd  jjd f}|j|krtd|j d| d j j d jj d jj 
t jt	r,tdd |D r,td j j d jj d jj  jdk s8 jdkr@td j  j jd d \}} j jd }	 jjd d \}}
 jjd } jjtjk}|rl|n|	}d } j jdkr j j|||	fko jj||
|	fko jj||
|fk} j jd! } j jdkr j j||||	fko jj||
||fko jj||
||fk} j jd } j jd"kr j j|||||	fko jj||
|||fko jj||
|||fk}|std# j j d jj d jj d$d S )%NrS   c                 3       | ]
}|j  jj kV  qd S rA   )r%   rE   .0r?   r4   r    r!   	<genexpr>   s    
z)Inputs.validate_inputs.<locals>.<genexpr>zIQuery/Key/Value should all have BMGHK, BMHK or BMK shape.
  query.shape: z
  key.shape  : z
  value.shape: c                 3   rY   rA   )r@   rE   rZ   r4   r    r!   r\          z0Query/Key/Value should all be on the same devicezPAttention bias and Query/Key/Value should be on the same device
  query.device: z
  attn_bias   : 
c                 3   s    | ]
}|j  jj kV  qd S rA   )r=   rE   rZ   r4   r    r!   r\      r]   zQuery/Key/Value should either all have the same dtype, or (in the quantized case) Key/Value should have dtype torch.int32
  query.dtype: z
  key.dtype  : z
  value.dtype: r   zKPlease provide inputs in BMHK format rather than BMK when using bias type ``rP   r   r   r   z"Invalid shape for attention bias: z (expected z)
  query.shape: c                 s   s    | ]
}|j d  dkV  qdS )r   r   N)rM   rZ   r    r    r!   r\     r]   zDExpected batch_size=1 when using block-diagonal bias
  query.shape: rH   g      ?zInvalid dropout probability: p=rL   TrO   z9Incompatible shapes for attention inputs:
  query.shape: z}
HINT: We don't support broadcasting, please use `expand` yourself before calling `memory_efficient_attention` if you need to)rE   rF   rG   r%   anyrU   rM   r   r#   r   r   r   r   r   Z	q_seqinfoZseqstartr@   r=   r   rV   allr"   typer9   r   ZHOLDS_DENSE_TENSORZ
_subtensorr   rI   )r2   ZqkvZbias_deviceZquantized_dtypesZnon_quantized_dtypesZattn_bias_tZexpected_shapeBMqKMkvKvZquantized_kv_cacheZkey_embed_dimZvalid_shapesHGr    r4   r!   validate_inputs   s   
"





zInputs.validate_inputsc                 C   s2   | j d u r| jr| jjtjurtjS | jjS | j S rA   )rJ   rK   rE   r=   r   Zfloat64Zfloat32r4   r    r    r!   get_output_dtypeF  s
   
zInputs.get_output_dtypec                 C   s   t dd | j| j| jfD S )zP
        Number of bytes in the input, not counting the attention bias.
        c                 s   s    | ]	}|   V  qd S rA   )Zuntyped_storagenbytesrZ   r    r    r!   r\   R  s    
z Inputs.nbytes.<locals>.<genexpr>)sumrE   rF   rG   r4   r    r    r!   rm   M  s   zInputs.nbytes)r   N)r9   r:   r;   __doc__r   r   __annotations__r#   r	   r   r   rI   r0   r(   rJ   r=   rK   r>   propertyr@   rN   r   rR   intrX   rk   rl   rm   r    r    r    r!   rD      s(   
 



xrD   c                   @   sl   e Zd ZU ejed< ejed< dZeed  ed< dZ	ee
 ed< dZeed< dd	ed
edejfddZdS )ContextlseoutNAttentionBwOpBaseop_bw	rng_stateFqkv_share_storagepad_toforce_pad_infr   c                 C   s   || j jd |  | }| j }|dkr@|r1|d d d d d | jjd f }||jd |  | }tjjj|d|gtjd}|S |rb| jjd |jd krb|d d d d | jjd d f 	tj |S )Nr   r   r   )rG   )
rt   rM   ru   r   nnZ
functionalpadmathinfZfill_)r2   rz   r{   Z
pad_amountrt   r    r    r!   get_padded_lsea  s   "*zContext.get_padded_lser8   )r9   r:   r;   r   r   rp   rw   r	   r   rx   r   ry   r>   rr   r   r    r    r    r!   rs   W  s   
 

rs   c                   @   s>   e Zd ZU ejed< ejed< ejed< dZeej ed< dS )	GradientsZdqZdkZdvNdb)r9   r:   r;   r   r   rp   r   r	   r    r    r    r!   r   n  s
   
 


r   c                   @   sX  e Zd ZU dZeed< ee ed< dZe	e
e
f ed< eej ed< eed< dZe
ed	< ed
fZee ed< eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eed< dZdZeed< ddgZee
 ed< ddgZee
 ed< ededefd d!Z ed"e
d#e
d$e
d%e
dee f
d&d'Z!ededee fd(d)Z"d
S )*AttentionOpBaseaI  Base class for any attention operator in xFormers

    See:

    - :attr:`xformers.ops.fmha.cutlass.FwOp`
    - :attr:`xformers.ops.fmha.cutlass.BwOp`
    - :attr:`xformers.ops.fmha.flash.FwOp`
    - :attr:`xformers.ops.fmha.flash.BwOp`
    - :attr:`xformers.ops.fmha.triton.FwOp`
    - :attr:`xformers.ops.fmha.triton.BwOp`
    ZOPERATORSUPPORTED_DEVICES)rO   r   CUDA_MINIMUM_COMPUTE_CAPABILITYSUPPORTED_DTYPESSUPPORTED_MAX_Kr   SUPPORTED_MIN_KNSUPPORTED_ATTN_BIAS_TYPESSUPPORTS_DROPOUTFSUPPORTS_CUSTOM_SCALESUPPORTS_DIFFERENT_VALUE_EMBEDSUPPORTS_OUTPUT_DTYPESUPPORTS_PARTIALTIS_DETERMINISTICSUPPORTS_BMGHKNAMEZmemory_efficient_attentionVARLEN_LSE_PACKEDr   i,  _TEST_BATCH_SIZES       _TEST_Kdr   c                 C   s   |  | S rA   )not_supported_reasons)r-   r   r    r    r!   supports  s   zAttentionOpBase.supportsre   rg   rf   rh   c                 C   sd   g }| j s||kr|d t||| jkr|d| j  t||| jk r0|d| j  |S )Nz"query.shape[-1] != value.shape[-1]z(max(query.shape[-1], value.shape[-1]) > z(min(query.shape[-1], value.shape[-1]) < )r   appendmaxr   minr   )r-   re   rg   rf   rh   reasonsr    r    r!   shape_not_supported_reasons  s   


z+AttentionOpBase.shape_not_supported_reasonsc                 C   s"  |j j}| j|d |jjd |d |jjtjkr|d n|jjd d}|j jj	}|j j}|| j
vr?|d| d| j
 d |dkrPtsPtjjdu rP|d	 |dkrstjjdu rstj|j}|| jk rs|d
| j d| d || jvr|d| d| j d t	|j| jvr|dt	|j  | js|jdur|j|ur|d |jr| js|d |jdkr| js|d |jdur| js|d |tju r|drtj|j jd dk r|d |  s|d | j st! r|d | j"s|j j#dkr|d |S )z
        Returns a list of reasons why this is not supported.
        The kernel can run these inputs only if the returned list is empty
        r   rL   )re   rg   rf   rh   zdevice=z (supported: r6   cudaNz'xFormers wasn't build with CUDA supportz"requires device with capability > z but your GPU has capability z
 (too old)zdtype=zattn_bias type is z!Custom output dtype not supportedzPartial attention not supportedrH   zdropout > 0.0zhas custom scaler      z$bf16 is only supported on A100+ GPUszCoperator wasn't built - see `python -m xformers.info` for more infozNoperator is non-deterministic, but `torch.use_deterministic_algorithms` is setrO   z&operator does not support BMGHK format)$rE   rM   r   rF   rG   r=   r   rV   r@   rc   r   r   r   versionZhipr   Zget_device_capabilityr   r   r#   r   r   rJ   rK   r   rI   r   r(   r   bfloat16
startswithZis_availabler   Z$are_deterministic_algorithms_enabledr   r%   )r-   r   Zquery_shaper   Zdevice_typer=   Zdevice_capabilityr    r    r!   r     sj   
 












z%AttentionOpBase.not_supported_reasons)#r9   r:   r;   ro   r   rp   r
   strr   r   rr   r   r=   r0   r   rc   r   r   r>   r   r   r   r   r   r   ZOPERATOR_CATEGORYr   r   r   r   classmethodrD   r   r   r   r    r    r    r!   r   w  sJ   
 r   c                
   @   s   e Zd ZU ejdejdejdiZeej	ef e
d< ejdejdejdiZeej	ef e
d< ed	ed
edeejee f fddZdS )AttentionFwOpBasega2U0*3?gMbp?{Gz?
ERROR_ATOLgh㈵>g-C6:?g{Gzt?
ERROR_RTOLinpneeds_gradientr   c                 C      t  rA   NotImplementedError)r-   r   r   r    r    r!   apply  s   zAttentionFwOpBase.applyN)r9   r:   r;   r   r0   halfr   r   r   r=   rp   r   r   rD   r>   r   r   r	   rs   r   r    r    r    r!   r     s"   
 r   c                	       s   e Zd ZU ejdejdejdiZeej	ef e
d< ejdejdejdiZeej	ef e
d< d	Zd
Zededee f fddZedededejdefddZ  ZS )rv   gH}M?g?g?r   g-C6?r   g?r   FTr   r   c                    s:   t t| |}t|jtjr|jjr| js|	d |S )NzMComputing the bias gradient is not supported (attn_bias.requires_grad = True))
superrv   r   r   r#   r   r   Zrequires_gradSUPPORTS_ATTN_BIAS_GRADr   )r-   r   r   	__class__r    r!   r     s   z'AttentionBwOpBase.not_supported_reasonsctxr   gradc                 C   r   rA   r   )r-   r   r   r   r    r    r!   r   &  s   zAttentionBwOpBase.apply)r9   r:   r;   r   r0   r   r   r   r   r=   rp   r   r   r   r   rD   r   r   r   rs   r   r   r   __classcell__r    r    r   r!   rv     s   
 &rv   	num_headsc                 C   s<   | j dkr| S | | jd | || jd | jd gdS )NrP   r   r   r   )r   r   r   r   )r%   ZreshaperM   Zpermute)rB   r   r    r    r!   bmk2bmhk0  s   
 r   r   name	alignmentc              
   C   s   |j d | dkr| | d| d n|d| dkr1| | d| d| d|  d	 |dd
krJ| | d| d|  d d S d S )NrL   r   z.shape[-1] % z != 0r`   z.stride(-2) % z != 0 (z.stride() = r6   r   z.stride(-1) > 1 (z0) - you should call `.contiguous()` on the input)rM   r   Zstride)r   r   r?   r   r    r    r!   check_lastdim_alignment_stride18  s   r   )0r~   dataclassesr   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   r   r   Z_cpp_libr   commonr   r#   r   r   r   r   r   r   r   r   r>   r"   r   r&   r'   r0   rC   rD   rs   r   r   r   rv   ZAttentionOprr   r   r   r   r    r    r    r!   <module>   sb   0(	
	B
 I{&