o
    )iD/                  %   @   s   d dl mZmZ d dlZd dlmZ ddlmZ dddd	d
dddddddddejdejdejdejdejdede	de	de
de	de	de	de	deej deej deej de
d ef$d!d"ZdS )#    )OptionalTupleN)+BlockDiagonalCausalWithOffsetPaddedKeysMask   )_is_triton_availableg     @g      ?Fg      @g      0@g      @@T )thetalinear_scaleuse_dynamic_scalingdynamic_old_context_lendynamic_scale_factordynamic_low_freq_factordynamic_high_freq_factorout_qfirst_seqposseqpos	adjacentsinternal_dtypexqxkxvcache_kcache_v	attn_biasr   r	   r
   r   r   r   r   r   r   r   r   r   c          4   
   C   s  t  r| js|js|js|js|js|durtdt s J ddl}ddlm} |jj	d }|j
j	d }| j}|dvrAtd|  }| }| }| }| }|j}|j}|d	 }|}|d	 dkrid}|} |dkry|d	 dkry|d	 } |d
kr| j\}!}"}#}$|"|ksJ |d|||$fkrtd| dd|||$f |jd|||$fkrtd|j dd|||$f |d|| |$fkrtd|jd|| |$fkrtdd}%d|#|$ |$df}&n| j\}!}"}%}#}$|"|ksJ |d||%||$fkrtd| dd||%||$f |jd||%||$fkrtd|j dd||%||$f |d||%| |$fkr7td| dd||%| |$f |jd||%| |$fkrRtd|j dd||%| |$f d|#|$ |% |#|$ |$df}&|!dkrhtd|d dkrstd|d dkr~td|d dkrtd|d dkrtd|d dkrtd|#d|  }'|'| }(|#})|du r| | j}n|j| jkrtd| }&|&d dkrtd|dusJ t|jj	d }*|dur|durtdd}+|dur|j|*fkrt|j},td|, d|* d|d}+n!|dur3|j|fkr.t|j},td|, d| d|d}+d |   }-t|-||$}.t|.d!}.t|.d"}.tt|.d# dd$}/| j}0|jj}1|j
j}2|j
j}3|1j|0ksv|2j|0ksv|3j|0krztd%|d&v sJ t j| j ||jj|*|'|% f g | ||||||1|2|3||||r|	nd|r|
nd|r|nd|r|nd|||)|(|%|$|d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr|d nd|d	 |d |d'kr-|d nd|d	 |1d|2d|3d|&d |d'krO|&d nd|&d	 |+|R d(dd|.||/d) W d   |S 1 spw   Y  |S )*u  
    Performs RoPE (rotary embeddings) and kv-cache emplacement for a heterogeneous
    batch for inference in the style given by
    BlockDiagonalCausalWithOffsetPaddedKeysMask.
    The batch is concatenated along the sequence dimension, so the
    actual dim-0 length of all tensors is 1.

    xq, xk and xv should be (1, slen, n_heads, dim), where
    xq's n_heads can differ from xk and xv.

    This function places the roped xk in the right place in cache_k, and
    xv (unmodified) in the right place in cache_v, and returns out_q
    (the roped xq) such that things are ready to call

    xformers.ops.memory_efficient_attention(
        out_q, cache_k, cache_v, attn_bias=attn_bias
    )

    This functionality is experimental. Its API might be changed without warnings.
    Use it at your own risk.

    Arguments:
        xq: tensor of queries to apply rope to
        xk: tensor of keys to apply rope to
        xv: tensor of values to copy into cache_v
        cache_k: cache of keys, MODIFIED IN PLACE
        cache_v: cache of values, MODIFIED IN PLACE
        attn_bias: details the layout of caches.
                Used to determine frequencies for the
                RoPE calculation as well as the locations in cache_k and cache_v
                to write to. Must be on the device.
        first_seqpos: Optionally a tensor containing the sequence position of the
                    beginning of the cache for each batch element.
                    Providing a tensor of zeros is the same as providing None.
                    This affects the numerical calculation but not which memory
                    locations are read or written.
        seqpos: Optionally a 1D tensor containing the sequence position of each
                    query. This should have length equal to xq.shape[1] .
                    This affects the numerical calculation but not which memory
                    locations are read or written.
        adjacents: If True, the inputs are in adjacent pairs along the final dim axis.
                  This is like the released LLaMA model.
                  If False, the dim axis is split in two equal pieces.
                   I.e. the features are ordered with all the real parts before all
                   the imaginary parts. This matches HuggingFace, e.g.
                   https://github.com/huggingface/transformers/blob/
                   f143037789288ba532dada934a118e648e715738/
                   src/transformers/models/llama/modeling_llama.py#L126-L130
        linear_scale: A scaling factor to apply to the sequence ids when computing
                      the RoPE frequencies.  When set to K, all sequence indices
                      are divided by K.
        use_dynamic_scaling: If true, dynamic scaling in use, using a scaling like
            “YaRN: Efficient Context Window Extension of Large Language Models”
        dynamic_old_context_len: used with use_dynamic_scaling
        dynamic_scale_factor: used with use_dynamic_scaling
        dynamic_low_freq_factor: used with use_dynamic_scaling
        dynamic_high_freq_factor: used with use_dynamic_scaling
        internal_dtype: set to "f32" or "f64" to enforce dtype in the calculation
    NzGradients not supported.r      )_rope_padded_kernel)      zUnexpected xq dimensionr   zunexpected k shape z: expected zunexpected v shape zunexpected cache_k shapezunexpected cache_v shapezunexpected cache_k shape zunexpected cache_v shape zHExpected batch size dimension to be 1 as batches should be concatenated.zEach q head must be contiguouszEach k head must be contiguouszEach v head must be contiguousz$Each cache_k head must be contiguousz$Each cache_v head must be contiguousr   zUnexpected shape of out_qz"Each out_q head must be contiguousz0seqpos and first_seqpos may not both be providedzfirst_seqpos.shape z but (z,) expected.zseqpos.shape i      i         z:`attn_bias` must be on the same device as the other inputs)r   Zf32Zf64r   F)Zconst_batch_stridesZcache_padding_lengthZseqlenk_shift
BLOCK_SIZEr   	num_warps)torchZis_grad_enabledZrequires_grad
ValueErrorr   tritonZ_triton.rope_padded_kernelsr   Z	q_seqinfoZseqstart_pyZ	k_seqinfondimZstrideshapeZ	new_emptylentupleZelement_sizeminZnext_power_of_2maxdeviceZseqstartZseqlencudaZ
max_seqlen)4r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r'   r   Zn_total_queriesZcache_lengthr(   Z	xq_strideZ	xk_strideZ	xv_strideZcache_k_strideZcache_v_strideZcache_k_shapeZxk_shapeZ
n_kv_headsZexpected_kv_headsZexpected_cache_headsZbszZq_lenZ	n_q_headsdimZn_groupsZout_q_strideZn_total_headsZv_startZk_startZlogical_bszZstride_seqposr)   ZMAX_FUSED_SIZEr#   r$   r.   Z	seqstartqZ	seqstartkZseqlenk r1   d/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/rope_padded.pyrope_padded   s  P











	
 !"#$%&'()*+,-./0

88r3   )typingr   r   r%   Zxformers.ops.fmha.attn_biasr   r   r   ZTensorfloatboolstrr3   r1   r1   r1   r2   <module>   sj   	
