o
    )i                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZ d d	lmZmZmZm Z m!Z!m"Z" d d
l#m$Z$ d dl%m&Z& d dl'm(Z( eryd dl)m*Z* de
j+dej,fddZ-dd Z.eG dd deZ/G dd de
j+Z0G dd de0Z1G dd de0Z2G dd de2Z3G dd de2Z4G d d! d!e4Z5G d"d# d#e4Z6G d$d% d%e5Z7G d&d' d'e0Z8G d(d) d)e2Z9G d*d+ d+e0Z:dS ),    N)	dataclass)TYPE_CHECKINGOptionalUnioncast)PretrainedConfig)AdapterMapping)
LoRAConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather tensor_model_parallel_all_reduce)divide)ColumnParallelLinear
LinearBaseMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)VocabParallelEmbedding)current_platform)PunicaWrapperBase
base_layerreturnc                 C   sh   t | dr	| jjS t | dr| jjS t | dr| jjS t | dr$| jjS t | dr-| jjS td|  )z7Returns the device for where to place the LoRA tensors.weightweight_packedqweightBW_qUnsupported base layer: )hasattrr   devicer   r   r   r    
ValueError)r    r%   \/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/lora/layers.py_get_lora_device'   s   




r'   c                    s    fdd}|S )zv
    decorator which adds the condition of not using fully sharded loras
    intended to wrap can_replace_layer()
    c                     s<   d|v r	| dnd}|r|d j nd} | i |o|S )NdecorateTlora_config)popfully_sharded_loras)argskwargsr(   	conditioncan_replacer%   r&   decC   s   z+_not_fully_sharded_can_replace.<locals>.decr%   )r0   r1   r%   r/   r&   _not_fully_sharded_can_replace=   s   r2   c                   @   s   e Zd ZU dZeed< dS )LoRAMappingF
is_prefillN)__name__
__module____qualname__r4   bool__annotations__r%   r%   r%   r&   r3   L   s   
 r3   c                   @   s"  e Zd Zdeejeeejdf  f deejeeejdf  f fddZdeejeeejdf  f deejeeejdf  f fddZ	dd	e	d
e
dee ddfddZde	fddZ	dde	dejdejdeej deej f
ddZdd Zedejd
e
dedee def
ddZdS )BaseLayerWithLoRAlora_aNr   c                 C      dS )z1Slice lora a if splitting for tensor parallelism.Nr%   selfr;   r%   r%   r&   slice_lora_aS      zBaseLayerWithLoRA.slice_lora_alora_bc                 C   r<   )z2Slice lora b if splitting with tensor parallelism.Nr%   r>   rA   r%   r%   r&   slice_lora_bY   r@   zBaseLayerWithLoRA.slice_lora_b	max_lorasr)   model_configc                 C   r<   )zInitializes lora matrices.Nr%   r>   rD   r)   rE   r%   r%   r&   create_lora_weights_   s   z%BaseLayerWithLoRA.create_lora_weightsindexc                 C   r<   )z+Resets the lora weights at index back to 0.Nr%   r>   rH   r%   r%   r&   
reset_lorah   s   zBaseLayerWithLoRA.reset_loraembeddings_tensorbiasc                 C   r<   )z!Overwrites lora tensors at index.Nr%   r>   rH   r;   rA   rK   rL   r%   r%   r&   set_loral      	zBaseLayerWithLoRA.set_lorac                 C   s
   || _ d S N)punica_wrapper)r>   rQ   r%   r%   r&   set_mappingw   s   
zBaseLayerWithLoRA.set_mappingsource_layerpacked_modules_listc                 C   s   t )z=Returns True if the layer can be replaced by this LoRA layer.)NotImplementedErrorclsrS   r)   rT   rE   r%   r%   r&   can_replace_layer}   rO   z#BaseLayerWithLoRA.can_replace_layerrP   )r5   r6   r7   r   torchTensorlistr?   rC   intr	   r   r   rG   rJ   rN   rR   classmethodnnModuler8   rX   r%   r%   r%   r&   r:   Q   s^    



	

r:   c                       s   e Zd Zdeddf fddZ	ddededee ddfd	d
Z	defddZ
	ddedejdejdeej deej f
ddZdejdejfddZedejdededee def
ddZedd Z  ZS )VocabParallelEmbeddingWithLoRAr   r   Nc                    s   t    || _|  |  d S rP   )super__init__r   r>   r   	__class__r%   r&   rb      s   
z'VocabParallelEmbeddingWithLoRA.__init__rD   r)   rE   c                 C   s.  | j jdkr:| j jj| j j| j j| j j  | _| j jj| j j | j jj	| j j f| _
| j jj| j jd  d nd | _
d | _tj||j| j jf| j jj| j jjd| _tj|| j j|j |jf|j| j jjd| _tj|d| j j|jf|j| j jjd| _| j| jjd | jjd  | jjd | _d S )Nr   dtyper#         )r   Z"num_added_embeddings_per_partitionr   dataZ num_org_embeddings_per_partitionembeddings_weightsZshard_indicesZadded_vocab_start_indexorg_vocab_sizeZadded_vocab_end_indexembeddings_sliceZfill_rY   zeroslora_extra_vocab_sizeZembedding_dimrg   r#   embeddings_tensorsmax_lora_rank
lora_dtypelora_a_stackedlora_b_stackedviewshapelora_a_stacked_2drF   r%   r%   r&   rG      sf   

	



z2VocabParallelEmbeddingWithLoRA.create_lora_weightsrH   c                 C   s"   d| j |< d| j|< d| j|< d S )Nr   )rs   rt   rp   rI   r%   r%   r&   rJ      s   

z)VocabParallelEmbeddingWithLoRA.reset_lorar;   rA   rK   rL   c                 C   s  |  | | j|d |jd d |jd f j|dd | j|dd |jd d |jd f j|jdd |d ur| j|d |jd d |jd f j|dd | jd ur| j| jjd | jjd  | jjd | jd | jd  }| j	d us{J | j	d |jd  | d S d S d S )Nr   rh   TZnon_blockingri   )
rJ   rs   rv   copy_rt   Trp   rm   ru   rk   )r>   rH   r;   rA   rK   rL   Z
embeddingsr%   r%   r&   rN      s@   
$



z'VocabParallelEmbeddingWithLoRA.set_loraxc           
      C   s   t || jjd kdd}|jd }| jjd d | }| jjd d | }t|| | j	}| j
|||  }|}|jdkrN||jd |jd  d}|jdkra||jd |jd  d}| jj||| jdd}	t sr|	}||S )Nrh   r      T)Z	add_input)rY   wherer   rl   rv   rQ   Z_embeddings_indicesFZ	embeddingrw   forwardndimru   Zadd_lora_embeddingrt   r   can_update_inplaceZview_as)
r>   r{   Zadded_tokens_maskZ
num_tokensZ	indices_1Z	indices_0Zfull_lora_a_embeddingsZfull_outputZfull_output_orglora_outputr%   r%   r&   r      sF   



z&VocabParallelEmbeddingWithLoRA.forwardrS   rT   c                 C      t |tu S rP   )typer   rV   r%   r%   r&   rX        z0VocabParallelEmbeddingWithLoRA.can_replace_layerc                 C      | j jS rP   )r   r   r>   r%   r%   r&   r         z%VocabParallelEmbeddingWithLoRA.weightrP   )r5   r6   r7   r   rb   r\   r	   r   r   rG   rJ   rY   rZ   rN   r   r]   r^   r_   r[   r8   rX   propertyr   __classcell__r%   r%   rd   r&   r`      sP    

9
(	r`   c                       s   e Zd Zdef fddZ	ddededee ddfd	d
Z	defddZ
	ddedejdejdeej deej f
ddZ	ddejdeej dejfddZedejfddZedeej fddZ  ZS )BaseLinearLayerWithLoRAr   c                    s@   t    || _| jj| _t| j| _d | _|  |  |  |  d S rP   )ra   rb   r   
input_sizer'   r#   lora_bias_stackedrc   rd   r%   r&   rb   '  s   

z BaseLinearLayerWithLoRA.__init__NrD   r)   rE   r   c                    s  _ tjtrj jn0tjtr'jsjntjj	 jntjt
r>j js6jntjj	ntt fddtjD _tfddtjD _jr{tfddtjD _jd jd f_d S )Nc              	   3   ,    | ]}t jd  jjjdV  qdS rh   rf   NrY   rn   r   rr   r#   .0_)lora_a_out_sizer)   rD   r>   r%   r&   	<genexpr>M      
z>BaseLinearLayerWithLoRA.create_lora_weights.<locals>.<genexpr>c              	   3   s,    | ]}t jd  jjjdV  qdS r   rY   rn   rq   rr   r#   r   )lora_b_out_sizer)   rD   r>   r%   r&   r   V  r   c                 3   s(    | ]}t jd  jjdV  qdS r   rY   rn   rr   r#   r   )lora_bias_out_sizer)   rD   r>   r%   r&   r   a      
r   ri   )r)   
isinstancer   r   rq   output_sizer   r+   r   tp_sizer   rU   tuplerangen_slicesrs   rt   bias_enabledr   rv   output_slicesrF   r%   )r   r   r   r)   rD   r>   r&   rG   3  sB   
	
	
z+BaseLinearLayerWithLoRA.create_lora_weightsrH   c                 C   s^   t | jD ]'}d| j| |< d| j| |< | jjr,tttj	df | j
| _
d| j
| |< qd S )Nr   .)r   r   rs   rt   r)   r   r   r   rY   rZ   r   )r>   rH   Zs_indexr%   r%   r&   rJ   k  s   z"BaseLinearLayerWithLoRA.reset_lorar;   rA   rK   	lora_biasc                 C   s:  t | jt | j  kr| j  krdksJ  J | | | jdkr7| |}| |}|d ur7| |}| jd |dd |j	d d |j	d f j
|jdd | jd |dd |j	d d |j	d f j
|jdd |d urtttjdf | j| _t | jsJ | jd |dd |j	d f j
|jdd d S d S Nrh   r   Trx   .)lenrs   rt   r   rJ   r   r?   rC   
slice_biasrv   ry   rz   r   r   rY   rZ   r   )r>   rH   r;   rA   rK   r   r%   r%   r&   rN   u  s@   







z BaseLinearLayerWithLoRA.set_lorar{   rL   c              	   C   sp   | j j| j ||}|jdkr |jdkr |dd}|dd}| j||| j| j| j	d| j
}t s6|}|S )Nr|   r   rh         ?)r   quant_methodapplyr   flattenrQ   Zadd_lora_linearrs   rt   r   r   r   r   )r>   r{   rL   outputr   r%   r%   r&   r     s   
zBaseLinearLayerWithLoRA.applyc                 C   st   t | jdr
| jjS t | jdr| jjS t | jdr| jjS t | jdr(| jjS t | jdr2| jjS td| j )Nr   r   r   r   r    r!   )r"   r   r   r   r   r   r    r$   r   r%   r%   r&   r     s   zBaseLinearLayerWithLoRA.weightc                 C   s   t | jdr
| jjS d S )NrL   )r"   r   rL   r   r%   r%   r&   rL     s   zBaseLinearLayerWithLoRA.biasrP   )r5   r6   r7   r   rb   r\   r	   r   r   rG   rJ   rY   rZ   rN   r   r   r   rL   r   r%   r%   rd   r&   r   %  sJ    
8
&
r   c                       s|   e Zd Zdeddf fddZdejdeejeeje	ej f f fddZ
ed	ejd
edede	e def
ddZ  ZS )ReplicatedLinearWithLoRAr   r   Nc                    s&   t  | d| _| jj| _d| _d S Nrh   )ra   rb   r   r   r   r   rc   rd   r%   r&   rb     s   

z!ReplicatedLinearWithLoRA.__init__input_c                 C   sH   | j js| j jnd}| ||}| j jr| j jnd}| j js |S ||fS )zForward of ReplicatedLinearWithLoRA

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        N)r   skip_bias_addrL   r   return_bias)r>   r   rL   r   output_biasr%   r%   r&   r     s   

z ReplicatedLinearWithLoRA.forwardrS   r)   rT   rE   c                 C   r   rP   )r   r   rV   r%   r%   r&   rX     r   z*ReplicatedLinearWithLoRA.can_replace_layer)r5   r6   r7   r   rb   rY   rZ   r   r   r   r   r]   r^   r_   r	   r[   r   r8   rX   r   r%   r%   rd   r&   r     s&    
r   c                       s   e Zd ZdZdeddf fddZdejdejfdd	Zd
ejdejfddZ	dejdejfddZ
dejdeejeejeej f f fddZeedejdededee def
ddZ  ZS )ColumnParallelLinearWithLoRAa$  
    LoRA on top of ColumnParallelLinear layer.
    LoRA B is sliced for tensor parallelism.
    There are two types for the `base_layer`:
    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
    r   r   Nc                    s6   t  | t|tu | _t | _| jj| _	d| _
d S r   )ra   rb   r   r   is_merged_col_linearr   r   r   Zoutput_size_per_partitionr   r   rc   rd   r%   r&   rb     s   

z%ColumnParallelLinearWithLoRA.__init__r;   c                 C      |S rP   r%   r=   r%   r%   r&   r?        z)ColumnParallelLinearWithLoRA.slice_lora_arA   c           
      C   s   | j rAt }| jd }|jd d }|d d || |d | f }|d d |||  ||d |  f }tj||gdd}|S t }| j}|| }|d | }	|d d ||	f }|S )Nri   r}   rh   dim)r   r
   r   rv   rY   cat)
r>   rA   tp_rank
shard_sizeoffsetZleft_weightZright_weighttensor_model_parallel_rank	start_idxend_idxr%   r%   r&   rC     s$   



	z)ColumnParallelLinearWithLoRA.slice_lora_brL   c                 C   s<   |d u r|S t  }| j}|| }|d | }||| }|S r   )r
   r   )r>   rL   r   r   r   r   r%   r%   r&   r   *  s   z'ColumnParallelLinearWithLoRA.slice_biasr   c                 C   s^   | j js| j jnd}| ||}| j jrt|}n|}| j js!|S | j jr)| j jnd}||fS )zForward of ColumnParallelLinear

        Args:
            input_: Tensor whose last dimension is `input_size`.

        Returns:
            - output
            - bias
        N)r   r   rL   r   Zgather_outputr   r   )r>   r   rL   output_parallelr   r   r%   r%   r&   r   5  s   


z$ColumnParallelLinearWithLoRA.forwardrS   r)   rT   rE   c                 C   s$   t |tu pt |tu ot|dkS r   )r   r   r   r   rV   r%   r%   r&   rX   S  s   	
z.ColumnParallelLinearWithLoRA.can_replace_layer)r5   r6   r7   __doc__r   rb   rY   rZ   r?   rC   r   r   r   r   r   r]   r2   r^   r_   r	   r[   r   r8   rX   r   r%   r%   rd   r&   r     s0    
r   c                       s<  e Zd ZdZdeeef ddf fddZ	ddede	d	e
e ddfd
dZdeeejdf  deeejdf  fddZdeeejdf  deeejdf  fddZdeeejdf  deeejdf  fddZ	ddedejdejde
ej de
ej f
ddZeedejde	ded	e
e def
ddZ  ZS )"MergedColumnParallelLinearWithLoRAzColumnParallelLinear layer that is composed of 2 sublayers (slices)
    packed together (eg. gate_proj + up_proj -> gate_up_proj).

    This means we have 2 LoRAs, each applied to one half of the layer.

    Both slices must have the same size.
    r   r   Nc                    s\   t  | t  _t  _ jj}t fdd|D  _	t
 j	 _ jf j  _d S )Nc                 3   s    | ]	}t | jV  qd S rP   )r   r   r   r   r   r%   r&   r   t  s    
z>MergedColumnParallelLinearWithLoRA.__init__.<locals>.<genexpr>)ra   rb   r   r   r
   r   r   output_sizesr   r   r   r   
output_ids)r>   r   r   rd   r   r&   rb   j  s   
z+MergedColumnParallelLinearWithLoRA.__init__rD   r)   rE   c                    s   _ js	jntjj t fddtjD _tfddj	D _
jrEtfddj	D _dS dS )zl
        The main reason for overriding this function is to enhance  code 
        maintainability.
        c              	   3   r   r   r   r   Z lora_a_output_size_per_partitionr)   rD   r>   r%   r&   r     r   zIMergedColumnParallelLinearWithLoRA.create_lora_weights.<locals>.<genexpr>c              	   3   s,    | ]}t jd | j jjdV  qdS r   r   r   r)   rD   r>   r%   r&   r     r   c                 3   s(    | ]}t jd | jjdV  qdS r   r   r   r   r%   r&   r     r   N)r)   r+   rq   r   r   r   r   r   rs   r   rt   r   r   rF   r%   r   r&   rG   y  s   

	
	z6MergedColumnParallelLinearWithLoRA.create_lora_weightsr;   c                 C   r   rP   r%   r=   r%   r%   r&   r?     s   z/MergedColumnParallelLinearWithLoRA.slice_lora_arA   c                 C   sd   d g| j  }tt| j| jD ] \}\}}||  }d ur/|d d || ||d  f ||< q|S r   )r   	enumeratezipr   r   )r>   rA   Zsliced_lora_bishard_idr   lora_b_ir%   r%   r&   rC     s   
z/MergedColumnParallelLinearWithLoRA.slice_lora_brL   c                 C   sP   t t| j| jD ]\}\}}||  }d ur%||| ||d   ||< q	|S r   )r   r   r   r   )r>   rL   r   r   r   Zbias_ir%   r%   r&   r     s   
z-MergedColumnParallelLinearWithLoRA.slice_biasrH   rK   r   c           
      C   sH  |  | | jdkr| |}| |}|d ur| |}t| jD ]H}||  }d urG| j| |dd |jd d |jd f j	|j
dd ||  }d urj| j| |dd |jd d |jd f j	|j
dd q"|d urtttjdf | j| _t| jD ]!}||  }	d ur| j| |dd |	jd f j	|	j
dd qd S d S r   )rJ   r   r?   rC   r   r   r   rs   rv   ry   rz   rt   r   r   rY   rZ   r   )
r>   rH   r;   rA   rK   r   r   Zlora_a_ir   Zlora_bias_ir%   r%   r&   rN     sL   





z+MergedColumnParallelLinearWithLoRA.set_lorarS   rT   c                 C      t |tu ot|dkS )Nri   )r   r   r   rV   r%   r%   r&   rX        	
z4MergedColumnParallelLinearWithLoRA.can_replace_layerrP   )r5   r6   r7   r   r   r   r   rb   r\   r	   r   r   rG   r[   rY   rZ   r?   rC   r   rN   r]   r2   r^   r_   r8   rX   r   r%   r%   rd   r&   r   a  sz    
,



$r   c                       s   e Zd ZdZdeddf fddZdejdejfdd	Zd
ejdejfddZ	e
edejdededee def
ddZ  ZS )QKVParallelLinearWithLoRAa  
    ColumnParallelLinear layer that is specifically designed for
    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
    only contains a single LoRA within their qkv_proj layer.

    During inference with Tensor Parallel, the weights of lora_b
    must be accurately partitioned according to the respective ranks.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r   Nc                    s^   t  | | jj| jj | _| jj| jj | _| jj| jj | _	| jj
| jj | _d| _d S r   )ra   rb   r   Ztotal_num_heads	head_sizeq_proj_total_size	num_headsq_proj_shard_sizenum_kv_headskv_proj_shard_sizeZtotal_num_kv_headskv_proj_total_sizer   rc   rd   r%   r&   rb     s   
z"QKVParallelLinearWithLoRA.__init__rA   c                 C   s   t  }|| _|| jj | _|d d | j| j | j| jd  f }| j}|d d || j| j  || j| jd   f }|| j }|d d || j| j  || j| jd   f }t	j
|||gdd}|S Nrh   r   )r
   
q_shard_idr   num_kv_head_replicaskv_shard_idr   r   r   r   rY   r   )r>   rA   r   Zlora_b_qk_offsetZlora_b_kv_offsetZlora_b_vr%   r%   r&   rC     s8   




z&QKVParallelLinearWithLoRA.slice_lora_brL   c                 C   s   || j | j | j | jd   }| j}||| j| j  || j| jd    }|| j }||| j| j  || j| jd    }tj|||gdd}|S r   )r   r   r   r   r   r   rY   r   )r>   rL   Zbias_qr   Zbias_kr   Zbias_vr%   r%   r&   r     s2   


z$QKVParallelLinearWithLoRA.slice_biasrS   r)   rT   rE   c                 C   r   r   r   r   r   rV   r%   r%   r&   rX   ,  s
   z+QKVParallelLinearWithLoRA.can_replace_layer)r5   r6   r7   r   r   rb   rY   rZ   rC   r   r]   r2   r^   r_   r	   r[   r   r   r8   rX   r   r%   r%   rd   r&   r     s     r   c                       s|   e Zd ZdZdeddf fddZ	ddeded	ee	 ddf fd
dZ
eedejdeded	ee	 def
ddZ  ZS )MergedQKVParallelLinearWithLoRAaK  MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
    packed together in qkv proj fashion
    (q_proj + k_proj + v_proj -> qkv_proj).

    This means we have 3 LoRAs, each applied to one slice of the layer.

    Q slice may have different shape than K and V slices (which both have
    the same shape).
    r   r   Nc                    s   t  | t| jj| _t | _t | _	| jj
| jj | _| jj| jj | _| j	| _| j	| jj | _| j| j| jf| _| j| j| jf| _d S rP   )ra   rb   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   r   r   r   r   rc   rd   r%   r&   rb   @  s(   
z(MergedQKVParallelLinearWithLoRA.__init__rD   r)   rE   c                    s   t  ||| dS )z
        The main reason for overloading this function is to handle inconsistent 
        weight dimensions in qkv lora.
        N)ra   rG   rF   rd   r%   r&   rG   Y  s   
z3MergedQKVParallelLinearWithLoRA.create_lora_weightsrS   rT   c                 C   r   )Nr|   r   rV   r%   r%   r&   rX   e  r   z1MergedQKVParallelLinearWithLoRA.can_replace_layerrP   )r5   r6   r7   r   r   rb   r\   r	   r   r   rG   r]   r2   r^   r_   r[   r8   rX   r   r%   r%   rd   r&   r   5  s4    
r   c                   @   s   e Zd ZdS )QKVCrossParallelLinearWithLoRAN)r5   r6   r7   r%   r%   r%   r&   r   s  s    r   c                       s   e Zd Zdeddf fddZdejdejfddZd	ejdejfd
dZdejdejfddZ	dejde
ejeejeej f f fddZeedejdededee def
ddZ  ZS )RowParallelLinearWithLoRAr   r   Nc                    s:   t  | t | _| jj| _| jj| _t | _	d| _
d S r   )ra   rb   r   r   r   Zinput_size_per_partitionr   r   r
   r   r   rc   rd   r%   r&   rb   y  s   


z"RowParallelLinearWithLoRA.__init__r;   c                 C   s6   | j }| j| }| jd | }|||d d f }|S r   )r   r   )r>   r;   r   r   r   r%   r%   r&   r?     s
   
z&RowParallelLinearWithLoRA.slice_lora_arA   c                 C   r   rP   r%   rB   r%   r%   r&   rC     r   z&RowParallelLinearWithLoRA.slice_lora_brL   c                 C   r   rP   r%   )r>   rL   r%   r%   r&   r     r   z$RowParallelLinearWithLoRA.slice_biasr   c                 C   s   | j jr|}nt|| j jd}|| j  }| |}| j jr*| j jdkr*t|}n|}| j j	sA| j j
dur<|| j j
 n|}d}n|}| j j
}| j jsM|S ||fS )a*  Forward of RowParallelLinear

        Args:
            input_: tensor whose last dimension is `input_size`. If
                    `input_is_parallel` is set, then the last dimension
                    is `input_size // tp_size`.

        Returns:
            - output
            - bias
        )Znum_partitionsrh   N)r   Zinput_is_parallelr   r   r   
contiguousr   Zreduce_resultsr   r   rL   r   )r>   r   Zinput_parallelZsplitted_inputr   Zoutput_r   r   r%   r%   r&   r     s*   

z!RowParallelLinearWithLoRA.forwardrS   r)   rT   rE   c                 C   r   rP   )r   r   rV   r%   r%   r&   rX     s   	z+RowParallelLinearWithLoRA.can_replace_layer)r5   r6   r7   r   rb   rY   rZ   r?   rC   r   r   r   r   r   r]   r2   r^   r_   r	   r[   r   r8   rX   r   r%   r%   rd   r&   r   w  s.    
+r   c                       sd  e Zd ZdZdededejdejde	e
e  ddf fd	d
Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Z	d4dedede	e ddfddZd efd!d"Z	d4d ed#ejd$ejd%e	ej d&e	ej f
d'd(Z	d4d)ejd*ed+e	ej de	ej fd,d-Zd.d/ Zed0ej ded1e
de	e de!f
d2d3Z"  Z#S )5LogitsProcessorWithLoRAa  
    LoRA wrapper for LogitsProcessor, with extra logic to handle the
    application of the LoRA adapter and added LoRA vocabulary.

    Args:
        base_layer: LogitsProcessor layer
        hidden_size: hidden size of the model
        dtype: data type of the model
        device: device of the model
        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
            received from base_layer.get_sharded_to_full_mapping(). If None,
            no reindexing will be done.
    r   hidden_sizerg   r#   sharded_to_full_mappingr   Nc                    s<   t    || _|| _|| _|| _t | _t | _	|| _
d S rP   )ra   rb   r   r   rg   r#   r   r   r
   r   r   )r>   r   r   rg   r#   r   rd   r%   r&   rb     s   

z LogitsProcessorWithLoRA.__init__c                 C   r   rP   )r   logits_as_inputr   r%   r%   r&   r     r   z'LogitsProcessorWithLoRA.logits_as_inputc                 C   r   rP   )r   
vocab_sizer   r%   r%   r&   r     r   z"LogitsProcessorWithLoRA.vocab_sizec                 C   r   rP   )r   scaler   r%   r%   r&   r     r   zLogitsProcessorWithLoRA.scalec                 C   r   rP   )r   soft_capr   r%   r%   r&   r     r   z LogitsProcessorWithLoRA.soft_capc                 C   r   rP   )r   use_all_gatherr   r%   r%   r&   r     r   z&LogitsProcessorWithLoRA.use_all_gatherc                 C   r   rP   )r   rl   r   r%   r%   r&   rl     r   z&LogitsProcessorWithLoRA.org_vocab_sizec                 C   r   rP   )r   include_gpu_probs_tensorr   r%   r%   r&   r     r   z0LogitsProcessorWithLoRA.include_gpu_probs_tensorc                 C   r   rP   )r   "should_modify_greedy_probs_inplacer   r%   r%   r&   r     r   z:LogitsProcessorWithLoRA.should_modify_greedy_probs_inplacerD   r)   rE   c                 C   s   d| j j  k rdkrtd tj|d|j| jf|j| jd| _	tj|dt
| j j|j |j |jf|j| jd| _tj||j| jftd| j| jd| _| jd urbtj| j| jtjd| _d S d | _d S )	Ni }  i  zAWhen using LoRA, vocab size must be 32000 >= vocab_size <= 257024rh   rf   -inf)Z
fill_valuerg   r#   )r#   rg   )r   r   r$   rY   rn   rq   r   rr   r#   rs   mathceilZlora_vocab_padding_sizert   fullro   floatrg   rp   r   Ztensorlongsharded_to_full_mapping_gpurF   r%   r%   r&   rG     sL   

	

z+LogitsProcessorWithLoRA.create_lora_weightsrH   c                 C   s&   d| j |< d| j|< td| j|< d S )Nr   r   )rs   rt   r   rp   rI   r%   r%   r&   rJ   4  s   

z"LogitsProcessorWithLoRA.reset_lorar;   rA   rK   rL   c                 C   s   |  | | j|dd |jd d |jd f j|jdd | j|dd |jd d |jd f j|jdd |d urO|| j|d |jd d |jd f< d S d S )Nr   rh   Trx   )rJ   rs   rv   ry   rz   rt   rp   rM   r%   r%   r&   rN   9  s,   
z LogitsProcessorWithLoRA.set_lorahidden_stateslm_headembedding_biasc           
      C   s  |j ||}|d ur||7 }| j|}|d u rd S | jd ur)|d d | jf }tj| jjd d | jjd |jd | jj	| jj
d}tj| j|j|d d d t|j	\}}||d< |j}| jj}t rr|d |d }||jd |jd  |jd d|j|||d}||d d | jj| jj|jd  f< | j||| j| jd}	t s|	}|d d d | jjf }|S )	Nr   rh   rf   r}   )outri   )nanZposinfZneginfr   )r   r   r   Z_gather_logitsr   rY   emptyrp   rv   rg   r#   matmulrz   r   Zget_infinity_valuesZmTrQ   Zsampler_indices_paddedZis_tpusizeZreshapeZindex_selectZnan_to_num_rl   Zadd_lora_logitsrs   rt   r   r   )
r>   r   r   r   ZlogitsZlora_logitsZneg_infZpos_infZindices_paddedr   r%   r%   r&   _get_logitsO  sd   


	z#LogitsProcessorWithLoRA._get_logitsc                 O   s   t | jj| g|R i |S rP   )r   r   r   )r>   r,   r-   r%   r%   r&   r     s   zLogitsProcessorWithLoRA.forwardrS   rT   c                 C   r<   )NFr%   rV   r%   r%   r&   rX     rO   z)LogitsProcessorWithLoRA.can_replace_layerrP   )$r5   r6   r7   r   r   r\   rY   rg   r#   r   r[   rb   r   r   r   r   r   r   rl   r   r   r	   r   rG   rJ   rZ   rN   r   r   r   r]   r^   r_   r8   rX   r   r%   r%   rd   r&   r     s    










/

Pr   );r   dataclassesr   typingr   r   r   r   rY   Ztorch.nnr^   Ztorch.nn.functionalZ
functionalr   Ztransformersr   Zvllm.adapter_commons.layersr   Zvllm.configr	   Zvllm.distributedr
   r   r   r   r   Zvllm.distributed.utilsr   Z!vllm.model_executor.layers.linearr   r   r   r   r   r   Z+vllm.model_executor.layers.logits_processorr   Z3vllm.model_executor.layers.vocab_parallel_embeddingr   Zvllm.platformsr   Zvllm.lora.punica_wrapperr   r_   r#   r'   r2   r3   r:   r`   r   r   r   r   r   r   r   r   r   r%   r%   r%   r&   <module>   sF    8  '0f D>S