o
    )i                     @   s6  d dl mZ d dlmZ d dlZd dlmZ d dlm  mZ	 d dl
m  mZ d dlmZ d dlmZ d dlmZmZmZ eeZG dd dejZd	ejjd
ejdejjfddZd	ejjd
ejdejjfddZd	ejjd
ejdejjfddZedefdefdefgZdd Zdejjd
dddfddZ dS )    )OrderedDict)OptionalN)	Parameter)init_logger)ColumnParallelLinearQKVParallelLinearRowParallelLinearc                       sR   e Zd Z	ddejded f fddZdddZdejfd	d
Zdd Z	  Z
S )XlaQKVParallelLinearN
qkv_linearmeshxs.Meshc                    sv   t    t|tsJ |j| _|j| _|jdksJ d|  |  |  |  |  |  | | |d ur9| | d S d S )N   z$TP > 1 is only supported under SPMD.)	super__init__
isinstancer   skip_bias_addreturn_biasZtp_size_load_weights_from_qkv_linear_shard_weight)selfr
   r   	__class__ r/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/distributed/tpu_distributed_utils.pyr      s   

zXlaQKVParallelLinear.__init__c                 C   s  t | jddd| _t | jddd| _t | jddd| _t| j|d t| j|d t| j|d | jd ur| jd urH| j	d usLJ dt | jddd| _t| j|d t | jddd| _t| j|d t | j	ddd| _	t| j	|d d S d S )NZxlaFZrequires_gradxNz:QKVParallelLinear should have q, k, and v biases together.)r   )
r   q_weighttok_weightv_weightxsmark_shardingq_biask_biasv_bias)r   r   r   r   r   r   )   s"   
z"XlaQKVParallelLinear._shard_weightc                 C   s$  |j \}}}|jj }t|d | dd}t||||  dd}t||| d  dd}| d| | d| | d| |jd ur~t|jd | dd}	t|j|||  dd}
t|j|| d  dd}| d|	 | d|
 | d| d S | dd  | dd  | dd  d S )	NFr   r   r   r    r#   r$   r%   )Zoutput_sizesweightdatacpur   Zregister_parameterZbias)r   r
   Zq_proj_sizeZk_proj_size_Z
qkv_weightr   r   r    r#   r$   r%   r   r   r   r   :   s<   

z2XlaQKVParallelLinear._load_weights_from_qkv_linearc           
      C   s   | j s| jnd }| j s| jnd }| j s| jnd }t|| j|}t|| j|}t|| j|}t	j
|||gdd}| j rGt	j
|||gddnd }	| jsN|S ||	fS )N)dim)r   r#   r$   r%   FZlinearr   r   r    torchcatr   )
r   inputr#   r$   r%   Zq_projZk_projZv_projZqkv_projZoutput_biasr   r   r   forwardX   s   zXlaQKVParallelLinear.forwardN)r   r   )__name__
__module____qualname__nnModuler   r   r   r   r0   __classcell__r   r   r   r   r	      s    
r	   layerr   returnc                 C   .   t | tsJ t| j|d td|  | S )Nr   z&Applied column-parallel sharding to %s)r   r   r!   r"   r&   loggerdebugr8   r   r   r   r    partition_column_parallel_linearp      r>   c                 C   r:   )N)Nr   z#Applied row-parallel sharding to %s)r   r   r!   r"   r&   r;   r<   r=   r   r   r   partition_row_parallel_linearx   r?   r@   c                 C   s(   t | tsJ t| |}td|  |S )Nz#Applied qkv parallel sharding to %s)r   r   r	   r;   r<   )r8   r   Z	xla_layerr   r   r   partition_qkv_parallel_linear   s   
rA   r   r   r   c                 C   s   | j jS r1   )r   r4   )moduler   r   r   get_fqn   s   rC   modelr   c                    s   d fdd	  |  dS )z
    Recursively check a PyTorch model and apply appropriate sharding based on 
    the MODULE_TYPE_TO_WRAPPING_FUNC mapping.
    
    Args:
        model: torch.nn.Module to process
        mesh: An XLA SPMD mesh object used for sharding
    Nc                    s   t  D ]0\}}t| |kr4|| }|d ur|d usJ d|| ur0td| | t||| |}  nqt|  D ]
\}} |||  q;d S )Nz/Top Level module is not expected to be wrapped.zreplace %s with %s)MODULE_TYPE_TO_WRAPPING_FUNCitemsrC   r;   r<   setattrlistZnamed_children)rB   nameparentmodule_typeZwrapping_funcZwrapped_moduleZ
child_nameZchild_module_process_moduler   r   r   rM      s   
z$shard_model.<locals>._process_module)NNr   )rD   r   r   rL   r   shard_model   s   
rN   )!collectionsr   typingr   r-   Ztorch.nnr5   Ztorch.nn.functionalZ
functionalr,   Ztorch_xla.distributed.spmddistributedZspmdr!   Ztorch.nn.parameterr   Zvllm.loggerr   Z!vllm.model_executor.layers.linearr   r   r   r2   r;   r6   r	   ZMeshr>   r@   rA   rE   rC   rN   r   r   r   r   <module>   sB   
\




