o
    )i"                     @   s   d Z ddlZdejdedeejejf fddZdejdedeejejejf fd	d
Zdejdedededef
ddZdejdededededeejejejf fddZdgZ	dS )aA  
Expert parallelism load balancer (EPLB) for vLLM.

This module implements the core rearrangement algorithm.

The rearrangement algorithm is adapted from
[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).

Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
on how the EPLB algorithm works.
    Nweight	num_packsreturnc                    sL  | j \}}|| dksJ ||   dkr2tj| dtj| jd| j }tj| tjd}||fS |  j	dddj
 }tj| dtjdd	}tj|dd
}t|D ]N}dg| }dg| || D ]=}	t fddt|D |jd}
|
  k s~J |
|||	f< |
 |||	f< ||
  | ||	f 7  < |
  d7  < qcqS||fS )a  
    Pack n weighted objects to m packs, such that each bin contains exactly
    n/m objects and the weights of all packs are as balanced as possible.

    Parameters:
        weight: [X, n], the weight of each item
        num_packs: number of packs

    Returns:
        pack_index: [X, n], the pack index of each item
        rank_in_pack: [X, n], the rank of the item in the pack
    r      dtypedevice)r   T)Z
descendingcpu)
fill_valuer   r	   )r   c                 3   s     | ]}|  k r|V  qd S )N ).0iZgroups_per_packZ
pack_itemsr   p/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/distributed/eplb/rebalance_algo.py	<genexpr>6   s    z#balanced_packing.<locals>.<genexpr>)key)shapetorcharangesizeint64r	   expandZ
zeros_likefloatsortindicesr
   Z	full_likerangemin__getitem__)r   r   
num_layers
num_groups
pack_indexrank_in_packr   r   Zpack_weightsgrouppackr   r   r   balanced_packing   sH   


r%   num_phyc                 C   s   | j \}}|| }|dksJ | j}tj|tj|d|d}tj||tj|d}tj||tj|d}tj|tj|d}	t||D ])}
| | j	ddj
}||dd|
f< ||	|f |dd|
f< ||	|f  d7  < qA|||fS )a  
    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
    load of all replicas is minimized.

    Parameters:
        weight: [X, num_log]
        num_phy: total number of experts after replication

    Returns:
        phy2log: [X, num_phy], logical expert id of each physical expert
        rank: [X, num_phy], the replica rank
        logcnt: [X, num_log], number of replicas for each logical expert
    r   r   r   r   )dimN)r   r	   r   r   r   repeatZzerosZonesr   maxr   )r   r&   nZnum_logZnum_redundantr	   phy2logZranklogcntZarangenr   Zredundant_indicesr   r   r   replicate_expertsB   s$   


r-   num_physical_expertsr    	num_nodesnum_gpusc                 C   s  | j \}}|| dksJ || }|| dksJ || }|| dks%J || dks-J || }	dtjdtjfdd}
| d||fd}t||\}}|| | | dtj|tj|j	d 
d}|
|}| d|d|| }t||| \}}}|| d|}t||| \}}||	 | }|
|}|d|}|||dtjd||| |j	d	d
dd
 
d}|d|}|d||d}||dd|}|||fS )aK  
    Parameters:
        weight: [num_moe_layers, num_logical_experts]
        num_physical_experts: number of physical experts after replication
        num_groups: number of expert groups
        num_nodes: number of server nodes, where the intra-node network
        (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`

    Returns:
        physical_to_logical_map: [num_moe_layers, num_physical_experts]
        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
        logical_count: [num_moe_layers, num_logical_experts]
    r   permr   c              	   S   s:   t | }|d| t j| dt j| jd| j |S )Nr   r   )	r   Z
empty_likescatter_r   r   r   r	   r   r   )r1   invr   r   r   inverse   s   
z/rebalance_experts_hierarchical.<locals>.inverser   r   )r	   r   )r   r   TensorZ	unflattensumr%   Z	unsqueezer   r   r	   flattengatherviewr-   )r   r.   r    r/   r0   r   num_logical_expertsZ
group_sizeZgroups_per_nodeZphy_experts_per_gpur4   Ztokens_per_groupZgroup_pack_indexZgroup_rank_in_packZlog2mlogZmlog2logZtokens_per_mlogZphy2mlogphyrankZmlogcntZtokens_per_phyr!   r"   Zphy2pphyZpphy2phyZ	pphy2mlogZpphy2logZpphyrankr,   r   r   r   rebalance_experts_hierarchicalc   sl   




r=   num_replicasc              	   C   s   | j \}}|   } || dkrt| ||||\}}}	nt| |dd|\}}}	|| }
|
d }tj|||fdtj|	jd}||d	d|| | tj
|tj|jd|d |||	fS )aJ  
    Entry point for expert-parallelism load balancer.

    Parameters:
        weight: [layers, num_logical_experts], the load statistics for all
            logical experts
        num_replicas: number of physical experts, must be a multiple of
            `num_gpus`
        num_groups: number of expert groups
        num_nodes: number of server nodes, where the intra-node network
            (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`

    Returns:
        physical_to_logical_map: [layers, num_replicas], the expert index of
            each replica
        logical_to_physical_map: [layers, num_logical_experts, X], the replica
            indices for each expert
        expert_count: [layers, num_logical_experts], number of physical
            replicas for each logical expert
    r   r   r   r   )r   r   r
   r=   r   fullr   r	   r:   r2   r   r   )r   r>   r    r/   r0   r   r;   r+   r<   r,   Znum_redundant_expertsZ	maxlogcntZlog2phyr   r   r   rebalance_experts   s4   






r@   )
__doc__r   r6   inttupler%   r-   r=   r@   __all__r   r   r   r   <module>   sN   
0
!
P

7