o
    * ir                     @  s6  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ dd	lmZ e
rLd dlmZ d d
lmZ ddgZdaejdejj Ze ejdd Z!		d$d%ddZ"	d&d'ddZ#G dd dZ$G dd dZ%G dd dZ&G d d! d!e&Z'G d"d# d#Z(dS )(    )annotationsN)reduce)product)TYPE_CHECKINGAnyLiteral)
NCCLConfig)check_nccl_version_for_p2p   )logger)GroupCommunicateTopologyHybridCommunicateGroupZPADDLE_USE_FOUR_DIRECTIONS_P2PZ$FLAGS_pipeline_nccl_comm_init_optionmessage0NCCLConfig_Message | dict[str, int | str] | Nonedefault_name
str | Nonereturnr   c                 C  s   t jjjdkr	d S t| ttfsd S ddlm} ddl	m
} t| tr&| }n|| dd}d|vr8|d ur8||d< |jjdi |S )	NZncclr   )MessageToDict)coreT)Zpreserving_proto_field_nameZcommName )paddledistributed
collectiveZ_default_backend
isinstanceNCCLConfig_MessagedictZgoogle.protobuf.json_formatr   Zpaddle.baser   r   create)r   r   r   r   Zret_dictr   r   r/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/fleet/base/topology.pymessage2nccl_config/   s   
r   nccl_configdict[str, int | str] | NoneNCCLConfig | Nonec                 C  s
   t | dS )a  

    Function that creates nccl config.

    Args:
        nccl_config (dict[str, int | str] | None): None or a dict containing the following keys:
            commName (str): name of the process group. ll_buffsize (int): buffer size of ll protocol.
            ll128_buffsize (int): buffer size of ll128 protocol. simple_buffsize (int): buffer size of
            simple protocol. buffsize_align (int): alignment unit of the total buffer size.
            nchannels (int): max number of channels. algoStr (str): communication algorithm.
            protoStr (str): communication protocol.

    Returns:
        NCCLConfig (NCCLConfig | None): an object containing the information,
        which can be used as an argument of new_group().

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
            >>> import paddle
            >>> import paddle.distributed as dist
            >>> from typing import Union
            >>> dist.init_parallel_env()
            >>> nccl_config: dict[str, Union[int, str]] = {"commName":"tp_comm","ll_buffsize":0,"ll128_buffsize":0,"simple_buffsize":1024,"buffsize_align":1024,"nchannels":4,"algoStr":"Ring","protoStr":"Simple",}
            >>> ranks=[0,1,2,3,4,5,6,7]
            >>> nccl_config=dist.create_nccl_config(nccl_config)
            >>> pg=dist.new_group(ranks, nccl_config=nccl_config)
            >>> m, n = 4096, 8192
            >>> local_rank = dist.get_rank(pg)
            >>> num_local_ranks = dist.get_world_size(pg)
            >>> x = paddle.ones(shape=[m, n], dtype=paddle.float32) * (local_rank + 1)
            >>> dist.all_reduce(x, group=pg)

    N)r   r    r   r   r   create_nccl_configE   s   
&r$   c                   @  s$   e Zd ZdZdZdZdZdZdZdS )ParallelModea  

    There are all the parallel modes currently supported:

        - DATA_PARALLEL: Distribute input data to different devices.
        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.

    Examples:
        .. code-block:: python

            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
            >>> import paddle
            >>> parallel_mode = paddle.distributed.ParallelMode
            >>> print(parallel_mode.DATA_PARALLEL)
            0

    r      r
         N)	__name__
__module____qualname____doc__DATA_PARALLELTENSOR_PARALLELPIPELINE_PARALLELSHARDING_PARALLELSEGMENT_PARALLELr   r   r   r   r%   n   s    r%   c                   @  s   e Zd Zg dg dfd+d	d
Zd,ddZd-ddZd.ddZd/ddZd0ddZd1ddZ	d-ddZ
d2d"d#Zd3d$d%Zd4d(d)Zd*S )5r   )datapipeshardingsepcontextmodel)r&   r&   r&   r&   r&   r&   hybrid_group_names	list[str]dims	list[int]r   Nonec                   s   | _ | _td j  _tdd  jd _dd  jD } fddt| D }tt	|t
t| _tt	 j  j  _d S )NZ
Coordinatec                 S  s   | | S Nr   )xyr   r   r   <lambda>       z.CommunicateTopology.__init__.<locals>.<lambda>r&   c                 S  s   g | ]}t |qS r   )range).0dr   r   r   
<listcomp>       z0CommunicateTopology.__init__.<locals>.<listcomp>c                   s   g | ]} j | qS r   )
coordinate)rC   r>   selfr   r   rE      s    )_parallel_names_dimscollections
namedtuplerG   r   _world_sizer   r   ziprB   len_coord2rankvalueskeys_rank2coord)rI   r8   r:   rangesZall_coordinater   rH   r   __init__   s   
zCommunicateTopology.__init__c                 C     | j S r=   )rJ   rH   r   r   r   get_hybrid_group_names      z*CommunicateTopology.get_hybrid_group_names	axis_namestrintc                 C  s   | j | j| S r=   )rK   rJ   indexrI   rZ   r   r   r   get_dim   s   zCommunicateTopology.get_dimc                 C  rW   r=   )rN   rH   r   r   r   
world_size   rY   zCommunicateTopology.world_sizeargsr   c                 K  sB   t |t | jksJ | jdi |}|| j v sJ | j| S Nr   )rP   rK   rG   rQ   rS   )rI   ra   keyr   r   r   get_rank   s   
zCommunicateTopology.get_rankrankc                 C  s*   || j k sJ || j v sJ | j| S r=   )rN   rT   rS   )rI   re   r   r   r   	get_coord   s   
zCommunicateTopology.get_coordr]   c                   s4   j |  fddj D }|  |S )Nc                   s"   g | ]}|  krj | qS r   )rQ   )rC   coordZaxisr]   rI   r   r   rE      s
    z5CommunicateTopology.get_axis_list.<locals>.<listcomp>)rJ   r]   rQ   rS   sort)rI   rZ   r]   ranksr   rh   r   get_axis_list   s   z!CommunicateTopology.get_axis_listc                 C  s    || j v sJ | j| j | S r=   )rJ   rK   r]   r^   r   r   r   get_dim_size   s   z CommunicateTopology.get_dim_size
fused_axislist[list[int]]c              	   C  s   t t| j|}g }|D ]}|t| j| j|  qg }|D ]}|t| j| j|  q#g }t| D ]>}i }g }	t	|D ]
\}
}||||
 < qDt| D ]}t	|D ]
\}
}||||
 < qY|	| j
| jdi |  qS||	 q:|S rb   )listsetrJ   
differenceappendrB   rK   r]   r   	enumeraterQ   rG   )rI   rm   Znon_fused_axisZnon_fused_rangesrZ   Zfused_rangesZ	rank_listZnon_fused_ranksZ
coord_dictrj   iZnon_fused_rankZfused_ranksZ
fused_rankr   r   r   get_fused_ranks   s.   z#CommunicateTopology.get_fused_ranksc              	     s    | j v sJ  fdd| j D }g }|D ]}| |}|t| qg }t| D ]6}i }|D ]}	|||	 ||	< q0g }
td|  D ]}|| < |
| j| jdi |  qF||
 q*|S )Nc                   s   g | ]}| kr|qS r   r   rC   namerZ   r   r   rE      s    z5CommunicateTopology.get_comm_list.<locals>.<listcomp>r   r   )rJ   rl   rr   rB   r   r]   rQ   rG   )rI   rZ   Zother_axis_namesrU   rw   Zdim_numZ
all_resultr>   Z	key_coordZ
other_nameresultrt   r   rx   r   get_comm_list   s&   

z!CommunicateTopology.get_comm_listglobal_rankkwargsc                 K  s.   |  |}|jdi | }| jdi |S rb   )rf   _replace_asdictrd   )rI   r{   r|   rg   tfr   r   r   get_rank_from_stage   s   
z'CommunicateTopology.get_rank_from_stageN)r8   r9   r:   r;   r   r<   )r   r9   )rZ   r[   r   r\   r   r\   )ra   r   r   r\   )re   r\   r   r   )rZ   r[   r]   r\   r   r;   )rm   r;   r   rn   )rZ   r[   r   rn   )r{   r\   r|   r   r   r\   )r)   r*   r+   rV   rX   r_   r`   rd   rf   rk   rl   ru   rz   r   r   r   r   r   r      s    









c                   @  s  e Zd Z	drdsdd	ZdtddZduddZdvddZ			dwdxddZ			dwdxddZdyddZ	dyd d!Z
dvd"d#Zdvd$d%Zdzd&d'Zdyd(d)Zdyd*d+Zdyd,d-Zdyd.d/Zd{d1d2Zdyd3d4Zd|d5d6Zdyd7d8Zdyd9d:Zd{d;d<Zdyd=d>Zdyd?d@ZdydAdBZdydCdDZdydEdFZdydGdHZdydIdJZd{dKdLZdydMdNZ d{dOdPZ!d}dRdSZ"dydTdUZ#dydVdWZ$dydXdYZ%d{dZd[Z&dyd\d]Z'd~dd`daZ(ddedfZ)d{dgdhZ*d{didjZ+dydkdlZ,	drddpdqZ-dS )r   Ntopologyr   hybrid_configsNCCLConfig_Message | Noner   r<   c                 C  s  t j | _t j | _|| _| jd| _| jd| _	| jd| _
| jd| _| jd| _|  | _|  | _|  | _|  | _|  | _|  skJ d| j d| j	 d| j d	| j
 d
| j d| j | jd|d urzt|d jdnd d\| _| _t jjt jdgddt jjj | jd d}t !|| r| jjdkr| jj"j#|d urt|d j$dnd d | jd|d urt|d j%dnd d\| _&| _'| jd|d urt|d j%dnd d\| _(| _)| jd|d urt|d j%dnd d\| _*| _+d | _,| jdkr| jd|d urt|d j%dnd d\| _,| _-| j.d|d ur.t|d j/dnd d\| _0| _1| jdkrT| j.d|d urLt|d j/dnd d\| _2| _3| jdkr| j4ddg|d urlt|d j%dnd d\| _5| _6| j4ddg|d urt|d j%d nd d\| _7| _8| jd!k| _9| j| j
d k| _:| j
dkrt j;j<= rt>  | ?  t@r| A  d"| j d#| j	 d$| j d%| j
 d&| j d'| j }|d(| j( d)| j* d*| j d+| j& d,| j, d-| j0 7 }tBC| | aDd S ).Nr2   r7   r3   r4   r5   znranks: z
, mp_num: z, sharding_num: z
, pp_num: z
, dp_num: z, sep_num: 
pp_configspp_collr#   r&   int32Zdtypeopgroup$FLAGS_eager_communication_connectionpp_p2p
dp_configsdp
mp_configstpsharding_configssep_configsZdp_checkZsharding_checkZdp_sep_configsZdp_sepZpp_tp_configsZpp_tpr   HybridParallelInfo: rank_id: , mp_degree: , sharding_degree: , pp_degree: , dp_degree: , sep_degree: , mp_group: ,  sharding_group: , pp_group: , dp_group: z, sep:group: , check/clip group: )Er   r   get_world_sizenranksrd   r{   _topor_   
_dp_degree
_mp_degree
_pp_degree_sharding_degree_sep_degree_get_data_parallel_id_data_parallel_id_get_model_parallel_id_model_parallel_id_get_sharding_parallel_id_sharding_parallel_id_get_sep_parallel_id_sep_parallel_id_get_pipe_parallel_idstage_id_check_valid_topo_set_comm_groupr   coll_nccl_config	_pp_group_pp_comm_group
all_reducezerosReduceOpSUM	get_flagsprocess_groupeager_connect_ring_exchangep2p_nccl_configr    	_dp_group_dp_comm_group	_mp_group_mp_comm_group_sharding_group_sharding_comm_group
_sep_group_sep_comm_group_set_check_groupcheck_nccl_config_check_group_check_comm_groupsharding_check_groupsharding_check_comm_groupcreate_fuse_groupZ_dp_sep_group_dp_sep_comm_groupZ_pp_mp_group_pp_mp_comm_groupis_first_stageis_last_stage	frameworkr   is_compiled_with_ncclr	   _set_p2p_prev_next_use_four_directions_set_four_directions_p2p_groupr   info_HYBRID_PARALLEL_GROUP)rI   r   r   env_name	debug_strr   r   r   rV     s  





2













8
zHybridCommunicateGroup.__init__Literal[0, 1, 2, 3, 4]c                 C  s   | j dkr| jdkr| jdkr| jdkr| jdkrtjS | j dkr3| jdkr3| jdkr3| jdkr3tjS | j dkrE| jdkrE| jdkrEtjS | j dkrR| jdkrRtj	S | j dkrZtj
S d S Nr&   )r   r   r   r   r   r%   r-   r0   r1   r.   r/   rH   r   r   r   get_parallel_mode  s(   












z(HybridCommunicateGroup.get_parallel_modeboolc                 C  s$   | j | j | j | j | j | jkS r=   )r   r   r   r   r   r   rH   r   r   r   r     s   z(HybridCommunicateGroup._check_valid_topoc                 C     | j dks	J dd S )Nr&   zsep not existr   rH   r   r   r   _check_sep_exist     z'HybridCommunicateGroup._check_sep_existr2   parallel_methodr[   topor    r"   tuple[list[int], Group]c           
      C  s   g }d }|d u r| j }||}|dkrtnd}|D ]}tjj|||d}	| j|v r.|}|	}qt|dks7J |d us=J t	dt| d| d ||fS )Nr3   r   rj   Znccl_comm_init_optionr    Total  z# comm group(s) create successfully!)
r   rz    g_pipeline_nccl_comm_init_optionr   r   	new_groupr{   rP   r   r   )
rI   r   r   r    parallel_groupparallel_comm_groupparallel_groupsgroup_nccl_comm_init_optionr   
comm_groupr   r   r   r     s2   

z&HybridCommunicateGroup._set_comm_groupc           
      C  s   g }d }|d u r| j }||}t|D ]}| j ||}tjj||d}	| j|v r.|}|	}qt|dks7J |d us=J ||fS )Nrj   r    r   )	r   r_   rB   rk   r   r   r   r{   rP   )
rI   r   r   r    r   r   Zparallel_sizeidxr   r   r   r   r   r     s"   

z'HybridCommunicateGroup._set_check_groupr\   c                 C     t | ds	J d| jS )N	next_rankznext_rank has not been inited)hasattrr   rH   r   r   r   _get_p2p_next_rank,     z)HybridCommunicateGroup._get_p2p_next_rankc                 C  r   )N	prev_rankzprev_rank has not been inited)r   r   rH   r   r   r   _get_p2p_prev_rank0  r   z)HybridCommunicateGroup._get_p2p_prev_rankc                 C  s|   | j d}|D ]3}t|| jksJ t|D ]#\}}|}||d | j  }||d | j  }| j|kr:|| _|| _qqd S )Nr3   r&   )r   rz   rP   r   rs   r{   r   r   )rI   
comm_lists
comm_ranksr   re   	curr_rankr   r   r   r   r   r   4  s   
z)HybridCommunicateGroup._set_p2p_prev_nextc           
      C  s  | j d}d | _d | _d | _d | _|D ]\}t|| jksJ t|D ]L\}}|}||d | j  }||d | j  }t	j
j||gd}| j|krM|| _n| j|krU|| _t	j
j||gd}	| j|krg|	| _q#| j|kro|	| _q#q| jd usxJ | jd usJ | jd usJ | jd usJ d S )Nr3   r&   )rj   )r   rz   send_next_groupsend_prev_grouprecv_next_grouprecv_prev_grouprP   r   rs   r   r   r   r{   )
rI   r   r   r   re   r   r   r   Z
next_groupZ
prev_groupr   r   r   r   B  s>   



z5HybridCommunicateGroup._set_four_directions_p2p_groupc                 C  rW   r=   )r   rH   r   r   r   r   g  rY   zHybridCommunicateGroup.topologyc                 C  rW   r=   )r{   rH   r   r   r   get_global_rankj  rY   z&HybridCommunicateGroup.get_global_rankc                 C     | j | jjS r=   )r   rf   r{   r2   rH   r   r   r   r   n     z,HybridCommunicateGroup._get_data_parallel_idc                 C  rW   r=   )r   rH   r   r   r   get_data_parallel_rankq  rY   z-HybridCommunicateGroup.get_data_parallel_rankc                 C  rW   r=   )r   rH   r   r   r   get_data_parallel_world_sizet  rY   z3HybridCommunicateGroup.get_data_parallel_world_sizer   c                 C  rW   r=   )r   rH   r   r   r   get_data_parallel_groupw  rY   z.HybridCommunicateGroup.get_data_parallel_groupc                 C     | j jd S Nr   )r   rj   rH   r   r   r    get_data_parallel_group_src_rankz     z7HybridCommunicateGroup.get_data_parallel_group_src_rankc                 C  r   r=   )r   rf   r{   r7   rH   r   r   r   r   ~  r  z-HybridCommunicateGroup._get_model_parallel_idc                 C  rW   r=   )r   rH   r   r   r   get_model_parallel_rank  rY   z.HybridCommunicateGroup.get_model_parallel_rankc                 C  rW   r=   )r   rH   r   r   r   get_model_parallel_world_size  rY   z4HybridCommunicateGroup.get_model_parallel_world_sizec                 C  rW   r=   )r   rH   r   r   r   get_model_parallel_group  rY   z/HybridCommunicateGroup.get_model_parallel_groupc                 C  r  r  )r   rj   rH   r   r   r   !get_model_parallel_group_src_rank  r  z8HybridCommunicateGroup.get_model_parallel_group_src_rankc                 C  r   r=   )r   rf   r{   r3   rH   r   r   r   r     r  z,HybridCommunicateGroup._get_pipe_parallel_idc                 C  rW   r=   )r   rH   r   r   r   get_stage_id  rY   z#HybridCommunicateGroup.get_stage_idc                 C  rW   r=   )r   rH   r   r   r   get_pipe_parallel_world_size  rY   z3HybridCommunicateGroup.get_pipe_parallel_world_sizec                 C  r   r=   )r   rf   r{   r5   rH   r   r   r   r     r  z+HybridCommunicateGroup._get_sep_parallel_idc                 C  rW   r=   )r   rH   r   r   r   get_sep_parallel_rank  rY   z,HybridCommunicateGroup.get_sep_parallel_rankc                 C  rW   r=   r   rH   r   r   r   get_sep_parallel_world_size  rY   z2HybridCommunicateGroup.get_sep_parallel_world_sizec                 C     |    | jS r=   )r   r   rH   r   r   r   get_sep_parallel_group     z-HybridCommunicateGroup.get_sep_parallel_groupc                 C     |    | jjd S r  )r   r   rj   rH   r   r   r   get_sep_parallel_group_src_rank     z6HybridCommunicateGroup.get_sep_parallel_group_src_rankc                 C  rW   r=   )r   rH   r   r   r   get_pipe_parallel_group  rY   z.HybridCommunicateGroup.get_pipe_parallel_group!tuple[Group, Group, Group, Group]c                 C  s    t sJ d| j| j| j| jfS )NzrIf you want to use four directions p2p group, set the environment variable PADDLE_USE_FOUR_DIRECTIONS_P2P to True.)r   r   r   r   r   rH   r   r   r   get_p2p_groups  s   z%HybridCommunicateGroup.get_p2p_groupsc                 C  r   r=   )r   rf   r{   r4   rH   r   r   r   r     r  z0HybridCommunicateGroup._get_sharding_parallel_idc                 C  rW   r=   )r   rH   r   r   r   get_sharding_parallel_rank  rY   z1HybridCommunicateGroup.get_sharding_parallel_rankc                 C  rW   r=   )r   rH   r   r   r    get_sharding_parallel_world_size  rY   z7HybridCommunicateGroup.get_sharding_parallel_world_sizec                 C  rW   r=   )r   rH   r   r   r   get_sharding_parallel_group  rY   z2HybridCommunicateGroup.get_sharding_parallel_groupc                 C  r  r  )r   rj   rH   r   r   r   $get_sharding_parallel_group_src_rank  s   z;HybridCommunicateGroup.get_sharding_parallel_group_src_rankFr4   c                 C     |r| j S | jS r=   )r   r   )rI   r4   r   r   r   get_check_parallel_group     z/HybridCommunicateGroup.get_check_parallel_groupr   r|   r   c                 K  s   | j j| jfd|i|S )Nr3   )r   r   r{   )rI   r   r|   r   r   r   r     s   z*HybridCommunicateGroup.get_rank_from_stagec                 C  r  r=   )r   r   rH   r   r   r   get_dp_sep_parallel_group  r  z0HybridCommunicateGroup.get_dp_sep_parallel_groupc                 C  r  r=   )r   r   rH   r   r   r   get_pp_mp_parallel_group  r  z/HybridCommunicateGroup.get_pp_mp_parallel_groupc                 C  s   dS r  r   rH   r   r   r   $get_moe_sharding_parallel_world_size  s   z;HybridCommunicateGroup.get_moe_sharding_parallel_world_sizefused_strategy_listr9   =tuple[list[list[int]], list[Group]] | tuple[list[int], Group]c                 C  s   t |dks
J dg }g }| j|}|  |D ]}tjj||d}| j|v r3|| || qt |dks<J t |dksDJ t	
dt | d| d t |dkr\||fS |d |d fS )Nr   z9the length of fused_strategy_list must be greater than 0.r   r   z comm group(s) of fused z create successfully!r&   )rP   r   ru   ri   r   r   r   r{   rr   r   r   )rI   r$  r    r   r   r   r   r   r   r   r   r     s.   


z(HybridCommunicateGroup.create_fuse_groupr=   )r   r   r   r   r   r<   )r   r   r   r   r   r<   )r2   NN)r   r[   r   r   r    r"   r   r   r   )r   r   r   r   )r   r[   )r   r  F)r4   r   r   r   )r   r\   r|   r   r   r\   )r$  r9   r    r"   r   r%  ).r)   r*   r+   rV   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r	  r
  r  r  r   r  r  r   r  r  r  r  r  r  r   r  r  r  r  r  r   r!  r"  r#  r   r   r   r   r   r     sj     
;
&

%




%





























c                   @  sj  e Zd Zg dg ddfdUddZdVddZdWddZdXddZdd Zdd Zdd Z	dXddZ
dXddZ	dXdYd d!Zd"d# Zd$d% Zd&d' Zd(d) ZdZd+d,Zd-d. ZdZd/d0ZdZd1d2Zd[d4d5ZdZd6d7Zd[d8d9ZdZd:d;Zd[d<d=ZdZd>d?ZdZd@dAZdZdBdCZd[dDdEZdZdFdGZdZdHdIZdZdJdKZ d[dLdMZ!dZdNdOZ"	Pd\dZdQdRZ#d\dZdSdTZ$dS )]EPHybridCommunicateGroup)r3   moe_shardingexpertr2   r4   r5   r6   r7   )r&   r&   r&   r&   r&   r&   r&   r&   Nr8   r9   r:   r;   r   r   r   r<   c                   s@  t j | _t j | _tt||  dd| _	 dd| _
 dd| _ dd| _ dd| _ dd| _ dd| _ dd| _d	 vrSd d	<  d	d| _g }g }t||D ]\}}|d
v ru|| || qcd|v r~d|v sJ t||| _ d  d   d< dd |D }|d}	|d|dkr||	d d ||	d n||	d d ||	d d  fdd|D }
|d|dk sJ dt||
| _ d  d	   d< g d} fdd|D }t||| _| | j_| | j_| | j_| j| _| | jd| _| | jd| _|  | _| | jd| _ | | jd	| _!| j"d| _#| | jd| _$| | jd| _%| | jd| _&| j| jksgJ d| j d| j d| jj'| jj'ksJ d| jj' d| jj' d| jdkr| jdksJ d| j d| j d| j(d| j|d urt)|d j*dnd d\| _+| _,t jj-t j.dgdd t jj/j0| j,d! d"}t 1|| r| j,jdkr| j,j2j3|d urt)|d j4d#nd d | j(d| j|d urt)|d$ j5d%nd d\| _6| _7| j(d| j|d urt)|d& j5dnd d\| _8| _9| j(d| j|d ur1t)|d' j5d(nd d\| _:| _;| j(d| j|d urKt)|d) j5dnd d\| _<| _=| j(d| j|d uret)|d* j5d+nd d\| _>| _?| j@| j|d ur~t)|d, j5dnd d\| _A| _B| jC| j|d urt)|d- j5d	nd d\| _D| _Ed | _Fd | _G| jdkr| jH| j|d urt)|d. j5d/nd d\| _F| _G| jI| j|d urt)|d0 j5dnd d\| _J| _K| L | _M| jNd| j|d urt)|d' jOd1nd d\| _P| _Q| jNd| j|d urt)|d& jOd2nd d\| _R| _S| j$d3k| _T| j$| jd k| _U| jdkr@t jVjWX r5tY  | Z  t[r@| \  d4| j d5| j d6| j d7| j d8| j d9| j d:| j d;| j	 d<| j
 }|d=| j> d>| jA d?| j+ d@| j: dA| j< dB| jD dC| jJ dD| jF dE| jP dF| j6 dG| j8 d7 }t]^| | a_d S )HNr,  r&   r+  r3   r2   r7   r4   r5   r6   )r3   r+  r,  dense_shardingc                 S  s   g | ]}|d vr|qS ))r+  r4   r,  r6   r   rv   r   r   r   rE   1  s
    z5EPHybridCommunicateGroup.__init__.<locals>.<listcomp>r
   c                      g | ]} | qS r   r   rv   Zdim_dictr   r   rE   @  rF   z%moe_sharding must be before sharding.cp_sharding)r0  r3   r6   r7   c                   r.  r   r   rv   r/  r   r   rE   P  rF   zMismatch moe_pp_degree:z, pp_degree:.zMismatch world_size:z, moe_world_size:zsep_degree z and dp_degree z must be 1 in MoE.r   r   r#   r   r   r   r   r   Z
ep_configsepZmoe_sharding_configsr   r   r   r   r   r   Z
cp_configsZcp_mp_configsZcp_mpZcp_sharding_configsZ
data_checkZmoe_sharding_checkr   r   r   r   r   r   r   z, cp_degree: z, ep_degree: z, moe_sharding_degree: r   r   r   r   z, sep_group: z, cp_group: z, cp_sharding_group: z, cp_mp_group: r   z, ep_group: z, moe_sharding_group: )`r   r   r   r   rd   r{   r   rO   get
_ep_degree_moe_sharding_degreeZ_moe_pp_degreer   r   r   r   r   
_cp_degreerr   r   Z	_moe_topor]   insert_dense_topoZ_cp_topoZ_parent_hcgr   _get_parallel_idr   r   r   r   r   _cp_parallel_idr_   _cp_sharding_degreer   _expert_parallel_id_moe_sharding_parallel_idrN   r   r   r   r   r   r   r   r   r   r   r   r   r   r    Z	_ep_group_ep_comm_groupZ_moe_sharding_group_moe_sharding_comm_groupr   r   r   r   r   r   build_sharding_groupr   r   build_context_group	_cp_group_cp_comm_groupZ_cp_mp_group_cp_mp_comm_groupbuild_cp_mp_fuse_groupbuild_context_sharding_group_cp_sharding_group_cp_sharding_comm_group_get_cp_sharding_parallel_id_cp_sharding_parallel_idr   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   r   r   )rI   r8   r:   r   Zmoe_hybrid_group_namesZmoe_dimsrw   dimZdense_group_namesZpipe_idxZ
dense_dimsZcp_group_namesZcp_dimsr   r   r   r/  r   rV     s  






























b
z!EPHybridCommunicateGroup.__init__r   c                 C  s8   | j | j | j | j | j | jko| jdkp| jdkS r   )r   r   r   r   r   r   r6  rH   r   r   r   r   G  s   z*EPHybridCommunicateGroup._check_valid_topoc                 C  r   )Nr&   zcp not existr6  rH   r   r   r   _check_cp_existQ  r   z(EPHybridCommunicateGroup._check_cp_existc           	      C  s   g }d }|  |dd}d}|D ]}tjj|||d}| j|v r#|}|}qt|dks,J |d us2J tdt| d ||fS )Nr+  r-  r   r   r   z, sharding comm group(s) create successfully!)merge_inner_comm_listr   r   r   r{   rP   r   r   )	rI   r   r    r   r   r   r   r   r   r   r   r   r@  T  s,   
z-EPHybridCommunicateGroup.build_sharding_groupc                 C  s   |  |dd}g }|D ]4}t|| j | jks$J dt| d| j t| jD ]}||| j |d | j  }|| q)q|S )Nr+  r-  sharding comm list ) size must divided by cp_sharding_degree r&   rN  rP   r;  r6  rB   rr   rI   r   sharding_comm_listZcontext_comm_listrj   rt   Z	sub_ranksr   r   r   split_context_comm_listp  s   z0EPHybridCommunicateGroup.split_context_comm_listc                 C  sx   |  |dd}g }|D ].}t|| j | jks$J dt| d| j t| jD ]}||d | j }|| q)q|S )Nr+  r-  rO  rP  rQ  rR  r   r   r    split_context_sharding_comm_list  s   z9EPHybridCommunicateGroup.split_context_sharding_comm_listc                 C  s   | d}| |}G dd d}| }|| D ]}t|dkr6|d }tdt|D ]
}||||  q+q| }	|	D ]}
|
  q=|	jdd d |	S )	Nr7   c                   @  s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
zREPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.UnionFindc                 S  s   i | _ i | _d S r=   )parentre   rH   r   r   r   rV     s   
z[EPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.UnionFind.__init__c                 S  sP   || j vr|| j |< d| j|< |S | j | |kr#| | j | | j |< | j | S r  )rV  re   find)rI   r>   r   r   r   rW    s   



zWEPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.UnionFind.findc                 S  sx   |  ||  |}}||krd S | j| | j| k r ||}}|| j|< | j| | j| kr:| j|  d7  < d S d S r   )rW  re   rV  )rI   r>   r?   Zpxpyr   r   r   union  s   

zXEPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.UnionFind.unionc                 S  sD   i }| j D ]}| |}||vrg ||< || | qt| S r=   )rV  rW  rr   ro   rR   )rI   
componentsnoderootr   r   r   get_components  s   

zaEPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.UnionFind.get_componentsN)r)   r*   r+   rV   rW  rY  r]  r   r   r   r   	UnionFind  s
    
r^  r&   r   c                 S  s   | d S r  r   )r>   r   r   r   r@     rA   zQEPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_list.<locals>.<lambda>)rc   )rz   rT  rP   rB   rY  r]  ri   )rI   r   Zmp_comm_listZcp_comm_listr^  Zufr   firstrt   Zcp_tp_comm_list	componentr   r   r   &fuse_context_tensor_parallel_comm_list  s   

$
z?EPHybridCommunicateGroup.fuse_context_tensor_parallel_comm_listc           	      C  t   d}|  |}|D ]}tjj|||d}| j|v r|}|}q	t|dks&J |d us,J td| j d ||fS )Nr   r   r   z4 context parallel comm group(s) create successfully!)	rT  r   r   r   r{   rP   r   r   r6  	rI   r   r    r   r   r   r   r   r   r   r   r   rA    $   

z,EPHybridCommunicateGroup.build_context_groupc           	      C  rb  )Nr   r   r   z= context sharding parallel comm group(s) create successfully!)	rU  r   r   r   r{   rP   r   r   r;  rc  r   r   r   rF    rd  z5EPHybridCommunicateGroup.build_context_sharding_groupr%  c           	      C  sN   d}|  |}|D ]}tjj|||d}| j|v r|}|}q	td ||fS )Nr   r   z9Fused context & model parallel group create successfully!)ra  r   r   r   r{   r   r   rc  r   r   r   rE    s   


z/EPHybridCommunicateGroup.build_cp_mp_fuse_groupc                 C  s   |j |}|j |}||}t||j|  }t|j|d d |j|  }|dkr3|dks5J g }	t|D ]9}
g }t|j| D ](}|
||  t|k sdJ d|
 d| d| dt| |||
||   7 }qF|	| q;|	S )a$  
        merge all inner communication list whose rank-id are in
        the same outer communication list. E.g.:
          outer_comm_list: [[0, 4], [1, 5]]
          inner_comm_list: [[0, 2], [1, 3], [4, 6], [5, 7]]
          => merged_inner_comm_list: [[0, 2, 4, 6], [1, 3, 5, 7]]
        r&   Nr   z+Unexpected error in merge_inner_comm_list, z, )	rJ   r]   rz   rP   rK   mathprodrB   rr   )rI   r   Z
outer_nameZ
inner_nameZ
inner_axisZ
outer_axisZinner_comm_listZnum_merged_groupsintervalZmerged_comm_listrt   Zcommjr   r   r   rN    s"   
 z.EPHybridCommunicateGroup.merge_inner_comm_listc                 C  sv   t |}t |d }|d }d}|dkr9||k r9|| | }||kr$|S ||k r-|d7 }n|d8 }|dkr9||k sd S )Nr   r&   )rP   )rI   	comm_listr{   rowscolsrccurrentr   r   r   find_col_idx  s   
	z%EPHybridCommunicateGroup.find_col_idxc                 C  s(   | |}| || j}|d usJ |S r=   )rz   ro  r{   )rI   r   Zparallel_typeri  parallel_idr   r   r   r9  0  s   
z)EPHybridCommunicateGroup._get_parallel_idc                 C  s.   |  | jdd}| || j}|d usJ |S )Nr+  r-  )rN  r8  ro  r{   )rI   rS  rp  r   r   r   r   6  s   z2EPHybridCommunicateGroup._get_sharding_parallel_idr\   c                 C     | j | jS r=   )rB  r]   r{   rH   r   r   r   _get_context_parallel_id>     z1EPHybridCommunicateGroup._get_context_parallel_idc                 C  rq  r=   )rG  r]   r{   rH   r   r   r   rI  A  rs  z5EPHybridCommunicateGroup._get_cp_sharding_parallel_idc                 C  rW   r=   )r:  rH   r   r   r   get_context_parallel_rankD  rY   z2EPHybridCommunicateGroup.get_context_parallel_rankc                 C  rW   r=   rL  rH   r   r   r   get_context_parallel_world_sizeG  rY   z8EPHybridCommunicateGroup.get_context_parallel_world_sizer   c                 C  r  r=   )rM  rC  rH   r   r   r   get_context_parallel_groupJ  r  z3EPHybridCommunicateGroup.get_context_parallel_groupc                 C  r  r  )rM  rC  rj   rH   r   r   r   #get_context_parallel_group_src_rankN  r  z<EPHybridCommunicateGroup.get_context_parallel_group_src_rankc                 C  r  r=   )rM  rH  rH   r   r   r   get_cp_sharding_parallel_groupR  r  z7EPHybridCommunicateGroup.get_cp_sharding_parallel_groupc                 C  r  r  )rM  rH  rj   rH   r   r   r   'get_cp_sharding_parallel_group_src_rankV  r  z@EPHybridCommunicateGroup.get_cp_sharding_parallel_group_src_rankc                 C  r  r=   )rM  rD  rH   r   r   r   get_cp_mp_parallel_groupZ  r  z1EPHybridCommunicateGroup.get_cp_mp_parallel_groupc                 C  r  r  )rM  rD  rj   rH   r   r   r   !get_cp_mp_parallel_group_src_rank^  r  z:EPHybridCommunicateGroup.get_cp_mp_parallel_group_src_rankc                 C  rW   r=   )r<  rH   r   r   r   get_expert_parallel_rankb  rY   z1EPHybridCommunicateGroup.get_expert_parallel_rankc                 C  rW   r=   )r4  rH   r   r   r   get_expert_parallel_world_sizee  rY   z7EPHybridCommunicateGroup.get_expert_parallel_world_sizec                 C  rW   r=   )r>  rH   r   r   r   get_expert_parallel_grouph  rY   z2EPHybridCommunicateGroup.get_expert_parallel_groupc                 C  r  r  )r>  rj   rH   r   r   r   "get_expert_parallel_group_src_rankk  r  z;EPHybridCommunicateGroup.get_expert_parallel_group_src_rankc                 C  rW   r=   )r=  rH   r   r   r   get_moe_sharding_parallel_rankn  rY   z7EPHybridCommunicateGroup.get_moe_sharding_parallel_rankc                 C  rW   r=   )r5  rH   r   r   r   r#  q  rY   z=EPHybridCommunicateGroup.get_moe_sharding_parallel_world_sizec                 C  rW   r=   )r?  rH   r   r   r   get_moe_sharding_parallel_groupt  rY   z8EPHybridCommunicateGroup.get_moe_sharding_parallel_groupc                 C  r  r  )r?  rj   rH   r   r   r   (get_moe_sharding_parallel_group_src_rankw  r  zAEPHybridCommunicateGroup.get_moe_sharding_parallel_group_src_rankFc                 C  r  r=   )r;  r   rI   Zwith_context_parallelr   r   r   r  z  s   z9EPHybridCommunicateGroup.get_sharding_parallel_world_sizec                 C  r  r=   )rJ  r   r  r   r   r   r    r   z3EPHybridCommunicateGroup.get_sharding_parallel_rank)r8   r9   r:   r;   r   r   r   r<   r&  r'  r=   )r   r%  r   r(  r)  )%r)   r*   r+   rV   r   rM  r@  rT  rU  ra  rA  rF  rE  rN  ro  r9  r   rr  rI  rt  ru  rv  rw  rx  ry  rz  r{  r|  r}  r~  r  r  r#  r  r  r  r  r   r   r   r   r*    sT    
  
G



7

















r*  c                   @  s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )_CommunicateGroupztmp for staticc                 C  s   | a i | _d S r=   )r   groupsrH   r   r   r   rV     s   
z_CommunicateGroup.__init__c                 C  s    t jj|||}|| j|< d S r=   )r   r   r   r   r  )rI   
group_nameZ
group_rankZ
group_sizeZring_idZgroup_ranksr   r   r   r   set_comm_group  s   z _CommunicateGroup.set_comm_groupc                 C  s   || j v sJ | j | S r=   )r  )rI   r  r   r   r   	get_group  s   
z_CommunicateGroup.get_groupc                 C  s
   |  dS Nr7   )r  rH   r   r   r   r    s   
z*_CommunicateGroup.get_model_parallel_groupc                 C     |  djS r  )r  r   rH   r   r   r   r
    r  z/_CommunicateGroup.get_model_parallel_world_sizec                 C  r  r  )r  re   rH   r   r   r   r	    r  z)_CommunicateGroup.get_model_parallel_rankN)
r)   r*   r+   r,   rV   r  r  r  r
  r	  r   r   r   r   r    s    r  )NN)r   r   r   r   r   r   r=   )r    r!   r   r"   ))
__future__r   rL   re  os	functoolsr   	itertoolsr   typingr   r   r   r   Z7paddle.distributed.fleet.proto.distributed_strategy_pb2r   r   Z#paddle.distributed.utils.nccl_utilsr	   Zutils.log_utilr   Zpaddle.base.libpaddleZpaddle.distributed.collectiver   __all__r   environr3  baser   Zis_compiled_with_xpur   r\   r   r   r$   r%   r   r   r*  r  r   r   r   r   <module>   sR   )x         