o
    0 i                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 z
d dlmZ dZW n ey9   dZY nw ejrfejejejejejejejejejejejejejd	Zejejejejd
Zni Zi ZdddZG dd deZ G dd dZ!dd Z"dd Z#G dd dZ$dS )    N)nccl)_store)_Backend)sparse)MPITF)bBiIlLqQefdFD)sumprodmaxminc                 C   sT   | j j}|tvrtd| j  dt| }|d u r| j}|dv r&|d| fS ||fS )NUnknown dtype 	 for NCCLFD   )dtypechar_nccl_dtypes	TypeErrorsize)arraycountr   Z
nccl_dtype r#   h/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/cupyx/distributed/_nccl_comm.py_get_nccl_dtype_and_count.   s   r%   c                       s   e Zd ZdZejejdf fdd	Zdd Zdd Z	d	d
 Z
dd Zdd Zdd Zd,ddZd-ddZd.ddZ	d,ddZd/ddZd/ddZd/d d!Zd/d"d#Zd.d$d%Zd.d&d'Zd/d(d)Zd*d+ Z  ZS )0NCCLBackenda  Interface that uses NVIDIA's NCCL to perform communications.

    Args:
        n_devices (int): Total number of devices that will be used in the
            distributed execution.
        rank (int): Unique id of the GPU that the communicator is associated to
            its value needs to be `0 <= rank < n_devices`.
        host (str, optional): host address for the process rendezvous on
            initialization. Defaults to `"127.0.0.1"`.
        port (int, optional): port used for the process rendezvous on
            initialization. Defaults to `13333`.
        use_mpi(bool, optional): switch between MPI and use the included TCP
            server for initialization & synchronization. Defaults to `False`.
    Fc                    sF   t  |||| to|| _| jr| || d S | |||| d S N)super__init___mpi_available_use_mpi_init_with_mpi_init_with_tcp_store)self	n_devicesrankhostportZuse_mpi	__class__r#   r$   r)   J   s
   
zNCCLBackend.__init__c                 C   sX   t j| _| j | _| j  d }| jdkrt }| jj|dd}t	|||| _
d S )Nr   root)r   Z
COMM_WORLD	_mpi_commZGet_rankZ	_mpi_rankBarrierr   get_unique_idbcastNcclCommunicator_comm)r.   r/   r0   nccl_idr#   r#   r$   r,   T   s   

zNCCLBackend._init_with_mpic                 C   s   d }|dkr%| j || t }tdd |D }|| jd< | j  n| j  | jd }tdd |D }t|||| _	d S )Nr   c                 S   s   g | ]}|d  qS    r#   .0r   r#   r#   r$   
<listcomp>i   s    z4NCCLBackend._init_with_tcp_store.<locals>.<listcomp>r=   c                 S   s   g | ]}t |d  qS r>   )intr@   r#   r#   r$   rB   o   s    )
r   runr   r9   bytes_store_proxybarriertupler;   r<   )r.   r/   r0   r1   r2   r=   Zshifted_nccl_idr#   r#   r$   r-   a   s   


z NCCLBackend._init_with_tcp_storec                 C   s    |j js|j jstdd S d S )Nz4NCCL requires arrays to be either c- or f-contiguous)flagsc_contiguousf_contiguousRuntimeError)r.   r!   r#   r#   r$   _check_contiguousr   s
   zNCCLBackend._check_contiguousc                 C   s   |d u r
t jj }|jS r'   )cupycudastreamZget_current_streamptr)r.   rP   r#   r#   r$   _get_streamw   s   zNCCLBackend._get_streamc                 C   s8   |t vrtd| d|dv r|dkrtdt | S )NzUnknown op r   r   r   z-Only nccl.SUM is supported for complex arrays)	_nccl_opsrL   
ValueError)r.   opr   r#   r#   r$   _get_op|   s   zNCCLBackend._get_opc                 C   sT   t }t|d ttfrt|d d st|d rt}t||| g|R   d S Nr   )_DenseNCCLCommunicator
isinstancelistrH   r   issparse_SparseNCCLCommunicatorgetattr)r.   functionargsZ
comm_classr#   r#   r$   _dispatch_arg_type   s   zNCCLBackend._dispatch_arg_typer   Nc                 C      |  d||||f dS )a  Performs an all reduce operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_reduceNr`   )r.   in_array	out_arrayrU   rP   r#   r#   r$   rb      s   zNCCLBackend.all_reducer   c                 C      |  d|||||f dS )a  Performs a reduce operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
                will only be modified by the `root` process.
            root (int, optional): rank of the process that will perform the
                reduction. Defaults to `0`.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        reduceNrc   )r.   rd   re   r6   rU   rP   r#   r#   r$   rg      s   zNCCLBackend.reducec                 C      |  d|||f dS )a  Performs a broadcast operation.

        Args:
            in_out_array (cupy.ndarray): array to be sent for `root` rank.
                Other ranks will receive the broadcast data here.
            root (int, optional): rank of the process that will send the
                broadcast. Defaults to `0`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        	broadcastNrc   )r.   in_out_arrayr6   rP   r#   r#   r$   ri      s   
zNCCLBackend.broadcastc                 C   rf   )a/  Performs a reduce scatter operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            count (int): Number of elements to send to each rank.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        reduce_scatterNrc   )r.   rd   re   r"   rU   rP   r#   r#   r$   rk      s   zNCCLBackend.reduce_scatterc                 C   ra   )as  Performs an all gather operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            count (int): Number of elements to send to each rank.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_gatherNrc   )r.   rd   re   r"   rP   r#   r#   r$   rl      s   
zNCCLBackend.all_gatherc                 C   rh   )a  Performs a send operation.

        Args:
            array (cupy.ndarray): array to be sent.
            peer (int): rank of the process `array` will be sent to.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        sendNrc   )r.   r!   peerrP   r#   r#   r$   rm         	zNCCLBackend.sendc                 C   rh   )a2  Performs a receive operation.

        Args:
            array (cupy.ndarray): array used to receive data.
            peer (int): rank of the process `array` will be received from.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        recvNrc   )r.   re   rn   rP   r#   r#   r$   rp      ro   zNCCLBackend.recvc                 C   ra   )a  Performs a send and receive operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array used to receive data.
            peer (int): rank of the process to send `in_array` and receive
                `out_array`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        	send_recvNrc   )r.   rd   re   rn   rP   r#   r#   r$   rq         zNCCLBackend.send_recvc                 C   ra   )a  Performs a scatter operation.

        Args:
            in_array (cupy.ndarray): array to be sent. Its shape must be
                `(total_ranks, ...)`.
            out_array (cupy.ndarray): array where the result with be stored.
            root (int): rank that will send the `in_array` to other ranks.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        scatterNrc   r.   rd   re   r6   rP   r#   r#   r$   rs      rr   zNCCLBackend.scatterc                 C   ra   )a  Performs a gather operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
                Its shape must be `(total_ranks, ...)`.
            root (int): rank that will receive `in_array` from other ranks.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        gatherNrc   rt   r#   r#   r$   ru     rr   zNCCLBackend.gatherc                 C   rh   )a  Performs an all to all operation.

        Args:
            in_array (cupy.ndarray): array to be sent. Its shape must be
                `(total_ranks, ...)`.
            out_array (cupy.ndarray): array where the result with be stored.
                Its shape must be `(total_ranks, ...)`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_to_allNrc   )r.   rd   re   rP   r#   r#   r$   rv     s   
zNCCLBackend.all_to_allc                 C   s"   | j r
| j  dS | j  dS )zPerforms a barrier operation.

        The barrier is done in the cpu and is a explicit synchronization
        mechanism that halts the thread progression.
        N)r+   r7   r8   rF   rG   )r.   r#   r#   r$   rG   *  s   zNCCLBackend.barrierr   Nr   r   Nr   Nr'   )__name__
__module____qualname____doc__r   Z_DEFAULT_HOSTZ_DEFAULT_PORTr)   r,   r-   rM   rR   rV   r`   rb   rg   ri   rk   rl   rm   rp   rq   rs   ru   rv   rG   __classcell__r#   r#   r3   r$   r&   :   s0    












r&   c                   @   s   e Zd ZedddZedddZed dd	Ze	dd
dZed!ddZed!ddZ	ed!ddZ
ed!ddZed!ddZed!ddZed ddZed ddZed!ddZdS )"rX   r   Nc                 C   s\   | | | | ||}t|\}}|||jj}|j|jj	|jj	|||| d S r'   )
rM   rR   r%   rV   r   r   r<   Z	allReducedatarQ   )clscommrd   re   rU   rP   r   r"   r#   r#   r$   rb   :  s   


z!_DenseNCCLCommunicator.all_reducer   c           	   	   C   sh   | | |j|kr| | ||}t|\}}|||jj}|j|j	j
|j	j
||||| d S r'   )rM   r0   rR   r%   rV   r   r   r<   rg   r   rQ   )	r   r   rd   re   r6   rU   rP   r   r"   r#   r#   r$   rg   D  s   




z_DenseNCCLCommunicator.reducec                 C   sB   | | ||}t|\}}|j|jj|jj|||| d S r'   )rM   rR   r%   r<   ri   r   rQ   )r   r   rj   r6   rP   r   r"   r#   r#   r$   ri   P  s   

z _DenseNCCLCommunicator.broadcastc                 C   s^   | | | | ||}t||\}}|||jj}|j|jj	|jj	|||| d S r'   )
rM   rR   r%   rV   r   r   r<   ZreduceScatterr   rQ   )r   r   rd   re   r"   rU   rP   r   r#   r#   r$   rk   Y  s   


z%_DenseNCCLCommunicator.reduce_scatterc                 C   sL   | | | | ||}t||\}}|j|jj|jj||| d S r'   )rM   rR   r%   r<   Z	allGatherr   rQ   )r   r   rd   re   r"   rP   r   r#   r#   r$   rl   d  s   


z!_DenseNCCLCommunicator.all_gatherc                 C   8   | | ||}t|\}}| |||||| d S r'   )rM   rR   r%   _send)r   r   r!   rn   rP   r   r"   r#   r#   r$   rm   m     

z_DenseNCCLCommunicator.sendc                 C      |j |jj|||| d S r'   )r<   rm   r   rQ   r   r   r!   rn   r   r"   rP   r#   r#   r$   r   t     z_DenseNCCLCommunicator._sendc                 C   r   r'   )rM   rR   r%   _recv)r   r   re   rn   rP   r   r"   r#   r#   r$   rp   x  r   z_DenseNCCLCommunicator.recvc                 C   r   r'   )r<   rp   r   rQ   r   r   re   rn   r   r"   rP   r#   r#   r$   r     r   z_DenseNCCLCommunicator._recvc           
      C   sr   | | | | ||}t|\}}t|\}}	t  | |||||| | |||||	| t  d S r'   )rM   rR   r%   r   
groupStartr   r   groupEnd)
r   r   rd   re   rn   rP   idtypeicountodtypeocountr#   r#   r$   rq     s   


z _DenseNCCLCommunicator.send_recvc              	   C   s   |j d |jkrtd|j d|j  || || ||}t  ||jkrHt|jD ]}|| }t	|\}}	| 
|||||	| q1t	|\}
}| ||||
|| t  d S )Nr   z"scatter requires in_array to have 'elements in its first dimension, found )shape
_n_devicesrL   rM   rR   r   r   r0   ranger%   r   r   r   )r   r   rd   re   r6   rP   r	   r!   r   r   r   r"   r#   r#   r$   rs     s$   




z_DenseNCCLCommunicator.scatterc              	   C   s   |j d |jkrtd|j d|j  || || ||}t  ||jkrHt|jD ]}|| }t	|\}}	| 
|||||	| q1t	|\}
}| ||||
|| t  d S )Nr   z"gather requires out_array to have r   )r   r   rL   rM   rR   r   r   r0   r   r%   r   r   r   )r   r   rd   re   r6   rP   r	   r!   r   r   r   r"   r#   r#   r$   ru     s$   




z_DenseNCCLCommunicator.gatherc           
   	   C   s   |j d |jkrtd|j d|j  |j d |jkr(td|j d|j  || || ||}t|d \}}t|d \}}t  t|jD ]}	| 	|||	 |	||| | 
|||	 |	||| qPt  d S )Nr   %all_to_all requires in_array to have r   z&all_to_all requires out_array to have )r   r   rL   rM   rR   r%   r   r   r   r   r   r   )
r   r   rd   re   rP   r   r   r   r   r	   r#   r#   r$   rv     s,   




z!_DenseNCCLCommunicator.all_to_allrw   rx   ry   r'   )rz   r{   r|   classmethodrb   rg   ri   rk   rl   rm   r   rp   r   rq   rs   ru   rv   r#   r#   r#   r$   rX   8  s8    	
rX   c                 C   s   t d| }t dd}t dd}|dkr tj|||fddS |dkr.tj|||fddS |dkr=tj|||ffddS td)	N   r	   csr)r   r   )r   csccoo4NCCL is not supported for this type of sparse matrix)rN   emptyr   Z
csr_matrixZ
csc_matrixZ
coo_matrixr   )r   Zsparse_typer   ar   r#   r#   r$   _make_sparse_empty  s   r   c                 C   s2   t | rdS t | rdS t | rdS td)Nr   r   r   r   )r   isspmatrix_cooisspmatrix_csrisspmatrix_cscr   )matrixr#   r#   r$   _get_sparse_type  s   


r   c                   @   s   e Zd Zedd Zedd Zedd Zdd Zed&ddZed'ddZ	ed(ddZ
e	
d&ddZed)ddZed)ddZed)ddZed)ddZed)ddZed)ddZed(d d!Zed(d"d#Zed)d$d%Zd
S )*r\   c                 C   sN   t |r|  |j|j|jfS t |st |r#|j|j|j	fS t
d)Nr   )r   r   Zsum_duplicatesr   rowcolr   r   indptrindicesr   )r   r!   r#   r#   r$   _get_internal_arrays  s   
z,_SparseNCCLCommunicator._get_internal_arraysc                 C   s   |t dd |D  }|S )Nc                 s   s    | ]}|j V  qd S r'   )r    )rA   r   r#   r#   r$   	<genexpr>  s    z?_SparseNCCLCommunicator._get_shape_and_sizes.<locals>.<genexpr>)rH   )r   arraysr   sizes_shaper#   r#   r$   _get_shape_and_sizes  s   z,_SparseNCCLCommunicator._get_shape_and_sizesc                 C   sz  |j r|dkrtj|dd}|jj||dd d S |dkr/tjddd}|jj||dd |S |d	krQ|j|kr@tj|dd}ntjddd}|jj||d
 |S |dkrptj|dd}tj|j	dgdd}|j
||| |S |dkrtj|dd}tj|j	dgdd}|j|| |S tdtd |dkrtj|dd}| ||||jd| d S |dkrtjddd}| ||||jd| t|S |d	kr|j|krtj|dd}ntjddd}tj||||d t|S |dkrtj|dd}tj|j	dfdd}tj|||||d t|S |dkr9tj|dd}tj|j	dfdd}tj||||d t|S td)Nrm   r   r   r   )desttagrp      )sourcer   r:   r5   ru   alltoallzUnsupported methodzUsing NCCL for transferring sparse arrays metadata. This will cause device synchronization and a huge performance degradation. Please install MPI and `mpi4py` in order to avoid this issue.)r6   rP   )rP   )r+   numpyr!   r7   ZSendr   ZRecvr0   ZBcastr   ZGatherZAlltoallrL   warningswarnrN   r   r   r   ZasnumpyrX   ri   ru   rv   )r   r   rn   r   methodrP   Zrecv_bufr#   r#   r$   _exchange_shape_and_sizes  s   








z1_SparseNCCLCommunicator._exchange_shape_and_sizesc                 C   s~   t | r|d | _|d | _|d | _t|| _d S t | s%t | r;|d | _|d | _	|d | _
t|| _d S td)Nr   r   r   r   )r   r   r   r   r   rH   _shaper   r   r   r   r   )r   r   r   r#   r#   r$   _assign_arraysH  s   






z&_SparseNCCLCommunicator._assign_arraysr   Nc                 C   s,   d}|  |||||| | |||| d S rW   )rg   ri   )r   r   rd   re   rU   rP   r6   r#   r#   r$   rb   W  s   z"_SparseNCCLCommunicator.all_reducer   c              
   C   sf  |  |}| ||j}| |||d|}|j|krt|t|kr&td|}	t|jt|}
t	|D ]V\}}t
|dd }|dd  }dd t||D }||krt  |D ]}| ||||j|j| qZt  | |
|| |dkr}|	|
 }	q4|dkr|	|
 }	q4td	q4| ||  |	|	j d S t  |D ]}| ||||j|j| qt  d S )
Nru   z.in_array and out_array must be the same formatr   r   c                 S       g | ]\}}t j||jd qS r   rN   r   r   rA   sr   r#   r#   r$   rB   q      z2_SparseNCCLCommunicator.reduce.<locals>.<listcomp>r   r   z.Sparse matrix only supports sum/prod reduction)r   r   r   r   r0   r   rT   r   r   	enumeraterH   zipr   r   r   r    r   r   r   )r   r   rd   re   r6   rU   rP   r   shape_and_sizesresultpartialrn   ssr   sizesr   r#   r#   r$   rg   _  sV   





z_SparseNCCLCommunicator.reducec           
      C   s   |  |}|j|kr| ||j}nd}| |||d|}t|dd }|dd  }|j|kr:dd t||D }t  |D ]
}	t	
||	|| q@t  | ||| d S )Nr#   r:   r   r   c                 S   r   r   r   r   r#   r#   r$   rB     r   z5_SparseNCCLCommunicator.broadcast.<locals>.<listcomp>)r   r0   r   r   r   rH   r   r   r   rX   ri   r   r   )
r   r   rj   r6   rP   r   r   r   r   r   r#   r#   r$   ri     s(   



z!_SparseNCCLCommunicator.broadcastc              	   C   sl   d}g }t |ttfstd|D ]}	t|	jt|	}
| ||	|
||| ||
 q| 	||||| d S )Nr   z5in_array must be a list or a tuple of sparse matrices)
rY   rZ   rH   rT   r   r   r   rg   appendrs   )r   r   rd   re   r"   rU   rP   r6   Zreduce_out_arrayss_mZpartial_out_arrayr#   r#   r$   rk     s   
z&_SparseNCCLCommunicator.reduce_scatterc           	         sd   d}g }|  | ||| |j|kr fddt|jD }|D ]}| |||| || q d S )Nr   c                    s   g | ]
}t  jt qS r#   )r   r   r   )rA   _rd   r#   r$   rB     s    z6_SparseNCCLCommunicator.all_gather.<locals>.<listcomp>)ru   r0   r   r   ri   r   )	r   r   rd   re   r"   rP   r6   Zgather_out_arraysZarrr#   r   r$   rl     s   

z"_SparseNCCLCommunicator.all_gatherc              	   C   s`   |  |}| ||j}| |||d| t  |D ]}| ||||j|j| qt	  d S )Nrm   )
r   r   r   r   r   r   r   r   r    r   )r   r   r!   rn   rP   r   r   r   r#   r#   r$   rm     s   

z_SparseNCCLCommunicator.sendc                 C   sT   |j j}|tvrtd|j  dt|\}}||}|j|jj	|||| d S Nr   r   )
r   r   r   r   r%   rR   r<   rm   r   rQ   r   r#   r#   r$   r     s   
z_SparseNCCLCommunicator._sendc              	   C   s   |  ||dd|}| |}t|dd }|dd  }dd t||D }	t  |	D ]}
| ||
||
j|
j| q,t	  | 
||	| d S )Nr#   rp   r   r   c                 S   r   r   r   r   r#   r#   r$   rB     s     z0_SparseNCCLCommunicator.recv.<locals>.<listcomp>)r   r   rH   r   r   r   r   r   r    r   r   )r   r   re   rn   rP   r   r   r   r   Zarrsr   r#   r#   r$   rp     s   

z_SparseNCCLCommunicator.recvc                 C   sR   |j }|tvrtd|j dt|\}}||}|j|jj	|||| d S r   )
r   r   r   r   r%   rR   r<   rp   r   rQ   r   r#   r#   r$   r     s   
z_SparseNCCLCommunicator._recvc                 C   s4   t   | |||| | |||| t   d S r'   )r   r   rm   rp   r   )r   r   rd   re   rn   rP   r#   r#   r$   rq     s   z!_SparseNCCLCommunicator.send_recvc                 C   sz   |j |kr3t  t|D ]\}}||kr| |||| qt  | || || || j d S | 	|||| d S r'   )
r0   r   r   r   rm   r   r   r   r   rp   )r   r   rd   re   r6   rP   rn   Zs_ar#   r#   r$   rs     s   
z_SparseNCCLCommunicator.scatterc                 C   s|   |j |kr4t|jD ]'}t|jt|}||kr!| |||| n| || ||j	 |
| q
d S | |||| d S r'   )r0   r   r   r   r   r   rp   r   r   r   r   rm   )r   r   rd   re   r6   rP   rn   resr#   r#   r$   ru     s   

z_SparseNCCLCommunicator.gatherc              
   C   sP  t ||jkrtd|j dt | g }g }t|D ]\}}| |}	|| |	|j q| |||d|}t	|jD ]g}t
|| dd }
|| dd  }| || }dd t||D }t  |D ]}| ||||j|j| qi|D ]}| ||||j|j| qzt  |t|| jt||  | || ||
 q>d S )Nr   zelements, found r   r   r   c                 S   r   r   r   r   r#   r#   r$   rB   >  r   z6_SparseNCCLCommunicator.all_to_all.<locals>.<listcomp>)lenr   rL   r   r   r   r   r   r   r   rH   r   r   r   r   r   r    r   r   r   r   r   )r   r   rd   re   rP   r   Zrecv_shape_and_sizesr	   r   r   r   r   Zs_arraysZr_arraysr#   r#   r$   rv   &  sB   



z"_SparseNCCLCommunicator.all_to_allrw   rx   ry   r'   )rz   r{   r|   r   r   r   r   r   rb   rg   ri   rk   rl   rm   r   rp   r   rq   rs   ru   rv   r#   r#   r#   r$   r\     sF    


K-r\   r'   )%r   r   rN   Z	cupy.cudar   Zcupyx.distributedr   Zcupyx.distributed._commr   Zcupyx.scipyr   Zmpi4pyr   r*   ImportError	availableZ	NCCL_INT8Z
NCCL_UINT8Z
NCCL_INT32ZNCCL_UINT32Z
NCCL_INT64ZNCCL_UINT64ZNCCL_FLOAT16ZNCCL_FLOAT32ZNCCL_FLOAT64r   ZNCCL_SUMZ	NCCL_PRODZNCCL_MAXZNCCL_MINrS   r%   r&   rX   r   r   r\   r#   r#   r#   r$   <module>   sV    
  