o
    * i\@                     @   s0  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ ddlmZ eddZd	d
 Zdd ZG dd dZG dd dZG dd dZG dd dZG dd dZdd Zdd Zdd Zdd Zdd  Zd-d!d"ZG d#d$ d$Z	d-d%d&Zd'd( Zd)d* Z d+d, Z!dS ).    N)closing)get_backend_by_compile_flag)	strtobool   )
get_loggerINFOrootc                    s   dd | j dD }| j}||}td| d| d|  d }| js@t|dkr@| jd u r@t	t|}|d ur?t
|}nd}| jd urJ| j}t
t||t| }g }|D ] | fd	d|D  qYt||||S )
Nc                 S      g | ]}|  qS  strip.0xr
   r
   q/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/utils/launch_utils.py
<listcomp>!       z)get_cluster_from_args.<locals>.<listcomp>,zparsed from args:node_ips:z	 node_ip:z node_rank:   i  c                    s   g | ]	}  d | qS ):r
   )r   portipr
   r   r   =   s    )Zcluster_node_ipssplitnode_ipindexloggerdebugZuse_paddlecloudlenstarted_portfind_free_portslistrangeappendget_cluster)argsselected_gpusnode_ipsr   	node_rankZ
free_portsr   trainer_endpointsr
   r   r   get_cluster_from_args    s2   


r*   c                    s   | d u rddl m} | }dd td|D }|S td}|d u s'|dkr3dd | dD }|S |d | dD ]}| v sNJ d	| d
| dq= fdd| dD }td|  d| d   |S )Nr   corec                 S      g | ]}t |qS r
   strr   r
   r
   r   r   F   r   zget_gpus.<locals>.<listcomp>ZCUDA_VISIBLE_DEVICES c                 S   r	   r
   r   r   r
   r
   r   r   J   r   r   zCan't find your selected_gpus z in CUDA_VISIBLE_DEVICES[z].c                    s   g | ]	}  | qS r
   )r   r   r   Zcuda_visible_devices_listr
   r   r   U   s    z1Change selected_gpus into relative values. --ips:z will change into relative_ips:z( according to your CUDA_VISIBLE_DEVICES:)	paddle.frameworkr,   Zget_cuda_device_countr"   osgetenvr   r   info)r&   r,   Zgpus_numgpusZcuda_visible_devicesr   r
   r1   r   get_gpusA   s:   



r7   c                   @   4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )Hdfsc                 C   s   d | _ d | _d | _d S NZhdfs_ugiZ	hdfs_nameZ	hdfs_pathselfr
   r
   r   __init__c      
zHdfs.__init__c                 C   s   | j d uo| jd uo| jd uS r:   r;   r<   r
   r
   r   is_validh   s
   
zHdfs.is_validc                 C      d| j  d| j d| j S )Nz	hdfs_ugi:z hdfs_name:z
 hdfs_pathr;   r<   r
   r
   r   __str__o      zHdfs.__str__c                 C   s$   | j |j ko| j|jko| j|jkS r:   r;   r=   nr
   r
   r   __eq__r   s
   

zHdfs.__eq__c                 C   
   | |k S r:   r
   rD   r
   r
   r   __ne__y      
zHdfs.__ne__N)__name__
__module____qualname__r>   r@   rB   rF   rH   r
   r
   r
   r   r9   b   s    r9   c                   @   s\   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd ZdS )Clusterc                 C   s   d | _ g | _d | _d | _d S r:   )
job_serverpodshdfsjob_stage_flag)r=   rP   r
   r
   r   r>   ~   s   
zCluster.__init__c                 C   s.   d| j  ddd | jD  d| j d| j S )Nzjob_server:z pods:c                 S   r-   r
   r.   )r   podr
   r
   r   r      r   z#Cluster.__str__.<locals>.<listcomp>z job_stage_flag:z hdfs:)rN   rO   rQ   rP   r<   r
   r
   r   rB      s   .zCluster.__str__c                 C   sR   t | jt |jkrdS t| j|jD ]\}}||kr dS q| j|jkr'dS dS NFT)r   rO   ziprQ   )r=   clusterabr
   r
   r   rF      s   zCluster.__eq__c                 C   s   |  | S r:   )rF   r=   rU   r
   r
   r   rH         zCluster.__ne__c                 C   s   t  |j| _d S r:   )copyrO   rX   r
   r
   r   update_pods   s   zCluster.update_podsc                 C   s   t |  S r:   )r   trainers_endpointsr<   r
   r
   r   trainers_nranks   rY   zCluster.trainers_nranksc                 C   s
   t | jS r:   )r   rO   r<   r
   r
   r   pods_nranks   rI   zCluster.pods_nranksc                 C   s,   g }| j D ]}|jD ]}||j q
q|S r:   )rO   trainersr#   endpoint)r=   rrR   tr
   r
   r   r\      s   

zCluster.trainers_endpointsc                 C   sR   g }| j D ]!}|j d|j }|jd ur|jd us!J | d|| q|S )Nr   z not a valid endpoint)rO   addrr   r#   )r=   ra   rR   epr
   r
   r   pods_endpoints   s   
zCluster.pods_endpointsc                 C   s*   | j D ]}t|t|jkr|  S qd S r:   )rO   r/   id)r=   Zpod_idrR   r
   r
   r   get_pod_by_id   s
   
zCluster.get_pod_by_idN)rJ   rK   rL   r>   rB   rF   rH   r[   r]   r^   r\   re   rg   r
   r
   r
   r   rM   }   s    rM   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
	JobServerc                 C   s
   d | _ d S r:   r`   r<   r
   r
   r   r>      rI   zJobServer.__init__c                 C   s   | j  S r:   ri   r<   r
   r
   r   rB      s   zJobServer.__str__c                 C   s   | j |j kS r:   ri   r=   jr
   r
   r   rF      rY   zJobServer.__eq__c                 C   rG   r:   r
   rj   r
   r
   r   rH      rI   zJobServer.__ne__N)rJ   rK   rL   r>   rB   rF   rH   r
   r
   r
   r   rh      s
    rh   c                   @   r8   )Trainerc                 C   s   g | _ d | _d | _d S r:   r6   r`   rankr<   r
   r
   r   r>      r?   zTrainer.__init__c                 C   rA   )Nzgpu:z
 endpoint:z rank:rm   r<   r
   r
   r   rB      rC   zTrainer.__str__c                 C   s^   t | jt |jkrdS | j|jks| j|jkrdS t| j|jD ]\}}||kr, dS q!dS rS   )r   r6   r`   rn   rT   )r=   rb   rV   rW   r
   r
   r   rF      s   zTrainer.__eq__c                 C   rG   r:   r
   )r=   rb   r
   r
   r   rH      rI   zTrainer.__ne__c                 C   s   | j S r:   )rn   r<   r
   r
   r   get_rank   s   zTrainer.get_rankN)rJ   rK   rL   r>   rB   rF   rH   ro   r
   r
   r
   r   rl      s    rl   c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )Podc                 C   s(   d | _ d | _d | _d | _g | _g | _d S r:   )rn   rf   rc   r   r_   r6   r<   r
   r
   r   r>         
zPod.__init__c                 C   s>   d| j  d| j d| j d| j d| j ddd | jD  S )	Nzrank:z id:z addr:z port:z visible_gpu:z
 trainers:c                 S   r-   r
   r.   )r   rb   r
   r
   r   r      r   zPod.__str__.<locals>.<listcomp>)rn   rf   rc   r   r6   r_   r<   r
   r
   r   rB      s   >zPod.__str__c                 C   s   | j |j ks| j|jks| j|jks| j|jkr%td|  d|  dS t| jt|jkr>td| j d|j  dS tt| jD ] }| j| |j| kretd| j|  d|j|    dS qEdS )Nzpod z != Fz	trainers ztrainer T)	rn   rf   rc   r   r   r   r   r_   r"   )r=   rR   ir
   r
   r   rF      s   "z
Pod.__eq__c                 C   rG   r:   r
   )r=   rR   r
   r
   r   rH     rI   z
Pod.__ne__c                 C   s   d S r:   r
   )r=   Zres_podsr
   r
   r   parse_response	  s   zPod.parse_responsec                 C   sF   d}| j D ]	}|| d7 }q|dksJ d|  d|d d }|S )Nr0   r   z	this pod z can't see any gpus)r6   )r=   ra   gr
   r
   r   get_visible_gpus  s   
zPod.get_visible_gpusN)	rJ   rK   rL   r>   rB   rF   rH   rs   rv   r
   r
   r
   r   rp      s    rp   c                 C   s   t |tu s
J dtd d}d}t| D ]J\}}t }||_||_|| }	t|	t|ks2J dtt|D ] }
t	 }|j
||
  |	|
  |_||_|d7 }|j| q8|j| q| |}||j| fS )Nztrainer_endpoints must be list)rP   r   zOcurrent trainer_endpoints size should be greater equal than selected_gpus size.r   )typer!   rM   	enumeraterp   rn   rc   r   r"   rl   r6   r#   r`   r_   rO   r   )r'   r   r)   r&   rU   Ztrainer_rankr(   r   rR   Zcur_node_endpointsrr   trainerZpod_rankr
   r
   r   r$     s*   

r$   c                 C   s   | D ] }|j  d u r"|j   |jr|j  td|j j  qt	d t
ddD ]*}d}| D ]}|j  d u rGt|j jtj d}q3|sRtd  d S t	d q-td td	 d S )
Nzterminate process id:   r   2   FTzterminate all the procszcan't kill all process and exitr   )procpoll	terminatelog_fncloser   r   pidtimesleepr"   r3   killsignalSIGKILLr5   fatalsysexit)procspstepaliver
   r
   r   terminate_local_procs2  s*   




r   c                  C   s*   zt  } t | }| |fW S    Y d S r:   )socketgethostnamegethostbyname)Z	host_namehost_ipr
   r
   r   get_host_name_ipM  s   

r   c                 K   s6   |t krtn|}|jd|  f|||d d| dS )ab  Add argparse's argument.
    Examples:
        .. code-block:: python

            >>> import argparse
            >>> from paddle.distributed.utils import launch_utils
            >>> parser = argparse.ArgumentParser()
            >>> launch_utils.add_arguments("name", str, "Jonh", "User name.", parser)
            >>> args = parser.parse_args()

    z--z Default: %(default)s.)defaultrw   helpN)boolr   add_argument)argnamerw   r   r   Z	argparserkwargsr
   r
   r   add_argumentsV  s   
r   c                 C   sZ   dd }t  }d}	 | }||vr|| t|| kr|S |d7 }|dkr,td d S q
)Nc                  S   sN   t ttjtj} | d |  d W  d    S 1 s w   Y  d S )N)r0   r   r   )r   r   AF_INETSOCK_STREAMbindgetsockname)sr
   r
   r   __free_portm  s   

$z$find_free_ports.<locals>.__free_portr   Tr   d   z@can't find available port and use the specified static port now!)setaddr   print)numr   Zport_setr   r   r
   r
   r   r    l  s    
r    c                 C   sn  |d u rt  }|dkr0dddd |jD t|jt|jt|  d|  d}|S |dkrYdddd |jD t|jt|jt|  d|  d	}|S |d
krvt|jt|jt|  d|  |d}|S |dkrddl	m
} | d }d| ddddd |jD dt|jdt|jdt|  dd|  i}|S td)NZbkclz{}r   c                 S   r-   r
   r.   r   ru   r
   r
   r   r     r   z(_prepare_trainer_env.<locals>.<listcomp>)ZFLAGS_selected_xpusPADDLE_TRAINER_IDPADDLE_CURRENT_ENDPOINTPADDLE_TRAINERS_NUMPADDLE_TRAINER_ENDPOINTSZncclc                 S   r-   r
   r.   r   r
   r
   r   r     r   )ZFLAGS_selected_gpusr   r   r   r   Zgloo)r   r   r   r   ZPADDLE_DISTRI_BACKENDZxcclr   r+   ZFLAGS_selected_r   c                 S   r-   r
   r.   r   r
   r
   r   r     r   r   r   r   r   z)backend must be one of 'gloo, nccl, bkcl')r   formatjoinr6   r/   rn   r`   r]   r\   r2   r,   Zget_all_custom_device_type
ValueError)rU   ry   backendproc_envr,   Zcustom_device_namer
   r
   r   _prepare_trainer_env  sV   
,
"


r   c                   @   s   e Zd Zdd ZdS )TrainerProcc                 C   s(   d | _ d | _d | _d | _d | _d | _d S r:   )r|   r   
log_offsetrn   
local_rankcmdr<   r
   r
   r   r>     rq   zTrainerProc.__init__N)rJ   rK   rL   r>   r
   r
   r
   r   r     s    r   c                 C   s   t  tj  }|dd  |dd  g }t|jD ]r\}}t| |}	||	 t	d|  t
jd|g|}
td|
 d|	  d }|d uretj|dd t| d	| d
}tj|
|||d}ntj|
|d}t }||_|j|_||_||_|r| nd |_|
|_|| q|S )NZ
http_proxyZhttps_proxyztrainer proc env:z-uzstart trainer proc:z env:T)exist_okz/workerlog.rV   )envstdoutstderr)r   )rZ   r3   environpoprx   r_   r   updater   r   r   
executabler5   makedirsopen
subprocessPopenr   r|   rn   r   r   tellr   r   r#   )rU   rR   Ztraining_scriptZtraining_script_argsZlog_dirZcurrent_envr   idxrb   r   r   fnr|   tpr
   r
   r   start_local_trainers  s2   

r   c              
   C   s   | j rKt| j jd7}|| jd |D ]}ztj| W q ty3   tjd| j j d Y qw |	 | _W d    d S 1 sDw   Y  d S d S )Nra   r   zOUnicodeEncodeError occurs at this line. Please refer to the original log file "z"
)
r   r   nameseekr   r   r   writeUnicodeEncodeErrorr   )r   Zfinliner
   r
   r   pull_worker_log  s    "r   c              
   C   s   z@d}g }d}| D ]&}|j r|jdkrt| |j }|d u r#d}q	|dkr/d}||j q	|r>t|  t	d W |S W |S  t
yQ   td t|    tyh   td| d| d t|      td| d| d t|   )	NFr   Tr   zKeyboardInterrupt, exitzABORT!!! Out of all z) trainers, the trainer process with rank=z# was aborted. Please check its log.)r   r   r   r|   r}   r#   rn   r   r   r   KeyboardInterruptr   warning
SystemExiterror)r   Znranksr   Z
error_rankr   r   retr
   r
   r   watch_local_trainers  sH   

r   c                 C   s@   t d tt|  D ]\}}t | d|  qt d d S )Nz0-----------  Configuration Arguments -----------z: z0------------------------------------------------)r   sortedvarsitems)r%   argvaluer
   r
   r   _print_arguments&  s   r   r:   )"rZ   r3   r   r   r   r   r   
contextlibr   Z%paddle.distributed.fleet.launch_utilsr   Zpaddle.utilsr   Zutils.log_utilsr   r   r*   r7   r9   rM   rh   rl   rp   r$   r   r   r   r    r   r   r   r   r   r   r
   r
   r
   r   <module>   s>   
!!=2	
3
+)