o
    * i(                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 ddl
mZ ddlmZ G dd	 d	ZG d
d dZG dd deZdS )    N)	Container)Job)Pod   )Master)Watcherc                   @   s   e Zd ZdZdZdZdZdS )ControllerModeZ
collectiveZpsZipuZrpcN)__name__
__module____qualname__Z
COLLECTIVEZPSZIPUZRPC r   r   |/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/launch/controllers/controller.pyr      s
    r   c                   @   sV   e Zd Zdd Zdd Zdd Zdefdd	ZdddZdddZ	dd Z
dd Zd
S )ControllerBasec                 C   s   t  t j| j t  t j| j t  t j| j | r3|js.t  t j| j t 	|j
 nt 	d || _t| j| _t| j| _t| jjj| jjj| jjjd| _t | _| jd| jji d | _d S )Nr   )nnodesmodeZjidZPOD_NAME)signalSIGTERMsignal_handlerSIGABRTSIGINTZis_auto_tuner_modeZrun_bestSIGALRMnot_exit_signal_handleralarmZmax_time_per_taskctxr   factorymasterr   watcherr   argsr   Zrun_modeZjob_idjobr   podZset_envsnameZjoin_server)selfr   r   r   r   __init__$   s&   

zControllerBase.__init__c                 C   s   t | jjt | jj dksJ d| jjd| j  t | jjdkr0| jj| jjd  t | jjdkrC| jj| jjd  |   | jj	
  | j  d S )Nr   No container in the podzRun )lenr   
containersinit_containersr   loggerinfodebugsave_pod_envstatusrunZdeployr!   r   r   r   
deploy_pod@   s   zControllerBase.deploy_podc                 C   s$   |    |   |   |   d S N)	build_job	build_podr.   watchr-   r   r   r   r,   O   s   zControllerBase.runreturnc                 C   s  | j jd| j  | j j s| jjdd}| j  || j jjkrG| j j	  | j
| | j r;	 | j s5| j jd|  dS || j jjkr| j j  | j
| | j
  | j }| j jd|  | j jd|d   | j jd |d   | j jjdkr| jjd	d dS | jjd
d dS | j j r| j
 | j jjkr| j jjdkr| jjd	d dS | jjd
d dS | j j rdS dS )zA
        watch self and peer status, return true to exit
        z	Watching    timeoutzPod TzContainer failed !!!
r   zD------------------------- ERROR LOG DETAIL -------------------------      FN)r   r'   r(   r   r+   Zis_doner2   ZlogsZ	COMPLETEDZcompleter   Z
set_statusZFAILEDZfailZrestart_peerZfailed_containererrortailr   Zelastic_levelstopZis_restartingZ
get_status)r!   r+   Zfcr   r   r   r2   W   sJ   





zControllerBase.watchNc                 C   s4   | j jd | j  | j  | jjdd d S )NzController stopr8   r5   )r   r'   r)   r   r<   r   r   )r!   sigintr   r   r   r<      s   

zControllerBase.stopTc                 C   sF   | j   | j  | jjd| j j  |r!t	| j j d S d S )Nz
Exit code )
r   joinr   r<   r   r'   r(   	exit_codesysexit)r!   rA   r   r   r   finalize   s   

zControllerBase.finalizec                 C   s   t | dr| jjd | jjdd t| | jjd|  || _| jj	
  | j|d | jjd|  t| d S Nr=   zForce quit in 10 seconds...
   r5   zTerminating with signal )r=   zExit with signal )hasattrr   r'   r(   r   r<   r@   rA   r=   r+   doner!   r=   framer   r   r   r      s   

zControllerBase.signal_handlerc                 C   sp   t | dr| jjd | jjdd | jjd|  || _| jj  | j|d | jjd|  d S rC   )	rE   r   r'   r(   r   r<   r=   r+   rF   rG   r   r   r   r      s   
z&ControllerBase.not_exit_signal_handlerr/   )T)r	   r
   r   r"   r.   r,   boolr2   r<   rB   r   r   r   r   r   r   r   #   s    
>
r   c                   @   s   e Zd ZdZdd ZdefddZdd Zdd
dZd	i dd	d	fddZ	d	d	i d	dfddZ
dd Zdd Zdd ZdddZd	S )
Controllerz*
    Controller API for customization
    c                 C   s   | j j| j dS )z.
        build job fill the job info.
        N)r   r'   r(   r   r-   r   r   r   r0      s   zController.build_jobr3   c                 C   s   t )z]
        build pod includes creating containers etc.

        Return True if succeed
        )NotImplementedErrorr-   r   r   r   r1      s   zController.build_podc                 C   s   | j jjdr)tjddkrtjdddddd	| j jjg}n!tjd| j jjg}n| j jjd
r:tj| j jjg}n| j jjg}|	| j jj
 |S )Nz.pyZWITH_COVERAGEONz-uz-mZcoverager,   z--branchz-pz.pyxes)r   r   Ztraining_scriptendswithosenvirongetr@   
executableextendZtraining_script_args)r!   
entrypointr   r   r   _get_entrypoint   s(   zController._get_entrypointNc                 C   sX   |r| j jjdkrtj| j jj|}|r&| j jjdkr&tj| j jj|}||p*|fS )N )r   r   log_dirrN   pathr>   )r!   outerrr   r   r   _get_out_err_file   s
   zController._get_out_err_fileTc                 C   sL   t |p|  |r| j ni | jjjd}| ||\|_|_|	| |S )N)rS   envZoverwrite_log)
r   rT   r   Zget_envsr   Zlog_overwriterZ   outfileZerrfileZ
update_env)r!   rS   envsZuse_ctx_envrX   rY   cr   r   r   new_container   s   

zController.new_containerFc                 C   s^   |st |}ttj| jjj|d< | j	||||d}|r'| j
| d S | j
| d S )NZPADDLE_LOG_DIR)rS   r]   rX   rY   )copydeepcopystrrN   rW   abspathr   r   rV   r_   r   Zadd_init_containeradd_container)r!   	containerrS   r]   Zlog_fileis_initr   r   r   rd     s   
zController.add_containerc                 C   sB   | j jjrt| j jjS | j jjrt| j jjdS | j jjj	S )zA
        how many process/container should be run in pod
        ,)
r   r   Znproc_per_nodeintZdevicesr$   splitnodeZdevicecountr-   r   r   r   pod_replicas  s
   

zController.pod_replicasc              
   C   s   | j jjsdS tj| j jj| jj d| jj	 d}zDtj
tj|dd t|d)}| dkr@|ttj |d |t| |d W d   W dS 1 sXw   Y  W dS  ty| } z| j jd	|  W Y d}~dS d}~ww )
zH
        save_pod_log append *info* to the log file of pod.name
        N.z.logTexist_okza+r   
zsave log failed because )r   r   rV   rN   rW   r>   r   idr   r    makedirsdirnameopentellwriterb   rO   	Exceptionr'   r:   )r!   r(   ffder   r   r   save_pod_log!  s&   

&"zController.save_pod_logc                 C   sj   t | jjt | jj dksJ d| jjjsd S | jjD ]	}| j|dd q| jjD ]}| | q+d S )Nr   r#   T)rf   )r$   r   r%   r&   r   r   rV   _save_container_env)r!   r^   r   r   r   r*   7  s   
zController.save_pod_envc              
   C   s   t j| jjj|rd|j nd|j }z5t jt j|dd t	||j
}|dd t|j D  W d    W d S 1 sDw   Y  W d S  tyh } z| jjd|  W Y d }~d S d }~ww )Nzenvlog.init.zenvlog.Trn   c                 s   s$    | ]\}}| d | dV  qdS )=rp   Nr   ).0kvr   r   r   	<genexpr>Q  s    
z1Controller._save_container_env.<locals>.<genexpr>z save pod env log failed because )rN   rW   r>   r   r   rV   Zrankrr   rs   rt   Zlog_mode
writelinessortedr[   itemsrw   r'   r:   )r!   re   rf   rx   ry   rz   r   r   r   r|   E  s"   


&"zController._save_container_env)NN)F)r	   r
   r   __doc__r0   rI   r1   rT   rZ   r_   rd   rl   r{   r*   r|   r   r   r   r   rJ      s$    



rJ   )r`   rN   r   r@   Z'paddle.distributed.launch.job.containerr   Z!paddle.distributed.launch.job.jobr   Z!paddle.distributed.launch.job.podr   r   r   r   r   r   r   rJ   r   r   r   r   <module>   s    