o
    W+ iUJ                     @   s2  d Z ddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZ dZdd Zdd	 Zd
d Zd2ddZdd ZG dd dZdd Z		d3ddZdd Zdd Zdd Z					d4ddZd d! Zd"d# Z		d3d$d%Zd5d&d'Zd5d(d)Zd5d*d+Z d5d,d-Z!d5d.d/Z"d0d1 Z#dS )6z'Utilities for logging and serialization    N)mpuprint_rank_0)FP16_Optimizerrunsc                 C   s   t j|t| S N)ospathjoinSUMMARY_WRITER_DIR_NAME)namebase r   l/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/modelscope/models/nlp/mglm/utils.pyget_log_dir   s   r   c                  C   s*   dg} t j| dd}|d d }|S )Nzhostname -ITshellutf-8r   )
subprocesscheck_outputdecodesplit)Zhostname_cmdresultZmaster_addrr   r   r   get_hostname"   s   r   c                 C   s   t j dkr0tjdgdd}t| }|| jkr(tjdgdd}t| }t j	|g}nt j	dg}t j
|d | }|S )Nr   zshuf -n 1 -i 10000-65535Tr   )torchdistributedget_rankr   r   intstripZmaster_portcudaZ
LongTensor	broadcastitem)argsportr   r   r   get_spare_port)   s   
r#   Tc           
   	   C   s<  |r%t ddd t| D ]}ddt|  }t d||t| |dd q|durtj|d}t|d	}t	j
t| |dd
 W d   n1 sJw   Y  | jr| jdurt| jdd}t	|}W d   n1 snw   Y  tj|d}	t|	d	}t	
|| W d   dS 1 sw   Y  dS dS dS dS )zPrint arguments.z
arguments:Tflush.   z
  {} {} {}Nzconfig.jsonw)	sort_keysr   encodingzconfig_gpt_large.json)printvarslenformatgetattrr   r   r	   openjsondump	deepspeeddeepspeed_configload)
r!   verboseZlog_dirargdotsZ	json_fileoutputfiler5   Zdeepspeed_json_filer   r   r   print_and_save_args:   s0   "r<   c              
   C   s   d}t j }d}| }t| tr| j}|jD ]2}|d D ]+}|d7 }|j }|j	 }	|j
 }
|d|||t|j7 }|d||	|
7 }qqt|dd d	S )
z+Print min, max, and norm of all parameters.r   z6iteration, rank, index, model-parallel,min, max, norm
params   z{:7d}, {:4d}, {:4d}, {:2d}, z{:.6E}, {:.6E}, {:.6E}
Tr$   N)r   r   r   
isinstancer   	optimizerZparam_groupsdataminmaxnormr/   r   Zmodel_parallelr,   )r@   	iterationindexZrankstringZ
optimizer_Zparam_groupparamZmin_Zmax_rD   r   r   r   print_params_min_max_normO   s$   





rI   c                   @   s8   e Zd ZdZG dd dZdd Zdd Zdd
dZdS )TimerszGroup of timers.c                   @   s:   e Zd ZdZdd Zdd Zdd Zdd	 ZdddZdS )zTimers.TimerzTimer.c                 C   s    || _ d| _d| _t | _d S )N        F)Zname_elapsed_started_time
start_timeselfr   r   r   r   __init__i   s   zTimers.Timer.__init__c                 C   s,   | j rJ dtj  t | _d| _ dS )zStart the timer.ztimer has already been startedTN)rM   r   r   synchronizerN   rO   rQ   r   r   r   starto   s   


zTimers.Timer.startc                 C   s:   | j sJ dtj  |  jt | j 7  _d| _ dS )zStop the timer.ztimer is not startedFN)rM   r   r   rS   rL   rN   rO   rT   r   r   r   stopv   s   

zTimers.Timer.stopc                 C   s   d| _ d| _dS )zReset timer.rK   FN)rL   rM   rT   r   r   r   reset}   s   
zTimers.Timer.resetTc                 C   s6   | j }| j r
|   | j}|r|   |r|   |S )zCalculate the elapsed time.)rM   rV   rL   rW   rU   )rQ   rW   rM   rL   r   r   r   elapsed   s   zTimers.Timer.elapsedN)T)	__name__
__module____qualname____doc__rR   rU   rV   rW   rX   r   r   r   r   Timerf   s    r]   c                 C   s
   i | _ d S r   )timersrT   r   r   r   rR      s   
zTimers.__init__c                 C   s$   || j vr| || j |< | j | S r   )r^   r]   rP   r   r   r   __call__   s   

zTimers.__call__      ?Tc                 C   sP   |dksJ d}|D ]}| j | j|dd | }|d||7 }q
t| dS )zLog a group of timers.rK   z	time (ms))rW   g     @@z | {}: {:.2f}N)r^   rX   r/   r   )rQ   namesZ
normalizerrW   rG   r   Zelapsed_timer   r   r   log   s   
z
Timers.logN)r`   T)rY   rZ   r[   r\   r]   rR   r_   rb   r   r   r   r   rJ   c   s    ,rJ   c                 C   sx   d}| d }|d tj | 7 }|d tj | 7 }|d tj | 7 }|d tj | 7 }t| dS )zSimple GPU memory report.g      0Az memory (MB)z | allocated: {}z | max allocated: {}z | cached: {}z | max cached: {}N)r/   r   r   Zmemory_allocatedZmax_memory_allocatedZmemory_cachedZmemory_reservedr   )r   Z
mega_bytesrG   r   r   r   report_memory   s   rc   Fc                 C   sH   |rd}nd |}|rt }|d |7 }tj| |d t S )Nreleasez{}z_zero_dp_rank_{}zmp_rank_{:02d}_model_states.pt)r/   r   get_data_parallel_rankr   r   r	   Zget_model_parallel_rank)checkpoints_pathrE   rd   zerodZdp_rankr   r   r   get_checkpoint_name   s   
ri   c                 C   s.   t j| }t j|st j|dd d S d S )NT)exist_ok)r   r   dirnameexistsmakedirs)filenamerk   r   r   r   ensure_directory_exists   s   ro   c                 C   s   t j| dS )Nz!latest_checkpointed_iteration.txt)r   r   r	   )rf   r   r   r   get_checkpoint_tracker_filename   s   rp   c                 C   sD   ||  d}t| j|dd}t| t|| td| d S )N)rE   Zoptimizer_state_dictT)rg     successfully saved {})
state_dictri   savero   r   r,   r/   )r!   rE   r@   Zzero_sdZzero_checkpoint_namer   r   r   save_zero_checkpoint   s   rt   c
                    s  |du rt | }|jr|st| ||||d nt dkrt|j|}
tdt	j
 | |
 d| i}|jr9|j}| }|rZi  | D ]	\}}|j |< qE fdd| D }||d< |jsw|	sw|durm| |d	< |durw| |d
< |jst |d< tj |d< t	 |d< t	j |d< t  |d< t|
 t	||
 td|
 |rt	j
  t	j
 dkrt|j}t|d}|| W d   dS 1 sw   Y  dS dS )Save a model checkpoint.N)tagr   z<global rank {} is saving checkpoint at iteration {:7d} to {}rE   c                    s   i | ]\}} | r||qS r   r   ).0keyvalueZrequires_grad_dictr   r   
<dictcomp>   s    z#save_checkpoint.<locals>.<dictcomp>moduler@   lr_schedulerrandom_rng_statenp_rng_statetorch_rng_statecuda_rng_staterng_tracker_statesrq   r(   ) strr4   save_ds_checkpointr   re   ri   rs   r,   r/   r   r   r   r|   rr   named_parametersZrequires_graditemsno_save_optimno_save_rngrandomgetstatenp	get_stateget_rng_stater   get_cuda_rng_tracker
get_statesro   barrierrp   r1   write)rE   modelr@   r}   r!   rv   r   Zonly_changed_parametersno_deepspeedr   checkpoint_namesdrr   r   Z	parametertracker_filenamefr   rz   r   save_checkpoint   s`   




"r   c                 C   s   i }| |d< |dur|  |d< |js5t |d< tj |d< t |d< tj |d< t	
  |d< |j|j||d	 dS )
ru   rE   Nclient_lr_schedulerr~   r   r   r   r   )Zclient_state)rr   r   r   r   r   r   r   r   r   r   r   r   r   rs   )rE   r   r}   r!   rv   r   r   r   r   r     s   r   c                 C   s   t | }tj|s9td| tj| r/tj| }tj|\}}td ||ddfS td | dddfS t	|ddd	}|
  }|d
k}W d    n1 sUw   Y  | ||dfS )Nz-WARNING: could not find the metadata file {} z6Try to directly load the checkpoint from the directoryFTz<    will not load any checkpoints and will start from randomr   rr   r*   rd   )rp   r   r   isfiler   r/   isdirnormpathr   r1   readr   )Z	load_pathr   r   load_dirrv   r   Z
metastringrd   r   r   r   get_checkpoint_iteration/  s&   
r   c                 C   s|  t |j\}}}}	|	sdS |jrG|sG| j|||j o| |j d\}
}|js6d|v r6||d  td |
du rFt	 dkrDt
d |S npt|||}
t	 dkr^t
dtj |
 tj|
dd	}|jrk| j} | j|d
 dd\}}|sz|rtd| d|  |s|js|js|sz|dur||d  |dur||d  W n ty   td|
 Y nw |js|rd}n)z|d }W n" ty   z|d }W n ty   td|
 d}Y nw Y nw |s.|js.|js.z)t|d  tj|d  t|d  tj|d  t |d  W n ty-   td|
 Y nw t	 dkr<t
d|
 |S )zLoad a model checkpoint.r   )Zload_optimizer_statesZload_lr_scheduler_statesr   zLoad lr scheduler stateNzUnable to load checkpoint.z'global rank {} is loading checkpoint {}cpu)Zmap_locationr|   F)strictzMissing keys z, unexpected keys r@   r}   zUnable to load optimizer from checkpoint {}, exiting. Specify --no-load-optim or --finetune to prevent attempting to load the optimizer state.rE   Ztotal_iterszbA metadata file exists but Unable to load iteration  from checkpoint {}, starting from 0 iterationr~   r   r   r   r   zUnable to load random state from checkpoint {}, exiting. Specify --no-load-rng or --finetune to prevent attempting to load the random state.z  successfully loaded {})r   r6   r4   load_checkpointno_load_optimZno_load_lr_schedulerZload_state_dictr   r   re   r,   ri   r/   r   r   r   r|   ZfinetuneKeyErrorZno_load_rngr   setstater   Z	set_stateZset_rng_stater   r   Z
set_states)r   r@   r}   r!   r   r   r   rv   rd   successr   r   Zmissing_keysZunexpected_keysrE   r   r   r   r   P  s   




r   c                 C   sr   dt t| v }|  D ]*\}}|r|j| j}|j}n	|j}|j| j}|r1d|v r1|  }|| qdS )z
    Loads weights from src to dst via in place copy.
    src is a huggingface gpt2model, while dst is one of our models.
    dst2src=True loads parameters from our models into huggingface's.
    ^dst2src is still untested
    ZConv1DweightN)r   typer   _parametersrA   t
contiguousZcopy_)srcdstdst2srcZ
conv_layernprA   r6   r   r   r   load_weights  s   r   c                 C   $   t |j| j| t |j| j| d S r   )r   Zc_fcZdense_h_to_4hc_projZdense_4h_to_houroair   r   r   r   load_mlp     r   c                 C   r   r   )r   Zc_attnZquery_key_valuer   Zdenser   r   r   r   load_attention  r   r   c                 C   sD   t |j| j| t |j| j| t| j|j| t| j|j	| d S r   )
r   Zln_1Zinput_layernormZln_2Zpost_attention_layernormr   Zmlpr   Z	attentionZattnr   r   r   r   load_transformer_layer  s   r   c                 C   sd   |j }t|j| j j| t|j| j| t|j| j| t| j j	|j j
D ]
\}}t||| q%dS )z
    Loads weights from `oai` to `our` via in place copy.
    `oai` is a huggingface gpt2model, while `our` is one of our models.
    dst2src=True loads parameters from our models into huggingface's.
    ^dst2src=True is still untested
    N)Ztransformerr   Zln_fZfinal_layernormZwteZword_embeddingsZwpeZposition_embeddingszipZlayershr   )r   r   r   Ztransformer_modelZ	our_layerZ	oai_layerr   r   r   move_weights  s   	r   c                 C   sT  | d | d }}| d | d | d }}}g }||   }	t|| d |	  D ] \}
}||}|dkrFd|| d|
f    d	}|| q+td
| g }t|	|dD ]}
|| |
 rj||
 q]t| t|	|| |   t
|jdkrt|	|| |   nt|	||   t|| d d |f  d S )Ntokens
target_idsattention_mask
logit_maskposition_idsz[MASK][r   ]    )r    	enumeratetolistZ	IdToTokenappendr,   r	   rangesizeZ	DecodeIdsr.   shape)
local_varsZbatch_idZ	tokenizerr   r   r   r   r   Zoutput_tokenssepitokenZtarget_positionsr   r   r   debug_finetune_data  s:   
 

r   )TN)FF)NTFFF)F)$r\   r   r   r   rN   r2   numpyr   r   Zmegatron_utilr   r   Zmegatron_util.fp16r   r
   r   r   r#   r<   rI   rJ   rc   ri   ro   rp   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sR   
B

E%

e



