o
    * iZ~                     @   s  d dl Z d dlZd dlZd dlZedZg Zg Zdd Zg fddZ	g fddZ
d	d
 Zdd Zeg fddZeg fddZedd Zeg fddZeg g fddZeg fddZeg g fddZeg fddZeg g fddZeg fdd Zd!d" Zeg g fd#d$Zeg fd%d&Zeg fd'd(Zeg g fd)d*Zd+d, Zeg fd-d.Zeg fd/d0Zeg g fd1d2Zeg g fd3d4Z dS )5    NZ
auto_tunerc                 C   s"  d | d | d | d | d | d | d | d | d	 | d
 	}d|v rE|d D ]}ddd |dD }|t| | 7 }|d | }q'd|v rk|d D ]}ddd |dD }|t| | 7 }|d | }qMzddlm} |jd| d|  W n   Y td| d|  d S )NzIDP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}	dp_degree	mp_degree	pp_degree
vpp_degreesharding_degreesharding_stagemicro_batch_sizeuse_recomputerecompute_granularityrefined_recompute c                 s       | ]}|  V  qd S N
capitalize.0i r   o/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/auto_tuner/prune.py	<genexpr>'       z"log_pruned_info.<locals>.<genexpr>_custom_search_dimc                 s   r   r   r   r   r   r   r   r   -   r   r   )ctxz	Strategy z has been pruned that )formatjoinsplitstrZpaddle.distributed.launch.mainr   loggerinfo)cur_cfgpruned_reason	tuner_cfgZpruned_strategykeystrategyr   r   r   r   log_pruned_info   s>   r&   c                 C   sf   g }d}|D ]*}|D ]}|| v rq
||vs!|| || kr%|dvr%d} nq
|r.| | qd}q|S )z
    Compare the current configuration with the history configuration,
    and obtain the same configurations as the current configuration except for the given attr.
    T)estimated_memory_usageF)append)attrsr!   history_cfgsresultssamecfgr$   r   r   r   same_cfgs_beside>   s    r.   c                 C   sN   d }|D ] }g d}d}|D ]}|| || krd} nq|r$|} |S q|S )N)r   r   r   r   r   r	   r
   r   TFr   )r#   r!   r*   resultr-   keysr,   r$   r   r   r   !same_cfgs_beside_sharding_overlapX   s   
r1   c                        fdd}t | |S )Nc                         | i |S r   r   argskwargsfuncr   r   wrapperq      zregister_prune.<locals>.wrapper)_PRUNE_FUNCr(   r8   r9   r   r7   r   register_prunep      
r=   c                    r2   )Nc                     r3   r   r   r4   r7   r   r   r9   y   r:   z'register_prune_history.<locals>.wrapper)_PRUNE_HISTORY_FUNCr(   r<   r   r7   r   register_prune_historyx   r>   r@   c           
      C   s   | dd}| d  dd}| d  dd}| d  dd}| d  dd}|  dd	}|du r2d	S |r<|| d
kr<dS |rF|| d
krFdS |rP|| d
krPdS |r\|| d
kr\|r\dS |  dd}	|	dkrl| d d }	|	rt||	vrtdS d	S )z
    Prune by mp, the rules are:
    1. MP degree should be evenly divided by hidden size and vocab size
    2. MP degree should be in the candidates of user defined.
    3. MP degree should be less than 8 if no candidates.
    r   N	model_cfghidden_size
vocab_sizenum_attention_heads
seq_lengthuse_sequence_parallelFr   Tauto
candidatesget)
r#   r!   r*   r   rB   rC   rD   rE   rF   Zmp_degree_candidatesr   r   r   prune_by_mp   s2   rK   c                 C   s   | dd}| d  dd}d|v r|d n|  dd}|du r"dS |r,|| dkr,d	S |  dd}|d
kr<| d d }|rF||vrDd	S dS |dkrP||krPd	S dS )a  
    Prune by pp (pipeline-parallelism), the rules are:
    1. PP degree should be evenly divided by number of layers.
    2. PP degree should be in the candidates of user defined.
    3. If no candidates, PP degree should be less than or equal to the number of nodes.
    r   NrA   
num_layersnodes   Fr   TrG   rH   rI   )r#   r!   r*   r   rL   Z	num_nodesZpp_degree_candidatesr   r   r   prune_by_pp   s&   rO   c           
   
   C   s   | dd }| dd }| dd }|d u s|d u s|d u r dS t|}|| tddg||}|rt|D ]=}|ss|d |d  || krs|d |krs| ddkrsd| d| d	|d  d
|d  d	}	t||	|  d|d<  dS q6dS )Nr   r   	recomputeFmax_mem_usageOOMz
mp_degree z, pp_degree  may cause oom because z,  already oom.TrJ   copydeepcopyextendr.   r&   )
r#   r!   r*   pruned_cfgsr   r   r	   cfgsr-   r"   r   r   r   prune_by_mp_pp_history   s*   

&r[   c           	      C   s  | dd}| dd}| d  dd}|du rdS |du r dS |rjd|v r*|d n| d  dd}||d  |d	  |d
  }|dkrL|| dkrLdS |||  dkrVdS |dkr`|dkr`dS |dkrj|dkrjdS |  dd}|dkrz| d d }|r||vrdS dS )z
    Prune by vpp (virtual pipeline parallelism), the rules are:
    1. VPP degree should be evenly divided by number of layers.
    2. VPP degree should be in the candidates of user defined.
    r   Nr   rA   rL   Fglobal_batch_sizer   r   r   rN   r   T   rG   rH   rI   )	r#   r!   r*   r   r   rL   r\   	acc_stepsZvpp_degree_candidatesr   r   r   prune_by_vpp   sF   r_   c                 C   s   | dd }|d u rdS t|}|| td||}|rH|D ]'}|d |krG| ddkrGd| d|d  d}t|||  d|d<  dS q dS )	Nr   FrQ   rR   zvpp_degree rS   rT   TrU   )r#   r!   r*   rY   r   rZ   r-   r"   r   r   r   prune_by_vpp_history  s    

r`   c           
      C   s  | dd}d|v r|d n| d  dd}|dkr|d }|r0||d  |d  }|dkr0d	S |  dd}|dkr@| d
 d }|du rFdS |r~|| dkrPd	S || }| dd}|durd||k rdd	S | dd}	|	dur~|	dkr~|dur~|| dkr~d	S |r||vrd	S dS )a(  
    Prune by mbs (micro batch size), the rules are:
    1. Micro batch size should be evenly divided by the local batch size.
    2. Micro batch size should be in the candidates of user defined.
    3. Prune if a similar configuration with a larger micro batch size resulted in a valid run.
    r   Nr\   rA   rG   r   r   r   TrH   Fr   r   rN   rI   )
r#   r!   r*   r   r\   Zlocal_batch_sizeZmbs_candidatesr^   r   r   r   r   r   prune_by_mbs2  sL   ra   c                 C   s   | dd }|d u rdS t|}|| tddg||}|rr|D ]O}|d |krL| dddkrLd| d|d  d	}t|||  |d |d<  d
S |d |k rq| ddkrqd| d|d  d}t|||  d|d<  d
S q"dS )Nr   Fr^   timer   zmicro_batch_size  may be slower because  has been already runnable.TrQ   rR   rS   rT   rU   )r#   r!   r*   rY   r   rZ   r-   r"   r   r   r   prune_by_mbs_historyh  s0   


rf   c           	      C   s   | dd}| dd}| dd}|sdS |sdS |  dd}|dkr*| d d }|  dd}|dkr:| d d }|rB||vrBdS |rJ||vrJdS |rZ|d	krZ|d	krZ|d	krZdS |d	krhtd||}|rhdS dS )
a  
    Prune by sharding parameters, the rules are:
    1. Sharding stage and sharding degree should be specified.
    2. Sharding stage and degree should be in the candidates of user defined.
    3. If PP (pipeline-parallelism) degree is not 1, sharding stage must be 1.
    4. Prune if a similar configuration with a lower sharding stage resulted in a valid run.
    5. If sharding degree is 1, sharding stage is invalid.
    r   Nr   r   FrG   rH   TrN   )rJ   r.   )	r#   r!   r*   r   r   r   Zsharding_stage_candidatesZsharding_degree_candidatesrZ   r   r   r   prune_by_sharding  s<   
rg   c           	      C   s   | dd }|d u rdS | dd }|d u rdS t|}|| td||}|r||D ]O}|d |k rV| dddkrVd| d|d  d	}t|||  |d |d<  d
S |d |kr{| ddkr{d| d|d  d}t|||  d|d<  d
S q,dS )Nr   Fr   rb   rc   r   zsharding_stage rd   re   TrQ   rR   rS   rT   rU   )	r#   r!   r*   rY   r   r   rZ   r-   r"   r   r   r   prune_by_sharding_history  s2   

rh   c           
      C   s   | dd}| dd}t|}|du rdS | d  dd}| d  dd}|r.||vr.dS |r8|r8||vr8dS |sX|dkr@dS tddg||}|rX|D ]}	|t|	krW dS qLdS )a  
    Prune by recompute parameters, the rules are:
    1. If recompute is not used, return False directly.
    2. Usage of recompute and recompute granularity should be in the candidates of user defined.
    3. If recompute is not used, but recompute granularity is set, return True for pruning.
    4. Prune if a similar configuration without using recompute resulted in a valid run.
    5. If recompute is false, prune redundant recompute granularity
    r
   Nr	   FrH   Tfull)rJ   get_config_recompute_levelr.   )
r#   r!   r*   r
   r	   recompute_levelZ recompute_granularity_candidatesZuse_recompute_candidatesrZ   r-   r   r   r   prune_by_recompute  s:   

rl   c                 C   s@   dddd}|  dd }|  dd }|d u rd S |sdS || S )N   r]   rN   )ri   	full_attn	core_attnr	   r
   r   rI   )r-   Zrecompute_granularity_levelr	   r
   r   r   r   rj     s   rj   c                 C   s   t |}|d u r
dS t|}|| tddg||}|rp|D ]O}t ||d< |d |k rM|dddkrMd|d  d	}t|||  |d |d<  d
S |d |kro|ddkrod|d  d}t|||  d|d<  d
S q dS )NFr	   r
   rk   rb   rc   r   z$use_recompute may be slower because re   TrQ   rR   z$use_recompute may cause oom because rT   )rj   rV   rW   rX   r.   rJ   r&   )r#   r!   r*   rY   rk   rZ   r-   r"   r   r   r   prune_by_recompute_history"  s2   


rp   c                 C   sf   d|v r|d n|  d}| dd}| dd}| dd}| dd}|| | | |kr1dS dS )	Nnum_gpusr   rN   r   r   r   TFrI   )r#   r!   r*   rq   r   r   r   r   r   r   r   prune_by_num_gpusK  s   rr   c                 C   sT  |  dd }|  dd }| d }|d u rdS tj|s#td| |d u r+td|d }|d }|d	 }|d
 }	|d }
|d }|d }|d }|d }d|dt|dt|dt|dt|	dt|
dt|dt|dt|dt|g}| dd }|d ur|dt|g | dd }|d ur|dt|g | dd }|d ur|dt|g | d d }|d ur|d!t|g | d"d }|d ur|d#t|g | d$d }|d ur|d%t|g tj|d&d&d'}|j	d(kr"t
tt|jd)}||d*< d+| d,| d-}||d. k}|r|d/7 }t| |S td0|j )1Nmemory_estimation_toolrQ   rA   Fz7memory_estimation_tool should be a valid path, but got z=max_mem_usage should be set when using memory estimation toolr   r   r   r   r   r   r	   r   r
   pythonz--dp_degreez--mp_degreez--pp_degreez--vpp_degreez--sharding_degreez--sharding_stagez--use_recomputez--micro_batch_sizez--recompute_granularityrB   z--hidden_sizerD   z--num_attention_headsrL   z--num_layersmax_sequence_lengthz--max_sequence_lengthrC   z--vocab_sizeintermediate_sizez--intermediate_sizeT)capture_outputtextr   r]   r'   z
Estimated z memory usage: z MBi   z, it will be pruned!z*memory_estimation_tool failed with error: )rJ   ospathexists
ValueErrorr   rX   
subprocessrun
returncodeintroundfloatstdoutr   r    stderr)r#   r!   r*   rs   Zmax_memory_usagerA   r   r   r   r   r   r   r	   r   r
   Zmemory_estimation_cmdrB   rD   rL   ru   rC   rv   r/   Zcur_memory_usagemsgZmemory_exceededr   r   r   prune_by_memory_estimation\  s   




r   c                 C   s4   d|v rt | ||}|sdS || d d  sdS dS )z2Prune by sharding overlap for single dp estimationZsharding_overlapTZ
metric_cfgnameF)r1   )r#   r!   r*   rY   r/   r   r   r   prune_by_sharding_overlap  s   r   c              
   C   s:  ddddddddd	d
	}dddd}i }|D ]}|||| < q|D ]}t |ts*J |d}d}|D ]}	d }
|D ]}|	|rD|}
 nq9|
r|	t|
 }|
dv rv|dkrd| ||
  dkrc|d7 }q3q3t|}| ||
  |kru|d7 }q3q3|
dkr|dkr| ||
  r|d7 }q3q3tt|}| ||
  |kr|d7 }q3q3|
dkr|dkr| |d  dkr|d7 }q3q3t|}| ||
  |kr|d7 }q3q3|
dkr|dkr|d7 }q3t|}| ||
  |kr|d7 }q3q3|
d	kr|dkr| |d  r|d7 }q3q3t|}|| }| ||
  |kr|d7 }q3q3q!|t|krdS dS )NdpmpppvppZmbsshardingZstagerP   granularity)	r   r   r   r   r   r   r   r	   r
   ri   rn   ro   )r   rN   r]   r   r   )r   r   r   r   r   *rN   r	   TF)
isinstancer   r   
startswithlenr   bool)r!   invalid_strategymappingZgranularity_mappingZreversed_mappingr$   r%   dimsZhas_matcheddimZmatchedvaluer   r   r   r   
is_invalid  s   


;r   c                 C   s4   |  dd r| d }t|tsJ t||rdS dS )Nr   TF)rJ   r   listr   )r#   r!   r*   r   r   r   r   prune_by_invalid_strategy+  s   
r   c           
         s   |  dd r|  d} d } d } d } fdd|D }|r3|r3|dkr3|dt|kr3d	S |d
krB|dt|krBd	S | d d | dkrNd	S | d d | } |d  |kr`d	S d
}	|	t|k r ||	  |ks ||	d
   |kr ||	  dkrd	S |	d
7 }	|	t|k shdS )Nr   r   r	   r
   c                    s   g | ]} | qS r   r   )r   itemr!   r   r   
<listcomp>=  s    z.prune_by_refined_recompute.<locals>.<listcomp>ri   r   TrN   rA   rL   F)rJ   countr   )
r#   r!   r*   rrr   rP   r
   compareZ	max_valuer   r   r   r   prune_by_refined_recompute6  s2   
$r   c           
      C   s  |  dd rt|}|| |  d}t|}|d t|||}|D ]}|r|D ]}|d sU| dddkrU| d||  d}	t||	|  |d |d<   dS || || kr| dddkr|d r|d r| d||  d	||  d
}	t||	|  |d |d<   dS || || k r| ddkr|d r|d r| d||  d||  d}	t||	|  d|d<   dS q-q'dS )Nr   r	   rb   rc   r    z? may be slower because not recompute has been already runnable.Trd   re   rQ   rR   rS   rT   F)rJ   rV   rW   rX   r(   r.   r&   )
r#   r!   r*   rY   r   r   rZ   r   r-   r"   r   r   r   "prune_by_refined_recompute_historyT  sJ   




r   c                 C   s  t |}| dd }g }i }|d ur2| D ]\}}|d r1|| dd t|d D ||< q|D ]U}|| t|||}	||d }
|
d u rN dS |	r|	D ]6}|| }|| | || |
 k r|ddd	kr| | d
| |
 d}t|||  |d |d<   dS qRq4dS )Nr   Zprunec                 S   s   i | ]\}}||qS r   r   )r   r   r$   r   r   r   
<dictcomp>  s    
z6prune_by_custom_search_dim_history.<locals>.<dictcomp>r   Frb   rc   r   rd   re   T)	rV   rW   rJ   itemsr(   	enumeraterX   r.   r&   )r#   r!   r*   rY   r   Zprune_custom_search_dimZcustom_dim_levelr$   r   rZ   Z	cur_valuer-   Z	cfg_valuer"   r   r   r   "prune_by_custom_search_dim_history  s>   






r   )!rV   loggingry   r}   	getLoggerr   r;   r?   r&   r.   r1   r=   r@   rK   rO   r[   r_   r`   ra   rf   rg   rh   rl   rj   rp   rr   r   r   r   r   r   r   r   r   r   r   r   <module>   sn   
%+ 
/5!3&.(iU
+