o
    ñpi%¿  ã                   @   s.  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlZ	d dl
Z
d dlmZ d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlm Z  d dl!m"Z# d dl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddd„Z,dd„ Z-G dd„ de%ƒZ.dS )é    N)ÚOrderedDict)Úreduce)Úproduct)Úcore)Úno_grad)Úpir)Úfleet)Únew_process_group)Úget_1D_sub_process_mesh)Ú
split_mesh)ÚOpRole)ÚalignÚget_current_device_type)ÚAutoParallelStreamType)Ú_current_expected_place_)Ú	Optimizeré   )Ú_dtensor_from_local)Úcopy_op_attr_with_new_member)ÚStrategyc                 C   sˆ   d}|d u r	| j }|D ]}t|tjƒr|dksJ dƒ‚| ¡ }qd }t| jƒD ]}||kr4t |¡} nq't |¡}|d urB|||< |S )NéÿÿÿÿzUThe parameter can't be shard twice with sharding strategy even in different mesh now.)	Ú
placementsÚ
isinstanceÚdistÚShardÚget_dimÚrangeÚndimÚcopyÚdeepcopy)ÚparamZsharding_axisZparam_placementsZ
shard_axisZ	placementZplacement_with_shardingÚdimZnew_placements© r"   úp/home/app/PaddleOCR-VL/.venv_paddleocr/lib/python3.10/site-packages/paddle/distributed/auto_parallel/sharding.pyÚget_placement_with_sharding1   s(   
ÿ€
þ
r$   c           
      C   s²   || j v sJ ‚| j  |¡}g }| jD ]	}| t|ƒ¡ qdg||< g }t|Ž D ]/}g }td|  |¡ƒD ]}g |d|… ¢|‘||d d … ¢R }	| | j|	 ¡ q3| |¡ q'|S )Nr   r   )Ú	dim_namesÚindexÚ_shapeÚappendr   r   Úget_dim_sizeÚmesh)
r*   Z	axis_nameZ
axis_indexÚrangesZdim_numZ
all_resultÚxÚresultÚiZcoordr"   r"   r#   Úget_mesh_comm_listL   s   

&r/   c                   @   sÐ   e Zd ZdZd.dd„Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z		d/dd„Z
dd„ Zdd„ Zeƒ dd„ ƒZdd„ Zeƒ dd„ ƒZdd„ Zdd„ Zeƒ d d!„ ƒZd"d#„ Zd$d%„ Zd&d'„ Zd(d)„ Zd*d+„ Zd,d-„ ZdS )0ÚShardingOptimizerStage1z4
    .. ZeRO: https://arxiv.org/abs/1910.02054

    Nc                 C   sö  |d usJ dƒ‚t |tjjtjjfƒsJ dƒ‚|| jd< || _|p#tƒ | _g | _	d | _
t ¡  | jjd u r;tj ¡ }n| jj}t|dƒ}|D ]}tt|ƒƒ}t ¡ |v rW|| _qFd | _d|jv ryt|dƒ}|D ]}tt|ƒƒ}t ¡ |v rx|| _qgtƒ | _d|jv r¨| dt ¡ ¡}	td| d¡ƒD ]}
| j |jd|
d¡ q’|jd|	d}n| j |¡ |j d¡| _ |j!| j  | _"d	| _#d
| _$d|jv rõ|j d¡| _#|j!| j# | _$tƒ }| jD ]}| |¡ t%|| j#dD ]}| |¡ qéqÛ|| _t &¡  d S )Nz)The argument `optimizer` cannot be empty.zR`paddle.distributed.ShardOptimizer` only supports AdamW and SGD optimizer for now.Ú
_inner_optÚdpÚmpÚppr   )r&   r   r   )Zglobal_meshZsub_mesh_dim)'r   ÚpaddleÚ	optimizerZAdamWZSGDÚ__dict__Ú	_shard_fnr   Ú	_strategyÚ_slice_param_group_infoÚ_dy_shard_groupZenable_staticÚ_meshr   Úauto_parallelÚget_meshr/   r	   ÚsortedÚget_rankÚ_sharding_groupÚ	_mp_groupZ
_dim_namesÚsetÚ	pp_meshesr%   Zget_rank_by_dim_and_process_idr   r)   ÚaddZget_mesh_with_dimr&   Ú_sharding_axisr'   Ú_sharding_degreeÚ_mp_mesh_axisÚ
_mp_degreer   Zdisable_static)Úselfr6   Zshard_fnÚstrategyr*   Z	dp_groupsÚgroupÚ
comm_groupZ	mp_groupsZpp_rankÚidxrD   Zpp_meshZsub_pp_meshr"   r"   r#   Ú__init__e   sn   
ÿÿý

€

€




ÿýz ShardingOptimizerStage1.__init__c           E      C   sþ  t ƒ }t|tjjƒrtj tj ¡ j¡}tjj	 
¡ | _| j |¡ | jjj}|dk r,d}i }i }d}d}g }|D ]4\}	}
|
d u rBq8|	 ¡ }|
 ¡ }|d usZJ d|	j› d|	› dƒ‚|d usjJ d|	j› d|
› dƒ‚|j|jksJ d	|	j› d
|	› d|
› dƒ‚| j|jvr| |	|
f¡ |	jd u r—ddi|	_nd|	jd< q8|	jd u r¨ddi|	_nd|	jd< |j| jv sÄJ d|	j› d|j› d| j› dƒ‚t ¡ |jjv rít|j| jƒ}t|jƒ| jjksíJ d|	j› d|j› d| jj› ƒ‚|jtƒ ksÿJ d|	j› d|	› dƒ‚|j |j ksJ d|	j› d
|	› d|
› dƒ‚|	j!|
j!ks+J d|	j› d
|	› d|
› dƒ‚|	j"|
j"ksAJ d|	j› d
|	› d|
› dƒ‚| j#dkrT| j$|j v rTd|	_%d}nd|	_%d}| &|jg ¡ |	¡ | &|jg ¡ |
¡ q8tj' (¡ }| )¡ }|j*d }|d d }g }| +¡ D ]N\}}|| }t,ƒ }t- .|||g¡}t ¡ |jv rª|  /||¡ t0|ƒD ]&\}}g }g }|D ]}| &|g ¡ || j¡ | || ¡ | || ¡ q¹| jjj1rã|  2||¡ |  3||¡\}}}} |d j4}!t5j6j7j8t9ƒ  t:|d j4  }"|"| j; t< =|!¡ t< =|d j4¡ }"t ¡ |jv r#|  >||||"¡ | jjj?sctj@ A||!ddddd|"dg g ¡\}#}$| jjBjCsJ|D ]}
d|
_DqCd|$_Dtj- E|$ F¡ | j"¡}%|$ Gt- H|%|$ ¡ ¡¡ n¢d }&d }'|D ]}
|
 I¡ }(|j* J|(¡}|'d u s||'k rƒ|}'|(}&qit- K|&¡ tj@ L| j"|!| j¡}$t- M|dgi ¡})|$ Gt- H|$ F¡ |)¡¡ |$ I¡  Nd¡}*|* Gt- H|* F¡ |)¡¡ d}+|D ]G}
|
 I¡ }(tO P|
j"¡},t- K|(¡ tj@ Q|$|+|+|, ¡}-tj@ R|-|
j"¡}-t- S|(¡ tj@ T|
|-g¡ |+|,t< =|!¡ |" d |" |" t< =|!¡ 7 }+q½| jjj1st- U¡  |$j"d | j; }.| jj Jt ¡ ¡}/|/|. }0|0|. }1tj@ Q|$|0|1¡}2tj@ V|$| jjW| j;¡}3| jjj1rJ|3 I¡  XtYjZj[¡ t- U¡  tj@ T|2|3g¡ g }4| +¡ D ]
\}5}6|4 |5¡ q\| |4|| f¡ | +¡ D ]`\}5}6|6\}}7}8tj@ Q|3|7|8¡}9||  ¡ j\}:|: ]| j¡ t- M|9jdg|:¡};|9 Gt- H|9 F¡ |;¡¡ |9 I¡ j ^¡ }<|<d  _¡ |<d< t- M|9jdg|:¡|<d< t`|9 I¡ j|<d|9 I¡ _| |5|9f¡ qsq®qˆ| jajbd urúd| jajb_c| j| jajb_d| je| jajb_f|| jajb_g|| jajb_h| ja i|¡ t- U¡  |D ]i\}4}=}>| jjj1r[d }?d }|4d  j¡ D ]}@|j* J|@¡}A|?d u s-|A|?kr1|A}?|@}qtj@ k| ^¡ d ¡}B|B I¡  XtYjZj[¡ tj@ l|=| jjW| j;¡}C|C I¡  XtYjZj[¡ ntj@ l|=| jjW| j;¡}Ctj@ T|>|Cg¡ q|j* J|¡d }D|j*|Dd … S )Nr   é   Fz5parameter dist attribute must not None. but received z : Ú.z4gradient dist attribute must not None. but received z grad : zDParameter and grad should have same process_mesh. but received name:z, parameter:z, grad: Z	no_fusionTzAparameter mesh mush be in pp_meshes. but received parameter name:z, mesh:z, pp_meshes: z? all parameter must have the same sharding group. but received z sharding group is : z, global sharding group is: z?Sharding fusion do not support partial parameter. but received zDParameter and grad should have same dims_mapping. but received name:zDParameter and grad should have same global shape. but received name:zCParameter and grad should have same local shape. but received name:r   r   i   ç        )Znew_results)mÚ_get_devicer   r5   Z	frameworkZ	CUDAPlaceÚdistributedZParallelEnvZdev_idÚbaseZ	libpaddleZPlaceÚ_placeZ	set_placer9   ZshardingÚcomm_buffer_size_MBÚ	dist_attrÚnameÚprocess_meshrF   Zpartial_dimsr(   Úoptimize_attrrD   r   r@   Zprocess_idsr
   r?   rA   ÚranksrC   Zdims_mappingÚshapeÚ_local_shaperI   rH   Úis_distributedÚ
setdefaultÚstaticÚdefault_main_programÚglobal_blockÚopsÚitemsr   r   Zassign_value_group_by_sizeÚ_cache_slice_param_group_infoÚ	enumerateZenable_overlapÚ_reduce_scatter_overlapÚ_fuse_group_paramÚdtyper   ÚutilsÚtensor_fusion_helperÚ	alignmentr   r   rG   r   Úsize_of_dtypeÚ!_cache_slice_param_range_and_sizeZrelease_gradientsÚ_C_opsÚcoalesce_tensor_ZpipelineÚenableÚpersistableÚcreate_shaped_typeÚtypeÚset_typeÚcvt_to_dist_typeÚget_defining_opr&   Zset_insertion_pointÚemptyÚcreate_tensor_dist_attributeÚoperand_sourceÚnpÚprodÚ
view_sliceZ
view_shapeÚset_insertion_point_afterZ	share_varZreset_insertion_point_to_endZreduce_scatterÚidZset_execution_streamr   ZSHARDING_STREAMÚvalueZpartial_statusÚpopÚresultsZas_tensor_dist_attrr   r1   Z
_grad_clipZshould_comm_on_shard_dimZsharding_grouprB   Zmp_groupÚhas_dist_paramÚhas_not_dist_paramÚapply_gradientsZall_used_opsZnopÚ
all_gather)ErJ   Úparams_gradsZplacerW   Zparameters_dictZ
grads_dictr„   r…   Znew_params_gradsr    ÚgradZparam_dist_attrZgrad_dist_attrZsub_meshÚmain_programÚtarget_blockZlast_opÚ
group_sizeZall_gather_param_info_listr*   Ú
parametersZgradsZ
var_groupsÚgroup_indicesÚ	group_idxÚindicesÚgroup_param_listÚgroup_grad_listr&   Úslice_param_dictÚpadded_size_dictÚmain_shard_fused_paramÚmain_fused_paramrj   Ú
align_sizeÚ_Z
fused_gradÚ
fused_typeZfirst_grad_opZfirst_indexÚgrad_oprX   Zprev_varZ
grad_beginÚsizeZgrad_bufferÚ
shard_sizeÚrankÚ
rank_beginZrank_endZview_shard_fused_gradZshard_fused_gradZslice_param_listÚslice_paramÚ
param_infoÚparam_beginÚ	param_endZ
slice_gradZpartail_statusZslice_grad_dist_attrZslice_grad_out_dist_attrZshard_paramÚfused_paramZlast_idxÚoprN   ÚtmpZallgather_valueÚstart_indexr"   r"   r#   r†   £   sV  
ÿ

ÿ
ÿþ



ÿÿþÿþÿÿÿ

ÿÿ
û
ÿýÿÿþýÿüõ

ÿÿ€
ýÿÿ

ÿÿ
ÿþûø	÷ÿÿÿÿÿýÿ
ÿÿ
ÿÿÿÿÿþ
ÿÿþÿÜ æ @


ü€ÿÿÿÿz'ShardingOptimizerStage1.apply_gradientsc                 C   sº   dd„ t t|ƒƒD ƒ| _t|ƒD ]J\}}|D ]C}|| }i | j| |j< |j| j| |j d< d| j| |j d< d| j| |j d< |j| j| |j d< |j| j| |j d< qqd S )	Nc                 S   s   g | ]}i ‘qS r"   r"   ©Ú.0r˜   r"   r"   r#   Ú
<listcomp>û  ó    zIShardingOptimizerStage1._cache_slice_param_group_info.<locals>.<listcomp>r]   r   Úparam_startr¢   r   rZ   )r   Úlenr:   rg   rY   r]   r   rZ   )rJ   r   rŽ   r   r   r&   r    r"   r"   r#   rf   ú  s2   ÿþÿþÿþÿþÿñÿz5ShardingOptimizerStage1._cache_slice_param_group_infoc                 C   s    |  ¡ D ]"\}}|j dd¡}|\}}	}
|	| j| | d< |
| j| | d< q|  ¡ D ]\}}|| j| | d< q+| j|   ¡ D ]\}}|| j| | d< q@d S )Núslice@Ú r«   r¢   Úpadded_sizer—   )re   rY   Úreplacer:   )rJ   r   r“   r”   r—   rŸ   r    Úslice_param_namer˜   r¡   r¢   rY   r¯   r"   r"   r#   ro     s$   
þÿþÿÿÿÿz9ShardingOptimizerStage1._cache_slice_param_range_and_sizec                 C   s@  dddœ}|D ]‡}|  ¡ g}d}g }t|ƒdkrL| ¡ }|jttjƒkr'|}n%| ¡ dkrE| | 	d¡  ¡ ¡ |jttjƒkrD| |¡ nnt|ƒdks|durŽ|j
 |¡d }	|D ]}|j
 |¡}
|	|
krl| ||	¡ qZ|d du sy|	|d krŽ|	|d< t|ƒdkrŠ|d |d< q||d< q|d duržt |d ¡ dS dS )a!  
        In order to overlap computation and reduce_scatter communication, we need to:
          a. place reduce_scatter in communication stream
          b. place reduce_scatter op and its producer ops after the last grad define op
        This function will complete the item b.
        N)rN   r¤   r   r   rN   r   r¤   )rx   r¬   r‚   Zop_roleÚintr   ZBackwardZnum_operandsr(   r{   rd   r&   Zmove_opr   r   )rJ   r’   r‹   Zinsertion_infor‰   Ústackrš   Zadvance_opsr¤   Znew_idxZold_idxr"   r"   r#   rh   '  sD   


€ö€€ÿz/ShardingOptimizerStage1._reduce_scatter_overlapc           "      C   sv  t j ¡ }t j ¡ }t j |¡› dd„ }g }d}|D ]}|||jƒ}	| |	¡ |d |j }q|d j}
tj	j
jtƒ  t|
  }|| j }t j ||
ddddd|d	g g ¡\}}d}|D ] }t |j¡t |
¡ }|| d
 | | t |
¡ }||7 }q]t j | ¡ |g¡}t |d jd	gi ¡}| t ||¡¡ d|_t j ||¡ |  ¡  !|| ¡ ¡}| j"|_#d|_|| j }| j$j% &t' (¡ ¡}|| }t j )|||| ¡}d|_t j |d| ¡ |  ¡  !d| | ¡ ¡}| j"|_#d|_d}i }i }t*|ƒD ]¨\}}t |j¡t |
¡ }|| d
 | | t |
¡ }|||j< t+|| dƒ}||7 }t,|| |ƒ}||k r¤t j )|||¡}d|_d|j } t j -|| ¡ | .|¡ t j |¡< t /¡  t j 0| ¡}!d|!_|! | ¡ ¡ |j1|!_1|j2|!_2|j3|!_3|j4|!_4|j5|!_5|j6|!_6|j7|!_7|j8|!_8W d   ƒ n	1 s˜w   Y  |||f||!< qüW d   ƒ n	1 s°w   Y  ||||fS )Nc                 S   sN   |   ¡ jD ]}| ¡ dkr|| ¡ d kr| d¡ ¡   S qtd|› dƒ‚)Nzbuiltin.set_parameterZparameter_namer   zcan't find param (z) in startup program)rc   rd   rY   ÚattrsÚoperandÚsourceÚ
ValueError)ZstartuprY   r¤   r"   r"   r#   Úget_param_from_startup[  s   €
ÿzIShardingOptimizerStage1._fuse_group_param.<locals>.get_param_from_startupzfused@ú-r   TFrR   r   r   zshard@r­   )9r5   ra   Zdefault_startup_programrb   Zprogram_guardrY   r(   rj   r   rk   rl   rm   r   r   rG   rp   rq   r|   r}   r^   r   rn   r   rt   ru   rz   rZ   rv   rw   rs   Z_pir_opsZset_persistable_valuerc   Z	add_kwargrV   Z
place_attrrA   r\   r&   r   r@   r~   rg   ÚmaxÚminZset_parameterZset_parameters_fromZreset_insertion_point_to_startZ	parameterZ	trainableZstop_gradientr[   ZregularizerZdo_model_averageZ	need_clipr_   Zis_parameter)"rJ   Zgroup_indexr‘   Ústartup_programrŠ   r¸   Zstartup_param_listZfuse_param_namer    Zstartup_paramrj   r—   r˜   r£   rŒ   r›   r¯   r™   rX   r–   rœ   r   rž   Zshard_fused_paramr•   Ztotal_buffer_sizer“   r”   r&   r¡   r¢   Zinit_slice_paramr±   rŸ   r"   r"   r#   ri   V  sü   

ÿ

ÿýÿ
õÿþÿ

ÿÿÿ

ÿÿÿÿþÿ

ÿ
ÿ
ÿ
òý€Úª üz)ShardingOptimizerStage1._fuse_group_paramr   c                 C   s
   |   |¡S ©N)r†   )rJ   Zlossr¼   rˆ   Zparam_group_idxr"   r"   r#   Ú_apply_optimizeá  s   
z'ShardingOptimizerStage1._apply_optimizec                 C   s0   d| j v r|dkr| j | S t| j d |ƒS t‚)Nr1   )r7   ÚgetattrÚAttributeError)rJ   Úitemr"   r"   r#   Ú__getattr__æ  s
   

z#ShardingOptimizerStage1.__getattr__c                 C   s.   |dkrt | ƒj› d}t|ƒ‚t| j||ƒS )Nr1   z._inner_opt is READ ONLY)ru   Ú__name__rÀ   Úsetattrr1   )rJ   rÁ   r   Úmsgr"   r"   r#   Ú__setattr__î  s   z#ShardingOptimizerStage1.__setattr__c           	      C   sð   g }g }g }g }|  ¡ D ]1\}}| ¡ sqd|vrqd|v r$| |¡ qd|v r.| |¡ qd|v r8| |¡ q| |¡ q|D ]}||= q@tjj ¡  | jd u rU|  ¡  | j	D ]}|  
|||¡ |  |||¡ |  |||¡ tjj ¡  qXd S )Nr­   Ú_momentÚ_pow_accÚ_master)re   Úis_distr(   r5   ÚdeviceÚcudaÚempty_cacher;   Ú_create_dy_sharding_groupr:   Ú_all_gather_master_opt_paramsÚ_all_gather_moment_opt_paramsÚ_broadcast_pow_acc_opt_params)	rJ   Ú
state_dictÚmaster_opt_param_namesÚmoment_opt_param_namesÚpow_acc_opt_param_namesZslice_param_namesrY   ÚtensorÚ
group_infor"   r"   r#   Ú.convert_state_dict_without_tensor_fusion_paramô  sB   

ÿÿÿôzFShardingOptimizerStage1.convert_state_dict_without_tensor_fusion_paramc                 C   sR   | j j}|d u rtj ¡ }t|dƒ}|D ]}t t|ƒ¡}t ¡ |v r&|| _	qd S )Nr2   )
r8   r<   r   r=   r>   r/   Z	new_groupr?   r@   r;   )rJ   r*   Zshard_groupsrL   rM   r"   r"   r#   rÎ   !  s   

€ýz1ShardingOptimizerStage1._create_dy_sharding_groupc                 C   s  g }g }g }|  ¡ D ].}d|v r| | d¡d ¡ q
d|v r*| | d¡d ¡ q
d|v r8| | d¡d ¡ q
tt|ƒƒ}tt|ƒƒ}tt|ƒƒ}| jd u rT|  ¡  | jD ]2}d}| ¡ D ]\}}	t	||	d ƒ}q_|  
||¡}
|  |||
|¡ |  |||
|¡ |  |||
|¡ qWd S )NrÇ   ú.distr   rÈ   rÉ   r   r¢   )Úkeysr(   Úsplitr?   rC   r;   rÎ   r:   re   rº   Ú_bucket_tensors_with_group_sizeÚ_re_slicing_opt_paramÚ_remove_pow_acc_opt_params)rJ   rÒ   Zmoment_suffixsÚpow_acc_suffixsZmaster_suffixsrY   r×   rŒ   Ú
param_namer    Úbucket_infor"   r"   r#   Ú+convert_state_dict_with_tensor_fusion_param,  sB   €

ÿÿÿÿñzCShardingOptimizerStage1.convert_state_dict_with_tensor_fusion_paramc                 C   sp   |\}}| j j t ¡ ¡}t| ¡ ƒD ]"\}\}	}
|D ]}||| v r/||	|  |d|	 | < ||	| = qqd S )Nr­   )rA   r\   r&   r   r@   rg   re   )rJ   rÒ   r×   rá   rß   Úgroup_rank_mappingÚsize_mappingÚcur_rankrN   rà   r    Úpow_acc_suffixr"   r"   r#   rÞ   T  s   
ÿûÿz2ShardingOptimizerStage1._remove_pow_acc_opt_paramsc                 C   sø  |\}}| j j t ¡ ¡}|D ]ê}g }	t| ¡ ƒD ]Y\}
\}}|||  }g }tj|| ¡  	¡ | j
d |j| j  ¡ }tj||d}| dg¡}|	 |¡ |jd |d k rj|	 tj|d |jd  g|jd¡ ~~tjj ¡  qtj|	dd}~	tjj ¡  d}t| ¡ ƒD ]h\}
\}}|||
 v rà|||  }|}t||
 ƒD ]\}}||kr­ n	|||
 | 7 }q£||||d  |d  … }d	d
„ tt|jjƒƒD ƒ}t||j|ƒ}||d| | < ||d 7 }||| = tjj ¡  q‰~tjj ¡  qd S )N©rL   ©Zaxisr   r   r¯   )rj   r¢   r«   c                 S   ó   g | ]}t  ¡ ‘qS r"   ©r   Z	Replicater§   r"   r"   r#   r©   £  s    ÿÿzAShardingOptimizerStage1._re_slicing_opt_param.<locals>.<listcomp>r­   )rA   r\   r&   r   r@   rg   re   r‡   Ú_local_valueÚ
contiguousr;   r   rF   r   r5   ÚconcatÚviewr(   r]   Úzerosrj   rË   rÌ   rÍ   r   r¬   rZ   r   )rJ   rÒ   r×   rá   Zparam_suffixsrã   rä   rå   Zparam_suffixÚopt_param_listrN   rà   r    Z	opt_paramZ
param_listÚparam_sharding_axisZglobal_opt_paramÚfused_opt_paramÚparam_indexZcur_rank_start_indexr.   Zrank_idÚshard_opt_paramZshard_opt_param_placementsr"   r"   r#   rÝ   b  sŠ   
ýÿþÿ
ÿÿûÿ
ÿþÿþýÿ
²z-ShardingOptimizerStage1._re_slicing_opt_paramc                 C   sn  g }|  ¡ D ]\}}d| | }||vrq||vrq| ||  ¡  ¡ ¡ qt|ƒdkr.d S tj|dd}	g }
tj|
|	| j	d dd„ |
D ƒ}
tj|
dd}	|  ¡ D ]\}}d| | }||vraqR||vrfqR||  ¡ }||= qRtj
j ¡  d}|  ¡ D ]²\}}d| | }t |d ¡}| jd ur´|d | j }t|tjƒr´| ¡ }||  | j  < t|| ƒ||< ttj|d	ƒ}|	||| … }| |¡}|d
 }t|| j|d ƒ}td ƒgt|jƒ }| jj  t !¡ ¡}|| j  ¡ }|| j" |j|  }||j| | j"  }tt|ƒt|ƒƒ}|||< |t#|ƒ }t$| ¡ |||jƒ}|||| < |d }||7 }q|tj
j ¡  d S )Nr­   r   rè   rç   c                 S   s   g | ]}|  ¡ ‘qS r"   )Úcpu)r¨   rÁ   r"   r"   r#   r©   Ò  s    zBShardingOptimizerStage1._all_gather_opt_params.<locals>.<listcomp>r]   r   r   rZ   r¯   )%re   r(   rë   Úcloner¬   r5   rí   r   r‡   r;   rË   rÌ   rÍ   r   r   rB   rH   r   r   r   rI   r²   r   ÚoperatorÚmulZreshaper$   rF   Úslicer]   rA   r\   r&   r@   rG   Útupler   )rJ   rÒ   r×   Zopt_param_namesZ
opt_suffixrð   rà   r    Zopt_param_namerò   Zfused_opt_param_listZlocal_tensorró   Zglobal_shapeZmp_placementZparam_tensor_parallel_axisZglobal_sizeZglobal_paramrô   Zopt_param_meshZopt_param_placementsZshard_indexr   rñ   Zshard_slice_start_idxZshard_slice_end_idxZshard_slicer¯   r"   r"   r#   Ú_all_gather_opt_params¸  s    ÿÿ
ÿ
ÿ
ÿÿþþÿÿÿÿü
z.ShardingOptimizerStage1._all_gather_opt_paramsc                 C   s€   t |ƒdkrd S i }|D ]}| d¡d }||vrg ||< ||  |¡ qtt| ¡ ƒƒ}| ¡ D ]\}}|  ||||¡ q1d S ©Nr   rÙ   r   )r¬   rÛ   r(   Údictr?   re   rû   )rJ   rÒ   r×   rÔ   ZmomentsrY   Zmoment_suffixZmoment_namesr"   r"   r#   rÐ     s   ÿÿz5ShardingOptimizerStage1._all_gather_moment_opt_paramsc                 C   s6   t |ƒdkrd S |d  d¡d }|  ||||¡ d S rü   )r¬   rÛ   rû   )rJ   rÒ   r×   rÓ   Zmaster_suffixr"   r"   r#   rÏ   2  s   üz5ShardingOptimizerStage1._all_gather_master_opt_paramsc                 C   sp  t |ƒdkrd S g }|D ]}| d¡d }| |¡ qtt|ƒƒ}d}| ¡ D ]\}}	t||	d ƒ}q'|  ||¡\}
}| jj	 
t ¡ ¡}t| ¡ ƒD ]k\}\}}	|
| d }|D ]\}d| | }||kr…|| }| ¡ }tj|| jj	| | jd |||| < | |¡ qX|	d }dd	„ tt |jƒƒD ƒ}t d
g¡}tj|| jj	| | jd t|||ƒ}|||| < qXqJd S )Nr   rÙ   r   r¢   r­   )ÚsrcrL   rZ   c                 S   ré   r"   rê   r§   r"   r"   r#   r©   g  s    ÿzIShardingOptimizerStage1._broadcast_pow_acc_opt_params.<locals>.<listcomp>r   )r¬   rÛ   r(   r?   rC   re   rº   rÜ   rA   r\   r&   r   r@   rg   rë   Ú	broadcastr;   r‚   r   r]   r5   rï   r   )rJ   rÒ   r×   rÕ   rß   rY   ræ   rŒ   rà   r    rã   r˜   rå   rN   Z	root_rankZpow_acc_nameZpow_acc_tensorZpow_acc_local_tensorZtmp_meshZtmp_placementsZtmp_datar"   r"   r#   rÑ   @  sX   ÿ
ýÿ
ýÿåþz5ShardingOptimizerStage1._broadcast_pow_acc_opt_paramsc                 C   sÔ   dd„ |D ƒ}dd„ |D ƒ}d}d}t | ¡ ƒD ]M\}}|d }	|	dkre|| }
|	|
krA||  |¡ ||  |	¡ ||	7 }d}	n |
dkr[||  |¡ ||  |
¡ |	|
8 }	||
7 }|d7 }d}|	dks$q||fS )Nc                 S   ó   g | ]}g ‘qS r"   r"   r§   r"   r"   r#   r©   w  rª   zKShardingOptimizerStage1._bucket_tensors_with_group_size.<locals>.<listcomp>c                 S   r   r"   r"   r§   r"   r"   r#   r©   x  rª   r   r¯   r   )rg   Úvaluesr(   )rJ   r×   rŒ   Zgroup_mappingrä   Zcurrent_sizeZcurrent_bucket_indexrN   r    Ztensor_sizeZavailable_spacer"   r"   r#   rÜ   v  s.   ï€z7ShardingOptimizerStage1._bucket_tensors_with_group_sizec                 C   sv   t  ¡ }t| ¡ ƒ}|D ],}|| }| ¡ sqd|vrqd|v s(d|v s(d|v r5|› d|› }|| ||< ||= qd S )Nr­   rÇ   rÈ   rÉ   Ú_rank)r   r@   ÚlistrÚ   rÊ   )rJ   rÒ   rå   Útensor_namesrY   rÖ   Z	rank_namer"   r"   r#   Ú(convert_state_dict_with_rank_unique_name”  s   õz@ShardingOptimizerStage1.convert_state_dict_with_rank_unique_namec                 C   sJ   t | ¡ ƒ}t | ¡ ƒD ]}d|v r"| d¡d }|| ||< ||= qd S )Nr  r   )r  rÚ   rÛ   )rJ   rÒ   r  rY   Zno_rank_namer"   r"   r#   Ú#convert_state_dict_with_origin_name¥  s   €üz;ShardingOptimizerStage1.convert_state_dict_with_origin_name)NN)r   )rÃ   Ú
__module__Ú__qualname__Ú__doc__rO   r†   rf   ro   rh   ri   r¾   rÂ   rÆ   r   rØ   rÎ   râ   rÞ   rÝ   rû   rÐ   rÏ   rÑ   rÜ   r  r  r"   r"   r"   r#   r0   _   s<    
>  Y/ 
ÿ
,
'V
f6r0   r½   )/r   r÷   Úcollectionsr   Ú	functoolsr   Ú	itertoolsr   Únumpyr|   r5   Zpaddle.distributedrT   r   r   Zpaddle.autogradr   Zpaddle.base.libpaddler   r   Z5paddle.distributed.auto_parallel.static.process_groupr	   ZJpaddle.distributed.auto_parallel.static.reshard_funcs.nd_mesh_reshard_funcr
   Z-paddle.distributed.auto_parallel.static.utilsr   Z/paddle.distributed.fleet.meta_optimizers.commonr   Z3paddle.distributed.fleet.utils.tensor_fusion_helperr   r   Z$paddle.distributed.passes.pass_utilsr   Zpaddle.frameworkr   rS   Zpaddle.optimizerr   Z	moe_utilsr   Z&static.reshard_funcs.base_reshard_funcr   rK   r   r$   r/   r0   r"   r"   r"   r#   Ú<module>   s4   
