o
    )iK/                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZmZ ejjdd	d
ddejdeej dededeej f
ddZejddejdeej dededeej f
ddZejjdd	d
ddejdeej deej dededeejeej f fddZejddejdeej deej dededeejeej f fddZdd Zdd Zejjdeed d ejd!eej ded"ej j!deej f
d#d$Z"ejjd%d	d
dd&ejd'ejdededejf
d(d)Z#ejd%d&ejd'ejdededejf
d*d+Z$ejjd,d	d
dd&ejd'ejd-ejdededeejejf fd.d/Z%ejd,d&ejd'ejd-ejdededeejejf fd0d1Z&d2d3 Z'd4d5 Z(ejjd%e(e'd d ejd6ejded"ej j!dejf
d7d8Z)dS )9    )CallableListTupleN)_resolve_process_group   )gather_along_first_dimgather_along_first_dim_asyncreduce_scatter_along_first_dim$reduce_scatter_along_first_dim_async)fused_allgather_and_anythingfused_allgather_and_linear fused_anything_and_reducescatterfused_linear_and_reducescatter)tiled_matmultiled_matmul_outz5xformers_python::sequence_parallel_leading_matmul_fwd cuda)Zmutates_argsZdevice_typesscattered_inputweightsfuseprocess_group_namereturnc                 C   sT   t |}|rt| dd |D |d}|S t| |d}t|ggdd |D g\}|S )Nc                 S   s   g | ]}|  qS r   t.0wr   r   _/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/ops/seqpar.py
<listcomp>*       z8sequence_parallel_leading_matmul_fwd.<locals>.<listcomp>groupprocess_groupc                 S      g | ]}|qS r   r   r   r   r   r   r   2       )r   r   r   r   )r   r   r   r   r#   Zgathered_outputsgathered_inputr   r   r   $sequence_parallel_leading_matmul_fwd   s   r'   c                    s    t |   fdd|D S )Nc                    s*   g | ]} jd    |jd fqS )r   r   )	new_emptyshaper   mp_sizer   r   r   r   ?   s    z=sequence_parallel_leading_matmul_fwd_fake.<locals>.<listcomp>)r   size)r   r   r   r   r   r*   r   )sequence_parallel_leading_matmul_fwd_fake7   s   r-   z5xformers_python::sequence_parallel_leading_matmul_bwdgrad_gathered_outputsc                    sz  t |}| dd |D }|rtt| }dd D fdd|D dttj dtdtg tjj	f dd ffd	d
}t
||g|d dd D  dttj dtdtg tjj	f dd f fdd}t| g||d |fS t| |d\}	}
tdd |D gdd D \\}|
d ur|
  t||d\}}
tdd |D |	gg}|
d ur|
  dd |D |fS )Nc                 S   s.   g | ]}t d d | D r| n|qS )c                 s       | ]}|d kV  qdS r   Nr   r   sr   r   r   	<genexpr>W       zBsequence_parallel_leading_matmul_bwd.<locals>.<listcomp>.<genexpr>)anystridecloner   Zgrad_gor   r   r   r   V   s     z8sequence_parallel_leading_matmul_bwd.<locals>.<listcomp>c                 S      g | ]}t |qS r   )torch
zeros_liker   r   r   r   r   ]       c                    s   g | ]	}|j  d dqS )r   dim)tensor_splitr8   )r+   r   r   r   _   s    grad_gathered_inputsdst_rankstream_factoryr   c                    sf   | \}t j|  t fddD gdd D |ggd W d    d S 1 s,w   Y  d S )Nc                    s   g | ]}|  qS r   r   )r   grad_gosrA   r   r   r   k   r   zNsequence_parallel_leading_matmul_bwd.<locals>.my_si_matmul.<locals>.<listcomp>c                 S      g | ]}|  gqS r   r   r   r   r   r   r   l   r<   out)r:   r   streamr   )r@   rA   rB   Zgrad_gi)grad_gathered_outputssr   rD   r   my_si_matmulc   s   "z:sequence_parallel_leading_matmul_bwd.<locals>.my_si_matmulr    c                 S   s   g | ]}t j qS r   )r:   r   Event)r   _r   r   r   r   y   r<   gathered_inputs_shardsrc_rankc              	      sx   | \}t  D ]0\}}}tj|  |  | ||  | |  W d    n1 s4w   Y  q	d S N)zipr:   r   rH   waitr   addmm_record)rM   rN   rB   Zgi_shardrC   grad_wevent)eventsrI   grad_weightsr   r   my_w_matmul{   s   
z9sequence_parallel_leading_matmul_bwd.<locals>.my_w_matmulr"   c                 S   r$   r   r   r8   r   r   r   r      r%   c                 S   rE   r   r   r   r   r   r   r      r<   c                 S   rE   r   r   r8   r   r   r   r      r<   c                 S   s   g | ]\}|  qS r   r   )r   rT   r   r   r   r      r<   )r   r,   r:   
empty_liker   Tensorintr   r   Streamr   r   r   r   rQ   r
   )r   r   r.   r   r   r#   grad_scattered_inputrJ   rX   r&   handlegrad_gathered_inputZgrad_weights_tuplesr   )rV   rI   rW   r+   r   r   $sequence_parallel_leading_matmul_bwdE   sz   

	

r`   c                 C   s   t | dd |D fS )Nc                 S   r9   r   r:   rY   r   r   r   r   r      r<   z=sequence_parallel_leading_matmul_bwd_fake.<locals>.<listcomp>ra   )r   r   r.   r   r   r   r   r   )sequence_parallel_leading_matmul_bwd_fake   s   rb   c                 C   s.   |\}}}}| j |g|R   || _|| _d S rO   Zsave_for_backwardr   r   )ctxinputsoutputr   r   r   r   r   r   r   .sequence_parallel_leading_matmul_setup_context   s   
rg   c                 C   s6   | j ^}}t|t|t|| j| j\}}||d d fS rO   )saved_tensorsr`   listr   r   )rd   r.   r   r   r]   rW   r   r   r   +sequence_parallel_leading_matmul_bwd_bridge   s   
rj   )Zsetup_contextxwsr#   c                   s0   t  dd|||j} fddt||D S )Nr   c                    s6   g | ]\}}|j d g jdd  |jd R  qS )r   )viewr)   )r   or   rk   r   r   r      s   6 z4sequence_parallel_leading_matmul.<locals>.<listcomp>)r'   flatten
group_namerP   )rk   rl   r   r#   osr   rq   r    sequence_parallel_leading_matmul   s   ru   z6xformers_python::sequence_parallel_trailing_matmul_fwdr&   weightc                 C   s>   t |}|rt| | |d}|S t| |}t||d}|S )Nr    r"   )r   r   r   r:   matmulr	   )r&   rv   r   r   r#   Zscattered_outputZgathered_outputr   r   r   %sequence_parallel_trailing_matmul_fwd   s   
rx   c                 C   s*   t | }| | jd | |jd fS )Nr   r   )r   r,   r(   r)   )r&   rv   r   r   r+   r   r   r   *sequence_parallel_trailing_matmul_fwd_fake   s   ry   z6xformers_python::sequence_parallel_trailing_matmul_bwdgrad_scattered_outputc           
         s   t |}| }tdd | D r| }|rXt| }t| j|dd |j|dddt	tj
 dtdtg tjjf dd f fd	d
}t|g||d |fS t||d}	t|	 }t|	 |  |fS )Nc                 s   r/   r0   r   r1   r   r   r   r3     r4   z8sequence_parallel_trailing_matmul_bwd.<locals>.<genexpr>r   r=   grad_gathered_outputs_shardrN   rB   r   c                    s   | \}t j|  t j| | d W d    n1 s!w   Y  t j|   |  |  W d    d S 1 sEw   Y  d S )NrF   )r:   r   rH   rw   r   rR   )r{   rN   rB   Zgrad_go_shardZgathered_inputsr@   grad_weightrv   r   r   my_gi_and_w_matmul  s   "zAsequence_parallel_trailing_matmul_bwd.<locals>.my_gi_and_w_matmulr    r"   )r   r,   r5   r6   r7   r:   rY   r;   r?   r   rZ   r[   r   r   r\   r   r   rw   r   )
r&   rv   rz   r   r   r#   r+   r_   r~   Zgrad_gathered_outputr   r|   r   %sequence_parallel_trailing_matmul_bwd  s<   

r   c                 C   s   t | t |fS rO   ra   )r&   rv   rz   r   r   r   r   r   *sequence_parallel_trailing_matmul_bwd_fake;  s   r   c                 C   s(   |\}}}}|  || || _|| _d S rO   rc   )rd   re   rf   r&   rv   r   r   r   r   r   /sequence_parallel_trailing_matmul_setup_contextF  s   
r   c                 C   s.   | j \}}t|||| j| j\}}||d d fS rO   )rh   r   r   r   )rd   rz   r&   rv   r_   r}   r   r   r   ,sequence_parallel_trailing_matmul_bwd_bridgeM  s   
r   r   c                C   s>   t | dd|||j}|jdg| jdd |jd R  S )Nr   rm   rn   r   )rx   rr   rs   ro   r)   )rk   r   r   r#   rp   r   r   r   !sequence_parallel_trailing_matmulc  s   &r   )*typingr   r   r   r:   Z"torch.distributed.distributed_c10dr   Zdifferentiable_collectivesr   r   r	   r
   Zsequence_parallel_fused_opsr   r   r   r   r   r   ZlibraryZ	custom_oprZ   boolstrr'   Zregister_faker-   r`   rb   rg   rj   Zregister_autograddistributedZProcessGroupru   rx   ry   r   r   r   r   r   r   r   r   r   <module>   sD  

_



2
