o
    )i"                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZG dd dZdded	ed
efddZded
efddZdejjjd
ejjjfddZeG dd dZdS )    N)defaultdict)	dataclass)AnycastDictListOptionalSequencec                   @   s$   e Zd ZdejjjddfddZdS )FakeKinetoEventereturnNc                 C   s6   t |D ]}|drqt| |t|| q|| _d S )N_)dir
startswithsetattrgetattrZ_kineto_event)selfr   attr r   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/xformers/profiler/profile_analyzer.py__init__   s
   

zFakeKinetoEvent.__init__)__name__
__module____qualname__torch_C	_autograd_KinetoEventr   r   r   r   r   r
      s    r
   BHMKcausalfmtr   c                 C   s  t |tsJ |dv sJ |dkrdd | |fD \} }| ^ }}}|^ }}}|rjd| t|| d | dt|| d t|| d |  }	|	dt|| t|| | dt|| t|| |  d 7 }	nd| | | d| | |  }	|D ]}
|	|
9 }	q|t|	S )N)BMHKr   r!   c                 S   s(   g | ]}|d  |d |d |d gqS )r            r   ).0xr   r   r   
<listcomp>   s   ( z$_attention_flops.<locals>.<listcomp>r"   r   )
isinstanceboolmaxminint)Zqueriesvaluesr   r    BNKZNvZKvflopsbr   r   r   _attention_flops   s    >6 
r3   	arg_namesc                 G   sB   t | jjjD ]\}}|j|v r|  S qtd| d| jj )NzNo such argument z
 found in )	enumeratedefaultZ_schema	argumentsname
ValueError)opr4   iargr   r   r   _get_arg_idx,   s
   
r=   r   c                    s  |   jdkr	| S |  }d tdd}dd tjjddi ftjjd	d|ftjjd
d|ftjjdd|ftjjddi ftjjddi ftjjdd|ftjjdd|ftjjdd|ftjjddi ff
D }|| v r|| \}}}| 	 }| 
 }z
|t|dd }	W n ty   |t|d dk}	Y nw t|t|d |t|d |	fi | |r d d   durt| }
 fdd|
_ttjjj|
} | S )z
    Adds a flops amount for operators that don't have this information in Kineto already
    This mostly applies for the attention for now, as GEMMs are already calculated by Kineto
    and other operations are negligible.
    CPUNr!   )r    c                 S   s<   i | ]\}}}}t ||rt||j t||||fqS r   )hasattrr   r6   r8   )r%   libr:   is_bwdkwargsr   r   r   
<dictcomp>A   s    
z&_replace_if_needed.<locals>.<dictcomp>Zscaled_dot_product_attentionFZ	flash_fwdZ#efficient_attention_forward_cutlassZ_efficient_attention_forwardZ,_scaled_dot_product_flash_attention_backwardTZ0_scaled_dot_product_efficient_attention_backwardZ	flash_bwdZ$efficient_attention_backward_cutlassZ_efficient_attention_backwardZ,_scaled_dot_product_cudnn_attention_backwardr   	is_causalZcustom_mask_typer   queryvalue   r"   c                      s    S Nr   r   r1   r   r   <lambda>r   s    z$_replace_if_needed.<locals>.<lambda>)device_typer8   dictr   opsZatenZxformers_flashZxformerskeysshapesconcrete_inputsr=   r9   r3   r
   r1   r   r   r   r   )r   Zop_nameZFMT_BMHKZATTN_OPSr:   rA   rB   rO   rP   rD   Znew_er   rI   r   _replace_if_needed3   sj   
rQ   c                	   @   s   e Zd ZU eejef ed< eejef ed< eed< 	ddejdededefd	d
Z	deejef defddZ
deejef defddZedeejjj deejjj fddZedeejjj dd fddZdS )AnalyzedTraceoperations_per_dtype_fwoperations_per_dtype_bwtotal_time_sTdtypefwbwr   c                 C   s4   d}|r|| j |d7 }|r|| j|d7 }|S N        )rS   getrT   )r   rV   rW   rX   rM   r   r   r   compute_num_ops}   s   zAnalyzedTrace.compute_num_opshardware_flopsc                 C   s2   d}|  D ]\}}|| || 7 }q|| j S rY   )itemsr\   rU   r   r]   Zhfu_secondsrV   Zhw_flopsr   r   r   compute_hfu   s   
zAnalyzedTrace.compute_hfuc                 C   sF   d}|  D ]\}}|td| j|dd | || 7 }q|| j S )NrZ   r$   F)rX   )r^   r+   r\   rU   r_   r   r   r   compute_mfu   s   
zAnalyzedTrace.compute_mfu
all_eventsc                 C   s   dd | D }t t}|D ]}|| | f | qg }| D ](}|jdd d d }|D ]}|d u sC| | |  krJ|}|| q1q#|S )Nc                 S   s:   g | ]}|  jd kr| s| r| dkr|qS )r>   r   )rK   r8   dtypesrO   r1   r%   r   r   r   r   r'      s    zBAnalyzedTrace._find_all_root_events_with_flops.<locals>.<listcomp>c                 S   s   |   |   fS rH   )start_nsduration_ns)r   r   r   r   rJ      s    z@AnalyzedTrace._find_all_root_events_with_flops.<locals>.<lambda>)key)	r   liststart_thread_idrK   appendr-   sortre   rf   )rb   Zall_ops_with_flopsZevents_per_groupr   Zroot_eventseventsZcurrent_rootr   r   r    _find_all_root_events_with_flops   s*   

z.AnalyzedTrace._find_all_root_events_with_flopsrl   c                 C   s2  dd | D } t | }tt}tt}dd | D }dtjfdtjfdtjfdtjfd	tjfg}tj	d
}}|D ]5}d }	|D ]\}
}|
|
 v rM|}	 nq?|	d u rSq9| |v rd||	  | 7  < q9||	  | 7  < q9| D ]}| jdkr{qqt|| }t|| |  }qqt |||| d dS )Nc                 S   s   g | ]}t |qS r   )rQ   rd   r   r   r   r'      s    z.AnalyzedTrace.from_profile.<locals>.<listcomp>c                 S   s    h | ]}|  d kr| qS )r   )Zfwd_thread_idri   rd   r   r   r   	<setcomp>   s     z-AnalyzedTrace.from_profile.<locals>.<setcomp>doublefloatz	c10::Halfzc10::BFloat16z	c10::Int8r   CUDAi ʚ;)rS   rT   rU   )rR   rm   r   rp   r   Zfloat64Zfloat16Zint8mathinfrc   ri   r1   rK   r8   r+   re   r*   rf   )rl   Zroot_opsrS   rT   Zall_bw_threadsZATEN_DTYPESZbegin_nsZend_nsr:   rV   Z
aten_dtypeZtorch_dtyper   r   r   from_profile   sD   
	
zAnalyzedTrace.from_profileN)TT)r   r   r   r   r   rV   rp   __annotations__r)   r\   r`   ra   staticmethodr	   r   r   r   rm   rt   r   r   r   r   rR   w   s8   
 

#rR   )r   )rr   collectionsr   dataclassesr   typingr   r   r   r   r   r	   r   r
   r)   strr,   r3   r=   r   r   r   rQ   rR   r   r   r   r   <module>   s    	
D