o
    )itb                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ er_d dlm Z  ne!Z ee"Z#G dd dZ$G dd de j%Z&eeG dd dZ'eeG dd dZ(dS )    N)Counter)asdictfield)TYPE_CHECKINGAnyCallableClassVarOptionalUnion)TypeAdapterfield_validator)	dataclass)CallableInductorPassInductorPass)config)init_logger)is_torch_equal_or_newerresolve_obj_by_qualname)
VllmConfigc                   @   s   e Zd ZdZdZdZdZdS )CompilationLevelr            N)__name__
__module____qualname__NO_COMPILATIONDYNAMO_AS_ISDYNAMO_ONCE	PIECEWISE r    r    c/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/config/compilation.pyr      s
    r   c                   @   st   e Zd ZdZdZdZdZeefZeefZdddZ	ddd	Z
defd
dZdddZdefddZdefddZdS )CUDAGraphModez Constants for the cudagraph mode in CompilationConfig.
    Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
    treated as concrete runtime mode for cudagraph runtime dispatching.
    r   r   r   returnc                 C      |   rt| jd S | S )Nr   separate_routiner"   valueselfr    r    r!   decode_mode.      zCUDAGraphMode.decode_modec                 C   r$   )Nr   r%   r(   r    r    r!   
mixed_mode2   r+   zCUDAGraphMode.mixed_modec                 C   s   |   tjkp|  tjkS N)r*   r"   r   r,   r(   r    r    r!   requires_piecewise_compilation6   s   z,CUDAGraphMode.requires_piecewise_compilationc                 C   s   |   rtt| jS | S r-   )r&   r"   maxr'   r(   r    r    r!   max_cudagraph_mode:   s   z CUDAGraphMode.max_cudagraph_modec                 C   s   |   tjkS r-   )r0   r"   FULLr(   r    r    r!   has_full_cudagraphs>   s   z!CUDAGraphMode.has_full_cudagraphsc                 C   s   t | jtS r-   )
isinstancer'   tupler(   r    r    r!   r&   A   s   zCUDAGraphMode.separate_routineN)r#   r"   )r   r   r   __doc__NONEr   r1   ZFULL_DECODE_ONLYZFULL_AND_PIECEWISEr*   r,   boolr.   r0   r2   r&   r    r    r    r!   r"   #   s    


r"   c                   @   s   e Zd ZU dZedd dZeed< 	 dZeed< 	 edd dZ	eed	< 	 dZ
eed
< 	 dZeed< 	 dZeed< 	 dZeed< 	 dd ZdddZdS )
PassConfigzConfiguration for custom Inductor passes.

    This is separate from general `CompilationConfig` so that inductor passes
    don't all have access to full configuration - that would create a cycle as
    the `PassManager` is set as a property of config.c                   C      t j S r-   envsZVLLM_USE_V1r    r    r    r!   <lambda>N       zPassConfig.<lambda>default_factoryenable_fusionFenable_attn_fusionc                   C   r9   r-   r:   r    r    r    r!   r<   R   r=   enable_noopenable_sequence_parallelismenable_async_tpenable_fi_allreduce_fusioni @  !fi_allreduce_fusion_max_token_numc                 C   s   t t| S )z
        Produces a hash unique to the pass configuration.
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.
        )r   Z	hash_dictr   r(   r    r    r!   uuid_   s   zPassConfig.uuidr#   Nc                 C   s2   | j s| jrtd | jrtd d S d S d S )NzdFusion enabled but reshape elimination disabled. RMSNorm/SiluMul + quant (fp8) fusion might not workz^Fusion enabled but reshape elimination disabled. Attention + quant (fp8) fusion might not work)rB   r@   loggerwarning_oncerA   r(   r    r    r!   __post_init__g   s   zPassConfig.__post_init__r#   N)r   r   r   r5   r   r@   r7   __annotations__rA   rB   rC   rD   rE   rF   intrG   rJ   r    r    r    r!   r8   E   s$   
 r8   c                   @   s  e Zd ZU dZdZee ed< 	 dZe	ed< 	 dZ
e	ed< 	 dZe	ed< 	 eedZee	 ed	< 	 dZeee	  ed
< 	 dZeed< 	 dZeeeee	f   ed< 	 eedZeed< 	 eedZee	e	f ed< 	 dZee ed< 	 dZeed< 	 dZeed< 	 dZeee  ed< 	 dZeed< 	 dZee ed< 	 eedZeed< 	 edddZ eed< 	 edddZ!e	ed< 	 edddZ"ee ed< 	 ee#ddZ$e#e	 ed< 	 ee#ddZ%e#e	 ed< 	 ee&ddZ'e&e	 ed < 	 ed!ddZ(e)ed"< 	 eeddZ*ee	e+f ed#< 	 g d$Z,e-ee	  ed%< d&e	fd'd(Z.d&e	fd)d*Z/e/Z0e1dd+d,e2d-e+d&e+fd.d/Z3d<d0d1Z4d2d3d&ee	e5f fd4d5Z6dee d&dfd6d7Z7d8d9 Z8d&efd:d;Z9dS )=CompilationConfiga)  Configuration for compilation. It has three parts:

    - Top-level Compilation control:
        - [`level`][vllm.config.CompilationConfig.level]
        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
        - [`backend`][vllm.config.CompilationConfig.backend]
        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
    - CudaGraph capture:
        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
        - [`cudagraph_capture_sizes`]
        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
        - [`cudagraph_num_of_warmups`]
        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
        - [`cudagraph_copy_inputs`]
        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
    - Inductor compilation:
        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
        - [`inductor_compile_config`]
        [vllm.config.CompilationConfig.inductor_compile_config]
        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
        - custom inductor passes

    Why we have different sizes for cudagraph and inductor:
    - cudagraph: a cudagraph captured for a specific size can only be used
        for the same size. We need to capture all the sizes we want to use.
    - inductor: a graph compiled by inductor for a general shape can be used
        for different sizes. Inductor can also compile for specific sizes,
        where it can have more information to optimize the graph with fully
        static shapes. However, we find the general shape compilation is
        sufficient for most cases. It might be beneficial to compile for
        certain small batchsizes, where inductor is good at optimizing.
    Nlevel debug_dump_path	cache_dirbackendr>   
custom_opssplitting_opsTuse_inductorcompile_sizesinductor_compile_configinductor_passescudagraph_modeuse_cudagraphr   cudagraph_num_of_warmupscudagraph_capture_sizesFcudagraph_copy_inputsfull_cuda_graphpass_config)defaultinitmax_capture_sizelocal_cache_dirbs_to_padded_graph_size)r?   rb   enabled_custom_opsdisabled_custom_opstraced_filesg        compilation_timestatic_forward_context)zvllm.unified_attentionz"vllm.unified_attention_with_outputzvllm.mamba_mixer2_attention_opsr#   c                 C   s~   g }| | j | | j | | j | | j | | j | | j | | j | | j	  t
t|  S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        )appendrO   rS   rT   rU   rV   rX   rY   r`   rG   hashlibsha256strencode	hexdigest)r)   Zfactorsr    r    r!   compute_hashU  s   zCompilationConfig.compute_hashc                 C   st   ddddddddid}i }t t  D ]\}}t| j||kr%d||< q|r,||d< tttj| |dd	 S )NTZpost_grad_custom_post_pass)rj   rf   rg   ri   re   rh   rX   r`   )excludeZexclude_unset)
varsr8   itemsgetattrr`   ro   r   rN   Z	dump_jsondecode)r)   rs   Zpass_config_excludeattrZdefault_valr    r    r!   __repr__l  s0   zCompilationConfig.__repr__before)moder'   c                 C   s   t |trt|  S |S )zI
        enable parse the `cudagraph_mode` enum type from string
        )r3   ro   r"   upper)clsr'   r    r    r!   validate_cudagraph_mode_before  s   
z0CompilationConfig.validate_cudagraph_mode_beforec           
      C   sh  | j d}| j d}|| dksJ dtdr&d}|| jvr&d| j|< | j D ]K\}}t|tsOt|s@J d| d	t|t	rG|nt
|| j|< q+|d
}d
|d d }|d }t|j| }	t|	t	ro|	nt
|	| j|< q+t| jtrtdi | j| _| jstd | jd urtdtj| _| jrtd | jd urtdtj| _d S d S )Nnoneallr   z Can only specify 'none' or 'all'z2.6Zenable_auto_functionalized_v2Fzpass z' should be callable or a qualified name.z=use_cudagraph is deprecated, use cudagraph_mode=NONE instead.zquse_cudagraph and cudagraph_mode are mutually exclusive, prefer cudagraph_mode since use_cudagraph is deprecated.z?full_cuda_graph is deprecated, use cudagraph_mode=FULL instead.zufull_cuda_graph and cudagraph_mode are mutually exclusive, prefer cudagraph_mode since full_cuda_graph is deprecated.r    )rT   countr   rX   rY   ru   r3   ro   callabler   r   splitjoin
__import____dict__r`   dictr8   r[   rH   warningrZ   
ValueErrorr"   r6   r_   r1   )
r)   Z
count_noneZ	count_allZKEYkvnamesmodule	func_namefuncr    r    r!   rJ     sT   











zCompilationConfig.__post_init__vllm_configr   c                 C   s   | j tjkr
tdddlm} |t d}| j tjtjfv r3| j	dkr&dS | j	|v r.| j	S t
| j	S | j tjks;J ddlm} ||S )NzNo compilation level is set.r   )list_backends)exclude_tagsrP   eager)VllmBackend)rO   r   r   r   Ztorch._dynamo.backends.registryr   r4   r   r   rS   r   r   Zvllm.compilation.backendsr   )r)   r   r   Ztorch_backendsr   r    r    r!   init_backend  s   


zCompilationConfig.init_backendc                 C   s`  | j du r	|| _ ntt| j }t|t| j k r td|| || _ g }| jdur[tt| j| _| jD ]%}t|trN|dksGJ d| |	| j  q5t|t
sUJ || q5|| _| j jdd | j rm| j d nd| _dd	 t| jd
 D | _t| j | j d
d dg D ]\}}t||D ]}||kr|| j|< q|| j|< qq| j| j| j< dS )zVTo complete the initialization of config,
        we need to know the cudagraph sizes.NzGcudagraph sizes specified by model runner %s is overridden by config %sr]   zOUnrecognized size type in compile_sizes, expect 'cudagraph_capture_sizes', got T)reverser   c                 S   s   g | ]}d qS )r   r    ).0ir    r    r!   
<listcomp>  s    z?CompilationConfig.init_with_cudagraph_sizes.<locals>.<listcomp>r   )r]   listsetlenrH   inforW   r3   ro   extendrM   rl   sortrc   rangere   zip)r)   r]   Zdedup_sizesZcomputed_compile_sizesxendstartbsr    r    r!   init_with_cudagraph_sizes  sX   




z+CompilationConfig.init_with_cudagraph_sizesc                 C   sn   | j tjks
J d| jd u r| j| _d S t| jdkr5td | jt	jkr0td t	j
| _g | _d S d S )NzWset_splitting_ops_for_v1 should only be called when level is CompilationLevel.PIECEWISEr   z5Using piecewise compilation with empty splitting_ops.a  When compilation level is piecewise with empty splitting_ops, PIECEWISE cudagraph_mode will be treated as FULL cudagraph_mode. Please ensure you are using attention backends that support cudagraph or set cudagraph_mode to NONE explicitly if encountering any problems.)rO   r   r   rU   rk   r   rH   rI   rZ   r"   r1   r(   r    r    r!   set_splitting_ops_for_v1  s   


z*CompilationConfig.set_splitting_ops_for_v1c                    s"    j d uot fdd jD S )Nc                 3   s    | ]}| j v V  qd S r-   )rU   )r   opr(   r    r!   	<genexpr>3  s    

zDCompilationConfig.splitting_ops_contain_attention.<locals>.<genexpr>)rU   r   rk   r(   r    r(   r!   splitting_ops_contain_attention2  s   z1CompilationConfig.splitting_ops_contain_attentionrK   ):r   r   r   r5   rO   r	   rM   rL   rQ   ro   rR   rS   r   r   rT   rU   rV   r7   rW   r
   r   rX   rY   rZ   r"   r[   r\   r]   r^   r_   r8   r`   rc   rd   re   r   rf   rg   r   rh   ri   floatrj   r   rk   r   rr   ry   __str__r   classmethodr~   rJ   r   r   r   r   r   r    r    r    r!   rN   s   s   
 &	#

8
2rN   ))enumrm   collectionsr   dataclassesr   r   typingr   r   r   r   r	   r
   Zpydanticr   r   Zpydantic.dataclassesr   Z	vllm.envsr;   Zvllm.compilation.inductor_passr   r   Zvllm.config.utilsr   Zvllm.loggerr   Z
vllm.utilsr   r   Zvllm.configr   objectr   rH   r   Enumr"   r8   rN   r    r    r    r!   <module>   s0    ",