o
    )i                     @   s   d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ dd	lmZmZmZ erOd d
lmZmZmZ d dlmZ ndZdZdZdZeeZG dd deZzd dlmZ  e ZW dS  e!y}   e"d Y dS w )    )TYPE_CHECKINGOptionalUnioncastN)device)ProcessorInputs
PromptType)init_logger)SamplingParamsSamplingType)DEFAULT_MAX_NUM_BATCHED_TOKENS   )PlatformPlatformEnum_Backend)	BlockSizeModelConfig
VllmConfig)PoolingParamsc                   @   s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< dZeed< g dZee ed< ddgZee ed< edededejdee dedededefddZedejddfdd ZedGd"edefd#d$ZedGd"edefd%d&Zed'ee defd(d)Zedefd*d+Zedejde e!e!f fd,d-Z"ed.d/ Z#edefd0d1Z$ed2d3 Z%ed4e&ddfd5d6Z'ed7d8 Z(edefd9d:Z)edefd;d<Z*ed=e+defd>d?Z,ed@e-dAe.e/e0f dBe1ddfdCdDZ2ededefdEdFZ3dS )HTpuPlatformtpudevice_namedevice_typeZXLAdispatch_keyTPUray_device_keyZgloodist_backendZTPU_VISIBLE_CHIPSdevice_control_env_varopenxlasimple_compile_backend)Zfp8Ztpu_int8zcompressed-tensorssupported_quantizationZTPU_CHIPS_PER_HOST_BOUNDSZTPU_HOST_BOUNDSadditional_env_varsselected_backend	head_sizedtypekv_cache_dtype
block_sizeuse_v1use_mlareturnc	           	      C   s:   |t jkr|t jkrtd| |stdtd dS )NzCannot use %s backend on TPU.zTPU backend only supports V1.zUsing Pallas V1 backend.z8vllm.v1.attention.backends.pallas.PallasAttentionBackend)r   ZPALLASZPALLAS_VLLM_V1loggerinfo
ValueError)	clsr"   r#   r$   r%   r&   r'   r(   Zhas_sink r.   ^/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/vllm/platforms/tpu.pyget_attn_backend_cls.   s   


z TpuPlatform.get_attn_backend_clsr   Nc                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)torchr   
set_device)r-   r   r.   r.   r/   r2   <   s   zTpuPlatform.set_devicer   	device_idc                 C   s   t  \}}d|j S )NzTPU )r   Zget_local_chipsname)r-   r3   Z	chip_type_r.   r.   r/   get_device_nameC   s   zTpuPlatform.get_device_namec                 C   s   t N)NotImplementedError)r-   r3   r.   r.   r/   get_device_total_memoryH      z#TpuPlatform.get_device_total_memoryenforce_eagerc                 C      dS NFr.   )r-   r;   r.   r.   r/   is_async_output_supportedL   r:   z%TpuPlatform.is_async_output_supportedc                 C   r<   )Nz4vllm.lora.punica_wrapper.punica_tpu.PunicaWrapperTPUr.   r-   r.   r.   r/   get_punica_wrapperP   r:   zTpuPlatform.get_punica_wrapperc                 C   s   t |jt |jfS r7   )r1   Zfinfominmax)r-   r$   r.   r.   r/   get_infinity_valuesT   s   zTpuPlatform.get_infinity_valuesc                 C   r<   r=   r.   r?   r.   r.   r/   can_update_inplaceX   r:   zTpuPlatform.can_update_inplacec                 C   r<   )Nr   r.   r?   r.   r.   r/   get_lora_vocab_padding_size\   r:   z'TpuPlatform.get_lora_vocab_padding_sizec                 C   s   t  S r7   )r1   Zno_gradr?   r.   r.   r/   inference_mode`   s   zTpuPlatform.inference_modevllm_configc           
      C   sx  ddl m}m} |j}|r|jd u rttd|_|j}|j|j	kr*t
d |j	|_|jd u s7|j |jkr@t
d |j|_|jdkrHd|_|jd u sQJ d|j}|d url|jtjtjfv rlt
d	|j tj|_dd
lm} |||_|j}|j}	|jdkrd|_|jrJ d|	jr|	jst
d d|	_|r|j rt
d d|j_!d|j_"t#|jj$t%|j_&d S d S d S )Nr   )CompilationLevelCUDAGraphMode   zE[TPU] Forcing DYNAMO_ONCE compilation level, and disabling cudagraph.z?[TPU] CUDA graph is not supported on TPU, disabling cudagraphs. r   z)TPU does not support speculative decodingzFThe TPU backend currently does not support %s. Using bfloat16 instead.)PallasAttentionBackendautoz#vllm.v1.worker.tpu_worker.TPUWorkerz9Speculative decoding is not yet supported for TPU backendzTPU does not support running Multimodal models without setting `--disable_chunked_mm_input`. Forcing --disable_chunked_mm_input.Tz`MLA is enabled on a non-GPU platform; forcing chunked prefill and prefix caching to be disabled.F)'vllm.configrH   rI   cache_configr&   r   r   compilation_configlevelZDYNAMO_ONCEr*   r+   Zcudagraph_modeZmax_cudagraph_modeNONEbackendZspeculative_configmodel_configr$   r1   Zfloat16Zfloat32warningZbfloat16Z!vllm.v1.attention.backends.pallasrL   Zget_page_sizeparallel_configscheduler_configZ
worker_clsZis_multimodal_modelZdisable_chunked_mm_inputr(   Zenable_chunked_prefillZchunked_prefill_enabledrB   Zmax_model_lenr   Zmax_num_batched_tokens)
r-   rG   rH   rI   rO   rP   rT   rL   rV   rW   r.   r.   r/   check_and_update_configd   sn   






z#TpuPlatform.check_and_update_configc                 C   s   t d dS )Nz#Pin memory is not supported on TPU.F)r*   rU   r?   r.   r.   r/   is_pin_memory_available   s   
z#TpuPlatform.is_pin_memory_availablec                 C   r<   )NzFvllm.distributed.device_communicators.tpu_communicator.TpuCommunicatorr.   r?   r.   r.   r/   get_device_communicator_cls   r:   z'TpuPlatform.get_device_communicator_clsc                 C   r<   NTr.   r?   r.   r.   r/   use_all_gather   r:   zTpuPlatform.use_all_gatherrT   c                 C   r<   r[   r.   )r-   rT   r.   r.   r/   supports_v1   s   zTpuPlatform.supports_v1promptparamsprocessed_inputsc                 C   s&   t |tr|jtjkrtddS dS )z6Raises if this request is unsupported on this platformz,Torch XLA does not support per-request seed.N)
isinstancer
   Zsampling_typer   ZRANDOM_SEEDr,   )r-   r^   r_   r`   r.   r.   r/   validate_request   s
   
zTpuPlatform.validate_requestc                 C   r<   r[   r.   )r-   r%   r.   r.   r/   is_kv_cache_dtype_supported   r:   z'TpuPlatform.is_kv_cache_dtype_supported)r   )4__name__
__module____qualname__r   r   Z_enumr   str__annotations__r   r   r   r   r   r   r    listr!   classmethodr   intr1   r$   r   boolr0   r   r2   r6   r9   r>   r@   tuplefloatrC   rD   rE   rF   r   rX   rY   rZ   r\   r   r]   r   r   r
   r   r   rb   rc   r.   r.   r.   r/   r      s   
 


A

r   )r   z/tpu_commons not found, using vLLM's TpuPlatform)#typingr   r   r   r   r1   Ztpu_infor   Zvllm.inputsr   r   Zvllm.loggerr	   Zvllm.sampling_paramsr
   r   Z
vllm.utilsr   Z	interfacer   r   r   rN   r   r   r   Zvllm.pooling_paramsr   rd   r*   r   Ztpu_commons.platformsZTpuCommonsPlatformImportErrorr+   r.   r.   r.   r/   <module>   s2    .

