o
    1 i*
  ã                   @   s\   d dl mZ d dlmZ d dlmZ eddG dd„ deƒƒZeddG dd	„ d	eƒƒZd
S )é    )Ú	LLMServer)ÚPDProxyServer)Ú	PublicAPIÚalpha)Z	stabilityc                   @   ó   e Zd ZdZdS )r   a¤  The implementation of the vLLM engine deployment.

    To build a Deployment object you should use `build_llm_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `serve.deployment` function.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig
            from ray.serve.llm.deployment import LLMServer

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=8,
                    )
                ),
            )

            # Build the deployment directly
            serve_options = LLMServer.get_deployment_options(llm_config)
            llm_app = serve.deployment(LLMServer).options(
                **serve_options).bind(llm_config)

            model_handle = serve.run(llm_app)

            # Query the model via `chat` api
            from ray.serve.llm.openai_api_models import ChatCompletionRequest
            request = ChatCompletionRequest(
                model="llama-3.1-8b",
                messages=[
                    {
                        "role": "user",
                        "content": "Hello, world!"
                    }
                ]
            )
            response = ray.get(model_handle.chat(request))
            print(response)
    N©Ú__name__Ú
__module__Ú__qualname__Ú__doc__© r   r   úd/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/serve/llm/deployment.pyr      s    2r   c                   @   r   )r   a£  A proxy server for prefill-decode disaggregation.

    This server acts as a proxy in a prefill-decode disaggregated system.
    For chat and completions, proxy sends the request to the prefill server
    with max_tokens=1 and then sends the returned metadata to the decode server.

    Args:
        prefill_server: The prefill server deployment handle.
        decode_server: The decode server deployment handle.
    Nr   r   r   r   r   r   E   s    r   N)Z.ray.llm._internal.serve.core.server.llm_serverr   ZInternalLLMServerZAray.llm._internal.serve.serving_patterns.prefill_decode.pd_serverr   Z_PDProxyServerZray.util.annotationsr   r   r   r   r   Ú<module>   s    6