o
    ưia/                     @  s   d Z ddlmZ ddlZddlmZmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZm Z  erbddlm!Z! G dd deZ"dS )z
Base RAG Ingestion class.

Provides abstract methods for:
- OCR
- Chunking
- Embedding
- Vector Store operations

Providers can inherit and override methods as needed.
    )annotationsN)ABCabstractmethod)TYPE_CHECKINGAnyDictListOptionalTuplecast)verbose_logger)uuid4)DEFAULT_CHUNK_OVERLAPDEFAULT_CHUNK_SIZE)get_async_httpx_clienthttpxSpecialProvider)extract_text_from_pdf)RecursiveCharacterTextSplitter)RAGIngestOptionsRAGIngestResponse)Routerc                   @  s   e Zd ZdZ	d/d0ddZd1ddZed2ddZ			d3d4ddZd5ddZ	d6d!d"Z
d7d%d&Zed8d*d+Z			d3d9d-d.ZdS ):BaseRAGIngestiona  
    Base class for RAG ingestion.

    Providers should inherit from this class and override methods as needed.
    For example, OpenAI handles embedding internally when attaching files to
    vector stores, so it overrides the embedding step to be a no-op.
    Ningest_optionsr   routerOptional['Router']c                 C  s   || _ || _dt  | _|d| _tttt	f |dp!ddi| _
|d| _tttt	f |dp6i | _|d| _|   d S )	NZingest_ocrchunking_strategytypeauto	embeddingZvector_storename)r   r   r   	ingest_idget
ocr_configr   r   strr   r   embedding_configvector_store_configZingest_name_load_credentials_from_config)selfr   r    r)   [/home/app/Keep/.python/lib/python3.10/site-packages/litellm/rag/ingestion/base_ingestion.py__init__,   s   
zBaseRAGIngestion.__init__returnNonec                 C  s^   ddl m} | jd}|r)tjr+||}| D ]\}}|| jvr(|| j|< qdS dS dS )z
        Load credentials from litellm_credential_name if provided in vector_store config.

        This allows users to specify a credential name in the vector_store config
        which will be resolved from litellm.credential_list.
        r   )CredentialAccessorZlitellm_credential_nameN)Z.litellm.litellm_core_utils.credential_accessorr.   r&   r"   litellmZcredential_listZget_credential_valuesitems)r(   r.   Zcredential_nameZcredential_valueskeyvaluer)   r)   r*   r'   D   s   



z.BaseRAGIngestion._load_credentials_from_configr$   c                 C  s   | j ddS )zGet the vector store provider.custom_llm_providerZopenai)r&   r"   )r(   r)   r)   r*   r3   U   s   z$BaseRAGIngestion.custom_llm_provider	file_data Optional[Tuple[str, bytes, str]]file_urlOptional[str]file_idCTuple[Optional[str], Optional[bytes], Optional[str], Optional[str]]c           	        s   |r|\}}}|||dfS |r;t tjd}||I dH }|  |j}|dd p-d}|jdd}|||dfS |rCddd|fS td)	aG  
        Upload / prepare file for ingestion.

        Args:
            file_data: Tuple of (filename, content_bytes, content_type)
            file_url: URL to fetch file from
            file_id: Existing file ID to use

        Returns:
            Tuple of (filename, file_content, content_type, existing_file_id)
        N)Zllm_provider/documentzcontent-typezapplication/octet-streamz,Must provide file_data, file_url, or file_id)	r   r   ZRAGr"   raise_for_statuscontentsplitheaders
ValueError)	r(   r4   r6   r8   filenamefile_contentcontent_typehttp_clientresponser)   r)   r*   uploadZ   s   
zBaseRAGIngestion.uploadrC   Optional[bytes]rD   c           	        s   | j r|sdS | j dd}|rd|v rd\}}nd\}}t|d}d| d	| }| jdurC| jj|d
|||idI dH }ntj|d
|||idI dH }t|drd|j	rdd
dd |j	D S dS )z
        Perform OCR on file content to extract text.

        Args:
            file_content: Raw file bytes
            content_type: MIME type of the file

        Returns:
            Extracted text or None if OCR not configured/needed
        Nmodelzmistral/mistral-ocr-latestimage)	image_urlrK   )document_urlrL   utf-8zdata:z;base64,r   )rI   r<   pagesz

c                 s  s     | ]}t |d r|jV  qdS )markdownN)hasattrrO   ).0pager)   r)   r*   	<genexpr>   s    

z'BaseRAGIngestion.ocr.<locals>.<genexpr>)r#   r"   base64	b64encodedecoder   Zaocrr/   rP   rN   join)	r(   rC   rD   Z	ocr_modelZdoc_typeZurl_keyZb64_contentZdata_urlZocr_responser)   r)   r*   r   }   s.   





zBaseRAGIngestion.ocrtextocr_was_usedbool	List[str]c                 C  s   d}|r|}n8|r?|s?z| d}W n, ty>   |dr3td t|}|s2td g  Y S n	td g  Y S Y nw |sCg S | jpGi }|dt}|dt	}|d	d}||d
}	|re||	d	< t
di |	}
|
|S )a  
        Split text into chunks using RecursiveCharacterTextSplitter.

        Args:
            text: Text from OCR (if used)
            file_content: Raw file content bytes
            ocr_was_used: Whether OCR was performed

        Returns:
            List of text chunks
        NrM   s   %PDFz(PDF detected, attempting text extractionzkPDF text extraction failed. Install 'pypdf' or 'PyPDF2' for PDF support, or enable OCR with a vision model.z,Binary file detected, skipping text chunking
chunk_sizechunk_overlap
separators)r\   r]   r)   )rV   UnicodeDecodeError
startswithr   debugr   r   r"   r   r   r   Z
split_text)r(   rX   rC   rY   Ztext_to_chunkZsplitter_argsr\   r]   r^   Zsplitter_kwargsZtext_splitterr)   r)   r*   chunk   sB   




zBaseRAGIngestion.chunkchunksOptional[List[List[float]]]c                   sd   | j r|sdS | j dd}| jdur | jj||dI dH }n
tj||dI dH }dd |jD S )z
        Generate embeddings for text chunks.

        Args:
            chunks: List of text chunks

        Returns:
            List of embeddings or None
        NrI   ztext-embedding-3-small)rI   inputc                 S  s   g | ]}|d  qS )r   r)   )rQ   itemr)   r)   r*   
<listcomp>  s    z*BaseRAGIngestion.embed.<locals>.<listcomp>)r%   r"   r   Z
aembeddingr/   data)r(   rc   Zembedding_modelrF   r)   r)   r*   embed   s   

zBaseRAGIngestion.embedrB   
embeddings#Tuple[Optional[str], Optional[str]]c                   s   dS )a  
        Store content in vector store.

        This method must be implemented by provider-specific subclasses.

        Args:
            file_content: Raw file bytes
            filename: Name of the file
            content_type: MIME type
            chunks: Text chunks (if chunking was done locally)
            embeddings: Embeddings (if embedding was done locally)

        Returns:
            Tuple of (vector_store_id, file_id)
        Nr)   )r(   rC   rB   rD   rc   rj   r)   r)   r*   store  s   zBaseRAGIngestion.storer   c              
     s   | j |||dI dH \}}}}z;| j||dI dH }| j||| jdud}	| j|	dI dH }
| j||||	|
dI dH \}}t| jd|pEd|pH|d	W S  tyq } zt	
d
|  t| jdddt|dW  Y d}~S d}~ww )as  
        Execute the full ingestion pipeline.

        Args:
            file_data: Tuple of (filename, content_bytes, content_type)
            file_url: URL to fetch file from
            file_id: Existing file ID to use

        Returns:
            RAGIngestResponse with status and IDs

        Raises:
            ValueError: If no input source is provided
        )r4   r6   r8   N)rC   rD   )rX   rC   rY   )rc   )rC   rB   rD   rc   rj   	completed )idstatusvector_store_idr8   zRAG Pipeline failed: failed)ro   rp   rq   r8   error)rG   r   rb   r#   ri   rl   r   r!   	Exceptionr   	exceptionr$   )r(   r4   r6   r8   rB   rC   rD   Zexisting_file_idZextracted_textrc   rj   rq   Zresult_file_ider)   r)   r*   ingest  sR   zBaseRAGIngestion.ingest)N)r   r   r   r   )r,   r-   )r,   r$   )NNN)r4   r5   r6   r7   r8   r7   r,   r9   )rC   rH   rD   r7   r,   r7   )rX   r7   rC   rH   rY   rZ   r,   r[   )rc   r[   r,   rd   )rC   rH   rB   r7   rD   r7   rc   r[   rj   rd   r,   rk   )r4   r5   r6   r7   r8   r7   r,   r   )__name__
__module____qualname____doc__r+   r'   propertyr3   rG   r   rb   ri   r   rl   rw   r)   r)   r)   r*   r   #   s(    

#
2
<r   )#r{   
__future__r   rT   abcr   r   typingr   r   r   r   r	   r
   r   r/   Zlitellm._loggingr   Zlitellm._uuidr   Zlitellm.constantsr   r   Z&litellm.llms.custom_httpx.http_handlerr   r   Z"litellm.rag.ingestion.file_parsersr   Zlitellm.rag.text_splittersr   Zlitellm.types.ragr   r   r   r   r)   r)   r)   r*   <module>   s     $