o
    1 i                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZd dl	m
Z
mZmZmZmZmZmZmZmZ er6d dlmZ e eZdedee
 fdd	Zdedee
 fd
dZdedee
 fddZeG dd dZ	ddddeee  defddZdS )    N)	dataclass)TYPE_CHECKINGListOptional)	AggregateFnV2ApproximateQuantileCountMaxMeanMinMissingValuePercentageStdZeroPercentage)Datasetcolumnreturnc                 C   sV   t | ddt| ddt| ddt| ddt| dddt| dgdt| dt| ddgS )	a  Generate default metrics for numerical columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - mean
    - min
    - max
    - std
    - approximate_quantile
    - missing_value_percentage
    - zero_percentage

    Args:
        column: The name of the numerical column to compute metrics for.

    Returns:
        A list of AggregateFnV2 instances that can be used with Dataset.aggregate()
    Fonignore_nullsTr   )r   r   Zddofg      ?)r   Z	quantilesr   )r   r
   r   r	   r   r   r   r   r    r   Z/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/data/stats.pynumerical_aggregators   s   




r   c                 C      t | ddt| dgS )ag  Generate default metrics for string columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - MissingValuePercentage

    Args:
        column: The name of the categorical column to compute metrics for.

    Returns:
        A list of AggregateFnV2 instances that can be used with Dataset.aggregate()
    Fr   r   r   r   r   r   r   r   categorical_aggregators9   s   
r   c                 C   r   )zGenerate default metrics for vector columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - MissingValuePercentage
    Fr   r   r   r   r   r   r   vector_aggregatorsL   s   
r   c                   @   sB   e Zd ZU dZee ed< ee ed< ee ed< ee ed< dS )FeatureAggregatorsz8Container for categorized columns and their aggregators.numerical_columnsstr_columnsvector_columnsaggregatorsN)__name__
__module____qualname____doc__r   str__annotations__r   r   r   r   r   r   Y   s   
 r   datasetr   columnsc                 C   sr  |   }|s
td|du r|j}t|t|j }|r$td| dg }g }g }g }|j}|j}	tt||	}
|D ]u}||
vrBq;|
| }t|tj	sYt
d| d| d q;tj|sqtj|sqtj|sqtj|r~|| |t| q;tj|r|| |t| q;tj|r|| |t| q;t
d| d| d q;t||||d	S )
a1  Generate aggregators for all columns in a dataset.

    Args:
        dataset: A Ray Dataset instance
        columns: A list of columns to include in the summary. If None, all columns will be included.
    Returns:
        FeatureAggregators containing categorized column names and their aggregators
    z9Dataset must have a schema to determine numerical columnsNzColumns z not found in dataset schemazSkipping field z: type z is not a PyArrow DataTypez not supported)r   r    r!   r"   )schema
ValueErrornamessettypesdictzip
isinstancepaZDataTypeloggerwarning
is_integerZis_floatingZ
is_decimalZ
is_booleanappendextendr   Z	is_stringr   Zis_listr   r   )r)   r*   r+   Zmissing_colsr   r    r!   Zall_aggsZcolumn_namesZcolumn_typesZname_to_typenameZftyper   r   r   feature_aggregators_for_datasetc   s^   






r:   )N)loggingdataclassesr   typingr   r   r   Zpyarrowr3   Zray.data.aggregater   r   r   r	   r
   r   r   r   r   Zray.datar   	getLoggerr#   r4   r'   r   r   r   r   r:   r   r   r   r   <module>   s*    ,


