o
    1 i                     @   s   d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ erDd d	lmZ ed
dG dd deZdddee deegef deee
eef f fddZdS )    Counter)Number)TYPE_CHECKINGCallableDictListOptionalUnionN)is_categorical_dtype)Mean)Preprocessor)	PublicAPI)Datasetalpha)Z	stabilityc                       s   e Zd ZdZg dZ		ddddee dedeeee	f  d	eee  f fd
dZ
dddefddZdejfddZdd Zdd Z  ZS )SimpleImputera  Replace missing values with imputed values. If the column is missing from a
    batch, it will be filled with the imputed value.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import SimpleImputer
        >>> df = pd.DataFrame({"X": [0, None, 3, 3], "Y": [None, "b", "c", "c"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  NaN     b
        2  3.0     c
        3  3.0     c

        The `"mean"` strategy imputes missing values with the mean of non-missing
        values. This strategy doesn't work with categorical data.

        >>> preprocessor = SimpleImputer(columns=["X"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  2.0     b
        2  3.0     c
        3  3.0     c

        The `"most_frequent"` strategy imputes missing values with the most frequent
        value in each column.

        >>> preprocessor = SimpleImputer(columns=["X", "Y"], strategy="most_frequent")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  c
        1  3.0  b
        2  3.0  c
        3  3.0  c

        The `"constant"` strategy imputes missing values with the value specified by
        `fill_value`.

        >>> preprocessor = SimpleImputer(
        ...     columns=["Y"],
        ...     strategy="constant",
        ...     fill_value="?",
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  ?
        1  NaN  b
        2  3.0  c
        3  3.0  c

        :class:`SimpleImputer` can also be used in append mode by providing the
        name of the output_columns that should hold the imputed values.

        >>> preprocessor = SimpleImputer(columns=["X"], output_columns=["X_imputed"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y  X_imputed
        0  0.0  None        0.0
        1  NaN     b        2.0
        2  3.0     c        3.0
        3  3.0     c        3.0

    Args:
        columns: The columns to apply imputation to.
        strategy: How imputed values are chosen.

            * ``"mean"``: The mean of non-missing values. This strategy only works with numeric columns.
            * ``"most_frequent"``: The most common value.
            * ``"constant"``: The value passed to ``fill_value``.

        fill_value: The value to use when ``strategy`` is ``"constant"``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Raises:
        ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or
            ``"constant"``.
    )meanmost_frequentconstantr   N)output_columnscolumnsstrategy
fill_valuer   c                   sl   t    || _|| _|| _|| jvrtd| d| j |dkr-d| _|d u r-tdt	||| _
d S )N	Strategy z( is not supported.Supported values are: r   Fz8`fill_value` must be set when using "constant" strategy.)super__init__r   r   r   _valid_strategies
ValueErrorZ_is_fittabler   Z#_derive_and_validate_output_columnsr   )selfr   r   r   r   	__class__ j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/data/preprocessors/imputer.pyr   h   s&   


zSimpleImputer.__init__datasetr   returnc                    sP   j dkrjjtjd S j dkr&jj fdddd jd S )Nr   )Zaggregator_fnr   r   c                    s   t  j| dS )N)r#   r   key_gen)_get_most_frequent_valuesr   )r%   r#   r   r!   r"   <lambda>   s
    z$SimpleImputer._fit.<locals>.<lambda>c                 S   s   d|  dS )Nmost_frequent()r!   )colr!   r!   r"   r(      s    )Zstat_fnZstat_key_fnr   )r   Zstat_computation_planZadd_aggregatorr   r   Zadd_callable_stat)r   r#   r!   r'   r"   _fit   s   


zSimpleImputer._fitdfc                 C   s   t | j| jD ]X\}}| |}|d u rtd| d||jvr&|||< qt|j| r8|| j|g||< ||ksLt	|| j
tjrV|| j
jjsV|| jdd||< |j||idd q|S )NzColumn zA has no fill value. Check the data used to fit the SimpleImputer.T)deep)Zinplace)zipr   r   _get_fill_valuer   r   ZdtypescatZadd_categories
isinstancevaluesnpZndarrayflagsZ	writeablecopyZfillna)r   r-   columnZoutput_columnvaluer!   r!   r"   _transform_pandas   s$   



zSimpleImputer._transform_pandasc                 C   sZ   | j dkr| jd| d S | j dkr| jd| d S | j dkr$| jS td| j  d)	Nr   zmean(r*   r   r)   r   r   zA is not supported. Supported values are: {self._valid_strategies})r   Zstats_r   r   )r   r7   r!   r!   r"   r0      s   


zSimpleImputer._get_fill_valuec              
   C   s.   | j j d| jd| jd| jd| jd
S )Nz	(columns=z, strategy=z, fill_value=z, output_columns=r*   )r    __name__r   r   r   r   )r   r!   r!   r"   __repr__   s   zSimpleImputer.__repr__)r   N)r:   
__module____qualname____doc__r   r   strr	   r
   r   r   r   r,   pd	DataFramer9   r0   r;   __classcell__r!   r!   r   r"   r      s(    S
 r   r#   r   r   r%   r$   c           	         s   dt jdtttt f f fdd}| j|dd}dd  D |jd d	D ]}| D ]\}}|D ]
}|  |7  < q2q,q&fd
d D S )Nr-   r$   c                    s    fddD S )Nc                    s$   i | ]}|t  |   gqS r!   )r   value_countsto_dict.0r+   r-   r!   r"   
<dictcomp>   s   $ zJ_get_most_frequent_values.<locals>.get_pd_value_counts.<locals>.<dictcomp>r!   rG   )r   rG   r"   get_pd_value_counts   s   z6_get_most_frequent_values.<locals>.get_pd_value_countspandas)Zbatch_formatc                 S   s   i | ]}|t  qS r!   r   rE   r!   r!   r"   rH      s    z-_get_most_frequent_values.<locals>.<dictcomp>)Z
batch_sizec                    s(   i | ]}| |  d d d qS )   r   )most_common)rF   r7   )final_countersr%   r!   r"   rH      s    )	r@   rA   r   r?   r   r   Zmap_batchesZiter_batchesitems)	r#   r   r%   rI   rC   batchr+   Zcounterscounterr!   )r   rM   r%   r"   r&      s   $r&   )collectionsr   numbersr   typingr   r   r   r   r	   r
   numpyr4   rJ   r@   Zpandas.api.typesr   Zray.data.aggregater   Zray.data.preprocessorr   Zray.util.annotationsr   Zray.data.datasetr   r   r?   r&   r!   r!   r!   r"   <module>   s.      =