o
    1 i}                     @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ erNd d	lmZ ed
dG dd deZed
dG dd deZed
dG dd deZed
dG dd deZ ed
dG dd deZ!dddddde
e" dede#deee"e$f  f
ddZ%d+d!e#d"eegee"e$f f fd#d$Z&d%ej'de"d"dfd&d'Z(d(ej)d"e#fd)d*Z*dS ),    )Counter)partial)TYPE_CHECKINGAnyCallableDictHashableListOptionalSetN)BatchFormat)PreprocessorPreprocessorNotFittedException)make_post_processor)	PublicAPI)Datasetalpha)Z	stabilityc                	       sl   e Zd ZdZddddee dedeee  f fdd	Zd
dde	fddZ
dejfddZdd Z  ZS )OrdinalEncodera  Encode values within columns as ordered integer values.

    :class:`OrdinalEncoder` encodes categorical features as integers that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of categories.

    If you transform a value that isn't in the fitted datset, then the value is encoded
    as ``float("nan")``.

    Columns must contain either hashable values or lists of hashable values. Also, you
    can't have both scalars and lists in the same column.

    Examples:
        Use :class:`OrdinalEncoder` to encode categorical features as integers.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OrdinalEncoder
        >>> df = pd.DataFrame({
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["sex", "level"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    1      1
        1    0      2
        2    1      0
        3    0      1

        :class:`OrdinalEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OrdinalEncoder(columns=["sex", "level"], output_columns=["sex_encoded", "level_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level  sex_encoded  level_encoded
        0    male    L4            1              1
        1  female    L5            0              2
        2    male    L3            1              0
        3  female    L4            0              1


        If you transform a value not present in the original dataset, then the value
        is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    0    NaN

        :class:`OrdinalEncoder` can also encode categories in a list.

        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [2, 0, 4]
        1                          Moana  [1, 2, 0]
        2  The Smartest Guys in the Room        [3]

    Args:
        columns: The columns to separately encode.
        encode_lists: If ``True``, encode list elements.  If ``False``, encode
            whole lists (i.e., replace each list with an integer). ``True``
            by default.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            Another preprocessor that encodes categorical data.
    TN)encode_listsoutput_columnscolumnsr   r   c                   s(   t    || _|| _t||| _d S N)super__init__r   r   r   #_derive_and_validate_output_columnsr   )selfr   r   r   	__class__ j/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/ray/data/preprocessors/encoder.pyr   i   s   

zOrdinalEncoder.__init__datasetr   returnc                    0   j j fddt dd dd jd S )Nc                    s   t  jj| dS )N)r    r   r   key_gen)compute_unique_value_indicesr   r   r#   r    r   r   r   <lambda>z   s    z%OrdinalEncoder._fit.<locals>.<lambda>c                 S      d|  dS Nzunique()r   colr   r   r   r'          c                 S   r(   Nunique_values(r*   r   r+   r   r   r   r'      r-   Zstat_fnZpost_process_fnZstat_key_fnZpost_key_fnr   stat_computation_planadd_callable_statunique_post_fnr   r   r    r   r&   r   _fitx   s   zOrdinalEncoder._fitdfc                    sX   t |gjR   dtdtffdd dtjf fdd}|j ||j< |S )Nelementnamec                   s    fdd| D S )Nc                    s$   g | ]}j d   d |qS )r/   r*   )stats_get.0x)r9   r   r   r   
<listcomp>   s   $ zIOrdinalEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>r   )r8   r9   r   r9   r   encode_list   s   z5OrdinalEncoder._transform_pandas.<locals>.encode_listsc                    sX   t  rjr t jdS  fdd} |S jd j d } |S )NrA   c                    s"   t | }jd j d |S r.   )tupler:   r9   r;   )r8   key)rC   r   r   r   list_as_category   s   zZOrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder.<locals>.list_as_categoryr/   r*   )_is_series_composed_of_listsr   mapr   r9   applyr:   )rC   rF   s_valuesrB   r   )rC   r   column_ordinal_encoder   s   

z@OrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder)_validate_dfr   liststrpdSeriesrI   r   )r   r7   rL   r   rK   r   _transform_pandas   s
   z OrdinalEncoder._transform_pandasc                 C   &   | j j d| jd| jd| jdS )N	(columns=z, encode_lists=, output_columns=r*   )r   __name__r   r   r   r@   r   r   r   __repr__      zOrdinalEncoder.__repr__)rV   
__module____qualname____doc__r	   rO   boolr
   r   r   r6   rP   	DataFramerR   rW   __classcell__r   r   r   r   r      s    Y
r   c                	       s   e Zd ZdZddddee deeeef  deee  f fddZ	d	d
de
fddZdedeeef fddZdejfddZdd Z  ZS )OneHotEncodera-  `One-hot encode <https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics>`_
    categorical data.

    This preprocessor transforms each specified column into a one-hot encoded vector.
    Each element in the vector corresponds to a unique category in the column, with a
    value of 1 if the category matches and 0 otherwise.

    If a category is infrequent (based on ``max_categories``) or not present in the
    fitted dataset, it is encoded as all 0s.

    Columns must contain hashable objects or lists of hashable objects.

    .. note::
        Lists are treated as categories. If you want to encode individual list
        elements, use :class:`MultiHotEncoder`.

    Example:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OneHotEncoder
        >>>
        >>> df = pd.DataFrame({"color": ["red", "green", "red", "red", "blue", "green"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OneHotEncoder(columns=["color"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
               color
        0  [0, 0, 1]
        1  [0, 1, 0]
        2  [0, 0, 1]
        3  [0, 0, 1]
        4  [1, 0, 0]
        5  [0, 1, 0]

        OneHotEncoder can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OneHotEncoder(columns=["color"], output_columns=["color_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           color color_encoded
        0    red     [0, 0, 1]
        1  green     [0, 1, 0]
        2    red     [0, 0, 1]
        3    red     [0, 0, 1]
        4   blue     [1, 0, 0]
        5  green     [0, 1, 0]

        If you one-hot encode a value that isn't in the fitted dataset, then the
        value is encoded with zeros.

        >>> df = pd.DataFrame({"color": ["yellow"]})
        >>> batch = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(batch).to_pandas()  # doctest: +SKIP
            color color_encoded
        0  yellow     [0, 0, 0]

        Likewise, if you one-hot encode an infrequent value, then the value is encoded
        with zeros.

        >>> encoder = OneHotEncoder(columns=["color"], max_categories={"color": 2})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
            color
        0  [1, 0]
        1  [0, 1]
        2  [1, 0]
        3  [1, 0]
        4  [0, 0]
        5  [0, 1]

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`MultiHotEncoder`
            If you want to encode individual list elements, use
            :class:`MultiHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.
    Nmax_categoriesr   r   ra   r   c                   ,   t    || _|pi | _t||| _d S r   r   r   r   ra   r   r   r   r   r   ra   r   r   r   r   r        


zOneHotEncoder.__init__r    r   r!   c                    r"   )Nc                       t  jd| jdS )NFr    r   r   r#   ra   r$   r   ra   r%   r&   r   r   r'         z$OneHotEncoder._fit.<locals>.<lambda>c                 S   r(   r)   r   r+   r   r   r   r'     r-   c                 S   r(   r.   r   r+   r   r   r   r'     r-   r0   r1   r5   r   r&   r   r6        zOneHotEncoder._fitvstatsc                 C   s2   t |ttjfrt|}t |tr||dS dS )N)
isinstancerN   npndarrayrD   r   r;   )r   rk   rl   r   r   r   safe_get   s
   
zOneHotEncoder.safe_getr7   c           	         s   t |g jR   t j jD ]D\}} jd| d t}tjt||ftjd}|| 	 fdd
 }|dk}t|d }d|||| f< | ||< q|S )	Nr/   r*   )dtypec                    s     | S r   )rq   )rk   r   rl   r   r   r'   1  r-   z1OneHotEncoder._transform_pandas.<locals>.<lambda>rm   r      )rM   r   zipr   r:   lenro   ZzerosZuint8rI   Zto_numpyZnonzerotolist)	r   r7   columnoutput_columnZnum_categoriesZone_hotcodesZvalid_category_maskZnon_zero_indicesr   rs   r   rR   (  s    zOneHotEncoder._transform_pandasc                 C   rS   NrT   z, max_categories=rU   r*   r   rV   r   ra   r   r@   r   r   r   rW   ?  rX   zOneHotEncoder.__repr__)rV   rY   rZ   r[   r	   rO   r
   r   intr   r   r6   r   rq   rP   r]   rR   rW   r^   r   r   r   r   r_      s    ]
r_   c                	       sx   e Zd ZdZddddee deeeef  deee  f fddZ	d	d
de
fddZdejfddZdd Z  ZS )MultiHotEncodera  Multi-hot encode categorical data.

    This preprocessor replaces each list of categories with an :math:`m`-length binary
    list, where :math:`m` is the number of unique categories in the column or the value
    specified in ``max_categories``. The :math:`i\\text{-th}` element of the binary list
    is :math:`1` if category :math:`i` is in the input list and :math:`0` otherwise.

    Columns must contain hashable objects or lists of hashable objects.
    Also, you can't have both types in the same column.

    .. note::
        The logic is similar to scikit-learn's [MultiLabelBinarizer][1]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MultiHotEncoder
        >>>
        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> encoder = MultiHotEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name            genre
        0                 Shaolin Soccer  [1, 0, 1, 0, 1]
        1                          Moana  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room  [0, 0, 0, 1, 0]

        :class:`MultiHotEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = MultiHotEncoder(columns=["genre"], output_columns=["genre_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name                        genre    genre_encoded
        0                 Shaolin Soccer     [comedy, action, sports]  [1, 0, 1, 0, 1]
        1                          Moana  [animation, comedy, action]  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room                [documentary]  [0, 0, 0, 1, 0]

        If you specify ``max_categories``, then :class:`MultiHotEncoder`
        creates features for only the most frequent categories.

        >>> encoder = MultiHotEncoder(columns=["genre"], max_categories={"genre": 3})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [1, 1, 1]
        1                          Moana  [1, 1, 0]
        2  The Smartest Guys in the Room  [0, 0, 0]
        >>> encoder.stats_  # doctest: +SKIP
        OrderedDict([('unique_values(genre)', {'comedy': 0, 'action': 1, 'sports': 2})])

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every unique category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            If you're encoding individual categories instead of lists of
            categories, use :class:`OneHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.

    [1]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
    Nr`   r   ra   r   c                   rb   r   rc   rd   r   r   r   r     re   zMultiHotEncoder.__init__r    r   r!   c                    r"   )Nc                    rf   )NTrg   rh   r%   r&   r   r   r'     ri   z&MultiHotEncoder._fit.<locals>.<lambda>c                 S   r(   r)   r   r+   r   r   r   r'     r-   c                 S   r(   r.   r   r+   r   r   r   r'     r-   r0   r1   r5   r   r&   r   r6     rj   zMultiHotEncoder._fitr7   c                    s^   t |g jR   dtdtf fdd}t j jD ]\}}|| t||d||< q|S )Nr8   r9   c                   sR   t | tjr|  } nt | ts| g} jd| d }t|   fdd|D S )Nr/   r*   c                    s   g | ]}  |d qS )r   )r;   r<   counterr   r   r?         zJMultiHotEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>)rn   ro   rp   rw   rN   r:   r   )r8   r9   rl   r@   r   r   rB     s   

z6MultiHotEncoder._transform_pandas.<locals>.encode_listrA   )rM   r   rN   rO   ru   r   rH   r   )r   r7   rB   rx   ry   r   r@   r   rR     s
   	z!MultiHotEncoder._transform_pandasc                 C   s&   | j j d| jd| jd| j dS r{   r|   r@   r   r   r   rW     rX   zMultiHotEncoder.__repr__)rV   rY   rZ   r[   r	   rO   r
   r   r}   r   r   r6   rP   r]   rR   rW   r^   r   r   r   r   r~   G  s    T
r~   c                       sx   e Zd ZdZdddedee f fddZdd	d
efddZde	j
fddZdddZde	j
fddZdd Z  ZS )LabelEncodera
  Encode labels as integer targets.

    :class:`LabelEncoder` encodes labels as integer targets that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of unique labels.

    If you transform a label that isn't in the fitted datset, then the label is encoded
    as ``float("nan")``.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({
        ...     "sepal_width": [5.1, 7, 4.9, 6.2],
        ...     "sepal_height": [3.5, 3.2, 3, 3.4],
        ...     "species": ["setosa", "versicolor", "setosa", "virginica"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> from ray.data.preprocessors import LabelEncoder
        >>> encoder = LabelEncoder(label_column="species")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          5.1           3.5        0
        1          7.0           3.2        1
        2          4.9           3.0        0
        3          6.2           3.4        2

        You can also provide the name of the output column that should hold the encoded
        labels if you want to use :class:`LabelEncoder` in append mode.

        >>> encoder = LabelEncoder(label_column="species", output_column="species_encoded")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height     species  species_encoded
        0          5.1           3.5      setosa                0
        1          7.0           3.2  versicolor                1
        2          4.9           3.0      setosa                0
        3          6.2           3.4   virginica                2

        If you transform a label not present in the original dataset, then the new
        label is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({
        ...     "sepal_width": [4.2],
        ...     "sepal_height": [2.7],
        ...     "species": ["bracteata"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          4.2           2.7      NaN

    Args:
        label_column: A column containing labels that you want to encode.
        output_column: The name of the column that will contain the encoded
            labels. If None, the output column will have the same name as the
            input column.

    .. seealso::

        :class:`OrdinalEncoder`
            If you're encoding ordered features, use :class:`OrdinalEncoder` instead of
            :class:`LabelEncoder`.
    N)ry   label_columnry   c                   s   t    || _|p|| _d S r   )r   r   r   ry   )r   r   ry   r   r   r   r     s   
zLabelEncoder.__init__r    r   r!   c                    s2   j j fddt dd dd jgd S )Nc                    s   t  jg| dS N)r    r   r#   )r$   r   r%   r&   r   r   r'     s
    z#LabelEncoder._fit.<locals>.<lambda>c                 S   r(   r)   r   r+   r   r   r   r'      r-   c                 S   r(   r.   r   r+   r   r   r   r'   !  r-   r0   )r2   r3   r4   r   r5   r   r&   r   r6     s   zLabelEncoder._fitr7   c                    s:   t | j dtjf fdd}| j || j< |S )NrC   c                    s    j d| j d }| |S r.   )r:   r9   rH   )rC   rJ   r@   r   r   column_label_encoder)  s   
z<LabelEncoder._transform_pandas.<locals>.column_label_encoder)rM   r   rP   rQ   	transformry   )r   r7   r   r   r@   r   rR   &  s   zLabelEncoder._transform_pandasdsc                 C   sF   |   }|tjjtjjfv rtd|  }|j| jfdt	j
i|S )a/  Inverse transform the given dataset.

        Args:
            ds: Input Dataset that has been fitted and/or transformed.

        Returns:
            ray.data.Dataset: The inverse transformed Dataset.

        Raises:
            PreprocessorNotFittedException: if ``fit`` is not called yet.
        z1`fit` must be called before `inverse_transform`, batch_format)
fit_statusr   Z	FitStatusZPARTIALLY_FITTEDZ
NOT_FITTEDr   Z_get_transform_configmap_batches_inverse_transform_pandasr   ZPANDAS)r   r   r   kwargsr   r   r   inverse_transform0  s    zLabelEncoder.inverse_transformc                    s.   dt jf fdd}| j || j< |S )NrC   c                    s,   dd  j d j d  D }| |S )Nc                 S      i | ]\}}||qS r   r   )r=   rE   valuer   r   r   
<dictcomp>O  s    zXLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder.<locals>.<dictcomp>r/   r*   )r:   r   itemsrH   )rC   Zinverse_valuesr@   r   r   column_label_decoderN  s   
zDLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder)rP   rQ   ry   r   r   )r   r7   r   r   r@   r   r   M  s   	z&LabelEncoder._inverse_transform_pandasc                 C   s   | j j d| jd| jdS )Nz(label_column=z, output_column=r*   )r   rV   r   ry   r@   r   r   r   rW   Z  s   zLabelEncoder.__repr__)r   r   r!   r   )rV   rY   rZ   r[   rO   r
   r   r   r6   rP   r]   rR   r   r   rW   r^   r   r   r   r   r     s     @

r   c                	       sx   e Zd ZdZ		ddee deeeej	f  deee  f fddZ
dd	d
efddZdejfddZdd Z  ZS )Categorizera^
  Convert columns to ``pd.CategoricalDtype``.

    Use this preprocessor with frameworks that have built-in support for
    ``pd.CategoricalDtype`` like LightGBM.

    .. warning::

        If you don't specify ``dtypes``, fit this preprocessor before splitting
        your dataset into train and test splits. This ensures categories are
        consistent across splits.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import Categorizer
        >>>
        >>> df = pd.DataFrame(
        ... {
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> categorizer = Categorizer(columns=["sex", "level"])
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5'], ordered=False)]

        :class:`Categorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the categorized values.

        >>> categorizer = Categorizer(columns=["sex", "level"], output_columns=["sex_cat", "level_cat"])
        >>> categorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level sex_cat level_cat
        0    male    L4    male        L4
        1  female    L5  female        L5
        2    male    L3    male        L3
        3  female    L4  female        L4

        If you know the categories in advance, you can specify the categories with the
        ``dtypes`` parameter.

        >>> categorizer = Categorizer(
        ...     columns=["sex", "level"],
        ...     dtypes={"level": pd.CategoricalDtype(["L3", "L4", "L5", "L6"], ordered=True)},
        ... )
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5', 'L6'], ordered=True)]

    Args:
        columns: The columns to convert to ``pd.CategoricalDtype``.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects. If you don't include a column in ``dtypes``, the categories
            are inferred.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Nr   dtypesr   c                    s0   t    |s	i }|| _|| _t||| _d S r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r     s   

zCategorizer.__init__r    r   r!   c                    s   fddj D   jjO  _ sS dtttf dtjfdd}jj fddt	t
d	d
|gddd dd  d S )Nc                    s   g | ]	}| j vr|qS r   )r   r=   rx   r@   r   r   r?     s    z$Categorizer._fit.<locals>.<listcomp>unique_indicesr!   c                 S   s   t |  S r   )rP   CategoricalDtypekeys)r   r   r   r   callback  s   z"Categorizer._fit.<locals>.callbackc                    s   t  | dS r   )r$   r%   )columns_to_getr    r   r   r'     s
    z"Categorizer._fit.<locals>.<lambda>Tdrop_na_values)Zbase_fn	callbacksc                 S   r(   r)   r   r+   r   r   r   r'     r-   c                 S   s   | S r   r   r+   r   r   r   r'     s    r0   )r   r:   r   r   rO   rP   r   r2   r3   r   r4   )r   r    r   r   )r   r    r   r   r6     s$   
zCategorizer._fitr7   c                 C   s   || j  | j|| j< |S r   )r   Zastyper:   r   )r   r7   r   r   r   rR     s   zCategorizer._transform_pandasc                 C   rS   )NrT   z	, dtypes=rU   r*   )r   rV   r   r   r   r@   r   r   r   rW     s   zCategorizer.__repr__)NN)rV   rY   rZ   r[   r	   rO   r
   r   rP   r   r   r   r6   r]   rR   rW   r^   r   r   r   r   r   ^  s    >
r   T)r   ra   r    r   r   r#   r   ra   c                    s  |d u ri }t  }|D ]}||vrtd| d  dqdtjdtffdddtjdtttt f f fd	d
}| j|dd}fdd D }	|j	d dD ]3}
|

 D ],\}}|D ]%}dd |
 D }||v rytt||| }|	| |  q_qYqS|	S )NzYou set `max_categories` for z, which is not present in .r,   r!   c                    sN   t | rrt   fdd}| |  S | dd } t| jdd S )Nc                    s     |  | S r   )update)r8   r   r   r   update_counter  s   
z\compute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.update_counterc                 S   s   t | S r   )rD   )r>   r   r   r   r'     s    zVcompute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.<lambda>F)Zdropna)rG   r   rH   Zvalue_countsto_dict)r,   r   )r   r   r   get_pd_value_counts_per_column  s   
zDcompute_unique_value_indices.<locals>.get_pd_value_counts_per_columnr7   c                    sJ   | j  }i } D ]}||v r| | g||< q	td| d| |S )NzColumn 'z2' does not exist in DataFrame, which has columns: )r   rw   
ValueError)r7   Z
df_columnsresultr,   )r   r   r   r   get_pd_value_counts  s   
z9compute_unique_value_indices.<locals>.get_pd_value_countspandas)r   c                    s   i | ]} |t  qS r   )set)r=   r,   r%   r   r   r     r   z0compute_unique_value_indices.<locals>.<dictcomp>)Z
batch_sizec                 S   s   i | ]\}}|d ur||qS r   r   )r=   krk   r   r   r   r     s    )r   r   rP   rQ   r   r]   rO   r	   r   Ziter_batchesr   dictr   most_commonr   r   )r    r   r#   r   ra   Zcolumns_setrx   r   Zvalue_counts_dsZunique_values_by_colbatchr,   Zcountersr   r   )r   r   r   r#   r   r$     s:   &r$   Fr   r!   c                    s"   dt dtttf f fdd}|S )a  
    Returns a post-processing function that generates an encoding map by
    sorting the unique values produced during aggregation or stats computation.

    :param drop_na_values: If True, NA/null values will be silently dropped from the encoding map.
                           If False, raises an error if any NA/null values are present.
    :return: A callable that takes a set of unique values and returns a dictionary
             mapping each value to a unique integer index.
    valuesr!   c                    sD    r
dd | D } nt dd | D rtddd tt| D S )Nc                 S   s   h | ]	}t |s|qS r   rP   isnullr=   r   r   r   r   	<setcomp>   s    z:unique_post_fn.<locals>.gen_value_index.<locals>.<setcomp>c                 s   s    | ]}t |V  qd S r   r   r   r   r   r   	<genexpr>"  s    z:unique_post_fn.<locals>.gen_value_index.<locals>.<genexpr>z]Unable to fit column because it contains null values. Consider imputing missing values first.c                 S   r   r   r   )r=   jr   r   r   r   r   '  s    z;unique_post_fn.<locals>.gen_value_index.<locals>.<dictcomp>)anyr   	enumeratesorted)r   r   r   r   gen_value_index  s   z'unique_post_fn.<locals>.gen_value_index)r   r   rO   r}   )r   r   r   r   r   r4     s   r4   r7   c                    s*    fdd|D }|rt d| dd S )Nc                    s"   g | ]} |   j r|qS r   )r   r   r   r   r7   r   r   r?   -  s   " z _validate_df.<locals>.<listcomp>zUnable to transform columns zJ because they contain null values. Consider imputing missing values first.)r   )r7   r   Znull_columnsr   r   r   rM   ,  s   
rM   seriesc                 C   s4   t dd | D d }tjj| jot|ttj	fS )Nc                 s   s    | ]	}|d ur|V  qd S r   r   )r=   r8   r   r   r   r   8  s    z/_is_series_composed_of_lists.<locals>.<genexpr>)
nextr   apitypesZis_object_dtyperr   rn   rN   ro   rp   )r   Zfirst_not_none_elementr   r   r   rG   5  s   
rG   )F)+collectionsr   	functoolsr   typingr   r   r   r   r   r	   r
   r   numpyro   r   rP   Zpandas.api.typesZ"ray.air.util.data_batch_conversionr   Zray.data.preprocessorr   r   Zray.data.preprocessors.utilsr   Zray.util.annotationsr   Zray.data.datasetr   r   r_   r~   r   r   rO   r\   r}   r$   r4   r]   rM   rQ   rG   r   r   r   r   <module>   sT    (  ! 
 w
&B	