o
    * i<                     @  s   d dl mZ d dlZd dlmZmZmZmZ d dlZ	ddl
mZ ddlmZmZmZ G dd deee  ZG d	d
 d
eed  ZG dd deZdS )    )annotationsN)IterableIteratorSequenceSized   )IterableDataset)RandomSamplerSamplerSequenceSamplerc                   @  s\   e Zd ZU dZded< ded< ded< ded< 					
		
ddddZdddZdddZd	S )BatchSamplera  
    A base implement of batch sampler used by `paddle.io.DataLoader`
    which yield mini-batch indices(a list/tuple with length as
    mini-batch size and holds sample indices) iterably.

    Batch sampler used by :code:`paddle.io.DataLoader` should be a subclass
    of :code:`paddle.io.BatchSampler`, BatchSampler subclasses should
    implement following methods:

    :code:`__iter__`: return mini-batch indices iterably.

    :code:`__len__`: get mini-batch number in an epoch.


    Args:
        dataset(Dataset, optional): this should be an instance of a subclass of :ref:`api_paddle_io_Dataset` or
                :ref:`api_paddle_io_IterableDataset` or other python object which implemented
                :code:`__len__` for BatchSampler to get indices as the
                range of :attr:`dataset` length. Default None, disabled.
        sampler (Sampler, Iterable, optional): this should be a :ref:`api_paddle_io_Sample` or Iterable
                instance which implemented :code:`__iter__` to generate
                sample indices. :attr:`sampler` and :attr:`dataset`
                can not be set in the same time.  If :attr:`sampler`
                is set, :attr:`dataset` should not be set. Default None, disabled.
        shuffle(bool, optional): whether to shuffle indices order before generating
                batch indices. Default False, don't shuffle indices before generating batch indices.
        batch_size(int, optional): sample indice number in a mini-batch indices. default 1, each mini-batch includes 1 sample.
        drop_last(bool, optional): whether drop the last incomplete (less than 1 mini-batch) batch dataset. Default False, keep it.
    see :ref:`api_paddle_io_DataLoader`

    Returns:
        BatchSampler: an iterable object for indices iterating

    Examples:

        .. code-block:: python

            >>> import numpy as np
            >>> from paddle.io import RandomSampler, BatchSampler, Dataset

            >>> np.random.seed(2023)
            >>> # init with dataset
            >>> class RandomDataset(Dataset):  # type: ignore[type-arg]
            ...     def __init__(self, num_samples):
            ...         self.num_samples = num_samples
            ...
            ...     def __getitem__(self, idx):
            ...         image = np.random.random([784]).astype('float32')
            ...         label = np.random.randint(0, 9, (1, )).astype('int64')
            ...         return image, label
            ...
            ...     def __len__(self):
            ...         return self.num_samples
            ...
            >>> bs = BatchSampler(dataset=RandomDataset(100),
            ...                     shuffle=False,
            ...                     batch_size=16,
            ...                     drop_last=False)
            ...
            >>> for batch_indices in bs:
            ...     print(batch_indices)
            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
            ...
            [96, 97, 98, 99]
            >>> # init with sampler
            >>> sampler = RandomSampler(RandomDataset(100))
            >>> bs = BatchSampler(sampler=sampler,
            ...                     batch_size=8,
            ...                     drop_last=True)
            ...
            >>> for batch_indices in bs:
            ...     print(batch_indices)
            [56, 12, 68, 0, 82, 66, 91, 44]
            ...
            [53, 17, 22, 86, 52, 3, 92, 33]
    zSampler[int] | Iterable[int]samplerint
batch_sizeboolshuffle	drop_lastNFr   datasetSized | NoneSampler | Iterable[int] | NonereturnNonec                 C  s   |d u r&|d usJ dt |ttfsJ dt| |r"J d|| _n,t |tr/J d|d u s7J dt |tsEJ dt| |rMt|| _nt|| _t |t	r[|dksbJ d| || _
|| _t |tsvJ d	t| || _d
| _d S )Nz'either dataset or sampler should be setz@sampler should be either paddle.io.Sampler or Iterable, but got z+shuffle should be False when sampler is setz1dataset should not be a paddle.io.IterableDatasetz'should not set both dataset and samplerz+shuffle should be a boolean value, but got r   z1batch_size should be a positive integer, but got z-drop_last should be a boolean value, but got r   )
isinstancer
   r   typer   r   r   r	   r   r   r   r   r   
_acc_steps)selfr   r   r   r   r    r   n/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/paddle/io/dataloader/batch_sampler.py__init__m   s:   


zBatchSampler.__init__Iterator[list[int]]c                 c  sb    | j | j }g }| jD ]}|| t||kr|V  g }q| js-t|dkr/|V  d S d S d S )Nr   )r   r   r   appendlenr   )r   local_batch_sizebatch_indicesidxr   r   r   __iter__   s   


zBatchSampler.__iter__c                 C  s6   | j | j }t| j}|t| j |d  7 }|| S Nr   )r   r   r!   r   r   r   r   r"   num_samplesr   r   r   __len__   s   
zBatchSampler.__len__)NNFr   F)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )__name__
__module____qualname____doc____annotations__r   r%   r)   r   r   r   r   r      s   
 M
,r   c                   @  s4   e Zd ZU ded< ded< dddd	ZdddZdS )_InfiniteIterableSamplerr   r   r   r   r   r   r   c                 C  s"   t |ts	J d|| _|| _d S )Nz:dataset should be an instance of paddle.io.IterableDataset)r   r   r   r   )r   r   r   r   r   r   r      s
   
z!_InfiniteIterableSampler.__init__Iterator[list[None]]c                 c  s    	 d g| j  V  q)N)r   r   r   r   r   r%      s   z!_InfiniteIterableSampler.__iter__N)r   )r   r   r   r   r   r   )r   r2   )r,   r-   r.   r0   r   r%   r   r   r   r   r1      s
   
 r1   c                   @  s   e Zd ZU dZded< ded< ded< ded< ded	< ded
< ded< ded< 				ddddZd ddZd!ddZd"ddZdS )#DistributedBatchSampleray	  Sampler that restricts data loading to a subset of the dataset.

    In such case, each process can pass a DistributedBatchSampler instance
    as a DataLoader sampler, and load a subset of the original dataset that
    is exclusive to it.

    .. note::
        Dataset is assumed to be of constant size.

    Args:
        dataset(Dataset): this could be an instance of subclass of :ref:`api_paddle_io_Dataset`
                     or other python object which implemented
                     `__len__` for BatchSampler to get indices of samples.
        batch_size(int): sample size of each mini-batch.
        num_replicas(int, optional): process number in distributed training.
            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
            retrieved from :ref:`api_paddle_distributed_ParallelEnv` .
            Default None.
        rank(int, optional): the rank of the current process among :attr:`num_replicas`
            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
            :ref:`api_paddle_distributed_ParallelEnv`. Default None.
        shuffle(bool, optional): whether to shuffle indices order before generating
            batch indices. Default False.
        drop_last(bool, optional): whether drop the last incomplete(less than a mini-batch) batch dataset size.
            Default False.

    Returns:
        DistributedBatchSampler, return an iterable object for indices iterating.

    Examples:
        .. code-block:: python

            >>> import numpy as np

            >>> from paddle.io import Dataset, DistributedBatchSampler

            >>> # init with dataset
            >>> class RandomDataset(Dataset):  # type: ignore[type-arg]
            ...     def __init__(self, num_samples):
            ...         self.num_samples = num_samples
            ...
            ...     def __getitem__(self, idx):
            ...         image = np.random.random([784]).astype('float32')
            ...         label = np.random.randint(0, 9, (1, )).astype('int64')
            ...         return image, label
            ...
            ...     def __len__(self):
            ...         return self.num_samples
            ...
            >>> dataset = RandomDataset(100)
            >>> sampler = DistributedBatchSampler(dataset, batch_size=64)

            >>> for data in sampler:
            ...     # do something
            ...     break
    r   r   r   r   r   r   nranksepoch
local_rankr(   
total_sizeNFnum_replicas
int | Nonerankr   r   r   c                 C  s  || _ t|tr|dksJ d|| _t|tsJ d|| _t|ts(J dddlm} |d urCt|tr;|dks?J d|| _n| j| _|d ur]t|trU|dksYJ d|| _	n| j	| _	|| _
d| _ttt| j d | j | _| j| j | _d	| _d S )
Nr   z'batch_size should be a positive integerz!shuffle should be a boolean valuez$drop_last should be a boolean number)ParallelEnvz)num_replicas should be a positive integerz%rank should be a non-negative integerg      ?r   )r   r   r   r   r   r   Zpaddle.distributedr<   r5   r7   r   r6   mathceilr!   r(   r8   r   )r   r   r   r9   r;   r   r   r<   r   r   r   r      s8   	

 
z DistributedBatchSampler.__init__r   c           	      #  s8    j  j }t j}t| } jt| }|t|kr)||d | 7 }n||t	|t|  d | 7 }t| jksCJ  j
rWtj j
|   jd7  _ fdd} jdkrf||}t| jksoJ t|}g }|D ]}|| t||kr|V  g }qw jst|dkr|V  d S d S d S )Nr   c                   s   g } j  j j  }| j dksJ | j }t j j t| |  j j D ]}|| || j   q+| t| | d  } ||  j|  jd |   |S )Nr   r   )r8   r   r5   ranger7   r!   extend)indicesZsubsampled_indicesZlast_batch_sizeZlast_local_batch_sizeir3   r   r   _get_indices_by_batch_sizeA  s(   



	zDDistributedBatchSampler.__iter__.<locals>._get_indices_by_batch_sizer   )r   r   r!   r   npZarangetolistr8   r=   r>   r   randomZRandomStater6   r5   r(   iterr    r   )	r   r"   r(   rA   Zpadding_sizerC   Z_sample_iterr#   r$   r   r3   r   r%   .  s:   



z DistributedBatchSampler.__iter__c                 C  s2   | j | j }| j}|t| j |d  7 }|| S r&   )r   r   r(   r   r   r'   r   r   r   r)   i  s   zDistributedBatchSampler.__len__c                 C  s
   || _ dS )a  
        Sets the epoch number. When :attr:`shuffle=True`, this number is used
        as seeds of random numbers. By default, users may not set this, all
        replicas (workers) use a different random ordering for each epoch.
        If set same number at each epoch, this sampler will yield the same
        ordering at all epochs.

        Arguments:
            epoch (int): Epoch number.

        Examples:
            .. code-block:: python

                >>> import numpy as np

                >>> from paddle.io import Dataset, DistributedBatchSampler

                >>> # init with dataset
                >>> class RandomDataset(Dataset):  # type: ignore[type-arg]
                ...     def __init__(self, num_samples):
                ...         self.num_samples = num_samples
                ...
                ...     def __getitem__(self, idx):
                ...         image = np.random.random([784]).astype('float32')
                ...         label = np.random.randint(0, 9, (1, )).astype('int64')
                ...         return image, label
                ...
                ...     def __len__(self):
                ...         return self.num_samples
                ...
                >>> dataset = RandomDataset(100)
                >>> sampler = DistributedBatchSampler(dataset, batch_size=64)

                >>> for epoch in range(10):
                ...     sampler.set_epoch(epoch)
        N)r6   )r   r6   r   r   r   	set_epocho  s   
%z!DistributedBatchSampler.set_epoch)NNFF)r   r   r   r   r9   r:   r;   r:   r   r   r   r   r   r   r*   r+   )r6   r   r   r   )	r,   r-   r.   r/   r0   r   r%   r)   rH   r   r   r   r   r4      s$   
 9
0
;r4   )
__future__r   r=   collections.abcr   r   r   r   numpyrD   r   r   r   r	   r
   r   r   r   r1   r4   r   r   r   r   <module>   s    