o
    `)iS                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlmZ ddlmZmZ ddlmZ ddlmZ G d	d
 d
eZdS )    N)Path)AnyCallableDictListOptionalTupleUnion)Tensor   )find_classesmake_dataset)
VideoClips)VisionDatasetc                !       s  e Zd ZdZdZdddZdZdZ									
	
	
	
	d)dee	e
f de	dededee dededee deee	ef  dededededede	ddf  fddZedee	ef fddZd ee	 d!e	dededee f
d"d#Zdefd$d%Zd&edeeeef fd'd(Z  ZS )*HMDB51a  
    `HMDB51 <https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/>`_
    dataset.

    HMDB51 is an action recognition video dataset.
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Internally, it uses a VideoClips object to handle clip creation.

    Args:
        root (str or ``pathlib.Path``): Root directory of the HMDB51 Dataset.
        annotation_path (str): Path to the folder containing the split files.
        frames_per_clip (int): Number of frames in a clip.
        step_between_clips (int): Number of frames between each clip.
        fold (int, optional): Which fold to use. Should be between 1 and 3.
        train (bool, optional): If ``True``, creates a dataset from the train split,
            otherwise from the ``test`` split.
        transform (callable, optional): A function/transform that takes in a TxHxWxC video
            and returns a transformed version.
        output_format (str, optional): The format of the output video tensors (before transforms).
            Can be either "THWC" (default) or "TCHW".

    Returns:
        tuple: A 3-tuple with the following entries:

            - video (Tensor[T, H, W, C] or Tensor[T, C, H, W]): The `T` video frames
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points
            - label (int): class of the video clip
    zJhttps://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rarzQhttps://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rarZ 15e67781e70dcfbdce2d7dbb9b3344b5)urlmd5r      NTr   THWCrootannotation_pathframes_per_clipstep_between_clips
frame_ratefoldtrain	transform_precomputed_metadatanum_workers_video_width_video_height_video_min_dimension_audio_samplesoutput_formatreturnc                    s   t  | |dvrtd| d}t| j\| _}t| j||| _dd | jD }t|||||	|
|||||d}|| _	|| _
|| _| ||||| _|| j| _|| _d S )N)r   r      z$fold should be between 1 and 3, got )Zavic                 S   s   g | ]\}}|qS  r&   ).0path_r&   r&   g/home/app/PaddleOCR-VL-test/.venv_paddleocr/lib/python3.10/site-packages/torchvision/datasets/hmdb51.py
<listcomp>[   s    z#HMDB51.__init__.<locals>.<listcomp>)r   r   r    r!   r"   r#   )super__init__
ValueErrorr   r   classesr   samplesr   full_video_clipsr   r   _select_foldindicesZsubsetvideo_clipsr   )selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   
extensionsZclass_to_idxZvideo_pathsr4   	__class__r&   r*   r-   =   s<   
zHMDB51.__init__c                 C   s   | j jS N)r1   metadatar5   r&   r&   r*   r:   s   s   zHMDB51.metadata
video_listannotations_dirc              	   C   s   |r| j n| j}d| d}tj||}t|}t }	|D ]2}
t|
}| }W d    n1 s4w   Y  |D ]}|	 \}}t
|}||krP|	| q;qg }t|D ]\}}tj||	v ri|| qX|S )Nz*test_splitz.txt)	TRAIN_TAGTEST_TAGosr(   joinglobsetopen	readlinessplitintadd	enumeratebasenameappend)r5   r<   r=   r   r   Z
target_tagZsplit_pattern_nameZsplit_pattern_pathZannotation_pathsZselected_filesfilepathZfidlineslineZvideo_filenameZ
tag_stringtagr3   Zvideo_indexZ
video_pathr&   r&   r*   r2   w   s,   




zHMDB51._select_foldc                 C   s
   | j  S r9   )r4   Z	num_clipsr;   r&   r&   r*   __len__   s   
zHMDB51.__len__idxc                 C   sJ   | j |\}}}}| j| }| j| \}}| jd ur | |}|||fS r9   )r4   Zget_clipr3   r0   r   )r5   rQ   ZvideoZaudior)   Z	video_idxZsample_indexZclass_indexr&   r&   r*   __getitem__   s   



zHMDB51.__getitem__)r   Nr   TNNr   r   r   r   r   r   )__name__
__module____qualname____doc__Zdata_urlZsplitsr>   r?   r	   strr   rG   r   boolr   r   r   r-   propertyr:   r   r2   rP   r   r
   rR   __classcell__r&   r&   r7   r*   r      st    '
	
6&$r   )rB   r@   pathlibr   typingr   r   r   r   r   r   r	   Ztorchr
   folderr   r   Zvideo_utilsr   Zvisionr   r   r&   r&   r&   r*   <module>   s    $