U
    (d4                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlmZ ddlmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ eeeddddZG dd deZ G dd de Z!dS )    N)partial)Pool)path)AnyCallableDictOptionalTuple)Tensor   )find_classesmake_dataset)download_and_extract_archivedownload_urlverify_str_argcheck_integrity)
VideoClips)VisionDataset)tarpath	videopathlinereturnc                 C   s   t || | d S N)r   )r   r   r    r   A/tmp/pip-unpacked-wheel-vx7f76es/torchvision/datasets/kinetics.py_dl_wrap   s    r   c                       s   e Zd ZdZddddZddddZd#eeeeee eee	 e
edf eeeeeeef  eeeeeeedd fddZddddZddddZddddZeeeef dddZedddZee
eeef d d!d"Z  ZS )$Kineticsuz  `Generic Kinetics <https://www.deepmind.com/open-source/kinetics>`_
    dataset.

    Kinetics-400/600/700 are action recognition video datasets.
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Args:
        root (string): Root directory of the Kinetics Dataset.
            Directory should be structured as follows:
            .. code::

                root/
                ├── split
                │   ├──  class1
                │   │   ├──  clip1.mp4
                │   │   ├──  clip2.mp4
                │   │   ├──  clip3.mp4
                │   │   ├──  ...
                │   ├──  class2
                │   │   ├──   clipx.mp4
                │   │    └── ...

            Note: split is appended automatically using the split argument.
        frames_per_clip (int): number of frames in a clip
        num_classes (int): select between Kinetics-400 (default), Kinetics-600, and Kinetics-700
        split (str): split of the dataset to consider; supports ``"train"`` (default) ``"val"`` ``"test"``
        frame_rate (float): If omitted, interpolate different frame rate for each clip.
        step_between_clips (int): number of frames between each clip
        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
            and returns a transformed version.
        download (bool): Download the official version of the dataset to root folder.
        num_workers (int): Use multiple workers for VideoClips creation
        num_download_workers (int): Use multiprocessing in order to speed up download.
        output_format (str, optional): The format of the output video tensors (before transforms).
            Can be either "THWC" or "TCHW" (default).
            Note that in most other utils and datasets, the default is actually "THWC".

    Returns:
        tuple: A 3-tuple with the following entries:

            - video (Tensor[T, C, H, W] or Tensor[T, H, W, C]): the `T` video frames in torch.uint8 tensor
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points in torch.float tensor
            - label (int): class of the video clip

    Raises:
        RuntimeError: If ``download is True`` and the video archives are already extracted.
    zChttps://s3.amazonaws.com/kinetics/400/{split}/k400_{split}_path.txtzChttps://s3.amazonaws.com/kinetics/600/{split}/k600_{split}_path.txtzMhttps://s3.amazonaws.com/kinetics/700_2020/{split}/k700_2020_{split}_path.txt)400600700z=https://s3.amazonaws.com/kinetics/400/annotations/{split}.csvz=https://s3.amazonaws.com/kinetics/600/annotations/{split}.csvzBhttps://s3.amazonaws.com/kinetics/700_2020/annotations/{split}.csvr   trainNr   ZaviZmp4Fr   TCHW.)rootframes_per_clipnum_classessplit
frame_ratestep_between_clips	transform
extensionsdownloadnum_download_workersnum_workers_precomputed_metadata_video_width_video_height_video_min_dimension_audio_samples_audio_channels_legacyoutput_formatr   c                    s   t |ddddgd| _|| _|
| _|| _|| _|rXtd || _d| _d}|	r|t	d	n$t
||| _t |d
dddgd| _|	r|   t | j t| j\| _}t| j||d d| _dd | jD }t||||||||||||d| _|| _d S )Nr%   r   r   r   )argZvalid_valueszUsing legacy structureunknownZTHWCz2Cannot download the videos using legacy_structure.r&   r    valtest)Zis_valid_filec                 S   s   g | ]}|d  qS )r   r   ).0xr   r   r   
<listcomp>   s     z%Kinetics.__init__.<locals>.<listcomp>)r-   r/   r0   r1   r2   r3   r5   )r   r%   r*   r,   r#   r4   printsplit_folderr&   
ValueErrorr   joindownload_and_process_videossuper__init__r   classesr   samplesr   video_clipsr)   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   Zclass_to_idxZ
video_list	__class__r   r   rC   \   sF    
zKinetics.__init__)r   c                 C   sb   t   }|   t   }td|| d  |   t   }td|| d  td|| d  dS )zEDownloads all the videos to the _root_ folder in the expected format.z%Elapsed time for downloading in mins <   z$Elapsed time for processing in mins zElapsed time overall in mins N)time_download_videosr=   _make_ds_structure)rG   ZticZtocZtoc2r   r   r   rA      s    z$Kinetics.download_and_process_videosc           
   	   C   s   t | jrtd| j dt | jd}t | jd}| j| j j| j	d}t |t 
|}t|stt|| t|}dd |  D }W 5 Q R X | jdkr|D ]}t||| j qn$tt|| j}t| j}	|	|| d	S )
a  download tarballs containing the video to "tars" folder and extract them into the _split_ folder where
        split is one of the official dataset splits.

        Raises:
            RuntimeError: if download folder exists, break to prevent downloading entire dataset again.
        zThe directory z[ already exists. If you want to re-download or re-extract the images, delete the directory.Ztarsfilesr&   c                 S   s   g | ]}t jj|d dqS )z/,:)safe)urllibparsequote)r:   r   r   r   r   r<      s     z-Kinetics._download_videos.<locals>.<listcomp>r   N)r   existsr>   RuntimeErrorr@   r#   	_TAR_URLSr%   formatr&   basenamer   r   openread
splitlinesr,   r   r   r   r   map)
rG   Ztar_pathZfile_list_pathZ	split_urlZsplit_url_filepathfileZlist_video_urlsr   partZpoolprocr   r   r   rL      s$    

 

zKinetics._download_videosc           
   
   C   s  t | jd}tt || j dsBt| j| j j| jd| t || j d}d}t	|}t
|}|D ]}|j|d t|d t|d d}|d	 d
ddddddd}tjt | j|dd t | j|}	t |	rrt|	t | j|| qrW 5 Q R X dS )u   move videos from
        split_folder/
            ├── clip1.avi
            ├── clip2.avi

        to the correct format as described below:
        split_folder/
            ├── class1
            │   ├── clip1.avi

        annotationsz.csvrO   z{ytid}_{start:06}_{end:06}.mp4Z
youtube_idZ
time_startZtime_end)Zytidstartendlabel _' ()T)exist_okN)r   r@   r#   r   r&   r   _ANNOTATION_URLSr%   rW   rY   csv
DictReaderintreplaceosmakedirsr>   isfile)
rG   Zannotation_pathr_   Zfile_fmtstrZcsvfilereaderrowfrb   Zdownloaded_filer   r   r   rM      s*    



(
zKinetics._make_ds_structurec                 C   s   | j jS r   )rF   metadatarG   r   r   r   ru      s    zKinetics.metadatac                 C   s
   | j  S r   )rF   Z	num_clipsrv   r   r   r   __len__   s    zKinetics.__len__)idxr   c                 C   s@   | j |\}}}}| j| d }| jd k	r6| |}|||fS )Nr   )rF   Zget_cliprE   r)   )rG   rx   videoaudioinfoZ	video_idxrb   r   r   r   __getitem__   s
    

zKinetics.__getitem__)r   r    Nr   Nr!   Fr   r   Nr   r   r   r   r   Fr"   )__name__
__module____qualname____doc__rV   rj   strrm   r   r   r	   boolr   r   rC   rA   rL   rM   propertyru   rw   r
   r|   __classcell__r   r   rH   r   r      sn   :
                 
B#r   c                
       s6   e Zd ZdZdeeeeeeedd fddZ  ZS )Kinetics400u  
    `Kinetics-400 <https://deepmind.com/research/open-source/open-source-datasets/kinetics/>`_
    dataset.

    .. warning::
        This class was deprecated in ``0.12`` and will be removed in ``0.14``. Please use
        ``Kinetics(..., num_classes='400')`` instead.

    Kinetics-400 is an action recognition video dataset.
    This dataset consider every video as a collection of video clips of fixed size, specified
    by ``frames_per_clip``, where the step in frames between each clip is given by
    ``step_between_clips``.

    To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
    and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
    elements will come from video 1, and the next three elements from video 2.
    Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
    frames in a video might be present.

    Internally, it uses a VideoClips object to handle clip creation.

    Args:
        root (string): Root directory of the Kinetics-400 Dataset. Should be structured as follows:

            .. code::

                root/
                ├── class1
                │   ├── clip1.avi
                │   ├── clip2.avi
                │   ├── clip3.mp4
                │   └── ...
                └── class2
                    ├── clipx.avi
                    └── ...

        frames_per_clip (int): number of frames in a clip
        step_between_clips (int): number of frames between each clip
        transform (callable, optional): A function/transform that  takes in a TxHxWxC video
            and returns a transformed version.

    Returns:
        tuple: A 3-tuple with the following entries:

            - video (Tensor[T, H, W, C]): the `T` video frames
            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points
            - label (int): class of the video clip
    N)r#   r$   r%   r&   r+   r,   kwargsr   c                    sJ   t d tdd ||||fD r,tdt jf ||dd| d S )NzThe Kinetics400 class is deprecated since 0.12 and will be removed in 0.14.Please use Kinetics(..., num_classes='400') instead.Note that Kinetics(..., num_classes='400') returns video in a Tensor[T, C, H, W] format.c                 s   s   | ]}|d k	V  qd S r   r   )r:   valuer   r   r   	<genexpr>=  s     z'Kinetics400.__init__.<locals>.<genexpr>zUsage of 'num_classes', 'split', 'download', or 'num_download_workers' is not supported in Kinetics400. Please use Kinetics instead.T)r#   r$   r4   )warningswarnanyrU   rB   rC   )rG   r#   r$   r%   r&   r+   r,   r   rH   r   r   rC   .  s    
zKinetics400.__init__)NNNN)	r}   r~   r   r   r   rm   r   rC   r   r   r   rH   r   r      s   6    r   )"rk   ro   rK   rQ   r   	functoolsr   multiprocessingr   r   typingr   r   r   r   r	   Ztorchr
   folderr   r   utilsr   r   r   r   Zvideo_utilsr   Zvisionr   r   r   r   r   r   r   r   r   <module>   s"    e