U
    %d                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	Zd
ZdZeeeef  dddZG dd deZdS )    N)Path)ListTupleUnion)Tensor)download_url_to_file)Dataset)load_librispeech_item)extract_archiveZlibrispeech_finetuningzIhttps://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgzZ@5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342afreturnc                 C   s   |dkr&dd t | d| D }n^|dkrtdd t | d| D }|dkr|d	d t | d
| D 7 }ntd| dt|dd d}|S )zGet the file names and the corresponding file paths without `speaker_id`
    and `chapter_id` directories.
    The format of path is like:
        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
        {root}/{_ARCHIVE_NAME}/9h/[clean, other]
    10minc                 S   s.   g | ]&}t jt j|d d t|jfqS z..ospathjoindirnamestrstem.0p r   J/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/datasets/librilight_limited.py
<listcomp>   s   z&_get_fileids_paths.<locals>.<listcomp>z1h/0/*/*/*/*)1h10hc                 S   s.   g | ]&}t jt j|d d t|jfqS r   r   r   r   r   r   r      s   z1h/*/*/*/*/*r   c                 S   s.   g | ]&}t jt j|d d t|jfqS r   r   r   r   r   r   r   #   s   z
9h/*/*/*/*z Unsupported subset value. Found .c                 S   s   | d | d  S )Nr      r   )xr   r   r   <lambda>)       z$_get_fileids_paths.<locals>.<lambda>)key)r   glob
ValueErrorsorted)r   subset
_ext_audioZfiles_pathsr   r   r   _get_fileids_paths   s    
r)   c                   @   sd   e Zd ZdZdZdZdeeef ee	dddd	Z
eeeeeeeef d
ddZedddZdS )LibriLightLimiteda  Create a Dataset for LibriLightLimited, which is the supervised subset of
        LibriLight dataset.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        subset (str, optional): The subset to use. Options: [``10min``, ``1h``, ``10h``]
            (Default: ``10min``).
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (default: ``False``).
    z
.trans.txtz.flacr   FN)rootr'   downloadr   c                 C   s   |dkst dt|}tj|t| _tj|t d}tj| jsz|sXtdtj	|srt
t|td t| t| j|| j| _d S )N)r   r   r   z.`subset` must be one of ['10min', '1h', '10h']z.tgzz9Dataset not found. Please use `download=True` to download)Zhash_prefix)AssertionErrorr   fspathr   r   _ARCHIVE_NAME_pathisdirRuntimeErrorisfiler   _URL	_CHECKSUMr
   r)   r(   _fileids_paths)selfr+   r'   r,   archiver   r   r   __init__<   s    
zLibriLightLimited.__init__)nr   c                 C   s    | j | \}}t||| j| jS )a  Load the n-th sample from the dataset.
        Args:
            n (int): The index of the sample to be loaded
        Returns:
            (Tensor, int, str, int, int, int):
            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
        )r6   r	   r(   _ext_txt)r7   r:   	file_pathZfileidr   r   r   __getitem__O   s    zLibriLightLimited.__getitem__r   c                 C   s
   t | jS )N)lenr6   )r7   r   r   r   __len__Z   s    zLibriLightLimited.__len__)r   F)__name__
__module____qualname____doc__r;   r(   r   r   r   boolr9   intr   r   r=   r?   r   r   r   r   r*   -   s     
 r*   )r   pathlibr   typingr   r   r   Ztorchr   Z	torch.hubr   Ztorch.utils.datar   Ztorchaudio.datasets.librispeechr	   Ztorchaudio.datasets.utilsr
   r/   r4   r5   r   r)   r*   r   r   r   r   <module>   s   