U
    %d;5                     @   s`  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZmZmZ g Zdeeejj Zed	d
e Zdd ZG dd dejjZ G dd dejjZ!G dd deZ"G dd deZ#G dd dejje"Z$G dd de#Z%eG dd dZ&e&deeddddd d!d"d#d$d%d&d d'Z'd(e'_(dS ))    N)ABCabstractmethod)	dataclass)partial)CallableListTuple)module_utils)emformer_rnnt_baseRNNTRNNTBeamSearch(   
   g?c                 C   s@   t | | tjk | | tjk< | | tjk tj | | tjk< | S N)torchlogmathex r   F/tmp/pip-unpacked-wheel-lbdmvq91/torchaudio/pipelines/rnnt_pipeline.py_piecewise_linear_log   s    r   c                       s$   e Zd Z fddZdd Z  ZS )_FunctionalModulec                    s   t    || _d S r   )super__init__
functional)selfr   	__class__r   r   r      s    
z_FunctionalModule.__init__c                 C   s
   |  |S r   )r   r   inputr   r   r   forward   s    z_FunctionalModule.forward__name__
__module____qualname__r   r"   __classcell__r   r   r   r   r      s   r   c                       s$   e Zd Z fddZdd Z  ZS )_GlobalStatsNormalizationc              	      s\   t    t|}t| }W 5 Q R X | dt|d  | dt|d  d S )Nmean	invstddev)	r   r   openjsonloadsreadZregister_bufferr   tensor)r   Zglobal_stats_pathfZblobr   r   r   r   $   s
    

z"_GlobalStatsNormalization.__init__c                 C   s   || j  | j S r   )r)   r*   r    r   r   r   r"   -   s    z!_GlobalStatsNormalization.forwardr#   r   r   r   r   r(   #   s   	r(   c                   @   s.   e Zd Zeejeejejf dddZdS )_FeatureExtractorr!   returnc                 C   s   dS )X  Generates features and length output from the given input tensor.

        Args:
            input (torch.Tensor): input tensor.

        Returns:
            (torch.Tensor, torch.Tensor):
            torch.Tensor:
                Features, with shape `(length, *)`.
            torch.Tensor:
                Length, with shape `(1,)`.
        Nr   r    r   r   r   __call__2   s    z_FeatureExtractor.__call__N)r$   r%   r&   r   r   Tensorr   r5   r   r   r   r   r1   1   s   r1   c                   @   s$   e Zd Zeee edddZdS )_TokenProcessor)tokensr3   c                 K   s   dS )zDecodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.

        Returns:
            str:
                Decoded text sequence.
        Nr   )r   r8   kwargsr   r   r   r5   C   s    z_TokenProcessor.__call__N)r$   r%   r&   r   r   intstrr5   r   r   r   r   r7   B   s   r7   c                       sJ   e Zd ZdZejjdd fddZeje	ejejf dddZ
  ZS )	_ModuleFeatureExtractorz``torch.nn.Module``-based feature extraction pipeline.

    Args:
        pipeline (torch.nn.Module): module that implements feature extraction logic.
    N)pipeliner3   c                    s   t    || _d S r   )r   r   r=   )r   r=   r   r   r   r   W   s    
z _ModuleFeatureExtractor.__init__r2   c                 C   s$   |  |}t|jd g}||fS )r4   r   )r=   r   r/   shape)r   r!   featureslengthr   r   r   r"   [   s    
z_ModuleFeatureExtractor.forward)r$   r%   r&   __doc__r   nnModuler   r6   r   r"   r'   r   r   r   r   r<   P   s   r<   c                   @   s8   e Zd ZdZeddddZd
ee eeddd	Z	dS )_SentencePieceTokenProcessorztSentencePiece-model-based token processor.

    Args:
        sp_model_path (str): path to SentencePiece model.
    N)sp_model_pathr3   c                 C   sJ   t dstddd l}|j|d| _| j | j | j h| _	d S )Nsentencepiecez2SentencePiece is not available. Please install it.r   )Z
model_file)
r	   Zis_module_availableRuntimeErrorrF   ZSentencePieceProcessorsp_modelZunk_idZeos_idZpad_idpost_process_remove_list)r   rE   Zspmr   r   r   r   t   s    
z%_SentencePieceTokenProcessor.__init__T)r8   lstripr3   c                    sH    fdd|dd D }d  j|dd}|r@| S |S dS )aX  Decodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.
            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
                removed. (Default: ``True``).

        Returns:
            str:
                Decoded text sequence.
        c                    s   g | ]}| j kr|qS r   )rI   ).0Ztoken_indexr   r   r   
<listcomp>   s    
 z9_SentencePieceTokenProcessor.__call__.<locals>.<listcomp>   N u   ▁ )joinrH   Zid_to_piecereplacerJ   )r   r8   rJ   Zfiltered_hypo_tokensZoutput_stringr   rL   r   r5      s    

z%_SentencePieceTokenProcessor.__call__)T)
r$   r%   r&   rA   r;   r   r   r:   boolr5   r   r   r   r   rD   m   s   rD   c                   @   sL  e Zd ZU dZG dd deZG dd deZee	d< e
g ef e	d< ee	d< ee	d	< ee	d
< ee	d< ee	d< ee	d< ee	d< ee	d< ee	d< ee	d< edddZeedddZeedddZeedddZeedddZeedddZeeddd Zedd!d"Zedd#d$Zedd%d&Zedd'd(Zd)S )*
RNNTBundleu  torchaudio.pipelines.RNNTBundle()

    Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
    inference with an RNN-T model.

    More specifically, the class provides methods that produce the featurization pipeline,
    decoder wrapping the specified RNN-T model, and output token post-processor that together
    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
    given a raw waveform.

    It can support non-streaming (full-context) inference as well as streaming inference.

    Users should not directly instantiate objects of this class; rather, users should use the
    instances (representing pre-trained models) that exist within the module,
    e.g. :py:obj:`EMFORMER_RNNT_BASE_LIBRISPEECH`.

    Example
        >>> import torchaudio
        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
        >>> import torch
        >>>
        >>> # Non-streaming inference.
        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
        >>>
        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
        >>> waveform = next(iter(dataset))[0].squeeze()
        >>>
        >>> with torch.no_grad():
        >>>     # Produce mel-scale spectrogram features.
        >>>     features, length = feature_extractor(waveform)
        >>>
        >>>     # Generate top-10 hypotheses.
        >>>     hypotheses = decoder(features, length, 10)
        >>>
        >>> # For top hypothesis, convert predicted tokens to text.
        >>> text = token_processor(hypotheses[0][0])
        >>> print(text)
        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
        >>>
        >>>
        >>> # Streaming inference.
        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
        >>> num_samples_segment_right_context = (
        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
        >>> )
        >>>
        >>> # Build streaming inference feature extractor.
        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
        >>>
        >>> # Process same waveform as before, this time sequentially across overlapping segments
        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
        >>> state, hypothesis = None, None
        >>> for idx in range(0, len(waveform), num_samples_segment):
        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
        >>>     with torch.no_grad():
        >>>         features, length = streaming_feature_extractor(segment)
        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
        >>>     hypothesis = hypotheses[0]
        >>>     transcript = token_processor(hypothesis[0])
        >>>     if transcript:
        >>>         print(transcript, end=" ", flush=True)
        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
    c                   @   s   e Zd ZdS )zRNNTBundle.FeatureExtractorNr$   r%   r&   r   r   r   r   FeatureExtractor   s   rV   c                   @   s   e Zd ZdS )zRNNTBundle.TokenProcessorNrU   r   r   r   r   TokenProcessor   s   rW   
_rnnt_path_rnnt_factory_func_global_stats_path_sp_model_path_right_padding_blank_sample_rate_n_fft_n_mels_hop_length_segment_length_right_context_length)r3   c                 C   s6   |   }tj| j}t|}|| |  |S r   )	rY   
torchaudioutilsdownload_assetrX   r   loadZload_state_dicteval)r   modelpathZ
state_dictr   r   r   
_get_model   s    

zRNNTBundle._get_modelc                 C   s   | j S )zSSample rate (in cycles per second) of input waveforms.

        :type: int
        )r^   rL   r   r   r   sample_rate   s    zRNNTBundle.sample_ratec                 C   s   | j S )z7Size of FFT window to use.

        :type: int
        )r_   rL   r   r   r   n_fft  s    zRNNTBundle.n_fftc                 C   s   | j S )z`Number of mel spectrogram features to extract from input waveforms.

        :type: int
        )r`   rL   r   r   r   n_mels  s    zRNNTBundle.n_melsc                 C   s   | j S )zdNumber of samples between successive frames in input expected by model.

        :type: int
        )ra   rL   r   r   r   
hop_length  s    zRNNTBundle.hop_lengthc                 C   s   | j S )zTNumber of frames in segment in input expected by model.

        :type: int
        )rb   rL   r   r   r   segment_length  s    zRNNTBundle.segment_lengthc                 C   s   | j S )zcNumber of frames in right contextual block in input expected by model.

        :type: int
        )rc   rL   r   r   r   right_context_length'  s    zRNNTBundle.right_context_lengthc                 C   s   |   }t|| jS )zOConstructs RNN-T decoder.

        Returns:
            RNNTBeamSearch
        )rk   r   r]   )r   ri   r   r   r   get_decoder/  s    zRNNTBundle.get_decoderc                    s^   t j j}ttjt jj	 j
 j j jdtdd tdd t|t fddS )zzConstructs feature extractor for non-streaming (full-context) ASR.

        Returns:
            FeatureExtractor
        rl   rm   rn   ro   c                 S   s   |  ddS NrN   r   Z	transposer   r   r   r   <lambda>D      z2RNNTBundle.get_feature_extractor.<locals>.<lambda>c                 S   s   t | t S r   r   _gainr   r   r   r   rv   E  rw   c                    s   t jj| ddd jfS )Nr   )r   rB   r   padr\   r   rL   r   r   rv   G  rw   rd   re   rf   rZ   r<   r   rB   Z
SequentialZ
transformsZMelSpectrogramrl   rm   rn   ro   r   r(   r   
local_pathr   rL   r   get_feature_extractor8  s       

z RNNTBundle.get_feature_extractorc              	   C   sP   t j| j}ttjt jj	| j
| j| j| jdtdd tdd t|S )zvConstructs feature extractor for streaming (simultaneous) ASR.

        Returns:
            FeatureExtractor
        rs   c                 S   s   |  ddS rt   ru   r   r   r   r   rv   W  rw   z<RNNTBundle.get_streaming_feature_extractor.<locals>.<lambda>c                 S   s   t | t S r   rx   r   r   r   r   rv   X  rw   r{   r|   r   r   r   get_streaming_feature_extractorK  s       

z*RNNTBundle.get_streaming_feature_extractorc                 C   s   t j| j}t|S )zQConstructs token processor.

        Returns:
            TokenProcessor
        )rd   re   rf   r[   rD   r|   r   r   r   get_token_processor]  s    zRNNTBundle.get_token_processorN)r$   r%   r&   rA   r1   rV   r7   rW   r;   __annotations__r   r   r:   rk   propertyrl   rm   rn   ro   rp   rq   r   rr   r~   r   r   r   r   r   r   rT      s@   
J	rT   z(models/emformer_rnnt_base_librispeech.pti  )Znum_symbolsz2pipeline-assets/global_stats_rnnt_librispeech.jsonz.pipeline-assets/spm_bpe_4096_librispeech.model   i   i>  i  P         )rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   a  Pre-trained Emformer-RNNT-based ASR pipeline capable of performing both streaming and non-streaming inference.

    The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
    and utilizes weights trained on LibriSpeech using training script ``train.py``
    `here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.

    Please refer to :py:class:`RNNTBundle` for usage instructions.
    ))r,   r   abcr   r   Zdataclassesr   	functoolsr   typingr   r   r   r   rd   Ztorchaudio._internalr	   Ztorchaudio.modelsr
   r   r   __all__log10ZiinfoZint16maxZ_decibelpowry   r   rB   rC   r   r(   r1   r7   r<   rD   rT   ZEMFORMER_RNNT_BASE_LIBRISPEECHrA   r   r   r   r   <module>   sJ   	+ O
