U
    d                     @   s(  d dl Z d dlZd dlZd dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ d
dlmZ ejejdZd e	e	ee	dddZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )!    N)ListTupleOptionaloverload)Tensor   )Module   )	Parameter)PackedSequence)init   )_VF)RNN_TANHRNN_RELU)tensorpermutationdimreturnc                 C   s   |  ||S N)Zindex_select)r   r   r    r   8/tmp/pip-unpacked-wheel-ua33x9lu/torch/nn/modules/rnn.pyapply_permutation   s    r   c                       s  e Zd ZU ddddddddd	g	Zd
gZeed< eed< eed< eed< eed< eed< e	ed< eed< eed	< d4eeeeeee	eedd
 fddZ
 fddZddddZ fddZddddZeee ddddZeee eeeef dd d!Zd5eeeeef edd#d$d%Zeeee d&d'd(Zeee d)d*d+Zedd,d-Z fd.d/Zeeee  dd0d1Z fd2d3Z  ZS )6RNNBasemode
input_sizehidden_size
num_layersbiasbatch_firstdropoutbidirectional	proj_sizeall_weightsr   TF        r   N)
r   r   r   r   r   r   r    r!   r"   r   c                    s  |
|d}t t  |_|_|_|_|_|_t	|_
|_|	_|rZdnd}t|tjrd|  kr~dkrn n
t|trtd|dkr|dkrtd|| |	dk rtd|	|krtd|d	krd
| }n>|dkrd| }n,|dkr
|}n|dkr|}ntd| g _g _t|D ] t|D ]r}|	dkr\|	n|} dkrn|n|| }ttj||ff|}ttj||ff|}ttj|f|}ttj|f|}d}jdkr|r||||f}n||f}n6ttj|	|ff|}|r |||||f}n
|||f}|dkr8dndddg}|rV|ddg7 }jdkrl|dg7 } fdd|D }t||D ]\}}t|| qj| j| qHq:fddjD _   !  d S )Ndevicedtyper	   r   r   zbdropout should be a number in range [0, 1] representing the probability of an element being zeroedzdropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout={} and num_layers={}zEproj_size should be a positive integer or zero to disable projectionsz,proj_size has to be smaller than hidden_sizeLSTM   GRUr   r   r   zUnrecognized RNN mode: r   _reverse weight_ih_l{}{}weight_hh_l{}{}bias_ih_l{}{}bias_hh_l{}{}weight_hr_l{}{}c                    s   g | ]}|  qS r   format.0xlayersuffixr   r   
<listcomp>r   s     z$RNNBase.__init__.<locals>.<listcomp>c                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   hasattrgetattrwnselfr   r   <lambda>y       z-RNNBase.__init__.<locals>.<listcomp>.<lambda>r   r5   r?   r@   r   r   r:   y   s     )"superr   __init__r   r   r   r   r   r   floatr    r!   r"   
isinstancenumbersNumberbool
ValueErrorwarningswarnr3   _flat_weights_names_all_weightsranger
   torchemptyzipsetattrextendappend_flat_weightsflatten_parametersreset_parameters)rA   r   r   r   r   r   r   r    r!   r"   r&   r'   factory_kwargsnum_directionsZ	gate_size	directionreal_hidden_sizeZlayer_input_sizeZw_ihZw_hhZb_ihZb_hhZlayer_paramsZw_hrZparam_namesnameparam	__class__r8   rA   r9   r   rF   '   s    

$ 






zRNNBase.__init__c                    s@   t | dr*|| jkr*| j|}|| j|< tt| || d S )NrO   )r<   rO   indexrX   rE   r   __setattr__)rA   attrvalueidxra   r   r   re   ~   s    
zRNNBase.__setattr__r   c                 C   sX  t | jt | jkrdS | jD ]}t|ts dS q| jd }|j}| jD ]:}t|jtr~|jj|kr~|jjr~tj	j
|jsJ dS qJtdd | jD }t |t | jkrdS tj| ddlm	  m
  m} t f t r@| jrdnd}| jdkr|d7 }t| j|| j|| j| j| j| j| jt| j	 W 5 Q R X W 5 Q R X dS )zResets parameter data pointer so that they can use faster code paths.

        Right now, this works only if the module is on the GPU and cuDNN is enabled.
        Otherwise, it's a no-op.
        Nr   c                 s   s   | ]}|  V  qd S r   )Zdata_ptr)r5   pr   r   r   	<genexpr>   s     z-RNNBase.flatten_parameters.<locals>.<genexpr>r)   r	   r   )lenrX   rO   rH   r   r'   dataZis_cudarR   backendsZcudnnZis_acceptablesetZcudaZ	device_ofZtorch.backends.cudnn.rnnrnnZno_gradZ_use_cudnn_rnn_flatten_weightr   r"   Z_cudnn_rnn_flatten_weightr   Zget_cudnn_moder   r   r   r   rK   r!   )rA   wZfirst_fwr'   fwZunique_data_ptrsrp   Znum_weightsr   r   r   rY      sF    





  
   zRNNBase.flatten_parametersc                    s2   t t |} fdd jD  _   |S )Nc                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   r;   r>   r@   r   r   rB      rC   z+RNNBase._apply.<locals>.<listcomp>.<lambda>r   rD   r@   r   r   r:      s     z"RNNBase._apply.<locals>.<listcomp>)rE   r   _applyrO   rX   rY   )rA   fnretra   r@   r   rs      s    zRNNBase._applyc                 C   s@   | j dkrdt| j  nd}|  D ]}t|| | q&d S Nr   g      ?r   mathsqrt
parametersr   Zuniform_rA   Zstdvweightr   r   r   rZ      s    zRNNBase.reset_parametersinputbatch_sizesr   c                 C   s\   |d k	rdnd}|  |kr0td||  | j|dkrXtd| j|dd S )Nr	   r   z%input must have {} dimensions, got {}z?input.size(-1) must be equal to input_size. Expected {}, got {})r   RuntimeErrorr3   r   size)rA   r~   r   Zexpected_input_dimr   r   r   check_input   s      zRNNBase.check_inputc                 C   sr   |d k	rt |d }n| jr&|dn|d}| jr:dnd}| jdkr\| j| || jf}n| j| || jf}|S Nr   r   r	   )intr   r   r!   r"   r   r   rA   r~   r   Z
mini_batchr\   expected_hidden_sizer   r   r   get_expected_hidden_size   s    
  z RNNBase.get_expected_hidden_sizeExpected hidden size {}, got {})hxr   msgr   c                 C   s(   |  |kr$t||t|  d S r   )r   r   r3   list)rA   r   r   r   r   r   r   check_hidden_size   s    zRNNBase.check_hidden_sizer~   hiddenr   c                 C   s(   |  || | ||}| || d S r   )r   r   r   )rA   r~   r   r   r   r   r   r   check_forward_args   s    zRNNBase.check_forward_args)r   r   c                 C   s   |d kr|S t ||S r   r   rA   r   r   r   r   r   permute_hidden   s    zRNNBase.permute_hiddenc                 C   s~   d}| j dkr|d7 }| jdkr(|d7 }| jdk	r:|d7 }| jdk	rL|d	7 }| jdkr^|d
7 }| jdk	rp|d7 }|jf | jS )N{input_size}, {hidden_size}r   z, proj_size={proj_size}r   z, num_layers={num_layers}T, bias={bias}Fz, batch_first={batch_first}z, dropout={dropout}z, bidirectional={bidirectional})r"   r   r   r   r    r!   r3   __dict__rA   sr   r   r   
extra_repr   s    





zRNNBase.extra_reprc                    s  t t| d|kr"|d _d|kr0d_tjd d trHd S j}jrXdnd}g _	g _t
|D ]& t
|D ]}|dkrdnddd	d
ddg} fdd|D }jrjdkr j|g7  _j	| n, j|d d g7  _j	|d d  q~jdkrj j|d d g|dd  g 7  _j	|d d |dd  g  q~ j|d d g7  _j	|d d  q~qpfddj	D _d S )Nr#   r"   r   r	   r   r+   r,   r-   r.   r/   r0   r1   c                    s   g | ]}|  qS r   r2   r4   r7   r   r   r:     s     z(RNNBase.__setstate__.<locals>.<listcomp>r)   r   c                    s   g | ]} fd d|qS )c                    s   t  | rt | S d S r   r;   r>   r@   r   r   rB   $  rC   z1RNNBase.__setstate__.<locals>.<listcomp>.<lambda>r   rD   r@   r   r   r:   $  s     )rE   r   __setstate__rP   r"   rH   strr   r!   rO   rQ   r   rV   rX   )rA   dr   r\   r]   weightsra   rc   r   r      s>    
 
&$zRNNBase.__setstate__c                    s    fdd j D S )Nc                    s   g | ]} fd d|D qS )c                    s   g | ]}t  |qS r   )r=   )r5   r|   r@   r   r   r:   (  s     z2RNNBase.all_weights.<locals>.<listcomp>.<listcomp>r   )r5   r   r@   r   r   r:   (  s     z'RNNBase.all_weights.<locals>.<listcomp>)rP   r@   r   r@   r   r#   &  s    zRNNBase.all_weightsc                    s2   t t|  }|jd d  |_|jd d  |_|S r   )rE   r   _replicate_for_data_parallelrX   rO   )rA   Zreplicara   r   r   r   *  s    z$RNNBase._replicate_for_data_parallel)r   TFr$   Fr   NN)r   )__name__
__module____qualname____constants__Z__jit_unused_properties__r   __annotations__r   rK   rG   rF   re   rY   rs   rZ   r   r   r   r   r   r   r   r   r   r   propertyr   r
   r#   r   __classcell__r   r   ra   r   r      sf   

                    W0   &r   c                       s   e Zd ZdZ fddZeejjd
e	e
e	 ee	e	f dddZeejjdee
e	 eee	f dddZdd	dZ  ZS )RNNa  Applies a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}` non-linearity to an
    input sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
    previous layer at time `t-1` or the initial hidden state at time `0`.
    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two RNNs together to form a `stacked RNN`,
            with the second RNN taking in outputs of the first RNN and
            computing the final results. Default: 1
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            RNN layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``

    Inputs: input, h_0
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
          state for the input sequence batch. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the RNN, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for each element in the batch.

    Attributes:
        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
            `(hidden_size, num_directions * hidden_size)`
        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
            of shape `(hidden_size, hidden_size)`
        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
            of shape `(hidden_size)`
        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
            of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional RNNs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.RNN(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    c                    sj   d|krt d|dd| _| jdkr.d}n | jdkr>d}nt d| jtt| j|f|| d S )	Nr"   =proj_size argument is only supported for LSTM, not RNN or GRUnonlinearitytanhr   relur   zUnknown nonlinearity '{}')rL   popr   r3   rE   r   rF   )rA   argskwargsr   ra   r   r   rF     s    

zRNN.__init__Nr~   r   r   c                 C   s   d S r   r   rA   r~   r   r   r   r   forward  s    zRNN.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   sx  |}t |tr(|\}}}}t|d }nd }| dk}| jrBdnd}	|s||	}|d k	r| dkr|td|  d|d}n(|d k	r| dkrtd|  d| jr|dn|d}d }d }|d kr| jrdnd}
t	j
| j|
 || j|j|jd}n| ||}|d k	s&t| ||| | jd	ksP| jd
ksPt|d kr| jd	krt||| j| j| j| j| j| j| j	}n(t||| j| j| j| j| j| j| j	}nZ| jd	krt|||| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|d }t |trNt||||}|| ||fS |sh||	}|d}|| ||fS )Nr   r   r   r	   7For unbatched 2-D input, hx should also be 2-D but got 	-D tensor5For batched 3-D input, hx should also be 3-D but got r'   r&   r   r   )rH   r   r   r   r   	unsqueezer   r   r!   rR   zerosr   r   r'   r&   r   AssertionErrorr   r   r   rnn_tanhrX   r   r    trainingrnn_relusqueezerA   r~   r   
orig_inputr   sorted_indicesunsorted_indicesmax_batch_size
is_batched	batch_dimr\   resultoutputr   output_packedr   r   r   r     s    


  
        

)N)N)Nr   r   r   __doc__rF   r   rR   _jit_internal_overload_methodr   r   r   r   r   r   r   r   ra   r   r   3  s   f$$r   c                
       s  e Zd ZdZ fddZeee eeeef dddZ	eeeef ee ddd	Z
eeef ee eeef d
ddZeejjdeeeeef  eeeeef f dddZeejjdeeeeef  eeeeef f dddZdddZ  ZS )r(   aW$  Applies a multi-layer long short-term memory (LSTM) RNN to an input
    sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll} \\
            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
            h_t = o_t \odot \tanh(c_t) \\
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
    is the hidden state of the layer at time `t-1` or the initial hidden
    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
    :math:`o_t` are the input, forget, cell, and output gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.

    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
    Second, the output hidden state of each layer will be multiplied by a learnable projection
    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two LSTMs together to form a `stacked LSTM`,
            with the second LSTM taking in outputs of the first LSTM and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            LSTM layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0

    Inputs: input, (h_0, c_0)
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          initial hidden state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.
        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          initial cell state for each element in the input sequence.
          Defaults to zeros if (h_0, c_0) is not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{cell} ={} & \text{hidden\_size} \\
                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
            \end{aligned}

    Outputs: output, (h_n, c_n)
        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the LSTM, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
          a concatenation of the forward and reverse hidden states at each time step in the sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
          final hidden state for each element in the sequence. When ``bidirectional=True``,
          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
          final cell state for each element in the sequence. When ``bidirectional=True``,
          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If
            ``proj_size > 0`` was specified, the shape will be
            `(4*hidden_size, num_directions * proj_size)` for `k > 0`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
            was specified, the shape will be `(4*hidden_size, proj_size)`.
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
            specified.
        weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hh_l[k]_reverse:  Analogous to `weight_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_ih_l[k]_reverse:  Analogous to `bias_ih_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        bias_hh_l[k]_reverse:  Analogous to `bias_hh_l[k]` for the reverse direction.
            Only present when ``bidirectional=True``.
        weight_hr_l[k]_reverse:  Analogous to `weight_hr_l[k]` for the reverse direction.
            Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified.

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
        former contains the final forward and reverse hidden states, while the latter contains the
        final forward hidden state and the initial reverse hidden state.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_rnn_determinism.rst

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.LSTM(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> c0 = torch.randn(2, 3, 20)
        >>> output, (hn, cn) = rnn(input, (h0, c0))
    c                    s   t t| jd|| d S )Nr(   )r(   )rE   r(   rF   rA   r   r   ra   r   r   rF     s    zLSTM.__init__r}   c                 C   sT   |d k	rt |d }n| jr&|dn|d}| jr:dnd}| j| || jf}|S r   )r   r   r   r!   r   r   r   r   r   r   get_expected_cell_size  s     zLSTM.get_expected_cell_sizer   c                 C   sD   |  || | |d | ||d | |d | ||d d S )Nr   z"Expected hidden[0] size {}, got {}r   z"Expected hidden[1] size {}, got {})r   r   r   r   )rA   r~   r   r   r   r   r   r     s    zLSTM.check_forward_args)r   r   r   c                 C   s(   |d kr|S t |d |t |d |fS )Nr   r   r   r   r   r   r   r     s    zLSTM.permute_hiddenNr   c                 C   s   d S r   r   r   r   r   r   r     s    zLSTM.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   s  |}d }t |tr0|\}}}}|d }t|}nNd }| dk}| jrJdnd}	|s\||	}| jrl|dn|d}d }d }|d kr| jrdnd}
| jdkr| jn| j	}t
j| j|
 |||j|jd}t
j| j|
 || j	|j|jd}||f}n|d kr|rR|d  dks&|d  dkrd|d   d|d   d}t|nj|d  dksv|d  dkrd	|d   d|d   d}t||d d|d df}| ||}| ||| |d kr
t||| j| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|dd  }t |trnt||||}|| ||fS |s||	}|d d|d df}|| ||fS d S )
Nr   r   r   r	   r   z=For batched 3-D input, hx and cx should also be 3-D but got (z-D, z-D) tensorsz?For unbatched 2-D input, hx and cx should also be 2-D but got ()rH   r   r   r   r   r   r   r!   r"   r   rR   r   r   r'   r&   r   r   r   r   ZlstmrX   r   r    r   r   )rA   r~   r   r   r   r   r   r   r   r   r\   r^   Zh_zerosZc_zerosr   r   r   r   r   r   r   r   r     s~    


    

$"
$"
      
)N)N)N)r   r   r   r   rF   r   r   r   r   r   r   r   r   rR   r   r   r   r   r   r   r   ra   r   r(     s.     


	r(   c                       s   e Zd ZdZ fddZeejjd
e	e
e	 ee	e	f dddZeejjdee
e	 eee	f dddZdd	dZ  ZS )r*   a  Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.


    For each element in the input sequence, each layer computes the following
    function:

    .. math::
        \begin{array}{ll}
            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
        \end{array}

    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
    variable which is :math:`0` with probability :attr:`dropout`.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
            would mean stacking two GRUs together to form a `stacked GRU`,
            with the second GRU taking in outputs of the first GRU and
            computing the final results. Default: 1
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        batch_first: If ``True``, then the input and output tensors are provided
            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
            Note that this does not apply to hidden or cell states. See the
            Inputs/Outputs sections below for details.  Default: ``False``
        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
            GRU layer except the last layer, with dropout probability equal to
            :attr:`dropout`. Default: 0
        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``

    Inputs: input, h_0
        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, H_{in})` when ``batch_first=False`` or
          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
          the input sequence.  The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})`
          containing the initial hidden state for the input sequence. Defaults to zeros if not provided.

        where:

        .. math::
            \begin{aligned}
                N ={} & \text{batch size} \\
                L ={} & \text{sequence length} \\
                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
                H_{in} ={} & \text{input\_size} \\
                H_{out} ={} & \text{hidden\_size}
            \end{aligned}

    Outputs: output, h_n
        * **output**: tensor of shape :math:`(L, H_{in})` for unbatched input,
          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
          `(h_t)` from the last layer of the GRU, for each `t`. If a
          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
          will also be a packed sequence.
        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
          for the input sequence.

    Attributes:
        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    .. note::
        For bidirectional GRUs, forward and backward are directions 0 and 1 respectively.
        Example of splitting the output layers when ``batch_first=False``:
        ``output.view(seq_len, batch, num_directions, hidden_size)``.

    .. note::
        ``batch_first`` argument is ignored for unbatched inputs.

    .. include:: ../cudnn_persistent_rnn.rst

    Examples::

        >>> rnn = nn.GRU(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> output, hn = rnn(input, h0)
    c                    s*   d|krt dtt| jd|| d S )Nr"   r   r*   )r*   )rL   rE   r*   rF   r   ra   r   r   rF     s    zGRU.__init__Nr   c                 C   s   d S r   r   r   r   r   r   r     s    zGRU.forwardc                 C   s   d S r   r   r   r   r   r   r     s    c                 C   s  |}t |tr,|\}}}}|d }t|}nd }| dk}| jrFdnd}	|s||	}|d k	r| dkrtd|  d|d}n(|d k	r| dkrtd|  d| jr|dn|d}d }d }|d kr| jrdnd}
t	j
| j|
 || j|j|jd}n| ||}| ||| |d kr^t||| j| j| j| j| j| j| j	}n&t|||| j| j| j| j| j| j	}|d }|d }t |trt||||}|| ||fS |s||	}|d}|| ||fS d S )	Nr   r   r   r	   r   r   r   r   )rH   r   r   r   r   r   r   r   r!   rR   r   r   r   r'   r&   r   r   r   ZgrurX   r   r    r   r   r   r   r   r   r     sn    



  
      

)N)N)Nr   r   r   ra   r   r*     s   m$$r*   c                       s|   e Zd ZU dddgZeed< eed< eed< eed< eed< deeeedd fdd	Ze	d
ddZ
dd
ddZ  ZS )RNNCellBaser   r   r   	weight_ih	weight_hhN)r   r   r   
num_chunksr   c                    s   ||d}t t|   || _|| _|| _ttj|| |ff|| _	ttj|| |ff|| _
|rttj|| f|| _ttj|| f|| _n| dd  | dd  |   d S )Nr%   bias_ihbias_hh)rE   r   rF   r   r   r   r
   rR   rS   r   r   r   r   Zregister_parameterrZ   )rA   r   r   r   r   r&   r'   r[   ra   r   r   rF     s    
zRNNCellBase.__init__ri   c                 C   sJ   d}d| j kr | jdk	r |d7 }d| j kr<| jdkr<|d7 }|jf | j S )Nr   r   Tr   r   r   z, nonlinearity={nonlinearity})r   r   r   r3   r   r   r   r   r     s    zRNNCellBase.extra_reprc                 C   s@   | j dkrdt| j  nd}|  D ]}t|| | q&d S rv   rw   r{   r   r   r   rZ     s    zRNNCellBase.reset_parameters)NN)r   r   r   r   r   r   rK   r   rF   r   r   rZ   r   r   r   ra   r   r     s   

   
r   c                       s^   e Zd ZU dZddddgZeed< deeeedd	 fd
dZ	de
ee
 e
dddZ  ZS )RNNCellar  An Elman RNN cell with tanh or ReLU non-linearity.

    .. math::

        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})

    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``

    Inputs: input, hidden
        - **input**: tensor containing input features
        - **hidden**: tensor containing the initial hidden state
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    Examples::

        >>> rnn = nn.RNNCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
                hx = rnn(input[i], hx)
                output.append(hx)
    r   r   r   r   Tr   N)r   r   r   r   r   c                    s4   ||d}t t| j|||fddi| || _d S )Nr%   r   r   )rE   r   rF   r   )rA   r   r   r   r   r&   r'   r[   ra   r   r   rF   .  s    
 zRNNCell.__init__r   c                 C   s   |  dks td|   d|  dk}|s:|d}|d krbtj|d| j|j|jd}n|sp|dn|}| j	dkrt
||| j| j| j| j}n<| j	dkrt
||| j| j| j| j}n|}td	| j	|s|d}|S )
Nr   r	   z6RNNCell: Expected input to be 1-D or 2-D but received r   r	   r   r   r   r   zUnknown nonlinearity: {})r   r   r   rR   r   r   r   r'   r&   r   r   Zrnn_tanh_cellr   r   r   r   Zrnn_relu_cellr   r3   r   rA   r~   r   r   ru   r   r   r   r   4  sD    
 
   
   

zRNNCell.forward)Tr   NN)N)r   r   r   r   r   r   r   r   rK   rF   r   r   r   r   r   r   ra   r   r     s   
5    
r   c                       sV   e Zd ZdZd
eeedd fddZdeee	eef  e	eef ddd	Z
  ZS )LSTMCella
  A long short-term memory (LSTM) cell.

    .. math::

        \begin{array}{ll}
        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o * \tanh(c') \\
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, (h_0, c_0)
        - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features
        - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state
        - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state

          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.

    Outputs: (h_1, c_1)
        - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state
        - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(4*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(4*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.LSTMCell(10, 20) # (input_size, hidden_size)
        >>> input = torch.randn(2, 3, 10) # (time_steps, batch, input_size)
        >>> hx = torch.randn(3, 20) # (batch, hidden_size)
        >>> cx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(input.size()[0]):
                hx, cx = rnn(input[i], (hx, cx))
                output.append(hx)
        >>> output = torch.stack(output, dim=0)
    TNr   r   r   r   c                    s.   ||d}t t| j|||fddi| d S )Nr%   r   r)   )rE   r   rF   rA   r   r   r   r&   r'   r[   ra   r   r   rF     s    
zLSTMCell.__init__r   c                 C   s   |  dks td|   d|  dk}|s:|d}|d krjtj|d| j|j|jd}||f}n$|s|d d|d dfn|}t	
||| j| j| j| j}|s|d d|d df}|S )Nr   z7LSTMCell: Expected input to be 1-D or 2-D but received r   r	   r   r   r   )r   r   r   rR   r   r   r   r'   r&   r   Z	lstm_cellr   r   r   r   r   )rA   r~   r   r   r   ru   r   r   r   r     s*    

$   zLSTMCell.forward)TNN)N)r   r   r   r   r   rK   rF   r   r   r   r   r   r   r   ra   r   r   W  s   ;    r   c                       sF   e Zd ZdZd
eeedd fddZdeee eddd	Z	  Z
S )GRUCellai	  A gated recurrent unit (GRU) cell

    .. math::

        \begin{array}{ll}
        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
        n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\
        h' = (1 - z) * n + z * h
        \end{array}

    where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        bias: If ``False``, then the layer does not use bias weights `b_ih` and
            `b_hh`. Default: ``True``

    Inputs: input, hidden
        - **input** : tensor containing input features
        - **hidden** : tensor containing the initial hidden
          state for each element in the batch.
          Defaults to zero if not provided.

    Outputs: h'
        - **h'** : tensor containing the next hidden state
          for each element in the batch

    Shape:
        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
          :math:`H_{in}` = `input_size`.
        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.

    Attributes:
        weight_ih: the learnable input-hidden weights, of shape
            `(3*hidden_size, input_size)`
        weight_hh: the learnable hidden-hidden weights, of shape
            `(3*hidden_size, hidden_size)`
        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.

    Examples::

        >>> rnn = nn.GRUCell(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> hx = torch.randn(3, 20)
        >>> output = []
        >>> for i in range(6):
                hx = rnn(input[i], hx)
                output.append(hx)
    TNr   c                    s.   ||d}t t| j|||fddi| d S )Nr%   r   r   )rE   r   rF   r   ra   r   r   rF     s    
zGRUCell.__init__r   c                 C   s   |  dks td|   d|  dk}|s:|d}|d krbtj|d| j|j|jd}n|sp|dn|}t	
||| j| j| j| j}|s|d}|S )Nr   z6GRUCell: Expected input to be 1-D or 2-D but received r   r	   r   r   )r   r   r   rR   r   r   r   r'   r&   r   Zgru_cellr   r   r   r   r   r   r   r   r   r     s(    
    
zGRUCell.forward)TNN)N)r   r   r   r   r   rK   rF   r   r   r   r   r   r   ra   r   r     s   =    r   )r   ) rx   rM   rI   typingr   r   r   r   rR   r   moduler   Z	parameterr
   Z	utils.rnnr   r,   r   r   r   r   Z
_rnn_implsr   r   r   r   r(   r*   r   r   r   r   r   r   r   r   <module>   s8      O   8+bY