U
    d.                     @   s   d dl Z d dl mZ ddlmZ d dlmZmZ G dd deZdee ee ee ee ee ee	e	e	e	e	edd	d
Z
ee ee ee ee ee e	e	e	e	e	edddZee ee ee ee ee e	e	e	e	e	edddZdS )    N)Tensor   )	Optimizer)ListOptionalc                       sJ   e Zd ZdZdee d fd	d
Z fddZe	 dddZ
  ZS )RMSpropa  Implements RMSprop algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \alpha \text{ (alpha)},\: \gamma \text{ (lr)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm}   \lambda \text{ (weight decay)},\: \mu \text{ (momentum)},\: centered\\
            &\textbf{initialize} : v_0 \leftarrow 0 \text{ (square average)}, \:
                \textbf{b}_0 \leftarrow 0 \text{ (buffer)}, \: g^{ave}_0 \leftarrow 0     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}v_t           \leftarrow   \alpha v_{t-1} + (1 - \alpha) g^2_t
                \hspace{8mm}                                                                     \\
            &\hspace{5mm} \tilde{v_t} \leftarrow v_t                                             \\
            &\hspace{5mm}if \: centered                                                          \\
            &\hspace{10mm} g^{ave}_t \leftarrow g^{ave}_{t-1} \alpha + (1-\alpha) g_t            \\
            &\hspace{10mm} \tilde{v_t} \leftarrow \tilde{v_t} -  \big(g^{ave}_{t} \big)^2        \\
            &\hspace{5mm}if \: \mu > 0                                                           \\
            &\hspace{10mm} \textbf{b}_t\leftarrow \mu \textbf{b}_{t-1} +
                g_t/ \big(\sqrt{\tilde{v_t}} +  \epsilon \big)                                   \\
            &\hspace{10mm} \theta_t \leftarrow \theta_{t-1} - \gamma \textbf{b}_t                \\
            &\hspace{5mm} else                                                                   \\
            &\hspace{10mm}\theta_t      \leftarrow   \theta_{t-1} -
                \gamma  g_t/ \big(\sqrt{\tilde{v_t}} + \epsilon \big)  \hspace{3mm}              \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to
    `lecture notes <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_ by G. Hinton.
    and centered version `Generating Sequences
    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
    The implementation here takes the square root of the gradient average before
    adding epsilon (note that TensorFlow interchanges these two operations). The effective
    learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma`
    is the scheduled learning rate and :math:`v` is the weighted moving average
    of the squared gradient.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        momentum (float, optional): momentum factor (default: 0)
        alpha (float, optional): smoothing constant (default: 0.99)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        centered (bool, optional) : if ``True``, compute the centered RMSProp,
            the gradient is normalized by an estimation of its variance
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)

    {Gz?Gz?:0yE>r   FN)foreachc	           
   	      s   d|kst d|d|ks,t d|d|ksBt d|d|ksXt d|d|ksnt d|t|||||||d}	tt| ||	 d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}zInvalid momentum value: {}zInvalid weight_decay value: {}zInvalid alpha value: {})lrmomentumalphaepscenteredweight_decayr   )
ValueErrorformatdictsuperr   __init__)
selfparamsr   r   r   r   r   r   r   defaults	__class__ 7/tmp/pip-unpacked-wheel-ua33x9lu/torch/optim/rmsprop.pyr   C   s     zRMSprop.__init__c                    s@   t  | | jD ](}|dd |dd |dd  qd S )Nr   r   r   Fr   )r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   T   s
    
zRMSprop.__setstate__c                 C   s  d}|dk	r&t   | }W 5 Q R X | jD ]T}g }g }g }g }g }|d D ]}	|	jdkr^qN||	 |	jjrxtd||	j | j|	 }
t|
dkrd|
d< t j	|	t j
d|
d< |d dkrt j	|	t j
d|
d	< |d
 rt j	|	t j
d|
d< ||
d  |d dkr||
d	  |d
 r4||
d  |
d  d7  < qNt||||||d |d |d |d |d |d
 |d d q,|S )zPerforms a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   z)RMSprop does not support sparse gradientsr   step)Zmemory_format
square_avgr   Zmomentum_bufferr   grad_avgr   r   r   r   r   r   )r   r   r   r   r   r   r   )torchZenable_gradr   gradappendZ	is_sparseRuntimeErrorr!   lenZ
zeros_likeZpreserve_formatrmsprop)r   closureZlossr"   Zparams_with_gradgradssquare_avgs	grad_avgsmomentum_buffer_listpr!   r   r   r   r#   [   sZ    




zRMSprop.step)r   r	   r
   r   r   FN)N)__name__
__module____qualname____doc__r   boolr   r   r&   Zno_gradr#   __classcell__r   r   r   r   r      s   ;    r   )r   r-   r.   r/   r0   r   r   r   r   r   r   r   c                C   s\   |dkrd}|r"t j r"td|r6t j s6t}nt}|| ||||||||	|
|d dS )zsFunctional API that performs rmsprop algorithm computation.
    See :class:`~torch.optim.RMSProp` for details.
    NFz6torch.jit.script not supported with foreach optimizers)r   r   r   r   r   r   )r&   ZjitZis_scriptingr)   _multi_tensor_rmsprop_single_tensor_rmsprop)r   r-   r.   r/   r0   r   r   r   r   r   r   r   funcr   r   r   r+      s&    r+   )r   r-   r.   r/   r0   r   r   r   r   r   r   c                C   s   t | D ]\}}|| }|| }|dkr6|j||d}||j||d| d |
r|| }||j|d| d |j||dd |}n| |}|	dkr|| }||	|| |j|| d q|j||| d qd S Nr   )r   r   )value)		enumerateaddZmul_Zaddcmul_Zadd_ZaddcmulZsqrt_sqrtZaddcdiv_)r   r-   r.   r/   r0   r   r   r   r   r   r   iparamr'   r$   r%   avgbufr   r   r   r9      s     r9   c                C   s   t | dkrd S |dkr(tj|| |d t|| tj|||d| d |
rt|| tj||d| d tj|||dd}t| t|| nt|}t|| |	dkrt||	 t||| tj| || d ntj| ||| d d S r;   )	r*   r&   Z_foreach_add_Z_foreach_mul_Z_foreach_addcmul_Z_foreach_addcmulZ_foreach_sqrt_Z_foreach_sqrtZ_foreach_addcdiv_)r   r-   r.   r/   r0   r   r   r   r   r   r   rC   r   r   r   r8      s&    

r8   )N)r&   r   Z	optimizerr   typingr   r   r   r6   floatr+   r9   r8   r   r   r   r   <module>   sV     ,%