U
    ‰dÀH  ã                   @   s  d dl Z d dlZd dlmZ ddlmZ d dlmZmZ G dd„ deƒZdee ee ee ee ee ee e	e	e	e
e
e
e
e
e	d	œd
d„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„Zee ee ee ee ee ee e	e
e
e
e
e
e	e	dœdd„ZdS )é    N)ÚTensoré   )Ú	Optimizer)ÚListÚOptionalc                       sX   e Zd ZdZdddddœee eedœ‡ fd	d
„Z‡ fdd„Ze 	¡ ddd„ƒZ
‡  ZS )ÚAdamaš  Implements Adam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
                \:\textit{maximize}                                                              \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        foreach (bool, optional): whether foreach implementation of optimizer
            is used (default: None)
        maximize (bool, optional): maximize the params based on the objective, instead of
            minimizing (default: False)
        capturable (bool, optional): whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    çü©ñÒMbP?©gÍÌÌÌÌÌì?g+‡ÙÎ÷ï?ç:Œ0âŽyE>r   FN)ÚforeachÚmaximizeÚ
capturablec             
      sÈ   d|kst d |¡ƒ‚d|ks,t d |¡ƒ‚d|d   krDdk sXn t d |d ¡ƒ‚d|d   krpdk s„n t d |d ¡ƒ‚d|ksšt d	 |¡ƒ‚t||||||||	d
}
tt| ƒ ||
¡ d S )Nç        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ð?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {})ÚlrÚbetasÚepsÚweight_decayÚamsgradr   r   r   )Ú
ValueErrorÚformatÚdictÚsuperr   Ú__init__)ÚselfÚparamsr   r   r   r   r   r   r   r   Údefaults©Ú	__class__© ú4/tmp/pip-unpacked-wheel-ua33x9lu/torch/optim/adam.pyr   J   s$       þzAdam.__init__c                    sœ   t ƒ  |¡ | jD ]4}| dd¡ | dd¡ | dd ¡ | dd¡ qt| j ¡ ƒ}t|ƒdkort 	|d d ¡}|s˜|D ]}t 
t|d ƒ¡|d< q|d S )Nr   Fr   r   r   r   Ústep)r   Ú__setstate__Úparam_groupsÚ
setdefaultÚlistÚstateÚvaluesÚlenÚtorchZ	is_tensorÚtensorÚfloat)r   r%   ÚgroupZstate_valuesZstep_is_tensorÚsr   r   r   r!   \   s    
zAdam.__setstate__c                 C   s²  |   ¡  d}|dk	r.t ¡  |ƒ }W 5 Q R X | jD ]v}g }g }g }g }g }g }	|d \}
}|d D ] }|jdk	rf| |¡ |jjrtdƒ‚| |j¡ | j| }t	|ƒdkr&| j
d rÒtjdtj|jdnt d	¡|d
< tj|tjd|d< tj|tjd|d< |d r&tj|tjd|d< | |d ¡ | |d ¡ |d rZ| |d ¡ |	 |d
 ¡ qft||||||	|d |
||d |d |d |d |d |d d q4|S )z±Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r   ©r   )ZdtypeÚdevicer   r    )Zmemory_formatÚexp_avgÚ
exp_avg_sqr   Zmax_exp_avg_sqr   r   r   r   r   )	r   Úbeta1Úbeta2r   r   r   r   r   r   )Z _cuda_graph_capture_health_checkr(   Zenable_gradr"   ÚgradÚappendZ	is_sparseÚRuntimeErrorr%   r'   r   Úzerosr*   r.   r)   Z
zeros_likeZpreserve_formatÚadam)r   ÚclosureZlossr+   Zparams_with_gradÚgradsÚexp_avgsÚexp_avg_sqsÚmax_exp_avg_sqsÚstate_stepsr1   r2   Úpr%   r   r   r   r    i   sf    



ÿÿ

òz	Adam.step)r   r	   r
   r   F)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Úboolr   r!   r(   Zno_gradr    Ú__classcell__r   r   r   r   r      s    A    ÿ þ þr   F)r   r9   r:   r;   r<   r=   r   r   r   r1   r2   r   r   r   r   c                C   s|   t dd„ |D ƒƒstdƒ‚|dkr&d}|r<tj ¡ r<tdƒ‚|rPtj ¡ sPt}nt}|| |||||||	|
|||||d dS )zmFunctional API that performs Adam algorithm computation.
    See :class:`~torch.optim.Adam` for details.
    c                 S   s   g | ]}t |tjƒ‘qS r   )Ú
isinstancer(   r   )Ú.0Útr   r   r   Ú
<listcomp>Æ   s     zadam.<locals>.<listcomp>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNFz6torch.jit.script not supported with foreach optimizers)r   r1   r2   r   r   r   r   r   )Úallr5   r(   ZjitZis_scriptingÚ_multi_tensor_adamÚ_single_tensor_adam)r   r9   r:   r;   r<   r=   r   r   r   r1   r2   r   r   r   r   Úfuncr   r   r   r7   °   s0    ór7   )r   r9   r:   r;   r<   r=   r   r1   r2   r   r   r   r   r   c                C   sæ  t | ƒD ]Ö\}}|s|| n||  }|| }|| }|| }|rX|jrP|jsXtdƒ‚|d7 }|
dkrv|j||
d}| |¡j|d| d | |¡j|| ¡ d| d |rT|}dt 	||¡ }dt 	||¡ }|	| }| 
¡ }| ¡ }|r,tj|| ||| d ||  ¡ ||   || ¡}n| ¡ ||   || ¡}| ||¡ q| ¡ }d||  }d||  }|	| }t |¡}|r¼tj|| ||| d ||  ¡ |  |¡}n| ¡ |  |¡}|j||| d qd S )Nú@If capturable=True, params and state_steps must be CUDA tensors.r   r   ©Úalpha)Úvalue)Úout)Ú	enumerateÚis_cudaÚAssertionErrorÚaddZmul_Zadd_Zaddcmul_Zconjr(   ÚpowÚnegÚsqrtÚmaximumZaddcdiv_ÚitemÚmath)r   r9   r:   r;   r<   r=   r   r1   r2   r   r   r   r   r   ÚiÚparamr3   r/   r0   Zstep_tr    Úbias_correction1Úbias_correction2Ú	step_sizeZstep_size_negÚbias_correction2_sqrtÚdenomr   r   r   rK   å   sD     
rK   c                   s~  t | ƒdkrd S |r4tdd„ t| |ƒD ƒƒs4tdƒ‚|rFt t|ƒ¡}t |d¡ |
dkrjtj|| |
d t |ˆ ¡ tj||dˆ  d t |ˆ¡ t 	|||dˆ ¡ |rÎ‡ fdd„|D ƒ}‡fd	d„|D ƒ}t 
|d¡ t 
|d¡ t |¡ t |¡ t |ˆ¡}t |¡ t |¡ t |¡}|r~t ||¡}t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}n@t |¡}t |t ||¡¡ t ||¡}t |¡ t ||¡}t | ||¡ n¬‡ fd
d„|D ƒ}‡fdd„|D ƒ}‡fdd„|D ƒ}dd„ |D ƒ}|rHt ||¡}t |¡}t ||¡ t ||¡}n"t |¡}t ||¡ t ||¡}t | |||¡ d S )Nr   c                 s   s   | ]\}}|j o|j V  qd S )N)rS   )rF   r>   r    r   r   r   Ú	<genexpr>I  s     z%_multi_tensor_adam.<locals>.<genexpr>rM   r   rN   c                    s   g | ]}t  ˆ |¡‘qS r   ©r(   rV   ©rF   r    ©r1   r   r   rH   ^  s     z&_multi_tensor_adam.<locals>.<listcomp>c                    s   g | ]}t  ˆ |¡‘qS r   rd   re   ©r2   r   r   rH   _  s     c                    s   g | ]}d ˆ |  ¡   ‘qS r-   ©rZ   re   rf   r   r   rH   ‚  s     c                    s   g | ]}d ˆ |  ¡   ‘qS r-   rh   re   rg   r   r   rH   ƒ  s     c                    s   g | ]}ˆ | d  ‘qS )éÿÿÿÿr   ©rF   Zbc)r   r   r   rH   …  s     c                 S   s   g | ]}t  |¡‘qS r   )r[   rX   rj   r   r   r   rH   ‡  s     )r'   rI   ÚziprT   r(   Z_foreach_negÚtupleZ_foreach_add_Z_foreach_mul_Z_foreach_addcmul_Z_foreach_sub_Z_foreach_neg_Z_foreach_divZ_foreach_reciprocal_Z_foreach_sqrtZ_foreach_maximumZ_foreach_div_Z_foreach_mulZ_foreach_addZ_foreach_addcdiv_)r   r9   r:   r;   r<   r=   r   r1   r2   r   r   r   r   r   r^   r_   r`   ra   Zmax_exp_avg_sq_sqrtZeps_over_step_sizerb   Zexp_avg_sq_sqrtr   )r1   r2   r   r   rJ   6  sh    ÿ










rJ   )NF)r[   r(   r   Z	optimizerr   Útypingr   r   r   rC   r*   r7   rK   rJ   r   r   r   r   Ú<module>   sl    1  ÷ï5òQò