U
    2de                     @   s6   d Z ddlZddlmZ ddlmZ G dd dZdS )zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse   )squared_normc                   @   sl   e Zd ZdZdd ZdddZdd Zd	d
 Zdd ZdddZ	dddZ
dddZdddZdddZdS )LinearModelLossa  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the sum of per sample losses and includes a term for L2
    regularization::

        loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = X.T @ loss.gradient + l2_reg_strength * coef
        hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                 C   s   || _ || _d S )N)	base_lossfit_intercept)selfr   r    r	   E/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/linear_model/_linear_loss.py__init__B   s    zLinearModelLoss.__init__Nc                 C   sZ   |j d }| jj}| jr"|d }n|}| jjrFtj|||f|dd}ntj|||d}|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)shapedtypeorderr   r   )r   r   	n_classesr   is_multiclassnpZ
zeros_like)r   Xr   
n_featuresr   n_dofcoefr	   r	   r
   init_zero_coefF   s    

zLinearModelLoss.init_zero_coefc                 C   s   | j js.| jr$|d }|dd }qd}|}nV|jdkrP|j| j jdfdd}n|}| jr|dddf }|ddddf }nd}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r   r   r   )r   r   r   ndimreshaper   )r   r   	interceptweightsr	   r	   r
   weight_interceptb   s    
z LinearModelLoss.weight_interceptc                 C   s<   |  |\}}| jjs$|| | }n||j | }|||fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r!   r   r   T)r   r   r   r    r   raw_predictionr	   r	   r
   weight_intercept_raw   s
    z$LinearModelLoss.weight_intercept_rawc                 C   s&   |j dkr|| nt|}d| | S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r   g      ?)r   r   )r   r    l2_reg_strengthZnorm2_wr	   r	   r
   
l2_penalty   s    zLinearModelLoss.l2_penaltyr   r   c                 C   sV   |dkr|  ||\}}	}n| |\}}	| jj||||d}
|
 }
|
| || S )a  Compute the loss as sum over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Sum of losses per sample plus penalty.
        NZy_truer#   sample_weight	n_threads)r$   r!   r   losssumr&   )r   r   r   yr(   r%   r)   r#   r    r   r*   r	   r	   r
   r*      s    'zLinearModelLoss.lossc                 C   s:  |j d | jj }}	|t| j }
|dkr>| ||\}}}n| |\}}| jj||||d\}}| }|| 	||7 }| jj
stj||jd}|j| ||  |d|< | jr| |d< nptj|	|
f|jdd}|j| ||  |ddd|f< | jr|jdd	|dddf< |jdkr2|jdd
}||fS )aN  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Sum of losses per sample plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        r   Nr'   r   r   r   r   r   r   Zaxisr   )r   r   r   intr   r$   r!   loss_gradientr+   r&   r   r   
empty_liker   r"   emptyr   ravel)r   r   r   r,   r(   r%   r)   r#   r   r   r   r    r   r*   grad_pointwisegradr	   r	   r
   r1      s2    *
"zLinearModelLoss.loss_gradientc                 C   s   |j d | jj }}	|t| j }
|dkr>| ||\}}}n| |\}}| jj||||d}| jjst	j
||jd}|j| ||  |d|< | jr| |d< |S t	j|	|
f|jdd}|j| ||  |ddd|f< | jr |jdd	|dddf< |jdkr|jdd
S |S dS )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        r   Nr'   r-   r   r   r.   r   r/   r   )r   r   r   r0   r   r$   r!   gradientr   r   r2   r   r"   r+   r3   r   r4   )r   r   r   r,   r(   r%   r)   r#   r   r   r   r    r   r5   r6   r	   r	   r
   r7   /  s0    '"zLinearModelLoss.gradientc
                 C   s  |j \}
}|t| j }|	dkr4| ||\}}}	n| |\}}| jj||	||d\}}t|dkdk}t	|}| jj
s|dkrtj||jd}n|}|j| ||  |d|< | jr| |d< |dkrtj||f|jd}n|}|r|||fS t|r<|jtj|df|
|
fd |  |d|d|f< n2|dddf | }t|j||d|d|f< |dkr|dd|| |d	   |7  < | jr|j| }||dddf< ||dddf< | |d
< nt|||fS )a  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than half of its elements non-positive.
        Nr'   r   g      ?r-   r   r   r   r   )r   r   )r   r0   r   r$   r!   r   gradient_hessianr   Zmeanabsr   r2   r   r"   r+   r3   r   issparse
dia_matrixZtoarraydotr   NotImplementedError)r   r   r   r,   r(   r%   r)   Zgradient_outZhessian_outr#   	n_samplesr   r   r    r   r5   hess_pointwiseZhessian_warningr6   ZhessZWXZXhr	   r	   r
   r9   v  sf    5





 


 
z LinearModelLoss.gradient_hessianc              
      s   j jj \}tj  \}}	jjsjj||	
|d\}
}tj	j
d} j|
   |d< jr|
 |d< | t rtj|df||fd  n|ddtjf   jr ttjddt fdd	}njj||	
|d\}
	tjfj
d
d}|
j    |dddf< jr|
jdd|dddf<  	
f
dd	}jdkr|jd
d|fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        r'   r-   Nr   r   r8   r/   c                    s   t | }t r4 j| d    |d < n$t j j| d  g|d < |d   | d   7  < jr|d   | d  7  < | d   | d   |d< |S )Nr   )r   r2   r   r;   r"   ZlinalgZ	multi_dotr   )sret)r   hXhX_sumhessian_sumr%   r   r   r	   r
   hesspD  s    

 $  z7LinearModelLoss.gradient_hessian_product.<locals>.hesspr   r.   c                    s  | j dfdd} jr>| d d df }| d d d df } nd} | j | }| | jddd d tjf 7 }|9 }d k	r|d d tjf 9 }tjf	jdd}|j  |   |d d d f< jr|jdd|d d df< jdkr|j	ddS |S d S )Nr   r   r   r   r   r/   r.   )
r   r   r"   r+   r   newaxisr3   r   r   r4   )rA   Zs_intercepttmpZ	hess_prod)
r   r   r%   r   r   r   probar(   r   r    r	   r
   rF   w  s"    $"r   r   )r   r   r   r0   r   r$   r   r9   r   r2   r   r"   r+   r   r;   r<   rG   ZsqueezeZasarrayZ
atleast_1dZgradient_probar3   r   r4   )r   r   r   r,   r(   r%   r)   r?   r   r#   r5   r@   r6   rF   r	   )r   r   rC   rD   rE   r%   r   r   r   rI   r(   r   r    r
   gradient_hessian_product  sN     




"z(LinearModelLoss.gradient_hessian_product)N)Nr   r   N)Nr   r   N)Nr   r   N)Nr   r   NNN)Nr   r   )__name__
__module____qualname____doc__r   r   r!   r$   r&   r*   r1   r7   r9   rJ   r	   r	   r	   r
   r   	   sB   8
' 
    
;    
P    
L      
 
     r   )rN   Znumpyr   Zscipyr   Zutils.extmathr   r   r	   r	   r	   r
   <module>   s   