U
    3d                  	   @   sD  d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ G dd dZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%eeee e!e"e$e%dZ&dS ) z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy   )	CyHalfSquaredErrorCyAbsoluteErrorCyPinballLossCyHalfPoissonLossCyHalfGammaLossCyHalfTweedieLossCyHalfTweedieLossIdentityCyHalfBinomialLossCyHalfMultinomialLoss)IntervalIdentityLinkLogLink	LogitLinkMultinomialLogit   )check_scalar)ReadonlyArrayWrapper)_weighted_percentilec                   @   s   e Zd ZdZdZdZdZdddZdd Zd	d
 Z	dddZ
dddZd ddZd!ddZd"ddZd#ddZd$ddZejdfddZdS )%BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.

    Attributes
    ----------
    closs: CyLossFunction
    link : BaseLink
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    need_update_leaves_values : bool
        Indicates whether decision trees in gradient boosting need to uptade
        leave values after having been fit to the (negative) gradients.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    FTNc                 C   sB   || _ || _d| _d| _|| _ttj tjdd| _| jj	| _	d S )NF)
closslinkapprox_hessianconstant_hessian	n_classesr   npinfinterval_y_trueinterval_y_pred)selfr   r   r    r"   6/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/_loss/loss.py__init__}   s    zBaseLoss.__init__c                 C   s   | j |S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r   includesr!   yr"   r"   r#   in_y_true_range   s    zBaseLoss.in_y_true_rangec                 C   s   | j |S )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r    r&   r'   r"   r"   r#   in_y_pred_range   s    zBaseLoss.in_y_pred_ranger   c                 C   sj   |dkrt |}|jdkr4|jd dkr4|d}t|}t|}|dk	rTt|}| jj|||||dS )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        Nr   r   y_trueraw_predictionsample_weightloss_out	n_threads)r   
empty_likendimshapesqueezer   r   loss)r!   r,   r-   r.   r/   r0   r"   r"   r#   r5      s    

zBaseLoss.lossc                 C   s   |dkr8|dkr&t |}t |}qPt j||jd}n|dkrPt j||jd}|jdkrr|jd dkrr|d}|jdkr|jd dkr|d}t|}t|}|dk	rt|}| jj||||||dS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Ndtyper   r   )r,   r-   r.   r/   gradient_outr0   )	r   r1   r7   r2   r3   r4   r   r   loss_gradient)r!   r,   r-   r.   r/   r8   r0   r"   r"   r#   r9      s.    &


zBaseLoss.loss_gradientc                 C   s   |dkrt |}|jdkr4|jd dkr4|d}|jdkrV|jd dkrV|d}t|}t|}|dk	rvt|}| jj|||||dS )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Nr   r   )r,   r-   r.   r8   r0   )r   r1   r2   r3   r4   r   r   gradient)r!   r,   r-   r.   r8   r0   r"   r"   r#   r:   
  s"    


zBaseLoss.gradientc                 C   s   |dkr2|dkr&t |}t |}qDt |}n|dkrDt |}|jdkrf|jd dkrf|d}|jdkr|jd dkr|d}|jdkr|jd dkr|d}t|}t|}|dk	rt|}| jj||||||dS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        Nr   r   )r,   r-   r.   r8   hessian_outr0   )r   r1   r2   r3   r4   r   r   gradient_hessian)r!   r,   r-   r.   r8   r;   r0   r"   r"   r#   r<   >  s2    '




zBaseLoss.gradient_hessianc                 C   s   t j| j||dd|d|dS )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr+   )weights)r   averager5   )r!   r,   r-   r.   r0   r"   r"   r#   __call__  s    zBaseLoss.__call__c                 C   s   t j||dd}dt |jj }| jjt j kr8d}n| jjrJ| jj}n| jj| }| jj	t jkrjd}n| jj
r|| jj	}n| jj	| }|dkr|dkr| j|S | jt |||S dS )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   r=   axis
   N)r   r>   finfor7   epsr    lowr   Zlow_inclusivehighZhigh_inclusiver   clip)r!   r,   r.   Zy_predrD   Za_minZa_maxr"   r"   r#   fit_intercept_only  s    

zBaseLoss.fit_intercept_onlyc                 C   s
   t |S )zpCalculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.
        )r   Z
zeros_liker!   r,   r.   r"   r"   r#   constant_to_optimal_zero  s    z!BaseLoss.constant_to_optimal_zeroFc                 C   sv   |t jt jfkr td| d| jr2|| jf}n|f}t j|||d}| jr^t jd|d}nt j|||d}||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)r3   r7   order)r   )r3   r7   )	r   Zfloat32float64
ValueErroris_multiclassr   emptyr   Zones)r!   Z	n_samplesr7   rL   r3   r:   Zhessianr"   r"   r#   init_gradient_and_hessian  s    
z"BaseLoss.init_gradient_and_hessian)N)NNr   )NNNr   )NNr   )NNNr   )Nr   )N)N)__name__
__module____qualname____doc__need_update_leaves_valuesdifferentiablerO   r$   r)   r*   r5   r9   r:   r<   r?   rH   rJ   r   rM   rQ   r"   r"   r"   r#   r   >   s:   :
		   
4    
F   
8    
E

*
r   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    Nc                    s"   t  jt t d |d k| _d S )Nr   r   )superr$   r   r   r   r!   r.   	__class__r"   r#   r$     s    zHalfSquaredError.__init__)NrR   rS   rT   rU   r$   __classcell__r"   r"   r\   r#   rX     s   rX   c                       s4   e Zd ZdZdZdZd	 fdd	Zd
ddZ  ZS )AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|
    FTNc                    s(   t  jt t d d| _|d k| _d S )NrY   T)rZ   r$   r   r   r   r   r[   r\   r"   r#   r$   3  s    zAbsoluteError.__init__c                 C   s&   |dkrt j|ddS t||dS dS )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        Nr   rA   2   )r   Zmedianr   rI   r"   r"   r#   rH   8  s    z AbsoluteError.fit_intercept_only)N)N	rR   rS   rT   rU   rW   rV   r$   rH   r_   r"   r"   r\   r#   r`   "  s
   r`   c                       s4   e Zd ZdZdZdZd
 fdd	Zddd	Z  ZS )PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Additional Attributes
    ---------------------
    quantile : float
        The quantile to be estimated. Must be in range (0, 1).
    FTN      ?c                    sF   t |dtjdddd t jtt|dt d d| _|d k| _	d S )	Nquantiler   r   Zneither)Ztarget_typeZmin_valZmax_valZinclude_boundaries)rg   rY   T)
r   numbersRealrZ   r$   r   floatr   r   r   )r!   r.   rg   r\   r"   r#   r$   a  s    zPinballLoss.__init__c                 C   s8   |dkr t j|d| jj ddS t||d| jj S dS )ra   Nd   r   rb   )r   Z
percentiler   rg   r   rI   r"   r"   r#   rH   q  s      
zPinballLoss.fit_intercept_only)Nrf   )Nrd   r"   r"   r\   r#   re   D  s
   re   c                       s,   e Zd ZdZd fdd	ZdddZ  ZS )	HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    Nc                    s*   t  jt t d tdtjdd| _d S )NrY   r   TF)rZ   r$   r   r   r   r   r   r   r[   r\   r"   r#   r$     s    zHalfPoissonLoss.__init__c                 C   s"   t ||| }|d k	r||9 }|S )Nr   r!   r,   r.   termr"   r"   r#   rJ     s    z(HalfPoissonLoss.constant_to_optimal_zero)N)NrR   rS   rT   rU   r$   rJ   r_   r"   r"   r\   r#   rl     s   rl   c                       s,   e Zd ZdZd fdd	ZdddZ  ZS )	HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    Nc                    s*   t  jt t d tdtjdd| _d S )NrY   r   F)rZ   r$   r	   r   r   r   r   r   r[   r\   r"   r#   r$     s    zHalfGammaLoss.__init__c                 C   s$   t | d }|d k	r ||9 }|S Nr   )r   logrm   r"   r"   r#   rJ     s    z&HalfGammaLoss.constant_to_optimal_zero)N)Nro   r"   r"   r\   r#   rp     s   rp   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    N      ?c                    sv   t  jtt|dt d | jjdkr@ttj	 tj	dd| _
n2| jjdk r`tdtj	dd| _
ntdtj	dd| _
d S N)powerrY   r   Fr   T)rZ   r$   r
   rj   r   r   rv   r   r   r   r   r!   r.   rv   r\   r"   r#   r$     s    zHalfTweedieLoss.__init__c                 C   s   | j jdkrt j||dS | j jdkr8t j||dS | j jdkrTt j||dS | j j}tt|dd| d|  d|  }|d k	r||9 }|S d S )Nr   )r,   r.   r   r   )r   rv   rX   rJ   rl   rp   r   maximum)r!   r,   r.   prn   r"   r"   r#   rJ     s(       (z(HalfTweedieLoss.constant_to_optimal_zero)Nrt   )Nro   r"   r"   r\   r#   rs     s   rs   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    Nrt   c                    s   t  jtt|dt d | jjdkr@ttj	 tj	dd| _
n2| jjdk r`tdtj	dd| _
ntdtj	dd| _
| jjdkrttj	 tj	dd| _ntdtj	dd| _d S ru   )rZ   r$   r   rj   r   r   rv   r   r   r   r   r    rw   r\   r"   r#   r$     s    z HalfTweedieLossIdentity.__init__)Nrt   r^   r"   r"   r\   r#   rz     s   rz   c                       s4   e Zd ZdZd	 fdd	Zd
ddZdd Z  ZS )HalfBinomialLossa  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.
    Nc                    s*   t  jt t dd tdddd| _d S )Nr   r   r   r   r   r   T)rZ   r$   r   r   r   r   r[   r\   r"   r#   r$   G  s    zHalfBinomialLoss.__init__c                 C   s0   t ||t d| d|  }|d k	r,||9 }|S rq   r   rm   r"   r"   r#   rJ   O  s    z)HalfBinomialLoss.constant_to_optimal_zeroc                 C   sx   |j dkr"|jd dkr"|d}tj|jd df|jd}| j||dddf< d|dddf  |dddf< |S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r   r   r   r6   N)r2   r3   r4   r   rP   r7   r   inverse)r!   r-   Zprobar"   r"   r#   predict_probaV  s    
 zHalfBinomialLoss.predict_proba)N)N)rR   rS   rT   rU   r$   rJ   r~   r_   r"   r"   r\   r#   r{   ,  s   
r{   c                       sJ   e Zd ZdZdZd fdd	Zdd Zdd	d
Zdd ZdddZ	  Z
S )HalfMultinomialLossa  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Reference
    ---------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    TN   c                    s<   t  jt t |d tdtjdd| _tdddd| _d S )Nr|   r   TFr   )	rZ   r$   r   r   r   r   r   r   r    )r!   r.   r   r\   r"   r#   r$     s    zHalfMultinomialLoss.__init__c                 C   s    | j |ot|t|kS r%   )r   r&   r   allZastypeintr'   r"   r"   r#   r)     s    z#HalfMultinomialLoss.in_y_true_rangec                 C   s   t j| j|jd}t |jj}t| jD ]6}t j||k|dd||< t || |d| ||< q*| j		|dddf 
dS )zCompute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.
        r6   r   r@   r   N)r   zerosr   r7   rC   rD   ranger>   rG   r   Zreshape)r!   r,   r.   outrD   kr"   r"   r#   rH     s    z&HalfMultinomialLoss.fit_intercept_onlyc                 C   s   | j |S )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r   r}   )r!   r-   r"   r"   r#   r~     s    z!HalfMultinomialLoss.predict_probar   c                 C   s|   |dkr2|dkr&t |}t |}qDt |}n|dkrDt |}t|}t|}|dk	rdt|}| jj||||||dS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        N)r,   r-   r.   r8   	proba_outr0   )r   r1   r   r   gradient_proba)r!   r,   r-   r.   r8   r   r0   r"   r"   r#   r     s&    $

z"HalfMultinomialLoss.gradient_proba)Nr   )N)NNNr   )rR   rS   rT   rU   rO   r$   r)   rH   r~   r   r_   r"   r"   r\   r#   r   l  s   "		
    r   )Zsquared_errorZabsolute_errorZpinball_lossZpoisson_lossZ
gamma_lossZtweedie_lossZbinomial_lossZmultinomial_loss)'rU   rh   Znumpyr   Zscipy.specialr   Z_lossr   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   utilsr   Zutils._readonly_array_wrapperr   Zutils.statsr   r   rX   r`   re   rl   rp   rs   rz   r{   r   Z_LOSSESr"   r"   r"   r#   <module>   s>   ,   P"; @.@ 