U
    2d                     @   sj  d Z ddlZddlZddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ dd Z dd Z!dd Z"dd Z#dd Z$dddddZ%dddddZ&G dd  d eeZ'G d!d" d"e'Z(G d#d$ d$e'Z)G d%d& d&e'Z*G d'd( d(e'Z+G d)d* d*e'Z,G d+d, d,e'Z-dS )-zUnivariate features selection.    N)IntegralReal)specialstats)issparse   )BaseEstimator)LabelBinarizer)as_float_arraycheck_array	check_X_ysafe_sqr	safe_mask)safe_sparse_dot	row_norms)check_is_fitted)Interval
StrOptions   )SelectorMixinc                 C   s(   t | dd} t| jj| t| < | S )z
    Fixes Issue #1240: NaNs can't be properly compared, so change them to the
    smallest value of scores's dtype. -inf seems to be unreliable.
    T)copy)r
   npfinfodtypeminisnan)scores r   S/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/feature_selection/_univariate_selection.py_clean_nans   s    r   c                  G   sV  t | }dd | D } tdd | D }t|}tdd | D }dd | D }t|d }dd |D }||t|  }d	}	t| D ]\}
}|	||
 ||
  7 }	q|	|t| 8 }	||	 }|d
 }|| }|	t| }|t| }t|d	kd }t|d j|jkr*|jr*t	
d| t || }t| }t|||}||fS )a+  Perform a 1-way ANOVA.

    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    *args : {array-like, sparse matrix}
        Sample1, sample2... The sample measurements should be given as
        arguments.

    Returns
    -------
    f_statistic : float
        The computed F-value of the test.
    p_value : float
        The associated p-value from the F-distribution.

    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.

    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.

    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.

    The algorithm is from Heiman[2], pp.394-7.

    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.

    References
    ----------
    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html

    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    c                 S   s   g | ]}t |qS r   )r
   .0ar   r   r   
<listcomp>]   s     zf_oneway.<locals>.<listcomp>c                 S   s   g | ]}|j d  qS )r   )shaper    r   r   r   r#   ^   s     c                 s   s   | ]}t |jd dV  qdS )r   ZaxisN)r   sumr    r   r   r   	<genexpr>`   s     zf_oneway.<locals>.<genexpr>c                 S   s   g | ]}t |jd dqS )r   r%   )r   asarrayr&   r    r   r   r   r#   a   s     r   c                 S   s   g | ]}|d  qS )r   r   )r!   sr   r   r   r#   c   s             r   r   zFeatures %s are constant.)lenr   arrayr&   float	enumeratewhereZnonzerosizewarningswarnUserWarningr(   Zravelr   Zfdtrc)argsZ	n_classesZn_samples_per_class	n_samplesZ
ss_alldataZ	sums_argsZsquare_of_sums_alldataZsquare_of_sums_argsZsstotZssbnk_ZsswnZdfbnZdfwnZmsbZmswZconstant_features_idxfZprobr   r   r   f_oneway+   s2    1
 r9   c                    s:   t  dddgd\  fddtD }t| S )a  Compute the ANOVA F-value for the provided sample.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The set of regressors that will be tested sequentially.

    y : ndarray of shape (n_samples,)
        The target vector.

    Returns
    -------
    f_statistic : ndarray of shape (n_features,)
        F-statistic for each feature.

    p_values : ndarray of shape (n_features,)
        P-values associated with the F-statistic.

    See Also
    --------
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    csrcsccoo)accept_sparsec                    s   g | ]} t  |k qS r   )r   )r!   r6   Xyr   r   r#      s     zf_classif.<locals>.<listcomp>)r   r   uniquer9   )r?   r@   r4   r   r>   r   	f_classifx   s    rB   c              	   C   sl   t j| t jd} t| }| }||8 }|dC }t jdd || }W 5 Q R X |jdd}|t|d |fS )zFast replacement for scipy.stats.chisquare.

    Version from https://github.com/scipy/scipy/pull/2525 with additional
    optimizations.
    r   r   ignore)invalidr   r%   r   )r   r(   float64r+   errstater&   r   Zchdtrc)Zf_obsZf_expr6   Zchisqr   r   r   
_chisquare   s    rH   c                 C   s   t | dtjtjfd} tt| r(| jn| dk r:tdtdd	|}|j
d dkrt| }tjd| |dd}t|j| }t|r| }| jdddd	}|jdddd	}t|j|}t||S )
a@  Compute chi-squared stats between each non-negative feature and class.

    This score can be used to select the `n_features` features with the
    highest values for the test chi-squared statistic from X, which must
    contain only **non-negative features** such as booleans or frequencies
    (e.g., term counts in document classification), relative to the classes.

    Recall that the chi-square test measures dependence between stochastic
    variables, so using this function "weeds out" the features that are the
    most likely to be independent of class and therefore irrelevant for
    classification.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Sample vectors.

    y : array-like of shape (n_samples,)
        Target vector (class labels).

    Returns
    -------
    chi2 : ndarray of shape (n_features,)
        Chi2 statistics for each feature.

    p_values : ndarray of shape (n_features,)
        P-values for each feature.

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    f_regression : F-value between label/feature for regression tasks.

    Notes
    -----
    Complexity of this algorithm is O(n_classes * n_features).
    r:   r=   r   r   zInput X must be non-negative.T)Zsparse_outputr   r%   )r   r   rF   float32anyr   data
ValueErrorr	   Zfit_transformr$   Ztoarrayappendr   Tr&   ZreshapemeandotrH   )r?   r@   YZobservedZfeature_countZ
class_probexpectedr   r   r   chi2   s    -rU   Tcenterforce_finitec          	   	   C   s   t | |dddgtjd\} }| jd }|r|t| }t| rR| jdd }n| jdd}tt| j	dd||d	   }n
t| j	}t
|| }tjd
d
d || }|tj| }W 5 Q R X |rt| st|}d||< |S )a  Compute Pearson's r for each features and the target.

    Pearson's r is also known as the Pearson correlation coefficient.

    Linear model for testing the individual effect of each of many regressors.
    This is a scoring function to be used in a feature selection procedure, not
    a free standing feature selection procedure.

    The cross correlation between each regressor and the target is computed
    as::

        E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))

    For more on usage see the :ref:`User Guide <univariate_feature_selection>`.

    .. versionadded:: 1.0

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The data matrix.

    y : array-like of shape (n_samples,)
        The target vector.

    center : bool, default=True
        Whether or not to center the data matrix `X` and the target vector `y`.
        By default, `X` and `y` will be centered.

    force_finite : bool, default=True
        Whether or not to force the Pearson's R correlation to be finite.
        In the particular case where some features in `X` or the target `y`
        are constant, the Pearson's R correlation is not defined. When
        `force_finite=False`, a correlation of `np.nan` is returned to
        acknowledge this case. When `force_finite=True`, this value will be
        forced to a minimal correlation of `0.0`.

        .. versionadded:: 1.1

    Returns
    -------
    correlation_coefficient : ndarray of shape (n_features,)
        Pearson's R correlation coefficients of features.

    See Also
    --------
    f_regression: Univariate linear regression tests returning f-statistic
        and p-values.
    mutual_info_regression: Mutual information for a continuous target.
    f_classif: ANOVA F-value between label/feature for classification tasks.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    r:   r;   r<   rI   r   r%   T)Zsquaredr   rD   dividerE   r*   )r   r   rF   r$   rQ   r   ZgetA1sqrtr   rP   r   rG   ZlinalgZnormisfiniteallr   )	r?   r@   rW   rX   r5   ZX_meansZX_normscorrelation_coefficientZnan_maskr   r   r   r_regression   s"    5
"


r_   c             	   C   s   t | |||d}|j|rdnd }|d }tjddd& |d|  | }tj|d|}W 5 Q R X |rt| st	|}	t
|jj||	< t|}
d||
< d||
< ||fS )a  Univariate linear regression tests returning F-statistic and p-values.

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 2 steps:

    1. The cross correlation between each regressor and the target is computed
       using :func:`r_regression` as::

           E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))

    2. It is converted to an F score and then to a p-value.

    :func:`f_regression` is derived from :func:`r_regression` and will rank
    features in the same order if all the features are positively correlated
    with the target.

    Note however that contrary to :func:`f_regression`, :func:`r_regression`
    values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
    therefore recommended as a feature selection criterion to identify
    potentially predictive feature for a downstream classifier, irrespective of
    the sign of the association with the target variable.

    Furthermore :func:`f_regression` returns p-values while
    :func:`r_regression` does not.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The data matrix.

    y : array-like of shape (n_samples,)
        The target vector.

    center : bool, default=True
        Whether or not to center the data matrix `X` and the target vector `y`.
        By default, `X` and `y` will be centered.

    force_finite : bool, default=True
        Whether or not to force the F-statistics and associated p-values to
        be finite. There are two cases where the F-statistic is expected to not
        be finite:

        - when the target `y` or some features in `X` are constant. In this
          case, the Pearson's R correlation is not defined leading to obtain
          `np.nan` values in the F-statistic and p-value. When
          `force_finite=True`, the F-statistic is set to `0.0` and the
          associated p-value is set to `1.0`.
        - when a feature in `X` is perfectly correlated (or
          anti-correlated) with the target `y`. In this case, the F-statistic
          is expected to be `np.inf`. When `force_finite=True`, the F-statistic
          is set to `np.finfo(dtype).max` and the associated p-value is set to
          `0.0`.

        .. versionadded:: 1.1

    Returns
    -------
    f_statistic : ndarray of shape (n_features,)
        F-statistic for each feature.

    p_values : ndarray of shape (n_features,)
        P-values associated with the F-statistic.

    See Also
    --------
    r_regression: Pearson's R between label/feature for regression tasks.
    f_classif: ANOVA F-value between label/feature for classification tasks.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    SelectKBest: Select features based on the k highest scores.
    SelectFpr: Select features based on a false positive rate test.
    SelectFdr: Select features based on an estimated false discovery rate.
    SelectFwe: Select features based on family-wise error rate.
    SelectPercentile: Select features based on percentile of the highest
        scores.
    rV   r   r   rD   rY   r*   g      ?)r_   r0   r   rG   r   r8   Zsfr\   r]   isinfr   r   maxr   )r?   r@   rW   rX   r^   Zdeg_of_freedomZcorr_coef_squaredZf_statisticZp_valuesZmask_infZmask_nanr   r   r   f_regressionE  s$    P   

rb   c                   @   sD   e Zd ZU dZdegiZeed< dd Zdd Z	dd	 Z
d
d ZdS )_BaseFilterzInitialize the univariate feature selection.

    Parameters
    ----------
    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
    
score_func_parameter_constraintsc                 C   s
   || _ d S Nrd   )selfrd   r   r   r   __init__  s    z_BaseFilter.__init__c                 C   s   |    | j||ddgdd\}}| || | ||}t|ttfrd|\| _| _t	
| j| _n|| _d| _t	
| j| _| S )a  Run score function on (X, y) and get the appropriate features.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples,)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : object
            Returns the instance itself.
        r:   r;   T)r=   Zmulti_outputN)Z_validate_paramsZ_validate_data_check_paramsrd   
isinstancelisttuplescores_pvalues_r   r(   )rh   r?   r@   Zscore_func_retr   r   r   fit  s        
z_BaseFilter.fitc                 C   s   d S rf   r   rh   r?   r@   r   r   r   rj     s    z_BaseFilter._check_paramsc                 C   s   ddiS )NZ
requires_yTr   rh   r   r   r   
_more_tags  s    z_BaseFilter._more_tagsN)__name__
__module____qualname____doc__callablere   dict__annotations__ri   rp   rj   rs   r   r   r   r   rc     s   
	$rc   c                       sX   e Zd ZU dZejdeeddddgiZee	d< e
fdd	 fd
dZdd Z  ZS )SelectPercentilea	  Select features according to a percentile of the highest scores.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

        .. versionadded:: 0.18

    percentile : int, default=10
        Percent of features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned only scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Notes
    -----
    Ties between features with equal scores will be broken in an unspecified
    way.

    Examples
    --------
    >>> from sklearn.datasets import load_digits
    >>> from sklearn.feature_selection import SelectPercentile, chi2
    >>> X, y = load_digits(return_X_y=True)
    >>> X.shape
    (1797, 64)
    >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
    >>> X_new.shape
    (1797, 7)
    
percentiler   d   bothclosedre   
   )r|   c                   s   t  j|d || _d S Nrg   )superri   r|   )rh   rd   r|   	__class__r   r   ri   7  s    zSelectPercentile.__init__c                 C   s   t |  | jdkr&tjt| jtdS | jdkrDtjt| jtdS t| j}t|d| j }||k}t	||kd }t|rt
t|| j d }|d ||   }d||< |S )Nr}   rC   r   T)r   r|   r   onesr+   rn   boolzerosr   r/   intr&   )rh   r   	thresholdmaskZtiesZ	max_featsZ	kept_tiesr   r   r   _get_support_mask;  s    


z"SelectPercentile._get_support_maskrt   ru   rv   rw   rc   re   r   r   ry   rz   rB   ri   r   __classcell__r   r   r   r   r{     s   
D r{   c                	       sh   e Zd ZU dZejdedheeddddgiZe	e
d< efd	d
 fddZdd Zdd Z  ZS )SelectKBesta]	  Select features according to the k highest scores.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

        .. versionadded:: 0.18

    k : int or "all", default=10
        Number of top features to select.
        The "all" option bypasses selection, for use in a parameter search.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned only scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif: ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif: Mutual information for a discrete target.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    f_regression: F-value between label/feature for regression tasks.
    mutual_info_regression: Mutual information for a continuous target.
    SelectPercentile: Select features based on percentile of the highest
        scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Notes
    -----
    Ties between features with equal scores will be broken in an unspecified
    way.

    Examples
    --------
    >>> from sklearn.datasets import load_digits
    >>> from sklearn.feature_selection import SelectKBest, chi2
    >>> X, y = load_digits(return_X_y=True)
    >>> X.shape
    (1797, 64)
    >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
    >>> X_new.shape
    (1797, 20)
    r6   r]   r   Nleftr   re   r   )r6   c                   s   t  j|d || _d S r   )r   ri   r6   )rh   rd   r6   r   r   r   ri     s    zSelectKBest.__init__c                 C   s>   t | jts:| j|jd kr:td|jd  d| j dd S )Nr   zk should be <= n_features = z; got z%. Use k='all' to return all features.)rk   r6   strr$   rN   rq   r   r   r   rj     s    zSelectKBest._check_paramsc                 C   s   t |  | jdkr$tj| jjtdS | jdkr@tj| jjtdS t| j}tj|jtd}d|tj	|dd| j d  < |S d S )Nr]   rC   r   r   Z	mergesort)kind)
r   r6   r   r   rn   r$   r   r   r   Zargsort)rh   r   r   r   r   r   r     s    


zSelectKBest._get_support_mask)rt   ru   rv   rw   rc   re   r   r   r   ry   rz   rB   ri   rj   r   r   r   r   r   r   r   O  s   
F r   c                       sX   e Zd ZU dZejdeeddddgiZee	d< e
fdd	 fd
dZdd Z  ZS )	SelectFpra  Filter: Select the pvalues below alpha based on a FPR test.

    FPR test stands for False Positive Rate test. It controls the total
    amount of false detections.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        Features with p-values less than `alpha` are selected.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    mutual_info_classif: Mutual information for a discrete target.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFpr, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 16)
    alphar   r   r~   r   re   皙?r   c                   s   t  j|d || _d S r   r   ri   r   rh   rd   r   r   r   r   ri     s    zSelectFpr.__init__c                 C   s   t |  | j| jk S rf   )r   ro   r   rr   r   r   r   r      s    zSelectFpr._get_support_maskr   r   r   r   r   r     s   
A r   c                       sX   e Zd ZU dZejdeeddddgiZee	d< e
fdd	 fd
dZdd Z  ZS )	SelectFdra6	  Filter: Select the p-values for an estimated false discovery rate.

    This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound
    on the expected false discovery rate.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        The highest uncorrected p-value for features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFwe : Select features based on family-wise error rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    References
    ----------
    https://en.wikipedia.org/wiki/False_discovery_rate

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFdr, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 16)
    r   r   r   r~   r   re   r   r   c                   s   t  j|d || _d S r   r   r   r   r   r   ri   P  s    zSelectFdr.__init__c                 C   sl   t |  t| j}t| j}||t| j| td|d  k }|jdkr^tj	| jt
dS | j| kS )Nr   r   rC   )r   r+   ro   r   sortr-   r   Zaranger0   Z
zeros_liker   ra   )rh   Z
n_featuressvselectedr   r   r   r   T  s    
 
zSelectFdr._get_support_maskr   r   r   r   r   r     s   
E r   c                       sX   e Zd ZU dZejdeeddddgiZee	d< e
fdd	 fd
dZdd Z  ZS )	SelectFwea  Filter: Select the p-values corresponding to Family-wise error rate.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues).
        Default is f_classif (see below "See Also"). The default function only
        works with classification tasks.

    alpha : float, default=5e-2
        The highest uncorrected p-value for features to keep.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    GenericUnivariateSelect : Univariate feature selector with configurable
        mode.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import SelectFwe, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y)
    >>> X_new.shape
    (569, 15)
    r   r   r   r~   r   re   r   r   c                   s   t  j|d || _d S r   r   r   r   r   r   ri     s    zSelectFwe.__init__c                 C   s   t |  | j| jt| j k S rf   )r   ro   r   r+   rr   r   r   r   r     s    zSelectFwe._get_support_maskr   r   r   r   r   r   a  s   
< r   c                       s   e Zd ZU dZeeeeedZ	e
ed< ejeee	 geeddddedhgd	Ze
ed
< efddd	 fddZdd Zdd Zdd Zdd Z  ZS )GenericUnivariateSelecta
	  Univariate feature selector with configurable strategy.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues). For modes 'percentile' or 'kbest' it can return
        a single array scores.

    mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'
        Feature selection mode.

    param : "all", float or int, default=1e-5
        Parameter of the corresponding mode.

    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.

    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned scores only.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    f_classif : ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif : Mutual information for a discrete target.
    chi2 : Chi-squared stats of non-negative features for classification tasks.
    f_regression : F-value between label/feature for regression tasks.
    mutual_info_regression : Mutual information for a continuous target.
    SelectPercentile : Select features based on percentile of the highest
        scores.
    SelectKBest : Select features based on the k highest scores.
    SelectFpr : Select features based on a false positive rate test.
    SelectFdr : Select features based on an estimated false discovery rate.
    SelectFwe : Select features based on family-wise error rate.

    Examples
    --------
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2
    >>> X, y = load_breast_cancer(return_X_y=True)
    >>> X.shape
    (569, 30)
    >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
    >>> X_new = transformer.fit_transform(X, y)
    >>> X_new.shape
    (569, 20)
    )r|   Zk_bestZfprZfdrZfwe_selection_modesr   Nr   r   r]   )modeparamre   r|   gh㈵>c                   s   t  j|d || _|| _d S r   )r   ri   r   r   )rh   rd   r   r   r   r   r   ri      s    z GenericUnivariateSelect.__init__c                 C   s@   | j | j | jd}| }|d |jf |d | ji |S )Nrg   rd   r   )r   r   rd   Z_get_param_namesremoveZ
set_paramsr   )rh   selectorZpossible_paramsr   r   r   _make_selector  s
    
z&GenericUnivariateSelect._make_selectorc                 C   s   dt jt jgiS )NZpreserves_dtype)r   rF   rK   rr   r   r   r   rs     s    z"GenericUnivariateSelect._more_tagsc                 C   s   |   || d S rf   )r   rj   rq   r   r   r   rj     s    z%GenericUnivariateSelect._check_paramsc                 C   s(   t |  |  }| j|_| j|_| S rf   )r   r   ro   rn   r   )rh   r   r   r   r   r     s
    z)GenericUnivariateSelect._get_support_mask)rt   ru   rv   rw   r{   r   r   r   r   r   ry   rz   rc   re   r   setkeysr   r   rB   ri   r   rs   rj   r   r   r   r   r   r   r     s    
@	r   ).rw   Znumpyr   r1   Znumbersr   r   Zscipyr   r   Zscipy.sparser   baser   Zpreprocessingr	   utilsr
   r   r   r   r   Zutils.extmathr   r   Zutils.validationr   Zutils._param_validationr   r   _baser   r   r9   rB   rH   rU   r_   rb   rc   r{   r   r   r   r   r   r   r   r   r   <module>   s4   MHSl=agP[Q