U
    3d1                  	   @   s   d Z ddlZddlmZ ddlZddlZddlmZmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ eddddZeddddZeeZdddddddddddZdddZdd ZdS ) zKDDCUP 99 dataset.

A classic dataset for anomaly detection.

The dataset page is available from UCI Machine Learning Repository

https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz

    N)GzipFile)existsjoin   )_fetch_remote)_convert_data_dataframe)get_data_home)RemoteFileMetadata)
load_descr   )Bunch)check_random_state)shuffleZkddcup99_dataz.https://ndownloader.figshare.com/files/5976045Z@3b6c942aa0356c0ca35b7b595a26c89d343652c9db428893e7494f837b274292)filenameurlZchecksumZkddcup99_10_dataz.https://ndownloader.figshare.com/files/5976042Z@8045aca0d84e70e622d1148d7df782496f6333bf6eb979a1b0837c42a9fd9561FT)subset	data_homer   random_state	percent10download_if_missing
return_X_yas_framec                 C   s  t |d}t|||d}|j}	|j}
|j}|j}| dkr|
dk}t|}|	|ddf }|
| }|	|ddf }|
| }|jd }t	|}|
d|d}|| }|| }tj||f }	tj||f }
| dks| d	ks| d
kr"|	dddf dk}tj|	|ddf |	|ddf f }	|dd |dd  }|
| }
t|	dddf d jtdd|	dddf< t|	dddf d jtdd|	dddf< t|	dddf d jtdd|	dddf< | d	krB|	dddf dk}|	| }	|
| }
tj|	dddf |	dddf |	dddf f }	|d |d |d g}| d
kr|	dddf dk}|	| }	|
| }
tj|	dddf |	dddf |	dddf f }	|d |d |d g}| dkr"tj|	dddf |	dddf |	dddf |	dddf f }	|d |d |d |d g}|r:t|	|
|d\}	}
td}d}|rbtd|	|
||\}}	}
|rp|	|
fS t|	|
||||dS )a  Load the kddcup99 dataset (classification).

    Download it if necessary.

    =================   ====================================
    Classes                                               23
    Samples total                                    4898431
    Dimensionality                                        41
    Features            discrete (int) or continuous (float)
    =================   ====================================

    Read more in the :ref:`User Guide <kddcup99_dataset>`.

    .. versionadded:: 0.18

    Parameters
    ----------
    subset : {'SA', 'SF', 'http', 'smtp'}, default=None
        To return the corresponding classical subsets of kddcup 99.
        If None, return the entire kddcup 99 dataset.

    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

        .. versionadded:: 0.19

    shuffle : bool, default=False
        Whether to shuffle dataset.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for dataset shuffling and for
        selection of abnormal samples if `subset='SA'`. Pass an int for
        reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : bool, default=False
        If True, returns ``(data, target)`` instead of a Bunch object. See
        below for more information about the `data` and `target` object.

        .. versionadded:: 0.20

    as_frame : bool, default=False
        If `True`, returns a pandas Dataframe for the ``data`` and ``target``
        objects in the `Bunch` returned object; `Bunch` return object will also
        have a ``frame`` member.

        .. versionadded:: 0.24

    Returns
    -------
    data : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : {ndarray, dataframe} of shape (494021, 41)
            The data matrix to learn. If `as_frame=True`, `data` will be a
            pandas DataFrame.
        target : {ndarray, series} of shape (494021,)
            The regression target for each sample. If `as_frame=True`, `target`
            will be a pandas Series.
        frame : dataframe of shape (494021, 42)
            Only present when `as_frame=True`. Contains `data` and `target`.
        DESCR : str
            The full description of the dataset.
        feature_names : list
            The names of the dataset columns
        target_names: list
            The names of the target columns

    (data, target) : tuple if ``return_X_y`` is True
        A tuple of two ndarray. The first containing a 2D array of
        shape (n_samples, n_features) with each row representing one
        sample and each column representing the features. The second
        ndarray of shape (n_samples,) containing the target samples.

        .. versionadded:: 0.20
    r   )r   r   r   ZSAs   normal.Nr   i1  ZSFhttpsmtp   r      g?F)copy      r   s   https   smtp)r   zkddcup99.rstfetch_kddcup99)datatargetframetarget_namesfeature_namesZDESCR)r   _fetch_brute_kddcup99r!   r"   r%   r$   npZlogical_notshaper   randintZr_Zc_logastypefloatshuffle_methodr
   r   r   )r   r   r   r   r   r   r   r   kddcup99r!   r"   r%   r$   stZnormal_samplesZnormal_targetsZabnormal_samplesZabnormal_targetsZn_samples_abnormalrZfdescrr#    r2   >/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/datasets/_kddcup99.pyr    1   s    _


&000
4
4
B    
r    c              *   C   s  t | d} d}|r&t| d| }t}nt| d| }t}t|d}t|d}t|}dtfdd	d
dtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfdtfd tfd!tfd"tfd#tfd$tfd%tfd&tfd'tfd(tfd)tfd*tfd+tfd,tfd-tfd.tfd/tfd0g*}	d1d2 |	D }
|
d3 }|
d4d3 }|rzt|}t|}W n: t	k
r } zt
d5t| d6|W 5 d4}~X Y nX n8|rt| td7|j  t||d8 t|	}td9 t||j}t|d:d;}g }| D ]&}| }||d<d=d> q&|  td? t| tj|td@}t dAD ],}|d4d4|f !|| |d4d4|f< q|d4d4d4d3f }|d4d4d3f }tj"||dBdC tj"||dBdC nt
dDt#||||gdES )Fa  Load the kddcup99 dataset, downloading it if necessary.

    Parameters
    ----------
    data_home : str, default=None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : bool, default=True
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    percent10 : bool, default=True
        Whether to load only 10 percent of the data.

    Returns
    -------
    dataset : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (494021, 41)
            Each row corresponds to the 41 features in the dataset.
        target : ndarray of shape (494021,)
            Each value corresponds to one of the 21 attack types or to the
            label 'normal.'.
        feature_names : list
            The names of the dataset columns
        target_names: list
            The names of the target columns
        DESCR : str
            Description of the kddcup99 dataset.

    r   z-py3Zkddcup99_10r.   Zsamplestargetsduration)Zprotocol_typeZS4)ZserviceZS11)flagZS6Z	src_bytesZ	dst_bytesZlandZwrong_fragmentZurgentZhotZnum_failed_loginsZ	logged_inZnum_compromisedZ
root_shellZsu_attemptedZnum_rootZnum_file_creationsZ
num_shellsZnum_access_filesZnum_outbound_cmdsZis_host_loginZis_guest_logincountZ	srv_countZserror_rateZsrv_serror_rateZrerror_rateZsrv_rerror_rateZsame_srv_rateZdiff_srv_rateZsrv_diff_host_rateZdst_host_countZdst_host_srv_countZdst_host_same_srv_rateZdst_host_diff_srv_rateZdst_host_same_src_port_rateZdst_host_srv_diff_host_rateZdst_host_serror_rateZdst_host_srv_serror_rateZdst_host_rerror_rateZdst_host_srv_rerror_rate)labelsZS16c                 S   s   g | ]}|d  qS )r   r2   ).0cr2   r2   r3   
<listcomp>F  s     z)_fetch_brute_kddcup99.<locals>.<listcomp>Nz7The cache for fetch_kddcup99 is invalid, please delete z! and run the fetch_kddcup99 againzDownloading %s)dirnamezextracting archiver1   )r   mode
 ,zextraction done)dtype*   r   )compressz1Data not found and `download_if_missing` is False)r!   r"   r%   r$   )$r   r   ARCHIVE_10_PERCENTARCHIVEr   intr,   joblibload	ExceptionIOErrorstr_mkdirploggerinfor   r   r'   rB   debugr   r   	readlinesdecodeappendreplacesplitcloseosremoveZasarrayobjectranger+   dumpr   )r   r   r   Z
dir_suffixZ
kddcup_dirarchiveZsamples_pathZtargets_path	availabledtZcolumn_namesr$   r%   XyeZDTarchive_pathfile_ZXylinejr2   r2   r3   r&      s    #


-




*r&   c              
   C   sD   zt |  W n0 tk
r> } z|jtjkr. W 5 d}~X Y nX dS )zgEnsure directory d exists (like mkdir -p on Unix)
    No guarantee that the directory is writable.
    N)rW   makedirsOSErrorerrnoEEXIST)dra   r2   r2   r3   rM   {  s
    rM   )NTT) __doc__rh   gzipr   loggingrW   os.pathr   r   Znumpyr'   rH   _baser   r   r@   r   r	   r
   utilsr   r   r   r-   rF   rE   	getLogger__name__rN   r    r&   rM   r2   r2   r2   r3   <module>   sN   

 8
 