U
    3d%B                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZmZ eeed
ddZeee
jd
ddZdd ZdddZdd ZdddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)_chunk_generatorcheck_pandas_supportget_chunk_n_rows)	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ]@\}}}||kr:|d | |d | |d ||  q:|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S   s   i | ]\}}||qS  r   .0Z	array_idxZ
column_idxr   r   A/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/datasets/_arff_parser.py
<dictcomp>-   s     z)_split_sparse_columns.<locals>.<dictcomp>r      r   )list	enumeratezipappend)r   r   Zarff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s    "r   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]"\}}}||krV||||| f< qV|S )Nr   c                 S   s   i | ]\}}||qS r   r   r   r   r   r   r   ?   s     z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsZy_shaper   yr   r   r   r   r   r   _sparse_data_to_array8   s    "r&   c                 C   sD   | | }t |dkr| | }nt |dkr8| |d  }nd}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r    )frameZfeature_namesZtarget_namesXr%   r   r   r   _post_process_frameJ   s    
r)   c           "         sh  dd }|| }|dkrt jnt j}|dk }	t j|||	d}
|| fdd|
d D  |dkrtd	}t|
d }t| }t|
d
 }|j	|g|d}|j
dd }t|}fdd|D }|| g}t|
d
 |D ]}||j	||d|  q|j|dd}~~i }|jD ]P}| d }| dkrFd||< n&| dkr^d||< n|j| ||< q||}t|||\}n|
d
 }fdd|D }fdd|D }t|tr@|dkrtd|d dkrd}n|d |d  }tjtj|d|d}|j| }|dd|f }|dd|f nt|trt||}t |d d }|t!|f} t"j#j$|d |d |d ff| tj%d }|& }t'||ntd!t(|  fd"d#|D }!|!sn<t)|!rt* fd$dt+|D nt,|!rtd%j-d dkr2d&nj-d dkrFd|dkr\||dfS |d fS )'a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c                 s   s   | D ]}| dV  qd S )Nutf-8)decode)	gzip_fileliner   r   r   _io_to_generator   s    z+_liac_arff_parser.<locals>._io_to_generatorsparsepandas)return_typeencode_nominalc                    s(   i | ] \}}t |tr| kr||qS r   )
isinstancer   )r   namecatcolumns_to_selectr   r   r      s
   
  z%_liac_arff_parser.<locals>.<dictcomp>
attributeszfetch_openml with as_frame=Truedata)columnsT)deepc                    s   g | ]}| kr|qS r   r   r   colr6   r   r   
<listcomp>   s      z%_liac_arff_parser.<locals>.<listcomp>)Zignore_index	data_typeintegerInt64nominalcategoryc                    s   g | ]}t  | d  qS indexintr   col_nameopenml_columns_infor   r   r>      s   c                    s   g | ]}t  | d  qS rD   rF   rH   rJ   r   r   r>      s   Nz6shape must be provided when arr['data'] is a Generatorr   r   r#   )r   countr   )shaper   z-Unexpected type for data obtained from arff: c                    s   h | ]}| kqS r   r   rH   )
categoriesr   r   	<setcomp>
  s    z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]B\}}t t j |d ddd||d f jtddqS )Or   Nr   F)copy)r!   ZtakeZasarraypopastyperG   )r   irI   )rO   r%   r   r   r>     s
    zAMix of nominal and non-nominal targets is not currently supported)rL   ).r   ZCOOZ	DENSE_GENloadr	   r   r   keysnextZ	DataFrameZmemory_usagesumr
   r   r   concatr:   lowerdtypesrT   r)   r3   r   
ValueErrorr!   Zfromiter	itertoolschainfrom_iterableZreshapetupler   r   r    spr/   Z
coo_matrixr#   Ztocsrr&   typeallZhstackr   anyrN   )"r,   output_arrays_typerK   feature_names_to_selecttarget_names_to_selectrN   r.   streamr1   r2   Zarff_containerpdZcolumns_infoZcolumns_namesZ	first_rowZfirst_dfZ	row_bytes	chunksizecolumns_to_keepdfsr9   r'   r\   r4   column_dtyper(   r   Zfeature_indices_to_selectZtarget_indices_to_selectrM   Zarff_data_Xr$   ZX_shapeZis_classificationr   )rO   r7   rK   r%   r   _liac_arff_parserj   s    7
  






  







	

ro   c              
      s`  ddl | D ]}|d dr q*qi }|D ]:}|| d }| dkrXd||< q2| dkr2d	||< q2j| dd
gdddd |D |dd}	||   fdd|	jD }
|	|
 }	tdfdd}fdd|	j	 D }|D ]}|	| j
||	|< qt|	||\}}|dkr*|||	dfS | |  }}fdd|	j	 D }||d|fS )a  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr*   z@datar?   r@   rA   rB   rC   ?%"c                 S   s   g | ]}|qS r   r   )r   r4   r   r   r   r>   v  s     z'_pandas_arff_parser.<locals>.<listcomp>T)headerZ	na_valuescomment	quotecharnamesr   skipinitialspacec                    s   g | ]}| kr|qS r   r   r<   r6   r   r   r>   |  s      z^'(?P<contents>.*)'$c                    s"   t  | }|d kr| S |dS )Ncontents)researchgroup)Zinput_stringmatch)single_quote_patternr   r   strip_single_quotes  s    z0_pandas_arff_parser.<locals>.strip_single_quotesc                    s"   g | ]\}} j j|r|qS r   )apitypesis_categorical_dtyper   r4   r   rj   r   r   r>     s   r0   c                    s*   i | ]"\}} j j|r||j qS r   )r   r   r   rO   tolistr   r   r   r   r     s    z'_pandas_arff_parser.<locals>.<dictcomp>)r0   r+   r[   
startswithZread_csvr:   ry   compiler\   itemsr5   Zrename_categoriesr)   Zto_numpy)r,   rf   rK   rg   rh   r-   r\   r4   rn   r'   rl   r~   Zcategorical_columnsr=   r(   r%   rO   r   )r7   rj   r}   r   _pandas_arff_parser+  sN    3





r   c                 C   sF   |dkrt | |||||S |dkr2t| ||||S td| ddS )a  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr0   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.N)ro   r   r]   )r,   parseroutput_typerK   rg   rh   rN   r   r   r   load_arff_from_gzip_file  s(    6
r   )N)N)__doc__r^   ry   collectionsr   collections.abcr   typingr   Znumpyr!   Zscipyrb   Z	externalsr   Zexternals._arffr   utilsr   r	   r
   r   Zndarrayr&   r)   ro   r   r   r   r   r   r   <module>   s4    $ & 
 B  