U
    3d                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlZddlZddlZddlZddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlm Z m!Z!m"Z"m#Z# dZ$dZ%G dd dZ&eeddZdd Z'eej()ddddidddfdddddddfdddidddfdddddddfdddid d!dfdd"ddd d!dfd#dd#id$d%d&fd'dd'id(d)dfd'd*d+id(d)dfd,dd,id!d-dfd,d*d.id!d-dfd/dd/id0d$dfgej()d1d2d3gej()d4dd5gd6d7 Z*eej()ddddidddfdddddddfdddidddfdddddddfdddid d!dfdd"ddd d!dfd#dd#id$d%d&fd'dd'id(d)dfd'd*d+id(d)dfd,dd,id!d-dfd,d*d.id!d-dfgej()d1d2d3gd8d9 Z+eej()ddd'd/gd:d; Z,eej()d1d2d3gd<d= Z-eej()d1d2d3gd>d? Z.eej()d1d2d3gej()d@dAdAdBggdCdD Z/eej()ddddd#d'gej()d1d2d3gdEdF Z0eej()dddd#d'gej()d1d2d3gdGdH Z1edIdJ Z2ej3dKdLdMdN Z4ej3dKdLdOdP Z5eej()dQdRdSdTdUdVdWdXdYdZd[d\d]d^d_gej()d4dd5gd`da Z6ej(7dbej()dcd1ddidefdfddidgfgdhdi Z8ej()djddkdldkdkdld5d3dlgdmdn Z9dodp Z:ej(7dqej(7dbej()dcd1d3idrfdfdidsfd3ddtdsfgdudv Z;eej(7dqej()dwdxdygdzd{ Z<ed|d} Z=ej()d4dd5gd~d Z>ej()d4dd5gdd Z?ej()d4dd5gej()d1d2d3gdd Z@ej()d4dd5gej()dddiddddgdd ZAej()d4dd5gej()ddd*dieBdfddddgdeBdfd/d/d5deBdfdddddeBdfdddd5deBdfddddeCdfddddgdeCdfgej()d1d2d3gdd ZDej()dddddeBdfdddeBdfddddeBdfi eBdfgdd ZEej()d4dd5gdd ZFej()d4dd5gdd ZGej()d4dd5gdd ZHej()d4dd5gdd ZIej()ddd5gdd ZJdd ZKdd ZLej()d4dd5gdd ZMeej()dddddgdd ZNdd ZOej()d4dd5gej()d1ddd ZPdd ZQddÄ ZRddń ZSdS )zTest the openml loader.    N)partial)BytesIO	HTTPError)config_context)Bunchcheck_pandas_support)_open_binary)SkipTestassert_allcloseassert_array_equalfails_if_pypy)fetch_openml)_OPENML_PREFIX_open_openml_url_get_local_path_retry_with_clean_cachez"sklearn.datasets.tests.data.openmlTc                   @   sF   e Zd Zdd ZdddZdd Zdd	 Zd
d Zdd Zdd Z	dS )_MockHTTPResponsec                 C   s   || _ || _d S N)datais_gzip)selfr   r    r   F/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/datasets/tests/test_openml.py__init__(   s    z_MockHTTPResponse.__init__c                 C   s   | j |S r   )r   read)r   amtr   r   r   r   ,   s    z_MockHTTPResponse.readc                 C   s   | j   d S r   )r   closer   r   r   r   r   /   s    z_MockHTTPResponse.closec                 C   s   | j rddiS i S )NzContent-Encodinggzipr   r   r   r   r   info2   s    z_MockHTTPResponse.infoc                 C   s
   t | jS r   )iterr   r   r   r   r   __iter__7   s    z_MockHTTPResponse.__iter__c                 C   s   | S r   r   r   r   r   r   	__enter__:   s    z_MockHTTPResponse.__enter__c                 C   s   dS )NFr   )r   exc_typeexc_valexc_tbr   r   r   __exit__=   s    z_MockHTTPResponse.__exit__N)r   )
__name__
__module____qualname__r   r   r   r"   r$   r%   r)   r   r   r   r   r   '   s   
r   )	data_homec                    s   d
ddddt j	td d|  fdd	  	fd
d
fddfddfdd 	fdd
fdd}tr| tjjd| d S )Nz$https://openml.org/api/v1/json/data/z-https://openml.org/api/v1/json/data/features/zhttps://openml.org/data/v1/z)https://openml.org/api/v1/json/data/list/z.gz.id_c                    s~   t dd| tdd  |   }|dddddd	d
dddddddddddddddS )Nz\W-zhttps://openml.org/z-json-data-listz-jdlz-json-data-featuresz-jdfz-json-data-qualitiesz-jdqz
-json-dataz-jdz
-data_namez-dnz	-downloadz-dlz-limitz-lz-data_versionz-dvz-statusz-sz-deactivatedz-dactz-activez-act)resublenreplace)urlsuffixoutput)path_suffixr   r   
_file_nameV   sD             	 
 z4_monkey_patch_webbased_functions.<locals>._file_namec              
      s   |  |st | |}t|\}|rNrNt| }t|dW  5 Q R  S |d}t| }t|dW  5 Q R  S W 5 Q R X d S )NTrbF)
startswithAssertionErrorr	   r   r   r   )r5   has_gzip_headerexpected_prefixr6   data_file_nameffpdecompressed_f)r9   data_modulegzip_responseread_fnr   r   _mock_urlopen_sharedj   s    

z>_monkey_patch_webbased_functions.<locals>._mock_urlopen_sharedc                    s    | |ddS N.jsonr5   r=   r>   r6   r   r5   r=   )rF   url_prefix_data_descriptionr   r   _mock_urlopen_data_descriptionx   s    zH_monkey_patch_webbased_functions.<locals>._mock_urlopen_data_descriptionc                    s    | |ddS rG   r   rJ   )rF   url_prefix_data_featuresr   r   _mock_urlopen_data_features   s    zE_monkey_patch_webbased_functions.<locals>._mock_urlopen_data_featuresc                    s    | |ddS )Nz.arffrI   r   rJ   )rF   url_prefix_download_datar   r   _mock_urlopen_download_data   s    zE_monkey_patch_webbased_functions.<locals>._mock_urlopen_download_datac              
      s   |  st | d}t|(}|d}| d}t|}W 5 Q R X d|krjtd ddd d dt|X}|rt| }t	|dW  5 Q R  S |d}t| }t	|d	W  5 Q R  S W 5 Q R X d S )
NrH   r:   zutf-8error  Simulated mock errorr5   codemsghdrsrA   TF)
r;   r<   r	   r   decodejsonloadsr   r   r   )r5   r=   r?   r@   rB   Z	decoded_sZ	json_datarA   )r9   rC   rE   url_prefix_data_listr   r   _mock_urlopen_data_list   s*    

    
zA_monkey_patch_webbased_functions.<locals>._mock_urlopen_data_listc                    sv   |   }| ddk}|r*||S |r>||S |rR||S |rf ||S td| d S )NzAccept-encodingr    zUnknown mocking URL pattern: %s)get_full_url
get_headerr;   
ValueError)requestargskwargsr5   r=   )rL   rN   r\   rP   rK   rM   r[   rO   r   r   _mock_urlopen   s    







z7_monkey_patch_webbased_functions.<locals>._mock_urlopenurlopen)r    openOPENML_TEST_DATA_MODULEtest_offlinesetattrsklearndatasets_openml)contextdata_idrD   rc   r   )r9   rL   rN   r\   rP   rF   rC   rD   r8   rE   rK   rM   r[   rO   r    _monkey_patch_webbased_functionsH   s     rn   z9data_id, dataset_params, n_samples, n_features, n_targets=   rm            iris)nameversion      &   Zanneal1        cpu鍞     H      _  
      rt   zadult-census  M   ZMiceProtein  i  parser	liac-arffpandasrD   Fc           
      C   s
  t d}t| ||d tf dd|d|}	t|	jd |ksDtt|	tsRtt|	j	|j
sdt|	j	j||| fks|tt|	j|j
st|	jj||fkst|dkrt|	j|jst|	jj|fkstn&t|	j|j
st|	jj||fkst|	jdkstdS )	zCheck the behaviour of `fetch_openml` with `as_frame=True`.

    Fetch by ID and/or name (depending if the file was previously cached).
    r   rD   TFas_framecacher   idrr   N)pytestimportorskiprn   r   intdetailsr<   
isinstancer   frame	DataFrameshaper   targetSeries
categories)
monkeypatchrm   dataset_params	n_samples
n_features	n_targetsr   rD   pdbunchr   r   r   test_fetch_openml_as_frame_true   s*    )
r   c                 C   s   t d t| |dd tf dd|d|}t|jd |ksDtt|tsRt|j	dks`tt|j
tjsrt|j
j||fkstt|jtjst|dkr|jj|fkstn|jj||fkstt|jtstdS )	znCheck the behaviour of `fetch_openml` with `as_frame=False`.

    Fetch both by ID and/or name + version.
    r   Tr   Fr   r   Nrr   )r   r   rn   r   r   r   r<   r   r   r   r   npZndarrayr   r   r   dict)r   rm   r   r   r   r   r   r   r   r   r    test_fetch_openml_as_frame_false  s&    %
r   c           
         s   t dt| |dd t|dddd}t|dddd}|j|j }  fdd}||}j|  |j|j }j|j	   fd	d
}||}	j|	 dS )z:Check the consistency of the LIAC-ARFF and pandas parsers.r   Tr   Fr   rm   r   r   r   c                    s,    | j  }jj|r$| |jS | S d S r   )rt   apitypesis_numeric_dtypeastypedtypeZseriesZpandas_series)data_pandasr   r   r   convert_numerical_dtypesb  s    
zFtest_fetch_openml_consistency_parser.<locals>.convert_numerical_dtypesc                    sJ    | j  }jj|r$| |jS jj|rB| j|jj	S | S d S r   )
rt   r   r   r   r   r   Zis_categorical_dtypecatZrename_categoriesr   r   )frame_pandasr   r   r   (convert_numerical_and_categorical_dtypesv  s    
zVtest_fetch_openml_consistency_parser.<locals>.convert_numerical_and_categorical_dtypesN)
r   r   rn   r   r   applytestingassert_frame_equalr   feature_names)
r   rm   Z
bunch_liacbunch_pandasZ	data_liacr   Zdata_liac_with_fixed_dtypesZ
frame_liacr   Zframe_liac_with_fixed_dtypesr   )r   r   r   r   $test_fetch_openml_consistency_parserI  s2    


r   c                 C   s\   t d d}t| |dd t|dd|d}t|dd|d}t|j|j t|j|j dS )z^Check the equivalence of the dataset when using `as_frame=False` and
    `as_frame=True`.
    r   ro   Tr   Fr   N)r   r   rn   r   r   r   r   r   )r   r   rm   Zbunch_as_frame_trueZbunch_as_frame_falser   r   r   -test_fetch_openml_equivalence_array_dataframe  s"    
r   c                 C   s  t d}|jjj}d}d}d}d}|dddg}tjgd	 }	d
dddg}
d}t| |d t|dd|d}|j	}|j
}|j}t||jstt|j|	kst|j|kstt|j|
kstt|j|
kst|j|gkstt||jst|j|kst|j|kst|j|ks&t|jjs4tt||jsFt|j|ksVtt|j|	|g ksrt|jjstdS )z>Check fetching on a numerical only dataset with string labels.r   ro   rp   rq   )rp   )rp      zIris-setosazIris-versicolorzIris-virginicarq   sepallength
sepalwidthpetallength
petalwidthclassTFr   N)r   r   r   r   CategoricalDtyper   Zfloat64rn   r   r   r   r   r   r   r<   alldtypesr   columnsr   Ztarget_namesr   r   rt   indexZ	is_unique)r   r   r   r   rm   Z
data_shapeZtarget_shapeZframe_shapeZtarget_dtypeZdata_dtypesZ
data_namesZtarget_namer   r   r   r   r   r   r   test_fetch_openml_iris_pandas  sJ    

r   target_columnr   r   c                 C   s   t d}d}t| |d t|dd||d}t|dd|d}|j|j|j t|tr|j	|j
j|| |jjdkstn |j
j|kst|jjdkstd	S )
z@Check that we can force the target to not be the default target.r   ro   TF)rm   r   r   r   r   r   )rp      r   N)r   r   rn   r   r   r   r   r   listZassert_index_equalr   r   ZIndexr   r   r<   rt   )r   r   r   r   rm   Zbunch_forcing_targetZbunch_defaultr   r   r   !test_fetch_openml_forcing_targets  s2    

 r   c                 C   s   t d}t| |dd t|ddd|d}t|ddd|d\}}|j|j| t||jrn|j	|j
| n|j|j
| dS )z>Check the behaviour of `return_X_y=True` when `as_frame=True`.r   Tr   Frm   r   r   
return_X_yr   N)r   r   rn   r   r   r   r   r   r   assert_series_equalr   )r   rm   r   r   r   Xyr   r   r   .test_fetch_openml_equivalence_frame_return_X_y  s(    

r   c                 C   s\   t d t| |dd t|ddd|d}t|ddd|d\}}t|j| t|j| dS )z?Check the behaviour of `return_X_y=True` when `as_frame=False`.r   Tr   Fr   N)r   r   rn   r   r   r   r   )r   rm   r   r   r   r   r   r   r   .test_fetch_openml_equivalence_array_return_X_y!  s$    

r   c                 C   sf   t d d}t| |dd d}t||ddd}t||ddd}|jjjdksRt|jjd	ksbtd
S )z9Check the difference between liac-arff and pandas parser.r   r   Tr   Fr   r   r@   ON)r   r   rn   r   r   r   kindr<   )r   rm   r   Zbunch_liac_arffr   r   r   r   $test_fetch_openml_difference_parsers>  s$    
r   module)Zscopec                S   C   s  dddddgdddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+dg'd,d-d.d/d0d1d2dgd3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddgNdddddddddddddddgddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddgNddddddddddddddgdS )z+Returns the columns names for each dataset.r   r   r   r   r   familyzproduct-typeZsteelZcarbonZhardnesstemper_rolling	conditionformabilityZstrength
non-ageingsurface-finishzsurface-qualityenamelabilitybcbfbtbw%2Fmeblmchromphoscbondmarviexptlferrocorrblue%2Fbright%2Fvarn%2Fcleanlustrejurofmspr   Zthickwidthr3   oilZborepackingvendorZMYCTZMMINZMMAXZCACHZCHMINZCHMAXZ Mean_Acc1298_Mean_Mem40_CentroidZMean_Acc1298_Mean_Mem40_RolloffZMean_Acc1298_Mean_Mem40_FluxZMean_Acc1298_Mean_Mem40_MFCC_0ZMean_Acc1298_Mean_Mem40_MFCC_1ZMean_Acc1298_Mean_Mem40_MFCC_2ZMean_Acc1298_Mean_Mem40_MFCC_3ZMean_Acc1298_Mean_Mem40_MFCC_4ZMean_Acc1298_Mean_Mem40_MFCC_5ZMean_Acc1298_Mean_Mem40_MFCC_6ZMean_Acc1298_Mean_Mem40_MFCC_7ZMean_Acc1298_Mean_Mem40_MFCC_8ZMean_Acc1298_Mean_Mem40_MFCC_9ZMean_Acc1298_Mean_Mem40_MFCC_10ZMean_Acc1298_Mean_Mem40_MFCC_11ZMean_Acc1298_Mean_Mem40_MFCC_12ZMean_Acc1298_Std_Mem40_CentroidZMean_Acc1298_Std_Mem40_RolloffZMean_Acc1298_Std_Mem40_FluxZMean_Acc1298_Std_Mem40_MFCC_0ZMean_Acc1298_Std_Mem40_MFCC_1ZMean_Acc1298_Std_Mem40_MFCC_2ZMean_Acc1298_Std_Mem40_MFCC_3ZMean_Acc1298_Std_Mem40_MFCC_4ZMean_Acc1298_Std_Mem40_MFCC_5ZMean_Acc1298_Std_Mem40_MFCC_6ZMean_Acc1298_Std_Mem40_MFCC_7ZMean_Acc1298_Std_Mem40_MFCC_8ZMean_Acc1298_Std_Mem40_MFCC_9ZMean_Acc1298_Std_Mem40_MFCC_10ZMean_Acc1298_Std_Mem40_MFCC_11ZMean_Acc1298_Std_Mem40_MFCC_12ZStd_Acc1298_Mean_Mem40_CentroidZStd_Acc1298_Mean_Mem40_RolloffZStd_Acc1298_Mean_Mem40_FluxZStd_Acc1298_Mean_Mem40_MFCC_0ZStd_Acc1298_Mean_Mem40_MFCC_1ZStd_Acc1298_Mean_Mem40_MFCC_2ZStd_Acc1298_Mean_Mem40_MFCC_3ZStd_Acc1298_Mean_Mem40_MFCC_4ZStd_Acc1298_Mean_Mem40_MFCC_5ZStd_Acc1298_Mean_Mem40_MFCC_6ZStd_Acc1298_Mean_Mem40_MFCC_7ZStd_Acc1298_Mean_Mem40_MFCC_8ZStd_Acc1298_Mean_Mem40_MFCC_9ZStd_Acc1298_Mean_Mem40_MFCC_10ZStd_Acc1298_Mean_Mem40_MFCC_11ZStd_Acc1298_Mean_Mem40_MFCC_12ZStd_Acc1298_Std_Mem40_CentroidZStd_Acc1298_Std_Mem40_RolloffZStd_Acc1298_Std_Mem40_FluxZStd_Acc1298_Std_Mem40_MFCC_0ZStd_Acc1298_Std_Mem40_MFCC_1ZStd_Acc1298_Std_Mem40_MFCC_2ZStd_Acc1298_Std_Mem40_MFCC_3ZStd_Acc1298_Std_Mem40_MFCC_4ZStd_Acc1298_Std_Mem40_MFCC_5ZStd_Acc1298_Std_Mem40_MFCC_6ZStd_Acc1298_Std_Mem40_MFCC_7ZStd_Acc1298_Std_Mem40_MFCC_8ZStd_Acc1298_Std_Mem40_MFCC_9ZStd_Acc1298_Std_Mem40_MFCC_10ZStd_Acc1298_Std_Mem40_MFCC_11ZStd_Acc1298_Std_Mem40_MFCC_12ZBH_LowPeakAmpZBH_LowPeakBPMZBH_HighPeakAmpZBH_HighPeakBPMZBH_HighLowRatioZBHSUM1ZBHSUM2ZBHSUM3zamazed.suprisedzhappy.pleasedzrelaxing.calmzquiet.stillz
sad.lonelyzangry.aggresiveageZ	workclasszfnlwgt:z
education:zeducation-num:zmarital-status:zoccupation:zrelationship:zrace:zsex:zcapital-gain:zcapital-loss:zhours-per-week:znative-country:ZDYRK1A_NZITSN1_NZBDNF_NZNR1_NZNR2A_NZpAKT_NZpBRAF_NZ	pCAMKII_NZpCREB_NZpELK_NZpERK_NZpJNK_NZPKCA_NZpMEK_NZpNR1_NZpNR2A_NZpNR2B_NZpPKCAB_NZpRSK_NZAKT_NZBRAF_NZCAMKII_NZCREB_NZELK_NZERK_NZGSK3B_NZJNK_NZMEK_NZTRKA_NZRSK_NZAPP_NZ
Bcatenin_NZSOD1_NZMTOR_NZP38_NZpMTOR_NZDSCR1_NZAMPKA_NZNR2B_NZpNUMB_NZRAPTOR_NZTIAM1_NZpP70S6_NNUMB_NZP70S6_NZpGSK3B_NZpPKCG_NZCDK5_NZS6_NZADARB1_NZAcetylH3K9_NZRRP1_NZBAX_NZARC_NZERBB4_NZnNOS_NZTau_NZGFAP_NZGluR3_NZGluR4_NZIL1B_NZP3525_NZpCASP9_NZPSD95_NZSNCA_NZUbiquitin_NZpGSK3B_Tyr216_NZSHH_NZBAD_NBCL2_NZpS6_NZpCFOS_NZSYP_NZ	H3AcK18_NZEGR1_NZH3MeK4_NZCaNA_NZpclassZsurvivedrt   sexZsibspZparchZticketfarecabinembarkedboatbody	home.destro   rv   ry   r}   r   r   r   r   r   r   r   r   datasets_column_names^  s   )QQ r   c                   C   s`   i ddddddddddddddddddddddddddddi i i dd	id
ddddddddS )Nrw   	   rv   rq   r      )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   i  rr   i  i7  i  i4  )r   r   r   r   r   r   r   r   r   r   r   r   r   datasets_missing_valuesQ  sT    r   zJdata_id, parser, expected_n_categories, expected_n_floats, expected_n_ints)ro   r   rr   rq   r   )ro   r   rr   rq   r   )rv   r   !   r   r   )rv   r   r   rv   rq   )ry   r   rr   r{   r   )ry   r   rr   r   r{   )r}   r   r   r   r   )r}   r   r   E   r   )r   r   r   r   r   )r   r   r   r   r   )r   r   rr   L   r   )r   r   rr   r   r   )r   r   r   r   r   )r   r   r   r   r   c	                    s   t d}	|	jjj t| ||d t|dd|d}
|
j}t fdd|j	D }tdd |j	D }td	d |j	D }||kst
||kst
||kst
|j || kst
|   }| D ]$\}}|| |d
}||kst
qdS )zYCheck that `fetch_openml` infer the right number of categories, integers, and
    floats.r   r   TFr   c                    s   g | ]}t | r|qS r   )r   .0r   r   r   r   
<listcomp>  s     
 z5test_fetch_openml_types_inference.<locals>.<listcomp>c                 S   s   g | ]}|j d kr|qS )r@   r   r  r   r   r   r    s     
 c                 S   s   g | ]}|j d kr|qS )ir  r  r   r   r   r    s     
 r   N)r   r   r   r   r   rn   r   r   r3   r   r<   r   tolistisnasumZto_dictitemsget)r   rm   r   Zexpected_n_categoriesZexpected_n_floatsZexpected_n_intsrD   r   r   r   r   r   Zn_categoriesZn_floatsZn_intsZframe_feature_to_n_nanrt   Z	n_missingZexpected_missingr   r  r   !test_fetch_openml_types_inference  s.    +

r  z0ignore:The default value of `parser` will changezparams, err_msgunknownz`parser` must be one ofr   z`as_frame` must be one ofc              	   C   s@   d}t | |d tjt|d tf d|i| W 5 Q R X d S )Nr   Tmatchrm   )rn   r   raisesr_   r   r   paramserr_msgrm   r   r   r   &test_fetch_openml_validation_parameter  s    	r  r  autor   r   c                 C   sn   d}zt d W nP tk
r`   t| |d d}tjt|d tf d|i| W 5 Q R X Y n
X tddS )	z=Check that we raise the proper errors when we require pandas.r   !test_fetch_openml_requires_pandasTz;requires pandas to be installed. Alternatively, explicitelyr  rm   .This test requires pandas to not be installed.N)r   ImportErrorrn   r   r  r   r
   )r   r  rm   r  r   r   r   'test_fetch_openml_requires_pandas_error  s    
"r  c                 C   sx   ddd}d}zt d W nP tk
rj   t| |d d}tjt|d tf d	|i| W 5 Q R X Y n
X td
dS )zICheck that we raise a warning that pandas will be required in the future.Fr  r  r   r  TzGFrom version 1.4, `parser='auto'` with `as_frame=False` will use pandasr  rm   r  N)r   r  rn   r   warnsFutureWarningr   r
   )r   r  rm   warn_msgr   r   r   +test_fetch_openml_requires_pandas_in_future  s    
"r   z2ignore:Version 1 of dataset Australian is inactivez:Sparse ARFF datasets cannot be loaded with parser='pandas'z9Sparse ARFF datasets cannot be loaded with as_frame=True.)r   r   c              	   C   sL   t d d}t| |d t jt|d tf |dd| W 5 Q R X dS )ztCheck that we raise the expected error for sparse ARFF datasets and
    a wrong set of incompatible parameters.
    r   $  Tr  F)rm   r   N)r   r   rn   r  r_   r   r  r   r   r   #test_fetch_openml_sparse_arff_error	  s    
r"  zdata_id, data_type)ro   	dataframe)r!  sparsec                 C   sP   t d}t| |d t|dddd}|dkr4|jntjj}t|j	|sLt
dS )z&Check the auto mode of `fetch_openml`.r   Tr  F)rm   r   r   r   r#  N)r   r   rn   r   r   scipyr$  Z
csr_matrixr   r   r<   )r   rm   Z	data_typer   r   klassr   r   r   test_fetch_openml_auto_mode/  s
    
r'  c              
   C   sb   t d d}t| |d d}t jt|d, tdd t|ddd	d
 W 5 Q R X W 5 Q R X dS )z[Check that we raise a warning regarding the working memory when using
    LIAC-ARFF parser.r   r   Tz*Could not adhere to working_memory config.r  gư>)Zworking_memoryFr   r   N)r   r   rn   r  UserWarningr   r   )r   rm   rV   r   r   r   :test_convert_arff_data_dataframe_warning_low_memory_pandasD  s    
r)  c              	   C   sF   d}d}t | || d}tjt|d t|dddd W 5 Q R X dS )	z\Check that a warning is raised when multiple versions exist and no version is
    requested.ro   rs   zMultiple active versions of the dataset matching the name iris exist. Versions may be fundamentally different, returning version 1.r  Fr   )rt   r   r   r   Nrn   r   r  r(  r   )r   rD   rm   Z	data_namerV   r   r   r   ,test_fetch_openml_iris_warn_multiple_versionX  s    r+  c                 C   sT   d}d}d}d}t | || t||dddd}|jj||fksBt|jdksPtdS )z/Check that we can get a dataset without target.ro   Nrp   r   Fr   rm   r   r   r   r   )rn   r   r   r   r<   r   )r   rD   rm   r   Zexpected_observationsZexpected_featuresr   r   r   r   test_fetch_openml_no_targeto  s    r-  c                 C   sd   t d d}t| ||d t|dd|d}|jjd }|jd   sNtt	|j
dd	d
g dS )zRcheck that missing values in categories are compatible with pandas
    categoricalr   iY  r   FTrm   r   r   r   r   ZFEMALEZMALE_N)r   r   rn   r   r   r   r
  anyr<   r   r   )r   rD   r   rm   ZpenguinsZ	cat_dtyper   r   r   test_missing_values_pandas  s    
r1  r     glass2)rm   rt   ru   c              	   C   sj   d}t | || d}tjt|d tf dddd|}W 5 Q R X |jjdksTt|jd d	ksftd
S )z;Check that we raise a warning when the dataset is inactive.r2  z(Version 1 of dataset glass2 is inactive,r  Fr   )r   r   r   )   r   r   Z40675N)	rn   r   r  r(  r   r   r   r<   r   )r   rD   r   rm   rV   r3  r   r   r   test_fetch_openml_inactive  s    
  r5  z"data_id, params, err_type, err_msgzNo active dataset glass2 foundr   r   )rm   r   z1Can only handle homogeneous multi-target datasets)rm   r   zOSTRING attributes are not supported for array representation. Try as_frame=Truer   )rm   r   r   zTarget column 'family'Z	undefinedz(Could not find target_column='undefined'c              	   C   s\   t | || |dds |dkr*td tj||d tf d|d| W 5 Q R X d S )Nr   Tr   r  F)r   r   )rn   r  r   r   r  r   )r   rD   rm   r  err_typer  r   r   r   r   test_fetch_openml_error  s
    0
r7  zparams, err_type, err_msgr   ru   z?Dataset data_id=-1 and version=version passed, but you can onlyZnAmE)rm   rt   z9Dataset data_id=-1 and name=name passed, but you can onlyzFNeither name nor data_id are provided. Please provide name or data_id.c              	   C   s(   t j||d tf |  W 5 Q R X d S )Nr  )r   r  r   )r  r6  r  r   r   r   )test_fetch_openml_raises_illegal_argument  s    r8  c              	   C   s  d}d}d}t | || d}||}tjt|d t||dddd W 5 Q R X d	}||}tjt|d t||dddd W 5 Q R X d}||}tjt|d t||d
gdddd W 5 Q R X d	}||}tjt|d t||d
gdddd W 5 Q R X d S )Nr   z.target_column='{}' has flag is_row_identifier.z&target_column='{}' has flag is_ignore.ZMouseIDr  Fr   r,  ZGenotyper   )rn   formatr   r  r(  r   )r   rD   rm   Zexpected_row_id_msgZexpected_ignore_msgZ
target_colrV   r   r   r   test_warn_ignore_attribute  sX    



r:  c              	   C   sB   d}t | || d}tjt|d t|dddd W 5 Q R X d S )Nrr   zJOpenML registered a problem with the dataset. It might be unusable. Error:r  Fr   r.  r*  r   rD   rm   rV   r   r   r   test_dataset_with_openml_error5  s
    r<  c              	   C   sB   d}t | || d}tjt|d t|dddd W 5 Q R X d S )Nr   zFOpenML raised a warning on the dataset. It might be unusable. Warning:r  Fr   r.  r*  r;  r   r   r    test_dataset_with_openml_warning>  s
    r=  c           	      C   st   d}t | || tjjj|}t|d}t||}t	||}t
j|sRtt||}| | ksptd S )Nro   scikit_learn_data)rn   ri   rj   rk   
_DATA_FILEr9  strmkdirr   r   ospathisfiler<   r   )	r   rD   tmpdirrm   openml_pathcache_directoryZ	response1locationZ	response2r   r   r   test_open_openml_url_cacheK  s    


rI  write_to_diskc              	      s   d}t jjj|}t|d}t||  fdd}| t jjd| t	j
tdd t|| W 5 Q R X tj rtd S )Nro   r>  c              	      s0   r$t  d}|d W 5 Q R X tdd S )Nw Invalid request)re   writer_   )r`   ra   rb   r@   rH  rJ  r   r   rc   c  s    z>test_open_openml_url_unlinks_local_path.<locals>._mock_urlopenrd   rM  r  )ri   rj   rk   r?  r9  r@  rA  r   rh   r   r  r_   r   rB  rC  existsr<   )r   rE  rJ  rm   rF  rG  rc   r   rO  r   'test_open_openml_url_unlinks_local_path\  s    
rQ  c              	      s   d}t jjj|}t| d}t|| t	tj
  t d}|d W 5 Q R X t|| fdd}d}tjt|d | }W 5 Q R X |d	kstd S )
Nro   r>  rK  rL  c                      s   t j rtddS )NzFile exist!rr   )rB  rC  rP  	Exceptionr   rH  r   r   
_load_data{  s    z/test_retry_with_clean_cache.<locals>._load_dataz!Invalid cache, redownloading filer  rr   )ri   rj   rk   r?  r9  r@  rA  r   rB  makedirsrC  dirnamere   rN  r   r   r  RuntimeWarningr<   )rE  rm   rF  rG  r@   rT  r  resultr   rS  r   test_retry_with_clean_cacheq  s    
rY  c              	   C   s\   d}t jjj|}t| d}t||dd }d}tj	t
|d |  W 5 Q R X d S )Nro   r>  c                   S   s   t d ddd d dd S )NrR   rS   rT   r   r   r   r   r   rT    s        z:test_retry_with_clean_cache_http_error.<locals>._load_datarS   r  )ri   rj   rk   r?  r9  r@  rA  r   r   r  r   )rE  rm   rF  rG  rT  	error_msgr   r   r   &test_retry_with_clean_cache_http_error  s    
r[  c           
      C   s   dd }d}t |d}t| || t|d|dddd\}}| tjjd	| t|d|dddd\}}	tj	
|| tj	
||	 d S )
Nc                 _   s   t d|   d S )NzhThis mechanism intends to test correct cachehandling. As such, urlopen should never be accessed. URL: %s)r_   r]   r`   ra   rb   r   r   r   _mock_urlopen_raise  s
    z4test_fetch_openml_cache.<locals>._mock_urlopen_raisero   r>  TFr   )rm   r   r-   r   r   r   rd   )r@  rA  rn   r   rh   ri   rj   rk   r   r   r   )
r   rD   rE  r]  rm   rG  Z	X_fetchedZ	y_fetchedZX_cachedZy_cachedr   r   r   test_fetch_openml_cache  s.    
	
r^  zas_frame, parser)Tr   )Fr   )Tr   )Fr   c              	      s  |s|dkrt d d}t| |d td d|  }d}|d  t||.}t|d}	t|	 }
d	|
t	|
d
 < W 5 Q R X t
 d}||
 W 5 Q R X tjjj fdd}| tjjd| t t}tjj|d||d W 5 Q R X |dstdS )z/Check that the checksum is working as expected.r   rv   Tr.   r/   zdata-v1-dl-1666876.arff.gzztest_invalid_checksum.arffr:   %   rr   wbc              	      sL   |   }|dr@t d}| }W 5 Q R X tt|ddS | S d S )Nzdata/v1/download/1666876r:   Tr!   )r]   endswithre   r   r   r   )r`   ra   rb   r5   r@   Zcorrupted_dataZcorrupt_copy_pathZmocked_openml_urlr   r   swap_file_mock  s    
z9test_fetch_openml_verify_checksum.<locals>.swap_file_mockrd   Fr.  Z1666876N)r   r   rn   rf   r	   r    re   	bytearrayr   r3   GzipFilerN  ri   rj   rk   rd   rh   r  r_   r   r  r<   )r   r   r   rE  r   rm   Zoriginal_data_moduleZoriginal_data_file_nameZ	orig_fileZ	orig_gzipr   Zmodified_gziprc  excr   rb  r   !test_fetch_openml_verify_checksum  s0    

	   rg  c              
   C   s   dd }|  tjjd| d}tjttdt	|  dd>}tj
tdd t|d d	d
 W 5 Q R X t|dksxtW 5 Q R X d S )Nc                 _   s   t dddd d d S )NrL  i  Simulated network errorr   r\  r   r   r   _mock_urlopen_network_error  s    zPtest_open_openml_url_retry_on_network_error.<locals>._mock_urlopen_network_errorrd   zinvalid-urlz+A network error occurred while downloading z. Retrying...r  rh  r   )delayr   )rh   ri   rj   rk   r   r  r(  r1   escaper   r  r   r   r3   r<   )r   ri  Zinvalid_openml_urlrecordr   r   r   +test_open_openml_url_retry_on_network_error  s"      rm  )r   r   c                 C   sh   |dkrt d d}t| || tjj|dd|d}|dk	sBt|d jdksTtd|d	 ksdtdS )
zCheck that we can load the "zoo" dataset.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14340
    r   >   Fr.  Nr   )e      Zanimalr   )r   r   rn   ri   rj   r   r<   r   )r   rD   r   rm   Zdatasetr   r   r   &test_fetch_openml_with_ignored_feature  s    
   rq  c                 C   s  t d}d}t| |dd dd|d}tf ddi|}tf ddi|}|j|j|j |jjd		 rtt
|jjd		 rt
tf dd
d|}tf dd
d|}|j|jd |jd  |jd jd		 rt
|jd jd		 rt
dS )zCheck that we strip the single quotes when used as a string delimiter.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/23381
    r   r   Frm   rD   Tr   r   rm   r   r   'r   )r   r   r   N)r   r   rn   r   r   r   r   r@  r;   r0  r<   ra  r   )r   r   rm   common_paramsZmice_pandasZmice_liac_arffr   r   r   test_fetch_openml_strip_quotes%  s,    
  rv  c                 C   sj   t d}d}t| |dd dd|d}tf ddi|}tf ddi|}|j|jd	 |jd	  d
S )zCheck that we can strip leading whitespace in pandas parser.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/25311
    r   i6  Frr  Trs  r   r   r   N)r   r   rn   r   r   r   r   )r   r   rm   ru  Zadult_pandasZadult_liac_arffr   r   r   $test_fetch_openml_leading_whitespaceB  s    
 rw  c              	   C   sH   t d d}t| |dd t jtdd tjj|d W 5 Q R X dS )	z?Check that we raise a deprecation warning for parser parameter.r   ro   Frr  z)The default value of `parser` will changer  )rm   N)r   r   rn   r  r  ri   rj   r   )r   rm   r   r   r   $test_fetch_openml_deprecation_parserX  s
    
rx  )T__doc__r    rY   rB  r1   	functoolsr   ior   urllib.errorr   Znumpyr   Zscipy.sparser%  r   ri   r   Zsklearn.utilsr   r   Zsklearn.utils.fixesr	   Zsklearn.utils._testingr
   r   r   r   Zsklearn.datasetsr   Zfetch_openml_origZsklearn.datasets._openmlr   r   r   r   rf   rg   r   rn   markZparametrizer   r   r   r   r   r   r   r   r   Zfixturer   r   r  filterwarningsr  r  r   r"  r'  r)  r+  r-  r1  r5  r_   KeyErrorr7  r8  r:  r<  r=  rI  rQ  rY  r[  r^  rg  rm  rq  rv  rw  rx  r   r   r   r   <module>   s  x-)A1!


 s

20












+




1




%	-