U
    2d=                    @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zejdddgdd Zdd Zejdddgdd Zejdejejejgejdejejejgdd Zejdejejejgdd Zdd Zdd Zdd  Zd!d" Zejjd#d$d%d&gd'd(d&gge d)d%d&gd*d(d&ggej d+d,d-gd.d/d-gge!d0ej d+d%d-gd.ej"d-gge!d0ej d+d%d-gd.e#d1d-gge!d0ej dd%d-gd.d(d-gge!d0ej dd%dgd.ej"dgge!d0ej dd%dgd.e#d1dgge!d0gd2d3d4d5d6d7d8d9gd:d;d< Z$ejdddgejd=d>d?gejd@ddAgdBdC Z%ejd=d>d?gejdDd(d&gd%d&gd(d&ggd d%d%gd d d gd d%d%ggfdEd.gdFd.gdGd+gdFd.ggd d d d d gd d d d d%gd d%d d d ggfgdHdI Z&dJdK Z'ejd@dLdAdgejdMdLdAdgdNdO Z(ejdPdQdRgejd#d%d(ge dSdTggdUdV Z)ejdPdQdRgdWdX Z*ejjdYd'd&gd$d&ggd'd$gd&ggej+fe d%d(gdZd(ggd%dZgd(ggej,fej d,d-gd/d-gge!d0d,d/gd-ggej+fe d,d-gd/d-ggd,d/gd-ggej-fe d%d(gej"d(ggd%ej"gd(ggej.fej d,ej"gdej"gge!d0d,dgej"ggej+fej d,e#d1gde#d1gge!d0d,dge#d1ggej+fgd2d3d4d[d\d]d^gd:d_d` Z/ejdddgejjdaej d.d+gge!d0j0ej d.dbgge!d0j0d.d+dcggej+fej d%d(ggddd0j0ej d%deggddd0j0d%d(dZggej1fej d.d+gge!d0j0ej d.dbgge!d0j0e d.d+dcggej+fej dd.gge!d0j0ej dd+gge!d0j0dd.dfgge!fej d.d+gge!d0j0ej d.ej"gge!d0j0d.d+dfgge!fej d.dgge!d0j0ej d.ej"gge!d0j0d.ddfgge!fej d.ej"gge!d0j0ej d.dgge!d0j0d.ej"dfgge!fgd4d3dgdhdidjdkgd:dldm Z2dndo Z3dpdq Z4drds Z5ejjdtdAdudvgfdLdudwdvgfdcd(d+gdxdygfgdAdzd{gd:d|d} Z6d~d Z7ejjd#d'd(d&gd$d%d&gge d)d(d&gdd%d&ggej d.d/d-gd+d,d-gge!d0gd2d3d4gd:dd Z8ejjdaej d.d+gge!d0j0ej d.dbgge!d0j0d.d+dcggej+fej d%d(ggddd0j0ej d%deggddd0j0d%d(dZggej1fej d.d+gge!d0j0ej d.dbgge!d0j0e d.d+dcggej+fgd4d3dgd:dd Z9dd Z:dd Z;ejde#e<gdd Z=dd Z>dd Z?dd Z@dd ZAdd ZBdd ZCejdej"de#d1gdd ZDejd@d'dZgd'dZdd.ggdd ZEejjdd?d>gddgd:ejjd@dAd.d(d+ggdAd{gd:dd ZFejdeegdd ZGejddd(iddiddid(dddeddgejddd.d+dcdbgggdd ZHejd@dLdAd+ggdd ZIejd@d.gdbggdd ZJejdddZiddiddiddiddidZdddeddgdd ZKejd@dAd+ggdd ZLejd@d.gdbggdd ZMdd ZNejddZd%dddeigddĄ ZOddƄ ZPddȄ ZQddʄ ZRdd̄ ZSdd΄ ZTejddd%dМgdd҄ ZUejdd(dZdМgddԄ ZVddք ZWejddddddddgejddddgdd ZXdd ZYejdej"dgdd ZZdd Z[ejdddgejdddgdd Z\ejdddgdd Z]ejdddgdd Z^ejdddgdd Z_dd Z`ejdej"dgdd Zaejdddgejdej"dgdd Zbejjdaej d.ej"gge!d0j0ej d.d+gge!d0j0ej d.ej"dbge!d0gej+fej d.ej"gge!d0j0ej d.d+gge!d0j0ej d.ej"dbge!d0gej+fej dej"ggejd0j0ej dSggejd0j0e ddTej"ggejfgdd dgd:dd Zcejde dej"dSggj0e dej"dggj0e dTggfe ddTdSggj0e dddggj0e ej"ggfej dcej"d+gge!d0j0e dej"dggj0ej dbgge!d0fej dcd.d+gge!d0j0e dddggj0ej ej"gge!d0fgdd Zdd	d
 Zedd Zfejddd/ggej dd/ggdd0ej dd/ggdd0gejdd,d/ggej d,d/ggdd0ej d,d/ggdd0gdd Zgdd Zhdd Zidd Zjejdd?d>gdd Zkejdej d.gdgge!d0d gej"gej"ggejldgdgdgge!d0fej ej"gdgd.gge!d0d gej"gej"ggejldgej"gej"gge!d0fgdd  Zmd!d" Znd#d$ Zod%d& ZpdS ('      Nsparse)NotFittedError)assert_array_equal)assert_allclose)_convert_container)is_scalar_nan)OneHotEncoder)OrdinalEncoderc                  C   s   t dddgdddgg} t }tdd}|| }|| }|jdksLt|jdksZtt|shtt|rvtt|	 dd	dd	d	gd	dd	dd	gg t|	 | d S )
N         r   Fsparse_outputr                    ?)
nparrayr	   fit_transformshapeAssertionErrorr   issparser   toarray)XZ
enc_sparseZ	enc_denseX_trans_sparseZX_trans_dense r   M/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s    


 r   handle_unknownignoreinfrequent_if_existc                 C   s   t dddgdddgdddgg}t dddgg}tdd}|| tjtdd	 || W 5 Q R X t| d}|| | }t	||
 t d
d
d
d
dd
d
gg t|| d S )Nr   r   r   r      errorr    Found unknown categoriesmatchr   r   )r   r   r	   fitpytestraises
ValueError	transformcopyr   r   r   r    r   X2ohZ	X2_passedr   r   r   #test_one_hot_encoder_handle_unknown(   s    "



r2   c               	   C   sL   t dgdgg} tddgd}d}tjt|d ||  W 5 Q R X d S )Nab
categorieszqThis OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.r'   )r   r   r	   r*   r+   r   r-   )r   encmsgr   r   r   test_one_hot_encoder_not_fitted@   s    r9   c              	   C   s   t ddddgd}t ddgd}t| d}|| | }t|| t ddddgdd	ddgg t|| d S )
NZ11111111Z22Z333Z4444)r   Z55555r%   r   r   )	r   r   reshaper	   r)   r.   r   r-   r   r/   r   r   r   +test_one_hot_encoder_handle_unknown_stringsL   s    

r<   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)r6   r@   F)r6   r@   r   )	r   asarrayTr	   r   r   r   r)   r-   )r>   r=   r   
X_expectedr1   r   r   r   test_one_hot_encoder_dtype_   s    rE   c                 C   s   t d}|ddgddgd}tjddddgddddgg| d}t| d}t|| | t|	|
| | t| d	d
}t||| t|	|
|| d S )Npandasr3   r4   r   r   ABr   r?   F)r@   r   )r*   importorskip	DataFramer   r   r	   r   r   r   r)   r-   )r=   pdX_dfrD   r1   r   r   r   !test_one_hot_encoder_dtype_pandasn   s    
"
rN   c                  C   s   t  } dddddgdddddgdd	d
ddgdddddgg}| | |  }tdddddddddddddddg| | ddd d!d"g}td#d$d%d&d'd(d)d*d+d,d-d.d/d0d1g| tjtd2d3 | ddg W 5 Q R X d S )4NMaler   Zgirlr   r   Female)   
   3   Zboy   [         Z	x0_FemaleZx0_MaleZx1_1Zx1_41Zx1_51Zx1_91Zx2_boyZx2_girlZx3_1Zx3_2Zx3_12Zx3_21Zx4_3Zx4_10Zx4_30onetwothreeZfourZfiveZ
one_FemaleZone_MaleZtwo_1Ztwo_41Ztwo_51Ztwo_91Z	three_boyZ
three_girlZfour_1Zfour_2Zfour_12Zfour_21Zfive_3Zfive_10Zfive_30z!input_features should have lengthr'   )r	   r)   get_feature_names_outr   r*   r+   r,   )r7   r   feature_namesZfeature_names2r   r   r   "test_one_hot_encoder_feature_names~   sb    
r]   c                  C   s\   t  } tjddggtdj}| | |  }tddg| | jdgd}tdd	g| d S )
Nu   c❤t1Zdat2r?   u	   x0_c❤t1Zx0_dat2u   n👍me)Zinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r	   r   r   objectrC   r)   r[   r   )r7   r   r\   r   r   r   *test_one_hot_encoder_feature_names_unicode   s    
r_   c                  C   s   t ddggj} t }|jddddggd | d ddddggksLt||  j	dksdt|jdddddggd ||  j	d	kstd S )
Nr   r   r   r   r5   r6   )r   r#   r#   r   )
r   r   rC   r	   
set_params
get_paramsr   r   r   r   )r   r1   r   r   r   test_one_hot_encoder_set_params   s    rb   c                 C   sN   t dd}|| }t ddd}|| }t| | t|sFt| S )NrA   r5   Fr6   r   )r	   r   r   r   r   Zisspmatrix_csrr   )r   r7   ZXtr1ZXtr2r   r   r   check_categorical_onehot   s    


rd   r   defr   7   abcr   rR   r   r4   rH   catr3   rI   r?   nanmixednumericr^   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|ddddgddddgg tdd| }t| dddddgdddddgg d S )Nr   r   rA   r5   )rd   r   r   r   r	   r   r   )r   Xtrr   r   r   test_one_hot_encoder   s    rn   sparse_FTdropfirstc              	   C   s  dddgdddgdddgg}t ||d}||}tj|td}t||| ddgddgddgg}t |d	|d
}||}t|}t||| |d krdddgdddgdddgg}t || ddgddgdddggd}||}tj|td}d |d< t||| ddgddgddgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tdddgdddgg}td}t	j
t|d || W 5 Q R X d S )Nrg   r   rf   re   r   r   r   rp   r?   rA   )r   r6   rp   6   8   )r   r    r6   )r   r   )r   r6   r    r   r   r   )Shape of the passed X data is not correctr'   )r	   r   r   r   r^   r   inverse_transformreescaper*   r+   r,   )r    ro   rp   r   r7   X_trexpr8   r   r   r   test_one_hot_encoder_inverse   sH    






r|   z
X, X_transrX   rY   rZ   c              	   C   sJ   t |d| }d}|r"t|d}tjt|d || W 5 Q R X dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r'   N)r	   r)   r   r*   r+   r,   rw   )r   X_transro   r7   r8   r   r   r   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknown1  s    
r~   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
NrO   r   rP   r   r   r?   	if_binaryFrp   r   )r   r   r^   r	   r   r   rw   )r   oherz   r   r   r   &test_one_hot_encoder_inverse_if_binaryQ  s     
r   r   
reset_dropc                 C   s   t jddgddgddggtd}t| dd}|| ||}| }|j|d	 t|	|| t
||| t| | d S )
NrO   r   rP   r   r   r?   Fr   rp   )r   r   r^   r	   r)   r-   r[   r`   r   rw   r   )rp   r   r   r   rz   r\   r   r   r   test_one_hot_encoder_drop_resetX  s     

r   methodr)   r         @      @c              	   C   s6   t  }d}tjt|d t|||  W 5 Q R X d S )N'Expected 2D array, got 1D array insteadr'   )r	   r*   r+   r,   getattr)r   r   r1   r8   r   r   r   test_X_is_not_1Dg  s    r   c              	   C   sR   t d}|ddddg}t }d}t jt|d t|| | W 5 Q R X d S )NrF      r   r#   r   r'   )r*   rJ   Seriesr	   r+   r,   r   )r   rL   r   r1   r8   r   r   r   test_X_is_not_1D_pandasq  s    
r   zX, cat_exp, cat_dtyper   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]}t dd}|| t|jts:tt|j|D ]l\}}| }t|d rt|d srt|d d |d d kstn| |kstt	
|j|sFtqFqd S )Nr:   rA   r5   )r	   r)   
isinstancecategories_listr   ziptolistr   r   
issubdtyper@   )r   Zcat_exp	cat_dtypeXir7   resr{   Zres_listr   r   r   test_one_hot_encoder_categories|  s    #

r   zX, X2, cats, cat_dtypedcint64r#   zzobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanzobject-nan-and-Nonec              	   C   s   t |d}tdddgdddgg}t||  | t|jd t|d ksVt|j	d 
 t|d kstt|j	d j|kstt |d}tjtdd || W 5 Q R X t ||d}tdddgdddgg}t||| | d S )Nr5   r   r   r   r&   r'   r6   r    )r	   r   r   r   r   r   r   r6   r   r   r   r@   r*   r+   r,   r)   r-   )r   r0   catsr   r    r7   r{   r   r   r   )test_one_hot_encoder_specified_categories  s    :

r   c               	   C   sD  t jddggtdj} tdddggd}t dddgdddgg}t|| |  | t|	|  | |j
d  dddgkstt |j
d jt jstt d	d
ggj} td
d	dggd}d}tjt|d |	|  W 5 Q R X t d	d
t jggj} td	t jd
ggd}tjt|d |	|  W 5 Q R X d S )Nr3   r4   r?   r   r5   r   r   r   r   r   r   z%Unsorted categories are not supportedr'   )r   r   r^   rC   r	   r   r)   r-   r   r   r   r   r   r   r@   object_r*   r+   r,   ri   )r   r7   r{   r8   r   r   r   (test_one_hot_encoder_unsorted_categories  s     r   c               	   C   s   t jddgddggtdj} tdddgdddggd}t d	d
d
d	d
d
gd
d	d
d
d
d	gg}t||  | |jd 	 dddgkst
t |jd jt jst
|jd 	 dddgkst
t |jd jt jst
d S )Nr3   r4   r   r   r?   r   r   r5   r   r   )r   r   r^   rC   r	   r   r   r   r   r   r   r   r@   r   r   r7   r{   r   r   r   7test_one_hot_encoder_specified_categories_mixed_columns  s    &r   c                  C   sL   t d} | ddgddgd}t|}t|ddddgddddgg d S )NrF   r3   r4   r   r   rG   r   )r*   rJ   rK   rd   r   )rL   rM   rm   r   r   r   test_one_hot_encoder_pandas   s    
r   zdrop, expected_namesx0_cZx2_bZx1_2x0_bZx2_abinaryZmanualc                 C   s>   dddgdddgg}t | d}|| | }t|| d S )Nr   r   r3   r4   r   )r	   r)   r[   r   )rp   Zexpected_namesr   r   r\   r   r   r   'test_one_hot_encoder_feature_names_drop)  s
    


r   c                  C   s   ddgddgddgg} t ddddgddddgddddgg}t d dg}td	d
d}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t dd g}td	d
d}|| }t|j| t|| d S )NrR   yes   norW   r   r   r   r   Fr   truer3   false)r   r   r	   r   r   	drop_idx_r   )r   expectedZexpected_drop_idxr   resultr   r   r   *test_one_hot_encoder_drop_equals_if_binary;  s      


r   r   c                 C   sX   t  }tjdddgdddggdd}t|| |d t dd}t|| | d S )Nr   r   r   r?   float64)r
   r   r   r   r   Zastyper   r   r   r   test_ordinal_encoderS  s
    

r   zobject-string-catc              	   C   s   t |d}tdgdgg}t|| | t|jd t|d ksJt|jd 	 t|d ksht|jd j
|ks|tt |d}tjtdd || W 5 Q R X d S )Nr5   r   r   r   r&   r'   )r
   r   r   r   r   r   r6   r   r   r   r@   r*   r+   r,   r)   )r   r0   r   r   r7   r{   r   r   r   )test_ordinal_encoder_specified_categoriesd  s    

r   c               	   C   s   dddgdddgg} t  }|| }tj| td}t||| tddddgddddgg}td}t	j
t|d	 || W 5 Q R X d S )
Nrg   r   rf   re   r   r?   r   rv   r'   )r
   r   r   r   r^   r   rw   rx   ry   r*   r+   r,   )r   r7   rz   r{   r8   r   r   r   test_ordinal_encoder_inverse  s    

r   c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer    unknown_valuer3   xr4   yr   r   r?   ZxyZblar   r   r   r   )r
   r   r   r^   r)   r-   r   rw   )r7   X_fitr}   X_trans_encr{   X_trans_invinv_expr   r   r   +test_ordinal_encoder_handle_unknowns_string  s      

 

 r   r@   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr   r   r      r      r   	   r?   rT      r   r   )r
   r   r   r)   r-   r   rw   r^   )r@   r7   r   r}   r   r{   r   r   r   r   r   ,test_ordinal_encoder_handle_unknowns_numeric  s      

 

 r   c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr   r   r   r   r   r#   r   )r
   r   ri   r   r)   r-   r   )r7   r   r}   r   r   r   (test_ordinal_encoder_handle_unknowns_nan  s
    
r   c               	   C   sN   t dtjtd} tdgdgdgg}tjtdd | | W 5 Q R X d S )Nr   )r    r   r@   r   r   r   z'dtype parameter should be a float dtyper'   )	r
   r   ri   intr   r*   r+   r,   r)   )r7   r   r   r   r   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s      r   c               	   C   s\   t jdddddggtdj} dddg}t|d}d}tjt|d ||  W 5 Q R X d S )NZLowZMediumZHighr?   r5   z*Shape mismatch: if categories is an array,r'   )	r   r   r^   rC   r
   r*   r+   r,   r)   )r   r   r7   r8   r   r   r   +test_ordinal_encoder_raise_categories_shape  s    

r   c                     s  t ddtjddddgddddggdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]B   t fddtdD stt 	 |  qddgd	d
gg   tfddtdD stt 	 |  ddgd	dgg   tfddtdD sltt 	 |  d S )NrA   r5   r   r   r   r?   r   r   r   r#   r   r3   r4   r   r      a   b   c   dr^   c                    s   g | ]}j | j jkqS r   r   r@   .0ir   r7   r   r   
<listcomp>  s     z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r   )r   r   r   r@   integerr   r7   r   r   r     s     c                    s   g | ]} j | jd kqS )r^   r   r   r   r   r   r     s     )
r	   r   r   r)   allranger   r   r-   r   )r{   r   r   r   test_encoder_dtypes  s&    
"
 
 
 r   c                     s  t d} tddtjddddddgddddddggdd}| jdd	gd
dgddgddd}| tfddtd	D st	t
| | | dd	gddgddgd}|d j|d j|d jg | t fddtd
D st	t
| | d S )NrF   rA   r5   r   r   r   r?   r   r   r   r#   r   r   )rH   rI   Cr   c                    s   g | ]} j | jd kqS )r   r   r   r   r   r   r     s     z.test_encoder_dtypes_pandas.<locals>.<listcomp>r3   r4   r   r   rH   rI   r   c                    s    g | ]}j | j | kqS r   r   r   ZX_typer7   r   r   r   	  s     )r*   rJ   r	   r   r   rK   r)   r   r   r   r   r-   r   r@   )rL   r{   r   r   r   r   test_encoder_dtypes_pandas  s    

"

"r   c                  C   s*   t  } ddgddgg}tj| j| d S )NrO   r   rP   r   )r	   r   ZtestingZassert_no_warningsr   )r7   r   r   r   r   test_one_hot_encoder_warning  s    r   missing_valuec           	      C   s  dddd| g}t |d}ddddd	gddd
dd	gdddd| gg}|| }d
dd
d
d
gdd
dd
d
gdddddgg}t|| |j|kstdd t|j|jD }|	|}t
j|td}t|d rzt|d d |d d  t|d stt|d stt|d d d df |d d d df  t|dd df |dd df  t|d sftt|d stnt|| t|| d S )Nre   rT   r   rt   r   rg   r   rf   r3   r   r   c                 S   s   g | ]\}}|| qS r   r   )r   rh   Zfeaturer   r   r   r   !  s    z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>r?   r:   )r:   r:   )r	   r   r   r   rp   r   r   r   r   rw   r   r   r^   r   )	r   Zcats_to_dropr7   r   Ztransr{   Zdropped_catsZX_inv_transZX_arrayr   r   r    test_one_hot_encoder_drop_manual  s2    
(

*"
r   rQ   c              	   C   sN   t | d}d}tjt|d( |dddgdddgdd	d
gg W 5 Q R X d S )Nr   z-`drop` should have length equal to the numberr'   rg   r   rf   re   r   r   ;   )r	   r*   r+   r,   r)   )rp   r7   err_msgr   r   r   test_invalid_drop_length8  s    
r   densityr   Zdensec                 C   s   t | d}t | |d}dddgdddgg}|| || t|j|j |dkrbt|jd	 n0t||j|jD ]\}}}|t| |ksrtqrt|jt	j
st|jjtkstd S )
Nr   rr   r   r   r3   r   r4   rq   r   )r	   r)   r   r   r   r   r   r   r   r   Zndarrayr@   r^   )r   rp   Zohe_baseZohe_testr   Zdrop_catZdrop_idxZcat_listr   r   r   test_categories@  s     


  r   Encoderc                 C   s   d|    d kstd S )NZcategoricalZX_types)Z	_get_tagsr   )r   r   r   r   "test_encoders_has_categorical_tagsT  s    r   kwargsmax_categoriesmin_frequency   g(\?r   )r   r   rT   r6   rA   c           
      C   s   t dgd dgd  dgd  dgd  gj}tf |d	d
d| |}t|jdddgg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rR   r   r   r"   F)r6   r    r   er   r   c                 S   s   g | ]
}|gqS r   r   r   colr   r   r   r   w  s     z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr#   r   x0_infrequent_sklearnNr   r   rC   r	   r)   r   infrequent_categories_r-   r   rw   r[   )
r   r6   X_trainr   X_testr   r}   expected_invX_invr\   r   r   r   test_ohe_infrequent_two_levelsY  s(    2(



r   c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t|jdg t dgdgg}||}tdgdgg| |	 }tdg| |
|}tdgdgg| dS )z3Test two levels and dropping the frequent category.r3   r   r4   r   r   rR   r   r   r"   Fr   r    r   r   rp   r   r   r   r   N)r   r   rC   r	   r)   r   r   r-   r   r[   rw   )rp   r   r   r   r}   r\   	X_inverser   r   r   ,test_ohe_infrequent_two_levels_drop_frequent  s"    2

r   c              	   C   sz   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W 5 Q R X dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.r3   r   r4   r   r   rR   r   r   r"   Fr   r   Unable to drop category r   ( from feature 0 because it is infrequentr'   Nr   r   rC   r	   r*   r+   r,   r)   rp   r   r   r8   r   r   r   5test_ohe_infrequent_two_levels_drop_infrequent_errors  s    2r   r   gQ?g{Gz?r   c           	   	   C   s  t dgd dgd  dgd  dgd  gj}tf d	d
d| |}t|jddgg dgdgdgdgdgg}t dddgdddgdddgdddgdddgg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tdddg| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rR   r   r   r"   Fr    r   r   r   r   r   r   r   r   Nr   )	r   r   r   r   r   r}   r   r   r\   r   r   r    test_ohe_infrequent_three_levels  s0    2 2



r  c              	   C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d |dgdgg}W 5 Q R X tddgddgg| dS )z5Test three levels and dropping the frequent category.r3   r   r4   r   r   rR   r   r   r"   Fr   r   r   r!   r%   r&   r'   r   N)r   r   rC   r	   r)   r   r-   r`   r*   warnsUserWarning)rp   r   r   r   r8   r}   r   r   r   .test_ohe_infrequent_three_levels_drop_frequent  s     2"r  c              	   C   sz   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W 5 Q R X dS )z7Test three levels and dropping the infrequent category.r3   r   r4   r   r   rR   r   r   r"   Fr   r   r   r   r'   Nr   r   r   r   r   7test_ohe_infrequent_three_levels_drop_infrequent_errors  s    2r  c               	   C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t dddgdddgdddgdddgg}||}t|| dgg}d}t	j
t|d || W 5 Q R X dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.r3   r   r4   r   r   rR   r   r   r$   F)r    r   r   r   r   badz.Found unknown categories \['bad'\] in column 0r'   N)r   r   rC   r	   r)   r   r   r-   r   r*   r+   r,   )r   r   r   r   r}   r8   r   r   r   (test_ohe_infrequent_handle_unknown_error  s"    2  *

r  c                 C   s   t jdgd dgd  gtdj}tf ddddggd	d
d| |}dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ].}|j|d| tdgdgg|| qdS )zG'a' is the only frequent category, all other categories are infrequent.r3   r   r   rW   r?   r   r   r4   Fr"   r6   r   r    r   r   rq   r   r   N)	r   r   r^   rC   r	   r)   r-   r   r`   )r   r   r   r   r   r}   Zdropsrp   r   r   r   5test_ohe_infrequent_two_levels_user_cats_one_frequent  s&    "(

r	  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tddddggd
ddd| }t|jdddgg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.r3   r   r4   r   r   rR   r   r   r?   Fr"   r   r6   r   r    r   r   r   r   c                 S   s   g | ]
}|gqS r   r   r   r   r   r   r   E  s     z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>r   r#   Nr   r   r^   rC   r	   r)   r   r   r-   r   rw   r   r   r   r   r}   r   r   r   r   r   (test_ohe_infrequent_two_levels_user_cats/  s(    ( (


r  c               	   C   s   t jdgd dgd  dgd  dgd  gtd	j} tddddggd
ddd| }t|jddgg dgdgdgdgdgg}t dddgdddgdddgdddgdddgg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.r3   r   r4   r   r   rR   r   r   r?   Fr"   r
  r   r   r   r   Nr  r  r   r   r   *test_ohe_infrequent_three_levels_user_catsJ  s2    ( 2


r  c                  C   s   t jdddddddddg	dddddddddg	f } tdddd}||  ddgddgg}||}t|ddddgddddgg dS )	zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.r   r   r   r   r   F)r   rp   r   N)r   c_r	   r)   r-   r   )r   r   r   r}   r   r   r   test_ohe_infrequent_mixedn  s    2

r  c            	      C   s  t jdddddddddg	dddddddddg	dddddddddg	f } tdddd	}||  }t|jd ddg t|jd ddg t|jd d
 | }tddddddddg| ddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgddddddddgg	}t|| dddgdddgg}|	|}ddddddddgddddddddgg}t||  |
|}t jddd
gddd
ggtd}t|| tdddd	| }tjtdd |	| W 5 Q R X dddgdddgg}|	|}ddddddddgddddddddgg}t||  |
|}t jdddgdddggtd}t|| d
S )z?Test infrequent categories with feature matrix with 3 features.r   r   r   r   r   rR   rA   r"   r6   r   r    NZx0_0Zx0_3r   Zx1_0Zx1_5Zx1_infrequent_sklearnZx2_0Zx2_1r#   r   r?   r$   r&   r'   )r   r  r	   r   r   r   r   r[   r   r-   rw   r   r^   r)   r*   r+   r,   )	r   r   r}   r\   r   r   X_test_transr   r   r   r   r   'test_ohe_infrequent_multiple_categories  s      


(
 
  
(
r  c            	      C   s(  t d} | jdddddddddg	dddd	d	d
dddg	dddgd}tdddd}|| }t|jd ddg t|jd ddd
g ddddddgddddddgddddddgddddddgddddddgddddddgddddddgddddddgddddddgg	}t|| | jddgdd
gdddgd}ddddddgddddddgg}|	|}t||  |
|}tjddgddggtd}t|| | jddgd
dgdddgd}|	| }ddddddgddddddgg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rF   r3   fr   r4   r   r   r   rR   rT   )strr   r  r   columnsrA   r"   r  r      r   r?   N)r*   rJ   rK   r	   r   r   r   r   r   r-   rw   r   r   r^   )	rL   r   r   r}   r   r   r  r   r   r   r   r   .test_ohe_infrequent_multiple_categories_dtypes  s\    
  	
  


  

 r  rV   )r   r   c                 C   sp   t dgd dgd  dgd  dgd  gj}tf d	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.r3   r   r4   r   r   rR   r   r   r"   Fr   r   N)r   r   rC   r	   r)   r-   r   r   r   r   r}   r   r   r   $test_ohe_infrequent_one_level_errors  s    2 
r  c                 C   sf   t jdgd gtdj}tf ddddggdd	d
| |}|dgdgg}t|dgdgg dS )r  r   r   r?   r   r   r3   r4   Fr"   r  r   N)r   r   r^   rC   r	   r)   r-   r   r  r   r   r   5test_ohe_infrequent_user_cats_unknown_training_errors*  s    r  c               	   C   sH   ddgddgddgg} d}t jt|d tdd	|  W 5 Q R X d S )
NrO   r   rP   r   r   z'`sparse` was renamed to `sparse_output`r'   Fr   )r*   r  FutureWarningr	   r)   )r   r8   r   r   r   &test_one_hot_encoder_sparse_deprecated;  s    r  zinput_dtype, category_dtypeZOOZOUZUOZUUSOZSUZSS
array_typer   r   Z	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    r4   r3   r?   Frc   r   r   r5   N)	r   r   r	   r)   r   r-   r   r
   r   )
r>   Zcategory_dtyper!  r   r6   r   r   r}   r   oer   r   r   test_encoders_string_categoriesD  s      
"

r#  c               	   C   sh   t jdgdggdd} t jddgddg}t|dd}td}tjt|d	 ||  W 5 Q R X d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    r4   r3   Ur?   SFrc   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r'   N)	r   r   r	   rx   ry   r*   r+   r,   r)   )r   r6   r   r8   r   r   r   $test_mixed_string_bytes_categoricalsc  s    r&  c                 C   sP   t jdd| d| ggtdj}tddd|}| }t|ddd	|  g d S )
Nr3   r4   r?   Fr!   r   r    Zx0_ar   Zx0_)r   r   r^   rC   r	   r)   r[   r   )r   r   r   namesr   r   r   )test_ohe_missing_values_get_feature_namesx  s    r)  c                  C   s   t d} | jddd dgtjdddtjgtddd	d
gd}tdddddddgdddddddgdddddddgdddddddgg}t|}t|| d S )NrF   dogrh   r   r   r#   r?   )col1col2r+  r,  r  r   )	r*   rJ   rK   r   r   ri   floatrd   r   )rL   dfexpected_df_transrm   r   r   r   %test_ohe_missing_value_support_pandas  s     

	r0  pd_nan_typepd.NAznp.nanc              
   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}td	d	d
d	gd
d	d	d	gd	d	d	d
gd	d
d	d	gd
d	d	d	gg}td|d}|	|}t
|| t|jd
kstt|jd	 d d dddg t|jd	 d std S )NrF   r2  r+  r   r3   r4   categoryr?   r   r   Fr'  r:   )r*   rJ   NAr   ri   rK   r   r   r	   r   r   lenr   r   r   isnan)r1  r    rL   pd_missing_valuer.  r/  r   df_transr   r   r   1test_ohe_missing_value_support_pandas_categorical  s*    
 







r9  c              	   C   s   ddgddgddgg}t dd| d}||}tdddgdddgdddgg}t|| d	d
gg}tdddgg}d}tjt|d ||}W 5 Q R X t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.r3   r   r4   r   r   rq   Frp   r   r    r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr'   r?   Nr	   r   r   r   r   r*   r  r  r-   rw   r   r^   r    r   r   r}   rD   r   warn_msgr   r   r   r   /test_ohe_drop_first_handle_unknown_ignore_warns  s.      




r?  c              	   C   s   ddgddgddgg}t dd| d}||}tddddgddddgddddgg}t|| d	d
gg}tddddgg}d}tjt|d ||}W 5 Q R X t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.r3   r   r4   r   r   r   Fr:  r   r   r;  r'   Nr?   r<  r=  r   r   r   3test_ohe_drop_if_binary_handle_unknown_ignore_warns  s.      







r@  c              	   C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W 5 Q R X t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.r3   r   r4   r   r   rq   F)rp   r   r    r6   r   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr'   N)	r	   r)   r   r   r*   r  r  r-   r   )r    r   r   r   rD   r>  r}   r   r   r   'test_ohe_drop_first_explicit_categories  s    

rA  c               	   C   sX   t t jdddggj} tt jd}dt j }tjt|d |	|  W 5 Q R X dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   r?   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r'   N)
r   r   ri   rC   r
   int32r*   r+   r,   r)   )r   r"  r8   r   r   r   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtype  s    
rC  encoded_missing_valuer   c                 C   s   t jt jdddggt jdj}t| d|}t|jdks@t	t
|jd ddt jg ||}t
|| gdgdgdgg ||}t
|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   r?   rD  r   r   r   N)r   r   ri   r   rC   r
   r)   r5  r   r   r   r-   rw   )rD  r   r"  r}   r   r   r   r   5test_ordinal_encoder_passthrough_missing_values_float,  s    

rF  c              	   C   s$  t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ksbtt|j
d dd dddg t|j
d d st||}t|dgdg|gdgdgg ||}|jdkstt|dddf ddg t|dddf ddg t|d s tdS )z0Check ordinal encoder is compatible with pandas.rF   r2  r+  r   r3   r4   r3  r?   rE  r   r   Nr   r:          @r   r   )r   r   r   ru   )r*   rJ   r4  r   ri   rK   r   r
   r)   r5  r   r   r   r6  r-   r   rw   r   )r1  rD  rL   r7  r.  r"  r8  r   r   r   r   =test_ordinal_encoder_missing_value_support_pandas_categorical>  s$    
 

rH  rG  zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec              	   C   st   t |d}tdgtjgg}t|| | |jd j|ksBtt |d}t	j
tdd || W 5 Q R X dS )z.Test ordinal encoder for specified categories.r5   r   r   r&   r'   N)r
   r   r   ri   r   r   r   r@   r   r*   r+   r,   r)   )r   r0   r   r   r"  r{   r   r   r   =test_ordinal_encoder_specified_categories_missing_passthrough_  s    &

rI  zX, expected_X_trans, X_testr   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr   r:   r   g      N)r
   r   r   r-   )r   Zexpected_X_transr   r"  r}   r   r   r   /test_ordinal_encoder_handle_missing_and_unknown  s    

rJ  c               	   C   s   t dddgdddgg} t| }t }d}tjt|d || W 5 Q R X tjt|d |	| W 5 Q R X |	| }t|}tjt|d |
| W 5 Q R X dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   r   r   z6A sparse matrix was passed, but dense data is requiredr'   N)r   r   r   Z
csr_matrixr
   r*   r+   	TypeErrorr)   r   rw   )r   ZX_sparseencoderr   r}   r   r   r   r   test_ordinal_encoder_sparse  s    


rM  c               	   C   s   t ddddddgddt jf } tdddggddd	}||  tdddggd
d}tjtdd ||  W 5 Q R X dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   r   r   r   Nr:   r   r   )r6   r    r   r$   r   r&   r'   )r   r   Znewaxisr
   r)   r*   r+   r,   )r   r"  r   r   r   -test_ordinal_encoder_fit_with_unseen_category  s    $
  
rN  r   ZAAOr$  r   c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   ir   r   N)r
   r)   r-   r   )r   r   r7   r}   r   r   r   1test_ordinal_encoder_handle_unknown_string_dtypes  s    

rP  c                  C   sf   t ddddgdd} t | }t|jt j| ddj |	| }t|dgd	gd
gdgg dS )zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr:   r   r   )Zaxisr   r   N)
r   r   r;   r
   r)   r   r   sortrC   r-   )r   rL  r}   r   r   r   #test_ordinal_encoder_python_integer  s     
rR  c                  C   sL   t d} dddg}| jdddgg|d}t |}| }t|| d	S )
z-Check feature names out is same as the input.rF   r4   r   r3   r   r   r   r  N)r*   rJ   rK   r
   r)   r[   r   )rL   r(  r   r7   Zfeature_names_outr   r   r   .test_ordinal_encoder_features_names_out_pandas  s    

rS  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg ||}|d d d
kst	t 
|d d st	d
S )zECheck interactions between encode_unknown and missing value encoding.r3   r4   r?   r   r    r   rD  r   r   r   N)r   r   ri   r^   r
   r)   r-   r   rw   r   r6  )r   r"  r}   r   r  X_roundtripr   r   r   0test_ordinal_encoder_unknown_missing_interaction  s     


rW  with_pandasc              	   C   s   t jddgddgdt jggtd}d}| rPtd}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W 5 Q R X dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.r3   r*  r4   rh   r   r?   zTencoded_missing_value \(1\) is already used to encode a known category in features: rF   letterZpetr  z	\['pet'\]z\[1\]r   rE  r'   N)r   r   ri   r^   r*   rJ   rK   r
   r+   r,   r)   )rX  r   	error_msgrL   r"  r   r   r   0test_ordinal_encoder_encoded_missing_value_error8  s    "


r[  z4X_train, X_test_trans_expected, X_roundtrip_expected1c                 C   s   t dtjtjd| }tdgtjgdgg}||}t|| ||}|jd }t	|D ]V}||df }	||df }
|	dkr|
dkst
q`t|	rt|
st
q`|
|	ks`t
q`dS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r   rU  r\  r4   r   N)r
   r   ri   r)   r   r-   r   rw   r   r   r   r   r6  )r   ZX_test_trans_expectedZX_roundtrip_expectedr"  r   r  rV  Z	n_samplesr   Zexpected_valvalr   r   r   9test_ordinal_encoder_unknown_missing_interaction_both_nanR  s(    



r^  c               	   C   s   t d} | ddgddgd}t }|jdd d}t jt|d	 || W 5 Q R X td
djdd}td
djdd}||}||}t|	 | t
| |j dS )z*Check OneHotEncoder works with set_output.rF   r3   r4   r   r   rG   r-   z*Pandas output does not support sparse datar'   Fr   defaultN)r*   rJ   rK   r	   
set_outputr+   r,   r   r   to_numpyr   r[   r  )rL   rM   r   r(   Zohe_defaultZ
ohe_pandas	X_defaultX_pandasr   r   r   test_one_hot_encoder_set_output  s    


re  c                  C   st   t d} | ddgddgd}t jdd}t jdd}||}||}t| | t|	 |j
 d	S )
z+Check OrdinalEncoder works with set_output.rF   r3   r4   r   r   rG   r`  r_  N)r*   rJ   rK   r
   ra  r   r   rb  r   r[   r  )rL   rM   Zord_defaultZ
ord_pandasrc  rd  r   r   r   test_ordinal_set_output  s    


rf  c                  C   sz   dddddgddgg} t | d}|ddgg t| t|jksFtt|jD ]$\}}|jtksftt| | | qPd	S )
zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    asZmmasZeasZrasZacsr\  2r5   N)	r	   r)   r5  r   r   	enumerater@   r^   r   )r6   r7   nrh   r   r   r    test_predefined_categories_dtype  s    
rk  )qrx   Znumpyr   Zscipyr   r*   Zsklearn.exceptionsr   Zsklearn.utils._testingr   r   r   Zsklearn.utilsr   Zsklearn.preprocessingr	   r
   r   markZparametrizer2   r9   r<   rB  Zfloat32r   rE   rN   r]   r_   rb   rd   r   r^   ri   r-  rn   r|   r~   r   r   r   r   r   r   Zstr_Zfloat_r   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r	  r  r  r  r  r  r  r  r  r#  r&  r)  r0  r9  r?  r@  rA  rC  rF  rH  rI  rJ  rM  rN  rP  rR  rS  rW  r[  rB   r^  re  rf  rk  r   r   r   r   <module>   s  


<
  
/0&


 &&* !





-8	
	
		





$







 
$[A

	 

%
$

		"

!