U
    3d                     @   s&  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*Z*d dl+m,Z, dZ-dZ.e-e. Z/dd Z0dd Z1dd  Z2d!d" Z3d#d$ Z4d%d& Z5ej67d'eefd(d) Z8d*d+ Z9d,d- Z:d.d/ Z;d0d1 Z<d2d3 Z=d4d5 Z>d6d7 Z?d8d9 Z@d:d; ZAd<d= ZBd>d? ZCd@dA ZDdBdC ZEdDdE ZFdFdG ZGdHdI ZHdJdK ZIdLdM ZJdNdO ZKdPdQ ZLdRdS ZMe#dTdU ZNdVdW ZOej67d'eefdXdY ZPdZd[ ZQd\d] ZRd^d_ ZSd`da ZTe#dbdc ZUej67d'eefddde ZVdfdg ZWdhdi ZXdjdk ZYe#dldm ZZdndo Z[dpdq Z\ej67drej]ej^ej_gdsdt Z`dudv Zadwdx Zbdydz Zcd{d| Zdd}d~ Zedd Zfdd Zgdd Zhe#dd Zidd Zjdd Zkdd Zlej67d'eeefdd Zmej67dejnejogdd Zpdd Zqej67dejrejodfejsejodfejnejndfejoejodfgdd Ztej67deddeddeddgdd Zudd Zve#dd Zwe%dd Zxe#ej67deeegdd Zyej67deeegej67ddezdfde{dfgdd Z|ej67deeej}ee#dgej67ddd dd gej67dddgdd Z~ej67deeegdd Zej67d'eeegej67dddgddddddddf	ddd dddddddf	ddd dddddddf	dddd dddd dddf	ddddddd dddf	dgddЄ Zej67dedddԜddd֜gfee-ffdd؄ Zddڄ Ze#dd܄ Zej67deeeegddބ ZdS )    )MappingN)sparse)
strip_tags)strip_accents_unicode)strip_accents_ascii)HashingVectorizer)CountVectorizer)TfidfTransformer)TfidfVectorizer)ENGLISH_STOP_WORDS)train_test_split)cross_val_score)GridSearchCV)Pipeline)	LinearSVC)clone)assert_array_almost_equal)assert_array_equal)IS_PYPY)assert_almost_equalfails_if_pypyassert_allclose_dense_sparseskip_if_32bit)defaultdict)partial)StringIO)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r    N/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/feature_extraction/tests/test_text.py	uppercase>   s    r"   c                 C   s   |  ddS )N   ée)replacer   r    r    r!   strip_eacuteB   s    r&   c                 C   s   |   S r   splitr   r    r    r!   split_tokenizeF   s    r)   c                 C   s   dgS )NZthe_ultimate_featurer    r   r    r    r!   lazy_analyzeJ   s    r*   c                  C   s   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td	} d
}t | |ksxtd} d}t | |kstd} d
}t | |kstd S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   AssertionErroraexpectedr    r    r!   test_strip_accentsN   s*    r8   c                  C   sd   d} d}t | |kstd} d}t | |ks0td} d}t | |ksHtd} d}t | |ks`td S )	Nr+   r,   r-   r.   r/   r3   r0   r1   )r   r4   r5   r    r    r!   test_to_asciir   s    r9   
Vectorizerc              
   C   s  | dd  }d}dddddd	d
dddg
}|||ks:td}dddddddg}|||ks`t| dd  }td}dddddddg}|||kst| td  }d}ddd d!d"d#d$d%d&d'g
}|||kst| tdd(  }d}d)ddddd*d+ddd,g
}|||kstd S )-Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.thisistestZreallyZmetZharryZ	yesterdayfile)input'This is a test with a file-like object!withlikeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.ZAIZMANGEZDUZ	KANGOUROUZCEZMIDIZETAITZPASZTRESZBON)	tokenizerr=   zj'aizmidi,zc'etaitzbon.)build_analyzerr4   r   r"   r)   )r:   watextr7   r    r    r!   test_word_analyzer_unigrams   sb    rX   c                  C   sT   t dddd } d}dddd	d
ddddddddddddddg}| ||ksPtd S )Nwordunicode      analyzerr=   ngram_ranger>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rU   r4   )rV   rW   r7   r    r    r!   'test_word_analyzer_unigrams_and_bigrams   s6      
ra   c               	   C   sp   d} |  d}tddd }tt || W 5 Q R X tdddd }tt || W 5 Q R X d S )	Nr>   zutf-8r[   r;   )r`   encodingchar      )r_   r`   rb   )encoder   rU   pytestraisesUnicodeDecodeError)rW   Z
text_bytesrV   car    r    r!   test_unicode_decode_error   s    
  
rl   c                  C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtd}dddddg}| |d d |kstdddddg}| |dd  |kstt dddd } td}dddddg}| |d d |kstd S ) Nrc   rZ   rd   r^   u9   J'ai mangé du kangourou  ce midi, c'était pas très bonzj'az'aizai zi mz ma   zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterdaythihisis zs iz isz yesteyesteresterdsterdaterdayrL   rM   r_   r`   rN   r   rU   r4   r   cngarW   r7   r    r    r!   test_char_ngram_analyzer   s.      
  
r{   c                  C   s   t dddd } d}dddd	d
g}| |d d |ks<tdddddg}| |dd  |ksbtt dddd } td}ddddddg}| |d d |kstd S )NZchar_wbrZ   rd   r^   ro   z thrp   rq   rr   z thirm   rs   rt   ru   rv   zerday rn   rL   rw   zA test with a file-like object!z a z teZtesestzst z tesrf   rx   ry   r    r    r!   test_char_wb_ngram_analyzer  s$      
  
r}   c                  C   s   t dddd } d}dddg}| |d d	 |ks8td
ddg}| |dd  |ksZtt dddd }t|}||| |kstd S )NrY   rZ   rd   r^   ro   zthis is testzis test reallyztest really metre   ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrL   rw   rx   )rz   rW   r7   Z	cnga_filerL   r    r    r!   test_word_ngram_analyzer$  s(      

  
r   c                  C   s   ddd} t |  }ttttttfD ]}|| }t|d}|	t
 t|trb|j| ksttnt |j|kstt|t
}|jd t|kst|| }t|d}||}t||jd ks(tq(d S )Nr   r\   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_r4   	transformshapeleninverse_transform)vocabtermstypvvectXinvr    r    r!   &test_countvectorizer_custom_vocabulary;  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ksJt|j	d t
| ks`td S )Nr   r   countr   tfidfr\   )r   r   r	   fit_transformALL_FOOD_DOCSr   named_stepsr   r4   r   r   )Zwhat_we_likepiper   r    r    r!   /test_countvectorizer_custom_vocabulary_pipelineP  s    
r   c               	   C   sB   ddd} d}t jt|d t| d}|dg W 5 Q R X d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   Zpasta_sizilianarh   ri   
ValueErrorr   r   )r   msgr   r    r    r!   7test_countvectorizer_custom_vocabulary_repeated_indices]  s
    

r   c               	   C   s>   ddd} t jtdd t| d}|dg W 5 Q R X d S )Nr\   r]   r   zdoesn't contain indexr   r   Zpasta_verdurar   r   r   r    r    r!   0test_countvectorizer_custom_vocabulary_gap_indexe  s    

r   c               	   C   s   t  } | jdd |  tks"t| jdd tt |   W 5 Q R X | jdd tt |   W 5 Q R X dddg}| j|d |  t|kstd S )Nenglish
stop_wordsZ_bad_str_stop_Z_bad_unicode_stop_Zsomeotherwords)	r   
set_paramsget_stop_wordsr   r4   rh   ri   r   r   )cvZstoplistr    r    r!   test_countvectorizer_stop_wordsl  s    
r   c               	   C   sj   t jtdd tg d} | dg W 5 Q R X t jtdd" tddd}|dd	d
g W 5 Q R X d S )Nzempty vocabularyr   r   foo      ?r   )max_dfr   zto be or not to bez
and me toozand so do your   )r   r   r    r    r!   %test_countvectorizer_empty_vocabulary{  s    
r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ksBtd S )Nrm   r\   )r   r   r   r   r4   )r   ZX1X2r    r    r!   test_fit_countvectorizer_twice  s    r   c                  C   sD   ddddg} d}t |d}||  ddd	g}| }t|| d
S )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    &This is the 1st document in my corpus. This document is the 2nd sample.And this is the 3rd one.Is this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_patterndocumentZonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr7   feature_names_outr    r    r!   )test_countvectorizer_custom_token_pattern  s    


r   c               	   C   sF   ddddg} d}d}t |d}tjt|d ||  W 5 Q R X d	S )
zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   r   r   r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   rh   ri   r   r   )r   r   err_msgr   r    r    r!   <test_countvectorizer_custom_token_pattern_with_several_group  s    
r   c               	   C   sn   ddddg} d}t d| d}tjt|d ||  W 5 Q R X t  td	t ||  W 5 Q R X d S )
NZSampleZUpperZCaseZ
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   rh   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r    r    r!   'test_countvectorizer_uppercase_in_vocab  s    
r   c                  C   sP   dddgdddgdddgg} t ddd| }dddg}||}t|| d	S )
z0Check get_feature_names_out for TfidfTransformerr\   r   Tl2
smooth_idfnormr6   cbN)r	   r   r   r   )r   trZfeature_names_inr   r    r    r!   %test_tf_transformer_feature_names_out  s
    

r   c                  C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}||  }|dk std S )	Nr\   r   Tr   r   r]   Zaxisr   )r	   r   toarrayallr4   r   sumr   r   r   r    r    r!   test_tf_idf_smoothing  s    r   c               	   C   s   dddgdddgdddgg} t ddd}||  }|dk sFtt|d jdddddg dddgdddgdddgg} t ddd}d	}tjt	|d
 ||   W 5 Q R X d S )Nr\   r   Fr   r   r]   r   r   zdivide by zeror   )
r	   r   r   r   r4   r   r   rh   r   RuntimeWarning)r   r   r   Zin_warning_messager    r    r!   test_tfidf_no_smoothing  s    r   c                  C   s   dgdgdgg} t ddd d}||  }|d dks<t|d |d ksPt|d |d ksdt|d dk stt|d dk std S )Nr\   r]   re   TF)sublinear_tfuse_idfr   r   )r	   r   r   r4   r   r    r    r!   test_sublinear_tf  s    r   c               	   C   sp  t td d } td g}ttd }tdd}|| }t|drL| }|d|jd f dksftt|jd	}||fD ]}|	|}t|dr| }|j}|d|d
 f dkst|d|d f dkst|d|d f dkstd|kstd|kst|d|d f dkst|d|d f dks6t|d|d f dksPt|d|d f dksztqzt
dd}	|	|	| }
t|	jt|jkst|
j|t|jfkst|		| }|jt|t|jfkstt
ddd}||	| }t|drtt
dd}tt |	| W 5 Q R X ttj|dddg|  t td d } tdd}|j|_||  }|jrtt|
| |	| }t|| td d	}tt |	|  W 5 Q R X |jddd | }d}t|}||}||kst|jdd d tt |  W 5 Q R X d |_tt |  W 5 Q R X d S )!Nr\         ?r   tocsrr   r   r]   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r   idf_Tr   r   r   r;   )r=   r   r>   Z_gabbledegook_)r=   rS   Z_invalid_analyzer_type_)r   r   r   r   r   hasattrr   r   r4   r   r	   r   r   r   r   rh   ri   r   r   npr   r
   r   fixed_vocabulary_r   build_preprocessorr   rU   )
train_data	test_dataZn_trainZv1Zcounts_trainZv2r   Zcounts_testr   t1r   Z
tfidf_testt2tft3tvZtfidf2Ztfidf_test2Zv3	processorrW   r7   resultr    r    r!   test_vectorizer   sv    











r  c                  C   s  d\} }}}t | |||d}|t |jj| ks6t|jj|ksFt|jj|ksVt|jj|ksftd|_d|_d|_d|_|jj| kst|jj|kst|jj|kst|jj|kst|t |jj|jkst|jj|jkst|jj|jkst|jj|jkstd S )N)r   FFF)r   r   r   r   r   T)	r
   r   r   _tfidfr   r4   r   r   r   )r   r   r   r   r   r    r    r!   test_tfidf_vectorizer_settersg  s2       

r  c                  C   s  t  } | t}|j}|jtt| jfks.t|j| jks>tt	
|jdksRtt	
|jdk sftt	|jdksztt	|jdk stt|jd D ]}tt	j|d jdd qt ddd} | t}|jtt| jfkst|j| jkst|j}||kst|d| k s tt	
|jdks6tt	|jdk sLtt|jd D ] }tt	j|d jdd qZd S )	Nr   r   r\   r]   r   r[   r   )r`   r   )r   r   r   nnzr   r   
n_featuresr4   dtyper   mindatamaxranger   Zlinalgr   )r   r   Z	token_nnziZ
ngrams_nnzr    r    r!   test_hashing_vectorizer  s,    

r  c               
   C   sN  t dd} tt |   W 5 Q R X | jr2t| t}|j	\}}t
| j|ksXt|  }t|tjspt|jtks~tt
||ksttddddddd	d
dg	| t|D ]\}}|| j|kstqddddddd	d
dg	}t |d} |  }tddddddd	d
dg	| | js tt|D ] \}}|| j|ks(tq(d S )Nr   r   r   r   celerir   r   r   	sparklingr   r   r   )r   rh   ri   r   r   r   r4   r   r   r   r   r   r   r   Zndarrayr  rQ   r   	enumerateget)r   r   Z	n_samplesr  Zfeature_namesidxnamer   r    r    r!   test_feature_names  sl    




r  c                 C   sX   ddddh}ddddd	d
dh}| ddd}| t t|j|ksFt|j|ksTtd S )Nr   r   r   r   r  r   r   r   r  r   r   g333333?   )r   max_features)r   r   r   r   r4   stop_words_)r:   Zexpected_vocabularyZexpected_stop_wordsr   r    r    r!   test_vectorizer_max_features  s    
r  c            	      C   s   t dd} t dd}t d d}| tjdd}|tjdd}|tjdd}|  }| }| }d| ks|td| kstd| kstd|t| kstd|t| kstd|t| kstd S )Nr\   r  re   r   r      r   )	r   r   r   r   r   r	  r4   r   Zargmax)	Zcv_1Zcv_3Zcv_NoneZcounts_1Zcounts_3Zcounts_NoneZ
features_1Z
features_3Zfeatures_Noner    r    r!   "test_count_vectorizer_max_features  s    


r  c                  C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j dkstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )Nabcdeaeatrc   r   r_   r   r6   rf   r   r   r  r]   r\   )r   r   r   r   r4   r   r  r   r   r   r    r    r!   test_vectorizer_max_df   s$    



r   c                  C   s  dddg} t ddd}||  d|j ks2tt|j dksHtt|jd	ksZtd
|_||  d|j ks|tt|j d
kstd|jkstt|jdkstd|_||  d|j kstt|j dkstd|jkstt|jdkstd S )Nr  r  r  rc   r\   )r_   min_dfr6   rf   r   r]   r   r  g?rm   )r   r   r   r   r4   r   r  r!  r  r    r    r!   test_vectorizer_min_df7  s$    



r"  c                  C   s   ddg} t ddd}||  }tdddd	d
g|  tdddddgdddddgg| t dddd}||  }tdddddgdddddgg| t dddtjd}|| }|jtjkstd S )Naaabcabbderc   r   r  r6   r   r   dr$   re   r\   r   r]   T)r_   r   binary)r_   r   r&  r  )	r   r   r   r   r   r   float32r  r4   )r   r   r   ZX_sparser    r    r!   test_count_binary_occurrencesN  s    ""
r(  c                  C   s   ddg} t ddd d}|| }t|dd jdks<tt|dd	 jd	ksXt|jtjkshtt ddd
d d}|| }t|jdkst|jtjkstt ddd
d tjd}|| }|jtjkstd S )Nr#  r$  Frc   )alternate_signr_   r   r   r\   re   r]   T)r_   r)  r&  r   )r_   r)  r&  r   r  )r   r   r   r	  r  r4   r  float64)r   r   r   r    r    r!   test_hashed_binary_occurrencesb  s0    
   
    
r+  c                 C   s  t }|  }||}||}t|ts,t| }t||D ]6\}}t	t
||}t	t
|}t|| q>t|st|jdkst| }	||	}
t||
D ]\}}tt	|t	| q| }||}t||D ]\}}tt	|t	| qd S )NZcsr)r   r   r   r   r   r4   rU   zipr   sortuniquer   r   issparseformatr   Ztocsc)r:   r  r   Ztransformed_dataZinversed_dataZanalyzedocZinversed_termsr   Ztransformed_data2Zinversed_data2Zterms2Ztransformed_data3Zinversed_data3Zterms3r    r    r!   !test_vectorizer_inverse_transform}  s(    



r2  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdkst|jjd }
|
jdkstd S )Nr   r\   g?r   Z	test_sizerandom_stater   svcr\   r\   r[   ZhingeZsquared_hinge)vect__ngram_range	svc__lossre   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_r4   best_estimator_r   r`   r  targetr   r   Ztarget_trainZtarget_testpipeline
parametersZgrid_searchpredZbest_vectorizerr    r    r!   -test_count_vectorizer_pipeline_grid_selection  s$       
rD  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdkst|jjd }
|
jdkst|
jdkst|
jrtd S )Nr   r\   g?r   r3  r   r5  r6  r[   )r   r   r7  )r8  Z
vect__normr9  )r:  r   r   )r   r;  r   r   r   r
   r   r   r   r<  r   r=  r4   r>  r   r`   r   r   r?  r    r    r!   'test_vectorizer_pipeline_grid_selection  s*       
rE  c                  C   s`   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|dddg d S )Nr   r\   r   r5  re   )r   r   )r   r;  r   r   r
   r   r   r   )r  r@  rA  Z	cv_scoresr    r    r!   )test_vectorizer_pipeline_cross_validation  s
    rF  c                  C   sx   d} t  }|| g}|jdks$ttd dd}|| g}|jdksJt|j|jksZttt	|j
t	|j
 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)r\      F)r   r)  )r\   i   )r   r   r   r4   r   r   r  r   r   r-  r  )r   r   Z	X_countedZX_hashedr    r    r!   test_vectorizer_unicode  s    rH  c                  C   sF   ddg} t | d}|t}|t}t| |  |jsBtd S )Nr   r  r   )r
   r   r   r   r   r   r   r4   )r   r   ZX_1ZX_2r    r    r!   +test_tfidf_vectorizer_with_fixed_vocabulary  s    


rI  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]f}t	|}t
|}t||jkst| | ksttrt|t rqjqjt|t|t qjd S )
Nr   r   T)r&  r[   r`   rR   )r_   r<   )r   r   r   r*   r   r   r&   r
   pickledumpsloadstype	__class__r4   
get_paramsr   r   r   r   )Z	instancesorigr   copyr    r    r!   test_pickling_vectorizer  s0    


rS  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ks>tdS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    r>   N)r   rK  rM  rL  r4   )rT  vecfunctionrW   Zroundtripped_functionr7   r   r    r    r!   test_pickling_built_processors6  s    rW  c                  C   s   t jd} t dddddddd	d
g	}tddD ]X}t| j|ddd}t|d}t	t
|}|t |t t| |  q2d S Nr   r   r   r  r   r   r   r  r   r   d   rm   F)sizer%   r   )r   randomRandomStatearrayr
  r   choicer   rK  rM  rL  r   r   r   r   )rngvocab_wordsxZ	vocab_setr   unpickled_cvr    r    r!   -test_countvectorizer_vocab_sets_when_picklingK  s.    


 rc  c                  C   s   t jd} t dddddddd	d
g	}tddD ]v}t }| j|ddd}tddD ]}|||| < qVt|d}t	t
|}|t |t t| |  q2d S rX  )r   r[  r\  r]  r
  r   r^  r   rK  rM  rL  r   r   r   r   )r_  r`  ra  Z
vocab_dictr   yr   rb  r    r    r!   .test_countvectorizer_vocab_dicts_when_picklingg  s4    


 re  c                  C   s   t  tttdtttdtf} | D ]R}|t }d |_|t }t	|d |t }t
|| t
|| q.d S )NrR   r<   r  )r
   r   r   r   r   r&   r   r   r  delattrr   )Zfitted_vectorizersr   Zvect_transformZstop_None_transformZstop_del_transformr    r    r!   test_stop_words_removal  s    


rg  c                  C   s`   t  t} t | }t|}t|}t||j	ks>t
t||  ||   d S r   )r   r   r   r	   r   rK  rL  rM  rN  rO  r4   r   r   )r   rQ  r   rR  r    r    r!   test_pickling_transformer  s    

rh  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r   r   r   r	   r   r   r   r   r   )r   rQ  rR  r    r    r!   test_transformer_idf_setter  s
    ri  c               	   C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W 5 Q R X d S )NTr   r   r   Fz+`idf_` cannot be set when `user_idf=False`.r   )r
   r   r   r   r   r   r   r   rh   ri   r   )rQ  rR  r   r    r    r!   test_tfidf_vectorizer_setter  s    

rk  c               	   C   s`   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W 5 Q R X d S )NTr   rj  r   r\   r   )
r
   r   r   r   r   r   rh   ri   r   setattr)r   rR  Zexpected_idf_lenZinvalid_idfr    r    r!   %test_tfidfvectorizer_invalid_idf_attr  s    


rm  c               	   C   s<   dddddg} t | d}tt |g  W 5 Q R X d S )Nr6   r   r   r   r   r   r    r    r!   test_non_unique_vocab  s    
rn  c               	   C   s4   d} t }dd }tj|| d |  W 5 Q R X d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r   r   r   nan)Zhvr    r    r!   func  s    z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   rh   ri   )r   	exceptionrr  r    r    r!   "test_hashingvectorizer_nan_in_docs  s
    rt  c                  C   sl   t ddd d} | jst| ddg }t| ddddg | ddg }t| ddddg d S )NTF)r&  r   r   ro  rp  r\   r   )r
   r&  r4   r   r   r   Zravelr   )r   r   r   r    r    r!   test_tfidfvectorizer_binary  s    
ru  c                  C   s(   t dd} | t t| j| jj d S )NTr   )r
   r   r   r   r   r  )r   r    r    r!   test_tfidfvectorizer_export_idf  s    

rv  c                  C   s<   t dgd} t| }| t |t |j| jks8td S )Nr   r   )r
   r   r   r   r   r4   )Z
vect_vocabZvect_vocab_cloner    r    r!   test_vectorizer_vocab_clone  s
    

rw  c              	   C   s   d}|  }t jt|d |d W 5 Q R X t jt|d |d W 5 Q R X |ddg t jt|d |d W 5 Q R X d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)rh   ri   r   r   r   r   )r:   r   rU  r    r    r!   &test_vectorizer_string_object_as_input  s    ry  X_dtypec                 C   s2   t jdd| dd}t |}|j|jks.td S N
   i N  *   )r  r4  )r   randr	   r   r  r4   )rz  r   ZX_transr    r    r!   test_tfidf_transformer_type  s    r  c                  C   s^   t jddtjdd} t | }t | }t |}t |}t|| |j	|j	ksZt
d S r{  )r   r~  r   r*  Z
csc_matrix
csr_matrixr	   r   r   r0  r4   )r   ZX_cscZX_csrZX_trans_cscZX_trans_csrr    r    r!   test_tfidf_transformer_sparse  s    


r  z0vectorizer_dtype, output_dtype, warning_expectedTFc              	   C   s   t dddg}t| d}d}|rHtjt|d ||}W 5 Q R X n*t  t	dt ||}W 5 Q R X |j
|kstd S )NnumpyscipyZsklearnr  z'dtype' should be used.r   r   )r   r]  r
   rh   r   r   r   r   r   r   r  r4   )Zvectorizer_dtypeZoutput_dtypeZwarning_expectedr   r   Zwarning_msg_matchZX_idfr    r    r!   test_tfidf_vectorizer_type  s    


r  rU  )r]   r\   rJ  c              	   C   s   | j }td| d}t| tr2tr2tjdd tjt	|d | 
dg W 5 Q R X tjt	|d | dg W 5 Q R X t| trtjt	|d | dg W 5 Q R X d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.*HashingVectorizer is not supported on PyPy)reasonr   zgood news everyone)r`   reescaper   r   r   rh   xfailri   r   r   r   r   )rU  Zinvalid_ranger   r    r    r!   $test_vectorizers_invalid_ngram_range0  s    

r  c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr   _check_stop_words_consistency)Z	estimatorr   tokenize
preprocessr    r    r!   r  N  s    r  c               
   C   s   d} d|  }t  t t fD ]R}|jddddgd tjt|d |d	g W 5 Q R X |`t	|d
kst
qt  tdt |d	g W 5 Q R X t	|d kst
|jdddddgd tjt|d |d	g W 5 Q R X d S )Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.you'veZyouyou'llANDr   r   ro  Fr   Zblah)r   r
   r   r   rh   r   r   r   Z_stop_words_idr  r4   r   r   r   )Zlstrr   rU  r    r    r!   'test_vectorizer_stop_words_inconsistentU  s$    
r  c                  C   s`   t jdtjd} tj}| j|| _| j|| _dddd}t | |}||jj	ks\t
dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )rm   rm   r  r   r\   r]   )zscikit-learnrJ   zgreat!N)r   r  r   int64indicesZastypeZindptrr   Z_sort_featuresr  r4   )r   ZINDICES_DTYPEr   ZXsr    r    r!   7test_countvectorizer_sort_features_64bit_sparse_indicesq  s    r  	Estimatorc                 C   s   ddig}|  }t |dks t| dd dgd}t |dksBtt |d ksRt|| G d	d
 d
| }|dgd}t |dkst| dd dgd}t |dkstd S )NrW   rx  Tc                 S   s   | d S NrW   r    ra  r    r    r!   <lambda>      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)rS   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   s   | d S r  r    r  r    r    r!   r    r  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r    )selfr    r    r!   r     s    zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r   r    r    r    r!   CustomEstimator  s   r  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr1  r    r    r!   r    r  )rT   r   )r  r4   r   )r  r  rU  r  r    r    r!   -test_stop_word_validation_custom_preprocessor  s    

 r  zinput_type, err_type, err_msgfilenamer3   rL   z$'str' object has no attribute 'read'c              	   C   sR   t | trtrtd dg}tj||d | dd |d| W 5 Q R X d S )Nr  "this is text, not file or filenamer   c                 S   s   |   S r   r'   r  r    r    r!   r    r  z.test_callable_analyzer_error.<locals>.<lambda>r_   rM   )
issubclassr   r   rh   r  ri   r   )r  
input_typeZerr_typer   r  r    r    r!   test_callable_analyzer_error  s
    
r  )Zmarksr_   c                 C   s
   t | dS )Nr)openr  r    r    r!   r    r  r  c                 C   s   |   S r   )readr  r    r    r!   r    r  r  c              	   C   s6   dg}t ttf | ||d| W 5 Q R X d S )Nr  r  )rh   ri   FileNotFoundErrorAttributeErrorr   )r  r_   r  r  r    r    r!   &test_callable_analyzer_change_behavior  s    r  c              	   C   sf   dd }t |tr tr td | d}|d tjtdd ||dd		|g W 5 Q R X d S )
Nc                 S   s   t dd S )Ntesting)	Exceptionr  r    r    r!   r_     s    z6test_callable_analyzer_reraise_error.<locals>.analyzerr  zfile.txtzsample content
r  r   rL   r  )
r  r   r   rh   r  joinwriteri   r  r   )Ztmpdirr  r_   fr    r    r!   $test_callable_analyzer_reraise_error  s    


r  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r6  rc   z'stop_words'
'analyzer'	!= 'word'c                 C   s   |   S r   r'   r   r    r    r!   r    r  z'tokenizer'c                 C   s   |   S r   r'   r   r    r    r!   r    r  \w+rY   'token_pattern'zis not Nonec                 C   s   |   S r   r   r   r    r    r!   r    r  c                 C   s   |   S r   r  r   r    r    r!   r    r  z'preprocessor'zis callabler[   c                 C   s   |   S r   r  r   r    r    r!   r    r  z'ngram_range')	NNNr6  r  rc   r  r  r  c
              	   C   sV   t }
|  }|j||||||d d|||	f }tjt|d ||
 W 5 Q R X d S )N)r   rT   rS   r`   r   r_   z-The parameter %s will not be used since %s %sr   )r   r   rh   r   r   r   )r:   r   rT   rS   r`   r   r_   Zunused_nameZ	ovrd_nameZovrd_msgr   r   r   r    r    r!   test_unused_parameters_warn  s"    Xr  zVectorizer, Xr\   r]   )r   barre   )r   Zbazc                 C   s0   |  }t |drt|| t |dr,td S )NZn_features_in_)r   r4   r   )r:   r   r   r    r    r!   test_n_features_inI  s    	
r  c                  C   s:   t dd} | ddgj}| ddgj}||ks6td S )Nr\   r  ZhelloZworld)r   r   r   r4   )rU  Zvocab1Zvocab2r    r    r!   )test_tie_breaking_sample_order_invarianceX  s    
r  c                  C   s.   t ddd} | dgj}|d dks*td S )Ni@B )r]   re   )r  r`   z22pcs efuturer   )r   r   r  r4   )Zhashingr  r    r    r!   2test_nonnegative_hashing_vectorizer_result_indicesa  s    r  c                 C   s   |  }t |drtdS )z0Check that vectorizers do not define set_output.Z
set_outputN)r   r4   )r  r|   r    r    r!   'test_vectorizers_do_not_have_set_outputi  s    r  )collections.abcr   r  rh   r   r  r   Zsklearn.feature_extraction.textr   r   r   r   r   r	   r
   r   Zsklearn.model_selectionr   r   r   Zsklearn.pipeliner   Zsklearn.svmr   Zsklearn.baser   r  r   Znumpy.testingr   r   Zsklearn.utilsr   Zsklearn.utils._testingr   r   r   r   collectionsr   	functoolsr   rK  ior   r   r;  r   r"   r&   r)   r*   r8   r9   markZparametrizerX   ra   rl   r{   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r   r"  r(  r+  r2  rD  rE  rF  rH  rI  rS  rU   r   r  rW  rc  re  rg  rh  ri  rk  rm  rn  rt  ru  rv  rw  ry  r'  r*  r  r  Zint32r  r  r  r  r  r  r  r  r  r  paramr  r  r  r  r  r  r  r    r    r    r!   <module>   s  	$
=
g
&G


$'


	

 

	



    
 H!
	
 
