U
    3dg,                     @   s   d dl mZ d dlmZ d dlmZ d dlZddlm	Z	 dddd	d
Z
d!ddZG dd deZdd ZG dd deZdd Zdd ZddddZd"ddZG dd deZdd  ZdS )#    )suppress)Counter)
NamedTupleN   is_scalar_nanFreturn_inversereturn_countsc                C   s&   | j tkrt| ||dS t| ||dS )a  Helper function to find unique values with support for python objects.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : ndarray
        Values to check for unknowns.

    return_inverse : bool, default=False
        If True, also return the indices of the unique values.

    return_counts : bool, default=False
        If True, also return the number of times each unique item appears in
        values.

    Returns
    -------
    unique : ndarray
        The sorted unique values.

    unique_inverse : ndarray
        The indices to reconstruct the original array from the unique array.
        Only provided if `return_inverse` is True.

    unique_counts : ndarray
        The number of times each of the unique values comes up in the original
        array. Only provided if `return_counts` is True.
    r   )dtypeobject_unique_python
_unique_np)valuesr	   r
    r   9/tmp/pip-unpacked-wheel-zrfo1fqw/sklearn/utils/_encode.py_unique	   s    
    r   c                 C   s   t j| ||d}d\}}|r&|^ }}|r4|^ }}|s<|rD|d }|jrt|d rt |t j}|d|d  }|r||||k< |rt ||d ||< |d|d  }|f}|r||f7 }|r||f7 }t|dkr|d S |S )zHelper function to find unique values for numpy arrays that correctly
    accounts for nans. See `_unique` documentation for details.r   )NNr   Nr   )npuniquesizer   searchsortednansumlen)r   r	   r
   uniquesZinversecountsZnan_idxretr   r   r   r   2   s4      



r   c                   @   s*   e Zd ZU dZeed< eed< dd ZdS )MissingValuesz'Data class for missing data informationr   nonec                 C   s*   g }| j r|d | jr&|tj |S )z3Convert tuple to a list where None is always first.N)r   appendr   r   )selfoutputr   r   r   to_lista   s    
zMissingValues.to_listN)__name__
__module____qualname____doc__bool__annotations__r#   r   r   r   r   r   [   s   
r   c                 C   sn   dd | D }|s"| t dddfS d|krRt|dkrDt ddd}q^t ddd}nt ddd}| | }||fS )a.  Extract missing values from `values`.

    Parameters
    ----------
    values: set
        Set of values to extract missing from.

    Returns
    -------
    output: set
        Set with missing values extracted.

    missing_values: MissingValues
        Object with missing value information.
    c                 S   s    h | ]}|d kst |r|qS Nr   .0valuer   r   r   	<setcomp>{   s      z#_extract_missing.<locals>.<setcomp>F)r   r   Nr   T)r   r   )r   Zmissing_values_setZoutput_missing_valuesr"   r   r   r   _extract_missingk   s    r/   c                       s(   e Zd ZdZ fddZdd Z  ZS )_nandictz!Dictionary with support for nans.c                    s4   t  | | D ]\}}t|r|| _ q0qd S r*   )super__init__itemsr   	nan_value)r!   mappingkeyr-   	__class__r   r   r2      s
    z_nandict.__init__c                 C   s$   t | drt|r| jS t|d S )Nr4   )hasattrr   r4   KeyErrorr!   r6   r   r   r   __missing__   s    z_nandict.__missing__)r$   r%   r&   r'   r2   r<   __classcell__r   r   r7   r   r0      s   r0   c                    s.   t dd t|D  t fdd| D S )z,Map values based on its position in uniques.c                 S   s   i | ]\}}||qS r   r   )r,   ivalr   r   r   
<dictcomp>   s      z#_map_to_integer.<locals>.<dictcomp>c                    s   g | ]} | qS r   r   r,   vtabler   r   
<listcomp>   s     z#_map_to_integer.<locals>.<listcomp>)r0   	enumerater   array)r   r   r   rC   r   _map_to_integer   s    rH   c                C   s   z>t | }t|\}}t|}||  tj|| jd}W nB tk
r   tdd t dd | D D }td| Y nX |f}|r|t	| |f7 }|r|t
| |f7 }t|dkr|d S |S )Nr   c                 s   s   | ]}|j V  qd S r*   )r&   )r,   tr   r   r   	<genexpr>   s     z!_unique_python.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S r*   )typerA   r   r   r   rK      s     zEEncoders require their input to be uniformly strings or numbers. Got r   r   )setr/   sortedextendr#   r   rG   r   	TypeErrorrH   _get_countsr   )r   r	   r
   uniques_setZmissing_valuesr   typesr   r   r   r   r      s"     
r   T)check_unknownc             
   C   s   | j jdkrPzt| |W S  tk
rL } ztdt| W 5 d}~X Y qX n0|rtt| |}|rttdt| t|| S dS )a  Helper function to encode values into [0, n_uniques - 1].

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.
    The numpy method has the limitation that the `uniques` need to
    be sorted. Importantly, this is not checked but assumed to already be
    the case. The calling method needs to ensure this for all non-object
    values.

    Parameters
    ----------
    values : ndarray
        Values to encode.
    uniques : ndarray
        The unique values in `values`. If the dtype is not object, then
        `uniques` needs to be sorted.
    check_unknown : bool, default=True
        If True, check for values in `values` that are not in `unique`
        and raise an error. This is ignored for object dtype, and treated as
        True in this case. This parameter is useful for
        _BaseEncoder._transform() to avoid calling _check_unknown()
        twice.

    Returns
    -------
    encoded : ndarray
        Encoded values
    OUSz%y contains previously unseen labels: N)	r   kindrH   r:   
ValueErrorstr_check_unknownr   r   )r   r   rT   ediffr   r   r   _encode   s    &
r\   c                    s~  d}| j jdkrt| }t|\}}t|t\| }|joLj }|joZj }fdd |r|sz|sz|rt fdd| D }ntjt	| t
d}t|}|r|d |r|tj nt| }	tj|	|dd	}|r|jrt| |}ntjt	| t
d}t| rdt|}
|
 rd|jrZ|rZt| }d
||< ||
  }t|}|rz||fS |S )a  
    Helper function to check for unknowns in values to be encoded.

    Uses pure python method for object dtype, and numpy method for
    all other dtypes.

    Parameters
    ----------
    values : array
        Values to check for unknowns.
    known_values : array
        Known values. Must be unique.
    return_mask : bool, default=False
        If True, return a mask of the same shape as `values` indicating
        the valid values.

    Returns
    -------
    diff : list
        The unique values present in `values` and not in `know_values`.
    valid_mask : boolean array
        Additionally returned if ``return_mask=True``.

    NrU   c                    s$   | kp" j r| d kp" jo"t| S r*   )r   r   r   )r-   )missing_in_uniquesrR   r   r   is_valid  s    z _check_unknown.<locals>.is_validc                    s   g | ]} |qS r   r   r+   )r^   r   r   rE     s     z"_check_unknown.<locals>.<listcomp>rI   TZassume_uniquer   )r   rV   rM   r/   r   r   r   rG   Zonesr   r(   listr    r   Z	setdiff1dr   Zin1disnanany)r   Zknown_valuesZreturn_maskZ
valid_maskZ
values_setZmissing_in_valuesr[   Znan_in_diffZnone_in_diffunique_valuesZdiff_is_nanis_nanr   )r^   r]   rR   r   rY      sH    	





rY   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )_NaNCounterz$Counter with support for nan values.c                    s   t  | | d S r*   )r1   r2   _generate_items)r!   r3   r7   r   r   r2   C  s    z_NaNCounter.__init__c                 c   s<   |D ]2}t |s|V  qt| ds(d| _|  jd7  _qdS )z>Generate items without nans. Stores the nan counts separately.	nan_countr   r   N)r   r9   rg   )r!   r3   itemr   r   r   rf   F  s    
z_NaNCounter._generate_itemsc                 C   s$   t | drt|r| jS t|d S )Nrg   )r9   r   rg   r:   r;   r   r   r   r<   P  s    z_NaNCounter.__missing__)r$   r%   r&   r'   r2   rf   r<   r=   r   r   r7   r   re   @  s   
re   c           
   
   C   s   | j jdkr^t| }tjt|tjd}t|D ](\}}tt	 || ||< W 5 Q R X q0|S t
| dd\}}tj||dd}t|d rt|d rd|d< t||| }	tj|tjd}||	 ||< |S )zGet the count of each of the `uniques` in `values`.

    The counts will use the order passed in by `uniques`. For non-object dtypes,
    `uniques` is assumed to be sorted and `np.nan` is at the end.
    ZOUrI   T)r
   r_   r   )r   rV   re   r   zerosr   Zint64rF   r   r:   r   isinra   r   Z
zeros_like)
r   r   counterr"   r>   rh   rc   r   Zuniques_in_valuesZunique_valid_indicesr   r   r   rQ   V  s    
rQ   )FF)F)
contextlibr   collectionsr   typingr   Znumpyr    r   r   r   r   r/   dictr0   rH   r   r\   rY   re   rQ   r   r   r   r   <module>   s   )
)&*
U