U
    9%e                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ eeZeG dd dZG dd deZG dd deZ dS )    N)	dataclassfield)Enum)ListOptionalUnion)FileLock)Dataset   )PreTrainedTokenizerBase)logging   )!glue_convert_examples_to_featuresglue_output_modesglue_processors)InputFeaturesc                   @   s   e Zd ZU dZeddde  idZe	e
d< eddidZe	e
d< ed	dd
idZee
d< edddidZee
d< dd ZdS )GlueDataTrainingArgumentsz
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
    line.
    helpz"The name of the task to train on: z, )metadata	task_namezUThe input data dir. Should contain the .tsv files (or other data files) for the task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.)defaultr   max_seq_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachec                 C   s   | j  | _ d S N)r   lowerself r   ^/var/www/html/Darija-Ai-API/env/lib/python3.8/site-packages/transformers/data/datasets/glue.py__post_init__=   s    z'GlueDataTrainingArguments.__post_init__N)__name__
__module____qualname____doc__r   joinr   keysr   str__annotations__r   r   intr   boolr!   r   r   r   r    r   #   s    
$	 r   c                   @   s   e Zd ZdZdZdZdS )SplittraindevtestN)r"   r#   r$   r-   r.   r/   r   r   r   r    r,   A   s   r,   c                   @   s|   e Zd ZU dZeed< eed< ee ed< de	j
dfeeee eee	f ee dddZd	d
 ZedddZdd ZdS )GlueDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsoutput_modefeaturesN)r1   	tokenizerlimit_lengthmode	cache_dirc                 C   s  t dt || _t|j  | _t|j | _t	|t
r`zt| }W n tk
r^   tdY nX tj|d k	rr|n|jd|j d|jj d|j d|j }| j }|jdkr|jjdkr|d |d  |d< |d< || _|d	 }t| tj|r:|js:t }	t|| _td
| dt |	  ntd|j  |tj krh| j!|j}
n*|tj"kr| j#|j}
n| j$|j}
|d k	r|
d | }
t%|
||j|| jd| _t }	t&| j| td| dt |	 dd W 5 Q R X d S )Nu  This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.pyzmode is not a valid split nameZcached__)Zmnlizmnli-mm)ZRobertaTokenizerZRobertaTokenizerFastZXLMRobertaTokenizerZBartTokenizerZBartTokenizerFastr      z.lockz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at )
max_length
label_listr2   z!Saving features into cached file z [took z.3fz s])'warningswarnFutureWarningr1   r   r   	processorr   r2   
isinstancer(   r,   KeyErrorospathr&   r   value	__class__r"   r   
get_labelsr;   r   existsr   timetorchloadr3   loggerinfor.   Zget_dev_examplesr/   Zget_test_examplesZget_train_examplesr   save)r   r1   r4   r5   r6   r7   Zcached_features_filer;   Z	lock_pathstartZexamplesr   r   r    __init__P   sb    
$

 

zGlueDataset.__init__c                 C   s
   t | jS r   )lenr3   r   r   r   r    __len__   s    zGlueDataset.__len__)returnc                 C   s
   | j | S r   )r3   )r   ir   r   r    __getitem__   s    zGlueDataset.__getitem__c                 C   s   | j S r   )r;   r   r   r   r    rF      s    zGlueDataset.get_labels)r"   r#   r$   r%   r   r)   r(   r   r   r,   r-   r   r   r*   r   rO   rQ   rT   rF   r   r   r   r    r0   G   s"   

Jr0   )!rB   rH   r<   dataclassesr   r   enumr   typingr   r   r   rI   Zfilelockr   Ztorch.utils.datar	   Ztokenization_utils_baser   utilsr   Zprocessors.gluer   r   r   Zprocessors.utilsr   Z
get_loggerr"   rK   r   r,   r0   r   r   r   r    <module>   s"   
