In [1]:
%%capture
!pip install jiwer
!pip install librosa
!pip install datasets
!pip install torchaudio
!pip install transformers[torch]

In [1]:
# Import necessary libraries
import os
import pickle
import random
import librosa
import numpy as np
import pandas as pd
import torchaudio
import IPython.display as ipd
from datasets import ClassLabel, load_dataset, load_metric
from sklearn.model_selection import train_test_split
from IPython.display import Audio, Javascript, display, HTML, Image
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Wav2Vec2FeatureExtractor

In [2]:
train = load_dataset('csv', data_files='/var/www/html/Darija-Ai-Train/train.csv')
test = load_dataset('csv', data_files='/var/www/html/Darija-Ai-Train/test.csv')

darija_ai_train = train['train']
darija_ai_test = test['train']

In [3]:

print(darija_ai_train)
print(darija_ai_test)

Dataset({
    features: ['path', 'transcript'],
    num_rows: 16919
})
Dataset({
    features: ['path', 'transcript'],
    num_rows: 564
})


In [5]:
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset)
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(darija_ai_train.remove_columns(["path"]), num_examples=20)

Unnamed: 0,transcript
0,الدوار هو النساء بمعنى اخر ان من حافظ على الهوية وعلى اللغة
1,ومامحتاجش تاشي نصيحة ديال شي حد الحمد الله واخا و هاد السيدة كتعرفها مزيان سولتي عليها مزيان
2,في فيسبةك يكتبها دبا اجي نكوليك علاش
3,وصافي ابهية متكونيش قاصحة انا راه قلبي
4,يالاه سير سير اسعدية
5,ايه راني عارف عارف راه ضاير واحد الفيروس غي الله يحفظ ويحضر السلامة
6,الخير صباح النور فقتي فقت وجديلي الفطور او متول
7,حنا وخا مكانعيشوش هنا فالمغرب كتعجبنا بزاف قناة دوزيم وكنتفرجو فيها
8,حماقتي رحيمو شوفي الشوار شوارك
9,مرحبا شحال خاصك


In [6]:
import re
chars_to_ignore_regex = '[\,\؟\.\!\-\;\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�]'
 
def remove_special_characters(batch):  
    batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower() + " "
    batch["transcript"] = re.sub('[a-z]','',batch["transcript"])
    batch["transcript"] = re.sub('\r\n',' ',batch["transcript"])
    batch["transcript"] = re.sub("[إأٱآا]", "ا", batch["transcript"])
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    batch["transcript"] = re.sub(noise, '', batch["transcript"])
    return batch


darija_ai_train = darija_ai_train.map(remove_special_characters)
darija_ai_test = darija_ai_test.map(remove_special_characters)   

Map:   0%|          | 0/16919 [00:00<?, ? examples/s]

In [7]:
darija_ai_train = darija_ai_train.map(remove_special_characters)
darija_ai_test = darija_ai_test.map(remove_special_characters)

Map:   0%|          | 0/16919 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

In [9]:
show_random_elements(darija_ai_train.remove_columns(["path"]), num_examples=20)

Unnamed: 0,transcript
0,بحالاش انا وبنت عمي وانت مالك والعفة
1,اكسيو كوبي ونحس براسي كنادي شي حاجة كنامن بيها شي حاجة كتخرج ليا من
2,و 6000 بدلت بها الزنك والفين ريال بدلت بها القفولة هاهية
3,واحده اسمها ليلى وحدة كايعيطوا لي الشطاعة
4,فالايميل ديالي ولا و فميساج ديالي وكانسطر عليهم و كانقولو شفتي هادي هاهوما و كانقولو خود وقتك
5,للمدام و الله حتى ندير ليك التمن ديال الخاوة جمعي راسك واش انا جايبك
6,ما تدمنيش على شان عونك دير خير ما يتراباش
7,الى اعلى قمة الجبل حين اخذه بين فكيه
8,الانسان فهاد الفترة هاذي راه الله يحسن العوان و الهرمونات و المسائل راه الانسان فهمتيني يقدر يبدا يتعصب ولا شي حاجة
9,صافي صافي اختي باهية الله يعطيك الستر غي خليها تمشي و انا غانكمل هاد الماعن كاملين


In [10]:
def extract_all_chars(batch):
  all_text = " ".join(batch["transcript"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [11]:
vocab_train = darija_ai_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=darija_ai_train.column_names)
vocab_test = darija_ai_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=darija_ai_test.column_names)

Map:   0%|          | 0/16919 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

In [12]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [13]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'ي': 0,
 '^': 1,
 ')': 2,
 'ڢ': 3,
 'و': 4,
 'ر': 5,
 '/': 6,
 'ز': 7,
 'ض': 8,
 'غ': 9,
 'ث': 10,
 '2': 11,
 'ا': 12,
 'ة': 13,
 'ف': 14,
 '5': 15,
 'ك': 16,
 'ب': 17,
 '\\': 18,
 '£': 19,
 '9': 20,
 'ء': 21,
 '\u200f': 22,
 'خ': 23,
 'ه': 24,
 '(': 25,
 'ذ': 26,
 'ت': 27,
 '6': 28,
 'ج': 29,
 '3': 30,
 'ع': 31,
 '0': 32,
 'ح': 33,
 '$': 34,
 ' ': 35,
 'è': 36,
 'س': 37,
 'ل': 38,
 'ى': 39,
 'ئ': 40,
 'ط': 41,
 'ê': 42,
 '4': 43,
 '\xa0': 44,
 '8': 45,
 'ن': 46,
 'ç': 47,
 'ص': 48,
 'ش': 49,
 '’': 50,
 'ظ': 51,
 'د': 52,
 '1': 53,
 'ï': 54,
 'ؤ': 55,
 '?': 56,
 '7': 57,
 'î': 58,
 'à': 59,
 'é': 60,
 'ق': 61,
 'م': 62}

In [14]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [15]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

65

In [16]:
import json
with open("vocab.json", 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [17]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [18]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [19]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load("/var/www/html/Darija-Ai-Train/" + batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["transcript"]
    return batch

In [20]:
darija_ai_train = darija_ai_train.map(speech_file_to_array_fn, remove_columns=darija_ai_train.column_names)
darija_ai_test = darija_ai_test.map(speech_file_to_array_fn, remove_columns=darija_ai_test.column_names)

Map:   0%|          | 0/16919 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

In [21]:
rand_int = random.randint(0, len(darija_ai_train)-1)
print("Target text:", darija_ai_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(darija_ai_train[rand_int]["speech"]).shape)
print("Sampling rate:", darija_ai_train[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(darija_ai_train[rand_int]["speech"]), autoplay=True, rate=darija_ai_train[rand_int]["sampling_rate"])

Target text: واللهيتا مسموم  
Input array shape: (221231,)
Sampling rate: 44100


In [22]:
def resample(batch): 
    batch["speech"]  = librosa.resample(np.asarray(batch["speech"]), orig_sr=batch['sampling_rate'], target_sr=16_000)
    batch["sampling_rate"] = 16_000
    return batch

In [23]:
darija_ai_train = darija_ai_train.map(resample, num_proc=4)
darija_ai_test = darija_ai_test.map(resample, num_proc=4)

Map (num_proc=4):   0%|          | 0/16919 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/564 [00:00<?, ? examples/s]

In [24]:
rand_int = random.randint(0, len(darija_ai_train)-1)
print("Target text:", darija_ai_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(darija_ai_train[rand_int]["speech"]).shape)
print("Sampling rate:", darija_ai_train[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(darija_ai_train[rand_int]["speech"]), autoplay=True, rate=16000)

Target text: انها القناة المفظلة عندي من الطفولة ديالي وانا كانبغيها لانها كتقدم برامج  
Input array shape: (80266,)
Sampling rate: 16000


In [25]:
def prepare_dataset(batch):
    assert (len(set(batch["sampling_rate"])) == 1), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [26]:
darija_ai_train = darija_ai_train.map(prepare_dataset, remove_columns=darija_ai_train.column_names, batch_size=8, num_proc=4, batched=True)
darija_ai_test = darija_ai_test.map(prepare_dataset, remove_columns=darija_ai_test.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/16919 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/564 [00:00<?, ? examples/s]



In [28]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [29]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [30]:
wer_metric = load_metric("wer", cache_dir="/var/www/html/Darija-Ai-Train")

  wer_metric = load_metric("wer", cache_dir="/var/www/html/Darija-Ai-Train")


In [31]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
#     cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer}
#     return {"cer" : cer}

In [32]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
model.freeze_feature_extractor()



In [35]:
training_args = TrainingArguments(
  output_dir="/var/www/html/Darija-Ai-Train",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=False,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [36]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=darija_ai_train,
    eval_dataset=darija_ai_test,
    tokenizer=processor.feature_extractor,
)

In [39]:
trainer.train()

Step,Training Loss,Validation Loss


ValueError: one or more references are empty strings

In [None]:
# Evaluation
results = trainer.evaluate(darija_ai_test)
print("Results:", results)