In [None]:
%%capture
!pip install --upgrade jupyter ipywidgets

In [None]:
%%capture
!pip install torch
!pip install jiwer
!pip install pandas
!pip install librosa
!pip install datasets
!pip install torchaudio
!pip install transformers

In [43]:
!pip install transformers[torch]



# prepare dataset

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
base_dataset_path = "/var/www/html/Darija-Ai-Train/dataset.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(base_dataset_path)

In [9]:
df.head()

Unnamed: 0,path,transcript,duration
0,audios/23ef5d5f-730a-4304-99bd-90b6c9031d7d.wav,عمرني ما نساهاذ اللحظه وانا هربانه وكانهش وقلب...,10.006
1,audios/44cf8c39-97e4-4c30-b5f5-6ad69c6202d7.wav,الدنيا ضلمات في عيني وحياتي اللي حسيت بها صافي...,10.006
2,audios/732a3996-92c0-4fbc-895b-ae3d1443b338.wav,انا بنت ال 2000 بنت الجيل اللي كبر مع,10.006
3,audios/9a90eefd-784f-40b6-86de-6a2ac43797a8.wav,انترنت وفيسبوك ويوتيوب انا الجيل ديال السمارت ...,10.006
4,audios/9ab62fa7-a82c-4016-ae8c-22409758d201.wav,حح,10.006


In [11]:
df.tail()

Unnamed: 0,path,transcript,duration
19127,audios/2aa35bf6-063c-4332-8ecb-90205a6c7e9b.wav,سيدي احمد العمامة,5.017
19128,audios/5e0f6189-4df3-4bc7-a487-d43fd3076f0f.wav,وسك الريح,5.017
19129,audios/d437d71f-3a15-4b9a-b6cf-d55c45627e2b.wav,والمطار شحيح خبار الناس اخويا,5.017
19130,audios/c6be5c02-5931-4e71-bb82-c35546692cde.wav,فالناس,5.017
19131,audios/e80eeb2a-f1b9-4b99-9cf8-d429041f11e9.wav,وا ها أنت,5.017


In [8]:
# Remove rows with NaN values
df = df.dropna()
# Reset the index
df = df.reset_index(drop=True)

In [16]:
# Remove rows with NaN, empty values, values equal to None, and 'text' column equal to 'None'
df = df.dropna()
df = df.dropna(subset=['transcript'], how='all')
df = df[df['transcript'].notna()]
df = df[df['transcript'] != 'None']
df = df[df['transcript'] != 'none']


# Reset the index
df = df.reset_index(drop=True)

In [18]:
# Calculate the total duration in seconds
total_duration = df["duration"].sum()
# Display the total duration in seconds
print("Total Duration in Seconds:", total_duration, "sec")
# Convert total duration to hours
total_duration_hours = total_duration / 3600  # There are 3600 seconds in an hour
# Display the total duration in hours
print("Total Duration in Hours:", total_duration_hours, "hours")

Total Duration in Seconds: 96595.195 sec
Total Duration in Hours: 26.831998611111114 hours


In [19]:
# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.3, random_state=42)

In [20]:
# Calculate the number of examples in each set
num_dataset_examples = df.shape[0]
num_train_examples = train_data.shape[0]
num_val_examples = val_data.shape[0]
num_test_examples = test_data.shape[0]

# Print the results
print("Number of Examples in Dataset Set:", num_dataset_examples)
print("Number of Examples in Training Set:", num_train_examples)
print("Number of Examples in Validation Set:", num_val_examples)
print("Number of Examples in Test Set:", num_test_examples)

Number of Examples in Dataset Set: 18799
Number of Examples in Training Set: 16919
Number of Examples in Validation Set: 1316
Number of Examples in Test Set: 564


In [21]:
# Select only the "path" and "transcript" columns
train_data = train_data[["path", "transcript"]]
val_data = val_data[["path", "transcript"]]
test_data = test_data[["path", "transcript"]]

# Save the modified DataFrames to CSV
train_data.to_csv("train.csv", index=False)
val_data.to_csv("dev.csv", index=False)
test_data.to_csv("test.csv", index=False)

# Load Dataset

In [44]:
# Import necessary libraries
import os
import torch
import librosa
import random
import torchaudio
import pandas as pd
import IPython.display as ipd
from IPython.display import display, HTML
from datasets import load_dataset, load_metric, ClassLabel
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Trainer  

In [3]:

train = load_dataset('csv', data_files=['/var/www/html/Darija-Ai-Train/train.csv','/var/www/html/Darija-Ai-Train/dev.csv'] ) 
test = load_dataset('csv', data_files=['/var/www/html/Darija-Ai-Train/test.csv'], )  

darija_ai_train = train['train']
darija_ai_test = test['train']

In [4]:
print(darija_ai_train)
print(darija_ai_test)

Dataset({
    features: ['path', 'transcript'],
    num_rows: 18235
})
Dataset({
    features: ['path', 'transcript'],
    num_rows: 564
})


In [5]:
def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset)
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [6]:
show_random_elements(darija_ai_train.remove_columns(["path"]), num_examples=20)

Unnamed: 0,transcript
0,ياكما دراها
1,باش نقري ولدي باش منين يكبر حتى هو يخذم كيليفري بيتزا لولد ماريو
2,الدرب كامل شعل فيه الضوء
3,راه حنا لي خديناه والله ماباقين نعاودو ابابا غير سمحلينا
4,الضباب الكثيف و
5,ما نبغيش
6,لي معافرة مع الارض خداتها زروالة بنت عمر وورتوهالو
7,هناك الافلام الاولى بطبيعه الحال كانت لم
8,ما كاين والو
9,ولاخور السرح لهيه وجنان لهاد جيه وفرقو فريق العقالة


In [7]:
import re
chars_to_ignore_regex = '[\,\؟\.\!\-\;\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�]'
 
def remove_special_characters(batch):  
    batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower() + " "
    batch["transcript"] = re.sub('[a-z]','',batch["transcript"])
    batch["transcript"] = re.sub('\r\n',' ',batch["transcript"])
    batch["transcript"] = re.sub("[إأٱآا]", "ا", batch["transcript"])
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    batch["transcript"] = re.sub(noise, '', batch["transcript"])
    return batch


darija_ai_train = darija_ai_train.map(remove_special_characters)
darija_ai_test = darija_ai_test.map(remove_special_characters)   

In [8]:
show_random_elements(darija_ai_train.remove_columns(["path"]), num_examples=20)

Unnamed: 0,transcript
0,1
1,اللي عطاه الله اكلس اكلس هانتا غاتسخف شنو زعما مابغيتوش تخرجو
2,دراسة للاستاذ عمر الرويدي في التربية نقترح تجديد
3,ودرناه ماشي كدوب اش طالعلكم منو لقمة ديال الطعام
4,انا بعيدا جدا فوق التلة
5,نشوفو شنو غادي يدير سي جمال
6,ذكريات ايام هي لي جات في بالك تفكر شكون
7,انا غانسبقكم لدار نجمع حوايجي ونتوما خلطو عليا مع بهية واخا واخة
8,من اجل فن والدليل على انه الا شتو مثلا في المنظومة التربوية ديالنا راه باقي عندنا
9,افيغتيغ سانك دي كارت مارس ميل نف سو كاتخوفان ناف ديس ناف


In [9]:
def extract_all_chars(batch):
  all_text = " ".join(batch["transcript"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [10]:
vocab_train = darija_ai_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=darija_ai_train.column_names)
vocab_test = darija_ai_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=darija_ai_test.column_names)
 

Map:   0%|          | 0/18235 [00:00<?, ? examples/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

In [11]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [12]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'ت': 0,
 'ل': 1,
 '$': 2,
 'ê': 3,
 'ج': 4,
 'ط': 5,
 'ر': 6,
 'ن': 7,
 'س': 8,
 'غ': 9,
 '3': 10,
 'ك': 11,
 'ء': 12,
 'و': 13,
 'ة': 14,
 'ز': 15,
 'ق': 16,
 'ض': 17,
 '6': 18,
 'ظ': 19,
 '×': 20,
 'ڢ': 21,
 '\u200f': 22,
 '\\': 23,
 '’': 24,
 '2': 25,
 'ث': 26,
 '^': 27,
 'ذ': 28,
 '9': 29,
 '?': 30,
 'ش': 31,
 'ص': 32,
 ')': 33,
 'م': 34,
 'è': 35,
 '7': 36,
 '1': 37,
 '/': 38,
 'ç': 39,
 '8': 40,
 '0': 41,
 'د': 42,
 'ئ': 43,
 '5': 44,
 'ب': 45,
 ' ': 46,
 'ه': 47,
 'î': 48,
 'ا': 49,
 '(': 50,
 'ؤ': 51,
 'ح': 52,
 'à': 53,
 'ï': 54,
 '\xa0': 55,
 'ف': 56,
 'é': 57,
 '4': 58,
 '£': 59,
 'ى': 60,
 'ع': 61,
 'خ': 62,
 'ي': 63}

In [13]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [14]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

66

In [15]:
import json
with open("vocab.json", 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [16]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [17]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [18]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load("/var/www/html/Darija-Ai-Train/" + batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["transcript"]
    return batch

In [19]:
darija_ai_train = darija_ai_train.map(speech_file_to_array_fn, remove_columns=darija_ai_train.column_names)
darija_ai_test = darija_ai_test.map(speech_file_to_array_fn, remove_columns=darija_ai_test.column_names)

In [23]:
rand_int = random.randint(0, len(darija_ai_train)-1)
print("Target text:", darija_ai_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(darija_ai_train[rand_int]["speech"]).shape)
print("Sampling rate:", darija_ai_train[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(darija_ai_train[rand_int]["speech"]), autoplay=True, rate=darija_ai_train[rand_int]["sampling_rate"])

Target text: كلشي هوا هداك ولكن بلاتي بلاتي 
Input array shape: (221231,)
Sampling rate: 44100


In [24]:
def resample(batch): 
    batch["speech"]  = librosa.resample(np.asarray(batch["speech"]), orig_sr=batch['sampling_rate'], target_sr=16_000)
    batch["sampling_rate"] = 16_000
    return batch

In [25]:
darija_ai_train = darija_ai_train.map(resample, num_proc=4)
darija_ai_test = darija_ai_test.map(resample, num_proc=4)

In [26]:
rand_int = random.randint(0, len(darija_ai_train)-1)
print("Target text:", darija_ai_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(darija_ai_train[rand_int]["speech"]).shape)
print("Sampling rate:", darija_ai_train[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(darija_ai_train[rand_int]["speech"]), autoplay=True, rate=16000)

Target text: واتي كاين شي قرقارة كدار فالصالون ديرها عا هنا وديرها ياك الشرجم ديال 
Input array shape: (80266,)
Sampling rate: 16000


In [27]:
def prepare_dataset(batch):
    assert (len(set(batch["sampling_rate"])) == 1), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [28]:
darija_ai_train = darija_ai_train.map(prepare_dataset, remove_columns=darija_ai_train.column_names, batch_size=8, num_proc=4, batched=True)
darija_ai_test = darija_ai_test.map(prepare_dataset, remove_columns=darija_ai_test.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/18235 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/564 [00:00<?, ? examples/s]



In [29]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [30]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [35]:
wer_metric = load_metric("wer", cache_dir="/var/www/html/Darija-Ai-Train")

In [36]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
#     cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer}
#     return {"cer" : cer}

In [37]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
model.freeze_feature_extractor()

In [49]:
training_args = TrainingArguments(
  output_dir="/var/www/html/Darija-Ai-Train",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dvoice_train,
    eval_dataset=dvoice_test,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
# Define constants and paths
output_dir = "fine_tuned_model"
model_name = "facebook/wav2vec2-large-960h"
train_dataset_name = "your_training_dataset_name"
eval_dataset_name = "your_evaluation_dataset_name"
test_dataset_name = "your_test_dataset_name"

In [None]:
# Initialize model and processor
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

In [None]:
# Data Preprocessing
def prepare_dataset(batch):
    inputs = processor(batch["audio"], return_tensors="pt", padding="longest")
    input_dict = {
        "input_values": inputs["input_values"],
        "labels": inputs["input_ids"],
    }
    return input_dict

train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(prepare_dataset, remove_columns=eval_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)

In [None]:
# Training Configuration
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=10,  # Adjust as needed
    learning_rate=1e-4,  # Adjust as needed
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=False,
    load_best_model_at_end=True,
)


In [None]:
# Metric
wer_metric = load_metric("wer")

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    data_collator=None,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
# Fine-tuning
trainer.train()

In [None]:
# Evaluation
results = trainer.evaluate(test_dataset)
print("Results:", results)

In [None]:
# Save Fine-Tuned Model
model.save_pretrained(output_dir)
processor.save_pretrained(output_dir)