Skip to content

Commit

Permalink
changed to new reduced vocab for PSST
Browse files Browse the repository at this point in the history
  • Loading branch information
arunasrivastava committed Feb 19, 2025
1 parent 3a5b6c4 commit ee0156e
Showing 1 changed file with 258 additions and 13 deletions.
271 changes: 258 additions & 13 deletions notebooks/PSST_EDA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,10 @@
"source": [
"ARPABET2IPA = {'AA':'ɑ','AE':'æ','AH':'ʌ','AH0':'ə','AO':'ɔ','AW':'aʊ','AY':'aɪ','EH':'ɛ','ER':'ɝ','ER0':'ɚ','EY':'eɪ','IH':'ɪ','IH0':'ɨ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','B':'b','CH':'tʃ','D':'d','DH':'ð','EL':'l̩ ','EM':'m̩','EN':'n̩','F':'f','G':'ɡ','HH':'h','JH':'dʒ','K':'k','L':'l','M':'m','N':'n','NG':'ŋ','P':'p','Q':'ʔ','R':'ɹ','S':'s','SH':'ʃ','T':'t','TH':'θ','V':'v','W':'w','WH':'ʍ','Y':'j','Z':'z','ZH':'ʒ'}\n",
"IPA_SUBSTITUTIONS = {\n",
" 'ɝ': 'ɹ', # Simplify rhotacized schwa to 'ɹ'\n",
" 'ɚ': 'ɹ', # Simplify rhotacized schwa to 'ɹ'\n",
" 'l̩': 'l', # Remove syllabic marker from 'l̩'\n",
" 'm̩': 'm', # Remove syllabic marker from 'm̩'\n",
" 'n̩': 'n', # Remove syllabic marker from 'n̩'\n",
" '̩': '', # Remove syllabic marker\n",
" 'ɨ': 'i', # Replace high central unrounded vowel with high front unrounded vowel\n",
" ' ': '', # Remove nasalization marker\n",
" \"ɝ\": \"ɜɹ\", # Expand rhotacized schwa\n",
" \"ɚ\": \"əɹ\", # Expand rhotacized schwa\n",
" \"\\u02de\": \"ɹ\", # Replace rhoticity marker (˞) with 'ɹ'\n",
" \"g\": \"ɡ\", # Replace ASCII 'g' with IPA 'ɡ'\n",
"}\n",
"for k in ARPABET2IPA.keys():\n",
" if ARPABET2IPA[k] in IPA_SUBSTITUTIONS:\n",
Expand Down Expand Up @@ -167,7 +163,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -237,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -272,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -353,7 +349,251 @@
"\n",
"Character comparison:\n",
"Ground Truth: k o ʊ m \n",
"Prediction: o w æ ɡ ə θ\n"
"Prediction: o w æ ɡ ə θ\n",
"\n",
"Evaluating model: vitouphy/wav2vec2-xls-r-300m-timit-phoneme\n",
"Progress: 652/652\n",
"Results for vitouphy/wav2vec2-xls-r-300m-timit-phoneme:\n",
"Average CER: 0.6444\n",
"Average Feature Distance: 20.1286\n",
"\n",
"Found 27 suspicious cases (CER > 1, Feature Distance < 28):\n",
"\n",
"Pattern Analysis:\n",
"\n",
"Top 5 Character Substitution Patterns:\n",
" ʌ → ɪ ə: 1 times\n",
" ʌ → ɛ: 1 times\n",
" ʌ → ə : 1 times\n",
" æ → eɪɑ: 1 times\n",
" t → dʊj: 1 times\n",
"\n",
"Average length difference: 3.56 characters\n",
"\n",
"Top 3 Common Additions:\n",
" Added ' ': 5 times\n",
" Added 's ɪɪŋg': 1 times\n",
" Added 'b ': 1 times\n",
"\n",
"Top 3 Common Deletions:\n",
" Deleted 'sʌjæ': 1 times\n",
" Deleted 'me': 1 times\n",
" Deleted 'pæ': 1 times\n",
"\n",
"Example Suspicious Cases:\n",
"\n",
"Case 1:\n",
"Ground Truth: jʌjɛsʌhʌjɛsʌjæ\n",
"Prediction: jɪ əjɛs ɛhə js ɪɪŋgɛ\n",
"CER: 1.0714\n",
"FER: 25.5625\n",
"\n",
"Character comparison:\n",
"Ground Truth: j ʌ j ɛ s ʌ h ʌ j ɛ s ʌ j æ \n",
"Prediction: j ɪ ə j ɛ s ɛ h ə j s ɪ ɪ ŋ g ɛ\n",
"\n",
"Case 2:\n",
"Ground Truth: bæ\n",
"Prediction: beɪɑ\n",
"CER: 1.5000\n",
"FER: 14.7500\n",
"\n",
"Character comparison:\n",
"Ground Truth: b æ \n",
"Prediction: b e ɪ ɑ\n",
"\n",
"Case 3:\n",
"Ground Truth: tuθbɹʌʃ\n",
"Prediction: dʊjufɝə ʤ\n",
"CER: 1.1429\n",
"FER: 17.2500\n",
"\n",
"Character comparison:\n",
"Ground Truth: t u θ b ɹ ʌ ʃ \n",
"Prediction: d ʊ j u f ɝ ə ʤ\n",
"\n",
"Evaluating model: speech31/wav2vec2-large-TIMIT-IPA\n",
"Progress: 652/652\n",
"Results for speech31/wav2vec2-large-TIMIT-IPA:\n",
"Average CER: 0.9337\n",
"Average Feature Distance: 36.2800\n",
"\n",
"Found 48 suspicious cases (CER > 1, Feature Distance < 28):\n",
"\n",
"Pattern Analysis:\n",
"\n",
"Top 5 Character Substitution Patterns:\n",
" ɑɡtʌpʊ → rakər bɔɪ: 1 times\n",
" koʊm → in lɪe: 1 times\n",
" ʌʌms → luʃ: 1 times\n",
" a → wne: 1 times\n",
" nnʌθɪŋ → o e ai: 1 times\n",
"\n",
"Average length difference: 3.38 characters\n",
"\n",
"Top 3 Common Additions:\n",
" Added 't': 2 times\n",
" Added 's': 2 times\n",
" Added 'b': 1 times\n",
"\n",
"Top 3 Common Deletions:\n",
" Deleted 'ŋk': 1 times\n",
" Deleted 'n': 1 times\n",
" Deleted 't': 1 times\n",
"\n",
"Example Suspicious Cases:\n",
"\n",
"Case 1:\n",
"Ground Truth: ɑɡtʌpʊs\n",
"Prediction: rakər bɔɪs\n",
"CER: 1.2857\n",
"FER: 22.7500\n",
"\n",
"Character comparison:\n",
"Ground Truth: ɑ ɡ t ʌ p ʊ s \n",
"Prediction: r a k ə r b ɔ ɪ s\n",
"\n",
"Case 2:\n",
"Ground Truth: koʊm\n",
"Prediction: in lɪe\n",
"CER: 1.5000\n",
"FER: 16.6250\n",
"\n",
"Character comparison:\n",
"Ground Truth: k o ʊ m \n",
"Prediction: i n l ɪ e\n",
"\n",
"Case 3:\n",
"Ground Truth: ʌʌmsiaɪ\n",
"Prediction: luʃi wneɪt\n",
"CER: 1.2857\n",
"FER: 25.6250\n",
"\n",
"Character comparison:\n",
"Ground Truth: ʌ ʌ m s i a ɪ \n",
"Prediction: l u ʃ i w n e ɪ t\n",
"\n",
"Evaluating model: speech31/wav2vec2-large-english-TIMIT-phoneme_v3\n",
"Progress: 652/652\n",
"Results for speech31/wav2vec2-large-english-TIMIT-phoneme_v3:\n",
"Average CER: 0.8897\n",
"Average Feature Distance: 39.5965\n",
"\n",
"Found 42 suspicious cases (CER > 1, Feature Distance < 28):\n",
"\n",
"Pattern Analysis:\n",
"\n",
"Top 5 Character Substitution Patterns:\n",
" bivɹ → du ju ər: 1 times\n",
" kʌnu → gəərb: 1 times\n",
" ʊɪ → rɛ: 1 times\n",
" ɪtɑ → ɛ: 1 times\n",
" ʌbɔɹd → ər əv wwər: 1 times\n",
"\n",
"Average length difference: 2.98 characters\n",
"\n",
"Top 3 Common Additions:\n",
" Added 'tɪ loʊvərm hɛ': 1 times\n",
" Added 'ɪz ': 1 times\n",
" Added 'da': 1 times\n",
"\n",
"Top 3 Common Deletions:\n",
" Deleted 't': 3 times\n",
" Deleted 'ʌlɪvɹɪŋɪt': 1 times\n",
" Deleted 'θɹo': 1 times\n",
"\n",
"Example Suspicious Cases:\n",
"\n",
"Case 1:\n",
"Ground Truth: bivɹ\n",
"Prediction: du ju ər\n",
"CER: 2.0000\n",
"FER: 20.3750\n",
"\n",
"Character comparison:\n",
"Ground Truth: b i v ɹ \n",
"Prediction: d u j u ə r\n",
"\n",
"Case 2:\n",
"Ground Truth: kʌnu\n",
"Prediction: gəərb\n",
"CER: 1.2500\n",
"FER: 13.1875\n",
"\n",
"Character comparison:\n",
"Ground Truth: k ʌ n u \n",
"Prediction: g ə ə r b\n",
"\n",
"Case 3:\n",
"Ground Truth: pʊɪnɪtɑnʌbɔɹd\n",
"Prediction: prɛnɛn ər əv wwər\n",
"CER: 1.1538\n",
"FER: 19.3125\n",
"\n",
"Character comparison:\n",
"Ground Truth: p ʊ ɪ n ɪ t ɑ n ʌ b ɔ ɹ d \n",
"Prediction: p r ɛ n ɛ n ə r ə v w w ə r\n",
"\n",
"Evaluating model: speech31/wav2vec2-large-TIMIT-IPA2\n",
"Progress: 652/652\n",
"Results for speech31/wav2vec2-large-TIMIT-IPA2:\n",
"Average CER: 1.5627\n",
"Average Feature Distance: 32.3273\n",
"\n",
"Found 278 suspicious cases (CER > 1, Feature Distance < 28):\n",
"\n",
"Pattern Analysis:\n",
"\n",
"Top 5 Character Substitution Patterns:\n",
" ɹ → r : 18 times\n",
" ʌ → ə : 13 times\n",
" ɹ → ə r: 6 times\n",
" ɪ → ə : 5 times\n",
" k → t : 4 times\n",
"\n",
"Average length difference: 7.87 characters\n",
"\n",
"Top 3 Common Additions:\n",
" Added ' ': 323 times\n",
" Added ' r ': 5 times\n",
" Added ' t ': 4 times\n",
"\n",
"Top 3 Common Deletions:\n",
" Deleted 'ŋ': 2 times\n",
" Deleted 'keɪnoʊ': 1 times\n",
" Deleted 'k': 1 times\n",
"\n",
"Example Suspicious Cases:\n",
"\n",
"Case 1:\n",
"Ground Truth: koʊm\n",
"Prediction: k ɔ i n\n",
"CER: 1.5000\n",
"FER: 1.3125\n",
"\n",
"Character comparison:\n",
"Ground Truth: k o ʊ m \n",
"Prediction: k ɔ i n\n",
"\n",
"Case 2:\n",
"Ground Truth: bɹʌʃ\n",
"Prediction: d ɪ ɪ b r ə ʃ\n",
"CER: 2.7500\n",
"FER: 23.6250\n",
"\n",
"Character comparison:\n",
"Ground Truth: b ɹ ʌ ʃ \n",
"Prediction: d ɪ ɪ b r ə ʃ\n",
"\n",
"Case 3:\n",
"Ground Truth: ɑɡtʌpʊs\n",
"Prediction: ɔ g d ə b ʊ s s\n",
"CER: 1.8571\n",
"FER: 8.3750\n",
"\n",
"Character comparison:\n",
"Ground Truth: ɑ ɡ t ʌ p ʊ s \n",
"Prediction: ɔ g d ə b ʊ s s\n"
]
}
],
Expand All @@ -371,10 +611,15 @@
"data = test # Use the PSSTDataset class\n",
"amount = len(data) # Get length from dataset\n",
"results = {}\n",
"DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"\n",
"# List of all model IDs to evaluate\n",
"MODEL_IDS = [\n",
" \"KoelLabs/xlsr-timit-a1\",\n",
" \"vitouphy/wav2vec2-xls-r-300m-timit-phoneme\",\n",
" \"speech31/wav2vec2-large-TIMIT-IPA\",\n",
" \"speech31/wav2vec2-large-english-TIMIT-phoneme_v3\",\n",
" \"speech31/wav2vec2-large-TIMIT-IPA2\"\n",
"]\n",
"\n",
"def transcribe_batch(batch, model, processor, device):\n",
Expand Down Expand Up @@ -563,7 +808,7 @@
" torch.mps.empty_cache()\n",
"\n",
"def save_results():\n",
" with open('model_evaluation_detailed_results.txt', 'w') as f:\n",
" with open('timit_models_evaluation_detailed_results.txt', 'w') as f:\n",
" f.write(\"=== Model Evaluation Detailed Results ===\\n\\n\")\n",
" for model_id, result in results.items():\n",
" f.write(f\"{model_id}:\\n\")\n",
Expand Down

0 comments on commit ee0156e

Please sign in to comment.