changed to new reduced vocab for PSST

KoelLabs · Feb 19, 2025 · ee0156e · ee0156e
1 parent 3a5b6c4
commit ee0156e
Showing 1 changed file with 258 additions and 13 deletions.
diff --git a/notebooks/PSST_EDA.ipynb b/notebooks/PSST_EDA.ipynb
@@ -67,14 +67,10 @@
    "source": [
     "ARPABET2IPA = {'AA':'ɑ','AE':'æ','AH':'ʌ','AH0':'ə','AO':'ɔ','AW':'aʊ','AY':'aɪ','EH':'ɛ','ER':'ɝ','ER0':'ɚ','EY':'eɪ','IH':'ɪ','IH0':'ɨ','IY':'i','OW':'oʊ','OY':'ɔɪ','UH':'ʊ','UW':'u','B':'b','CH':'tʃ','D':'d','DH':'ð','EL':'l̩ ','EM':'m̩','EN':'n̩','F':'f','G':'ɡ','HH':'h','JH':'dʒ','K':'k','L':'l','M':'m','N':'n','NG':'ŋ','P':'p','Q':'ʔ','R':'ɹ','S':'s','SH':'ʃ','T':'t','TH':'θ','V':'v','W':'w','WH':'ʍ','Y':'j','Z':'z','ZH':'ʒ'}\n",
     "IPA_SUBSTITUTIONS = {\n",
-    "    'ɝ': 'ɹ',   # Simplify rhotacized schwa to 'ɹ'\n",
-    "    'ɚ': 'ɹ',   # Simplify rhotacized schwa to 'ɹ'\n",
-    "    'l̩': 'l',   # Remove syllabic marker from 'l̩'\n",
-    "    'm̩': 'm',   # Remove syllabic marker from 'm̩'\n",
-    "    'n̩': 'n',   # Remove syllabic marker from 'n̩'\n",
-    "    '̩': '',     # Remove syllabic marker\n",
-    "    'ɨ': 'i',    # Replace high central unrounded vowel with high front unrounded vowel\n",
-    "    ' ': '',     # Remove nasalization marker\n",
+    "    \"ɝ\": \"ɜɹ\",  # Expand rhotacized schwa\n",
+    "    \"ɚ\": \"əɹ\",  # Expand rhotacized schwa\n",
+    "    \"\\u02de\": \"ɹ\",  # Replace rhoticity marker (˞) with 'ɹ'\n",
+    "    \"g\": \"ɡ\",  # Replace ASCII 'g' with IPA 'ɡ'\n",
     "}\n",
     "for k in ARPABET2IPA.keys():\n",
     "    if ARPABET2IPA[k] in IPA_SUBSTITUTIONS:\n",
@@ -167,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -237,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -272,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -353,7 +349,251 @@
       "\n",
       "Character comparison:\n",
       "Ground Truth: k o ʊ m    \n",
-      "Prediction:   o w æ ɡ ə θ\n"
+      "Prediction:   o w æ ɡ ə θ\n",
+      "\n",
+      "Evaluating model: vitouphy/wav2vec2-xls-r-300m-timit-phoneme\n",
+      "Progress: 652/652\n",
+      "Results for vitouphy/wav2vec2-xls-r-300m-timit-phoneme:\n",
+      "Average CER: 0.6444\n",
+      "Average Feature Distance: 20.1286\n",
+      "\n",
+      "Found 27 suspicious cases (CER > 1, Feature Distance < 28):\n",
+      "\n",
+      "Pattern Analysis:\n",
+      "\n",
+      "Top 5 Character Substitution Patterns:\n",
+      "  ʌ → ɪ ə: 1 times\n",
+      "  ʌ →   ɛ: 1 times\n",
+      "  ʌ → ə : 1 times\n",
+      "  æ → eɪɑ: 1 times\n",
+      "  t → dʊj: 1 times\n",
+      "\n",
+      "Average length difference: 3.56 characters\n",
+      "\n",
+      "Top 3 Common Additions:\n",
+      "  Added ' ': 5 times\n",
+      "  Added 's ɪɪŋg': 1 times\n",
+      "  Added 'b ': 1 times\n",
+      "\n",
+      "Top 3 Common Deletions:\n",
+      "  Deleted 'sʌjæ': 1 times\n",
+      "  Deleted 'me': 1 times\n",
+      "  Deleted 'pæ': 1 times\n",
+      "\n",
+      "Example Suspicious Cases:\n",
+      "\n",
+      "Case 1:\n",
+      "Ground Truth: jʌjɛsʌhʌjɛsʌjæ\n",
+      "Prediction:   jɪ əjɛs  ɛhə js ɪɪŋgɛ\n",
+      "CER: 1.0714\n",
+      "FER: 25.5625\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: j ʌ j ɛ s ʌ h ʌ j ɛ s ʌ j æ              \n",
+      "Prediction:   j ɪ   ə j ɛ s     ɛ h ə   j s   ɪ ɪ ŋ g ɛ\n",
+      "\n",
+      "Case 2:\n",
+      "Ground Truth: bæ\n",
+      "Prediction:   beɪɑ\n",
+      "CER: 1.5000\n",
+      "FER: 14.7500\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: b æ    \n",
+      "Prediction:   b e ɪ ɑ\n",
+      "\n",
+      "Case 3:\n",
+      "Ground Truth: tuθbɹʌʃ\n",
+      "Prediction:   dʊjufɝə ʤ\n",
+      "CER: 1.1429\n",
+      "FER: 17.2500\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: t u θ b ɹ ʌ ʃ    \n",
+      "Prediction:   d ʊ j u f ɝ ə   ʤ\n",
+      "\n",
+      "Evaluating model: speech31/wav2vec2-large-TIMIT-IPA\n",
+      "Progress: 652/652\n",
+      "Results for speech31/wav2vec2-large-TIMIT-IPA:\n",
+      "Average CER: 0.9337\n",
+      "Average Feature Distance: 36.2800\n",
+      "\n",
+      "Found 48 suspicious cases (CER > 1, Feature Distance < 28):\n",
+      "\n",
+      "Pattern Analysis:\n",
+      "\n",
+      "Top 5 Character Substitution Patterns:\n",
+      "  ɑɡtʌpʊ → rakər bɔɪ: 1 times\n",
+      "  koʊm → in lɪe: 1 times\n",
+      "  ʌʌms → luʃ: 1 times\n",
+      "  a →  wne: 1 times\n",
+      "  nnʌθɪŋ → o e  ai: 1 times\n",
+      "\n",
+      "Average length difference: 3.38 characters\n",
+      "\n",
+      "Top 3 Common Additions:\n",
+      "  Added 't': 2 times\n",
+      "  Added 's': 2 times\n",
+      "  Added 'b': 1 times\n",
+      "\n",
+      "Top 3 Common Deletions:\n",
+      "  Deleted 'ŋk': 1 times\n",
+      "  Deleted 'n': 1 times\n",
+      "  Deleted 't': 1 times\n",
+      "\n",
+      "Example Suspicious Cases:\n",
+      "\n",
+      "Case 1:\n",
+      "Ground Truth: ɑɡtʌpʊs\n",
+      "Prediction:   rakər bɔɪs\n",
+      "CER: 1.2857\n",
+      "FER: 22.7500\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: ɑ ɡ t ʌ p ʊ s      \n",
+      "Prediction:   r a k ə r   b ɔ ɪ s\n",
+      "\n",
+      "Case 2:\n",
+      "Ground Truth: koʊm\n",
+      "Prediction:   in lɪe\n",
+      "CER: 1.5000\n",
+      "FER: 16.6250\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: k o ʊ m    \n",
+      "Prediction:   i n   l ɪ e\n",
+      "\n",
+      "Case 3:\n",
+      "Ground Truth: ʌʌmsiaɪ\n",
+      "Prediction:   luʃi wneɪt\n",
+      "CER: 1.2857\n",
+      "FER: 25.6250\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: ʌ ʌ m s i a ɪ      \n",
+      "Prediction:   l u ʃ i   w n e ɪ t\n",
+      "\n",
+      "Evaluating model: speech31/wav2vec2-large-english-TIMIT-phoneme_v3\n",
+      "Progress: 652/652\n",
+      "Results for speech31/wav2vec2-large-english-TIMIT-phoneme_v3:\n",
+      "Average CER: 0.8897\n",
+      "Average Feature Distance: 39.5965\n",
+      "\n",
+      "Found 42 suspicious cases (CER > 1, Feature Distance < 28):\n",
+      "\n",
+      "Pattern Analysis:\n",
+      "\n",
+      "Top 5 Character Substitution Patterns:\n",
+      "  bivɹ → du ju ər: 1 times\n",
+      "  kʌnu → gəərb: 1 times\n",
+      "  ʊɪ → rɛ: 1 times\n",
+      "  ɪtɑ → ɛ: 1 times\n",
+      "  ʌbɔɹd →  ər əv wwər: 1 times\n",
+      "\n",
+      "Average length difference: 2.98 characters\n",
+      "\n",
+      "Top 3 Common Additions:\n",
+      "  Added 'tɪ loʊvərm hɛ': 1 times\n",
+      "  Added 'ɪz ': 1 times\n",
+      "  Added 'da': 1 times\n",
+      "\n",
+      "Top 3 Common Deletions:\n",
+      "  Deleted 't': 3 times\n",
+      "  Deleted 'ʌlɪvɹɪŋɪt': 1 times\n",
+      "  Deleted 'θɹo': 1 times\n",
+      "\n",
+      "Example Suspicious Cases:\n",
+      "\n",
+      "Case 1:\n",
+      "Ground Truth: bivɹ\n",
+      "Prediction:   du ju ər\n",
+      "CER: 2.0000\n",
+      "FER: 20.3750\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: b i v ɹ        \n",
+      "Prediction:   d u   j u   ə r\n",
+      "\n",
+      "Case 2:\n",
+      "Ground Truth: kʌnu\n",
+      "Prediction:   gəərb\n",
+      "CER: 1.2500\n",
+      "FER: 13.1875\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: k ʌ n u  \n",
+      "Prediction:   g ə ə r b\n",
+      "\n",
+      "Case 3:\n",
+      "Ground Truth: pʊɪnɪtɑnʌbɔɹd\n",
+      "Prediction:   prɛnɛn ər əv wwər\n",
+      "CER: 1.1538\n",
+      "FER: 19.3125\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: p ʊ ɪ n ɪ t ɑ n ʌ b ɔ ɹ d        \n",
+      "Prediction:   p r ɛ n ɛ n   ə r   ə v   w w ə r\n",
+      "\n",
+      "Evaluating model: speech31/wav2vec2-large-TIMIT-IPA2\n",
+      "Progress: 652/652\n",
+      "Results for speech31/wav2vec2-large-TIMIT-IPA2:\n",
+      "Average CER: 1.5627\n",
+      "Average Feature Distance: 32.3273\n",
+      "\n",
+      "Found 278 suspicious cases (CER > 1, Feature Distance < 28):\n",
+      "\n",
+      "Pattern Analysis:\n",
+      "\n",
+      "Top 5 Character Substitution Patterns:\n",
+      "  ɹ →  r : 18 times\n",
+      "  ʌ →  ə : 13 times\n",
+      "  ɹ →  ə r: 6 times\n",
+      "  ɪ →  ə : 5 times\n",
+      "  k → t : 4 times\n",
+      "\n",
+      "Average length difference: 7.87 characters\n",
+      "\n",
+      "Top 3 Common Additions:\n",
+      "  Added ' ': 323 times\n",
+      "  Added ' r ': 5 times\n",
+      "  Added ' t ': 4 times\n",
+      "\n",
+      "Top 3 Common Deletions:\n",
+      "  Deleted 'ŋ': 2 times\n",
+      "  Deleted 'keɪnoʊ': 1 times\n",
+      "  Deleted 'k': 1 times\n",
+      "\n",
+      "Example Suspicious Cases:\n",
+      "\n",
+      "Case 1:\n",
+      "Ground Truth: koʊm\n",
+      "Prediction:   k ɔ i n\n",
+      "CER: 1.5000\n",
+      "FER: 1.3125\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: k o ʊ m      \n",
+      "Prediction:   k   ɔ   i   n\n",
+      "\n",
+      "Case 2:\n",
+      "Ground Truth: bɹʌʃ\n",
+      "Prediction:   d ɪ ɪ b r ə ʃ\n",
+      "CER: 2.7500\n",
+      "FER: 23.6250\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: b ɹ ʌ ʃ                  \n",
+      "Prediction:   d   ɪ   ɪ   b   r   ə   ʃ\n",
+      "\n",
+      "Case 3:\n",
+      "Ground Truth: ɑɡtʌpʊs\n",
+      "Prediction:   ɔ g d ə b ʊ s s\n",
+      "CER: 1.8571\n",
+      "FER: 8.3750\n",
+      "\n",
+      "Character comparison:\n",
+      "Ground Truth: ɑ ɡ t ʌ p ʊ s                \n",
+      "Prediction:   ɔ   g   d   ə   b   ʊ   s   s\n"
      ]
     }
    ],
@@ -371,10 +611,15 @@
     "data = test  # Use the PSSTDataset class\n",
     "amount = len(data)  # Get length from dataset\n",
     "results = {}\n",
+    "DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
     "\n",
     "# List of all model IDs to evaluate\n",
     "MODEL_IDS = [\n",
     "    \"KoelLabs/xlsr-timit-a1\",\n",
+    "    \"vitouphy/wav2vec2-xls-r-300m-timit-phoneme\",\n",
+    "    \"speech31/wav2vec2-large-TIMIT-IPA\",\n",
+    "    \"speech31/wav2vec2-large-english-TIMIT-phoneme_v3\",\n",
+    "    \"speech31/wav2vec2-large-TIMIT-IPA2\"\n",
     "]\n",
     "\n",
     "def transcribe_batch(batch, model, processor, device):\n",
@@ -563,7 +808,7 @@
     "            torch.mps.empty_cache()\n",
     "\n",
     "def save_results():\n",
-    "    with open('model_evaluation_detailed_results.txt', 'w') as f:\n",
+    "    with open('timit_models_evaluation_detailed_results.txt', 'w') as f:\n",
     "        f.write(\"=== Model Evaluation Detailed Results ===\\n\\n\")\n",
     "        for model_id, result in results.items():\n",
     "            f.write(f\"{model_id}:\\n\")\n",