processing a subset of dataset

drowe67 · Dec 13, 2024 · 67a669f · 67a669f
1 parent bd8afde
commit 67a669f
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 76 deletions.
diff --git a/asr_test.sh b/asr_test.sh
@@ -1,7 +1,9 @@
 #!/usr/bin/env bash
 # asr_test.sh
 #
-# Automatic Speech Recognition (ASR) testing for the  Radio Autoencoder
+# Automatic Speech Recognition (ASR) testing for the  Radio Autoencoder. This script
+# takes the samples from a clean dataset (e.g. Librispeech test-clean), and generates
+# a dataset with channel simulations (RADE, SSB etc) applied.
 
 CODEC2_DEV=${CODEC2_DEV:-${HOME}/codec2-dev}
 PATH=${PATH}:${CODEC2_DEV}/build_linux/src:${CODEC2_DEV}/build_linux/misc:${PWD}/build/src
@@ -12,16 +14,19 @@ source utils.sh
 
 function print_help {
     echo
-    echo "Automated Speech Recognition (ASR) testing for the Radio Autoencoder"
+    echo "Automated Speech Recognition (ASR) dataset processing for Radio Autoencoder testing"
     echo
     echo "  usage ./asr_test.sh path/to/source dest [test option below]"
     echo "  usage ./ota_test.sh ~/.cache/LibriSpeech/test-clean  ~/.cache/LibriSpeech/test-awgn-2dB --awgn 2"
     echo
     echo "    --awgn SNRdB              AWGN channel simulation"
+    echo "    -n numSamples             number of dataset samples to process (default all)"
     echo "    -d                        verbose debug information"
     exit
 }
 
+n_samples=0
+
 POSITIONAL=()
 while [[ $# -gt 0 ]]
 do
@@ -32,6 +37,11 @@ case $key in
         shift
         shift
     ;;
+    -n)
+        n_samples="$2"
+        shift
+        shift
+    ;;
     -d)
         set -x;
         shift
@@ -54,29 +64,35 @@ fi
 source=$1
 dest=$2
 
-# cp translation files to new test directory
+# cp translation files to new dataset directory
 function cp_translation_files {
     pushd $source; trans=$(find . -name '*.txt'); popd
     for f in $trans
     do
-    d=$(dirname $f)
-    mkdir -p ${dest}/${d}
-    cp ${source}/${f} ${dest}/${f}
+        d=$(dirname $f)
+        mkdir -p ${dest}/${d}
+        cp ${source}/${f} ${dest}/${f}
     done
 }
 
-# process audio files and place in new test directory
+# process audio files and place in new dataset directory
 function process {
     pushd $source; flac=$(find . -name '*.flac'); popd
+    if [ $n_samples -ne 0 ]; then
+        flac=$(echo "$flac" | head -n $n_samples)
+    fi
+
+    n=$(echo "$flac" | wc -l)
+    printf "Processing %d samples in dataset\n" $n
+
     for f in $flac
     do
-    d=$(dirname $f)
-    mkdir -p ${dest}/${d}
-    sox ${source}/${f} -t .s16 -r 8000 - | ch - - --No -30 | sox -t .s16 -r 8000 -c 1 - -r 16000 ${dest}/${f}
+        d=$(dirname $f)
+        mkdir -p ${dest}/${d}
+        sox ${source}/${f} -t .s16 -r 8000 - | ch - - --No -30 | sox -t .s16 -r 8000 -c 1 - -r 16000 ${dest}/${f}
     done
-    pwd
 }
 
+#cp_translation_files
 process
-#mkidr -p ${test-clean}
 
diff --git a/asr_wer.py b/asr_wer.py
@@ -1,45 +1,18 @@
 # coding: utf-8
 
-# # Installing Whisper
-# 
-# The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.
-
-# In[1]:
-
-
-#get_ipython().system(' pip install git+/~https://github.com/openai/whisper.git')
-#get_ipython().system(' pip install jiwer')
-
-
-# # Loading the LibriSpeech dataset
-# 
-# The following will load the test-clean split of the LibriSpeech corpus using torchaudio.
-
-# In[2]:
-
+# derived from: /~https://github.com/openai/whisper/blob/main/notebooks/LibriSpeech.ipynb
 
 import os,argparse
 import numpy as np
-
-#try:
-#    import tensorflow  # required in Colab to avoid protobuf compatibility issues
-#except ImportError:
-#    pass
-
 import torch
 import pandas as pd
 import whisper
 import torchaudio
-
 from tqdm.notebook import tqdm
 
-
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
-# In[3]:
-
-
 class LibriSpeech(torch.utils.data.Dataset):
     """
     A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
@@ -66,39 +39,25 @@ def __getitem__(self, item):
 
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--test_name', default="test-clean", type=str, help='Librispeech test name')
+parser.add_argument('test_name', type=str, help='Librispeech dataset name (e.g. test-clean)')
+parser.add_argument('-n', type=str, help='Number of dataset sntries to use (default all of them)')
 args = parser.parse_args()
 
-print("start");
 dataset = LibriSpeech(args.test_name)
-print("dataset")
-loader = torch.utils.data.DataLoader(dataset, batch_size=16)
-print("loader")
-
-# # Running inference on the dataset using a base Whisper model
-# 
-# The following will take a few minutes to transcribe all utterances in the dataset.
-
-# In[5]:
-
+if args.n:
+    dataset = torch.utils.data.Subset(dataset,list(range(0,int(args.n))))
+print("dataset length:", dataset.__len__())
 
+loader = torch.utils.data.DataLoader(dataset, batch_size=16)
 model = whisper.load_model("base.en")
 print(
     f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
     f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
 )
 
-
-# In[6]:
-
-
 # predict without timestamps for short-form transcription
 options = whisper.DecodingOptions(language="en", without_timestamps=True)
 
-
-# In[7]:
-
-
 hypotheses = []
 references = []
 
@@ -107,38 +66,22 @@ def __getitem__(self, item):
     hypotheses.extend([result.text for result in results])
     references.extend(texts)
 
-
-# In[8]:
-
-
 data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
-data
 
 
 # # Calculating the word error rate
 # 
 # Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.
 
-# In[9]:
-
-
 import jiwer
 from whisper.normalizers import EnglishTextNormalizer
 
 normalizer = EnglishTextNormalizer()
 
-
-# In[10]:
-
-
 data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
 data["reference_clean"] = [normalizer(text) for text in data["reference"]]
 print(data)
 
-
-# In[11]:
-
-
 wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
 
 print(f"WER: {wer * 100:.2f} %")