Skip to content

Commit

Permalink
processing a subset of dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
drowe67 committed Dec 13, 2024
1 parent bd8afde commit 67a669f
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 76 deletions.
40 changes: 28 additions & 12 deletions asr_test.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/bin/env bash
# asr_test.sh
#
# Automatic Speech Recognition (ASR) testing for the Radio Autoencoder
# Automatic Speech Recognition (ASR) testing for the Radio Autoencoder. This script
# takes the samples from a clean dataset (e.g. Librispeech test-clean), and generates
# a dataset with channel simulations (RADE, SSB etc) applied.

CODEC2_DEV=${CODEC2_DEV:-${HOME}/codec2-dev}
PATH=${PATH}:${CODEC2_DEV}/build_linux/src:${CODEC2_DEV}/build_linux/misc:${PWD}/build/src
Expand All @@ -12,16 +14,19 @@ source utils.sh

function print_help {
echo
echo "Automated Speech Recognition (ASR) testing for the Radio Autoencoder"
echo "Automated Speech Recognition (ASR) dataset processing for Radio Autoencoder testing"
echo
echo " usage ./asr_test.sh path/to/source dest [test option below]"
echo " usage ./ota_test.sh ~/.cache/LibriSpeech/test-clean ~/.cache/LibriSpeech/test-awgn-2dB --awgn 2"
echo
echo " --awgn SNRdB AWGN channel simulation"
echo " -n numSamples number of dataset samples to process (default all)"
echo " -d verbose debug information"
exit
}

n_samples=0

POSITIONAL=()
while [[ $# -gt 0 ]]
do
Expand All @@ -32,6 +37,11 @@ case $key in
shift
shift
;;
-n)
n_samples="$2"
shift
shift
;;
-d)
set -x;
shift
Expand All @@ -54,29 +64,35 @@ fi
source=$1
dest=$2

# cp translation files to new test directory
# cp translation files to new dataset directory
function cp_translation_files {
pushd $source; trans=$(find . -name '*.txt'); popd
for f in $trans
do
d=$(dirname $f)
mkdir -p ${dest}/${d}
cp ${source}/${f} ${dest}/${f}
d=$(dirname $f)
mkdir -p ${dest}/${d}
cp ${source}/${f} ${dest}/${f}
done
}

# process audio files and place in new test directory
# process audio files and place in new dataset directory
function process {
pushd $source; flac=$(find . -name '*.flac'); popd
if [ $n_samples -ne 0 ]; then
flac=$(echo "$flac" | head -n $n_samples)
fi

n=$(echo "$flac" | wc -l)
printf "Processing %d samples in dataset\n" $n

for f in $flac
do
d=$(dirname $f)
mkdir -p ${dest}/${d}
sox ${source}/${f} -t .s16 -r 8000 - | ch - - --No -30 | sox -t .s16 -r 8000 -c 1 - -r 16000 ${dest}/${f}
d=$(dirname $f)
mkdir -p ${dest}/${d}
sox ${source}/${f} -t .s16 -r 8000 - | ch - - --No -30 | sox -t .s16 -r 8000 -c 1 - -r 16000 ${dest}/${f}
done
pwd
}

#cp_translation_files
process
#mkidr -p ${test-clean}

71 changes: 7 additions & 64 deletions asr_wer.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,18 @@
# coding: utf-8

# # Installing Whisper
#
# The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

# In[1]:


#get_ipython().system(' pip install git+/~https://github.com/openai/whisper.git')
#get_ipython().system(' pip install jiwer')


# # Loading the LibriSpeech dataset
#
# The following will load the test-clean split of the LibriSpeech corpus using torchaudio.

# In[2]:

# derived from: /~https://github.com/openai/whisper/blob/main/notebooks/LibriSpeech.ipynb

import os,argparse
import numpy as np

#try:
# import tensorflow # required in Colab to avoid protobuf compatibility issues
#except ImportError:
# pass

import torch
import pandas as pd
import whisper
import torchaudio

from tqdm.notebook import tqdm


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# In[3]:


class LibriSpeech(torch.utils.data.Dataset):
"""
A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
Expand All @@ -66,39 +39,25 @@ def __getitem__(self, item):


parser = argparse.ArgumentParser()
parser.add_argument('--test_name', default="test-clean", type=str, help='Librispeech test name')
parser.add_argument('test_name', type=str, help='Librispeech dataset name (e.g. test-clean)')
parser.add_argument('-n', type=str, help='Number of dataset sntries to use (default all of them)')
args = parser.parse_args()

print("start");
dataset = LibriSpeech(args.test_name)
print("dataset")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)
print("loader")

# # Running inference on the dataset using a base Whisper model
#
# The following will take a few minutes to transcribe all utterances in the dataset.

# In[5]:

if args.n:
dataset = torch.utils.data.Subset(dataset,list(range(0,int(args.n))))
print("dataset length:", dataset.__len__())

loader = torch.utils.data.DataLoader(dataset, batch_size=16)
model = whisper.load_model("base.en")
print(
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)


# In[6]:


# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)


# In[7]:


hypotheses = []
references = []

Expand All @@ -107,38 +66,22 @@ def __getitem__(self, item):
hypotheses.extend([result.text for result in results])
references.extend(texts)


# In[8]:


data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data


# # Calculating the word error rate
#
# Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.

# In[9]:


import jiwer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()


# In[10]:


data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
print(data)


# In[11]:


wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")
Expand Down

0 comments on commit 67a669f

Please sign in to comment.