From bd8afdee52374ca63ea5e8595fb518075f31acf8 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 12 Dec 2024 13:15:28 +1030 Subject: [PATCH] wip ASR - building up test framework --- asr_test.sh | 82 +++++++++++++++++++++++++++++ asr_wer.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100755 asr_test.sh create mode 100644 asr_wer.py diff --git a/asr_test.sh b/asr_test.sh new file mode 100755 index 0000000..368f1c8 --- /dev/null +++ b/asr_test.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# asr_test.sh +# +# Automatic Speech Recognition (ASR) testing for the Radio Autoencoder + +CODEC2_DEV=${CODEC2_DEV:-${HOME}/codec2-dev} +PATH=${PATH}:${CODEC2_DEV}/build_linux/src:${CODEC2_DEV}/build_linux/misc:${PWD}/build/src + +which ch >/dev/null || { printf "\n**** Can't find ch - check CODEC2_PATH **** \n\n"; exit 1; } + +source utils.sh + +function print_help { + echo + echo "Automated Speech Recognition (ASR) testing for the Radio Autoencoder" + echo + echo " usage ./asr_test.sh path/to/source dest [test option below]" + echo " usage ./ota_test.sh ~/.cache/LibriSpeech/test-clean ~/.cache/LibriSpeech/test-awgn-2dB --awgn 2" + echo + echo " --awgn SNRdB AWGN channel simulation" + echo " -d verbose debug information" + exit +} + +POSITIONAL=() +while [[ $# -gt 0 ]] +do +key="$1" +case $key in + --awgn) + awgn_snr_dB="$2" + shift + shift + ;; + -d) + set -x; + shift + ;; + -h) + print_help + ;; + *) + POSITIONAL+=("$1") # save it in an array for later + shift + ;; +esac +done +set -- "${POSITIONAL[@]}" # restore positional parameters + +if [ $# -lt 2 ]; then + print_help +fi + +source=$1 +dest=$2 + +# cp translation files to new test directory +function cp_translation_files { + pushd $source; trans=$(find . -name '*.txt'); popd + for f in $trans + do + d=$(dirname $f) + mkdir -p ${dest}/${d} + cp ${source}/${f} ${dest}/${f} + done +} + +# process audio files and place in new test directory +function process { + pushd $source; flac=$(find . -name '*.flac'); popd + for f in $flac + do + d=$(dirname $f) + mkdir -p ${dest}/${d} + sox ${source}/${f} -t .s16 -r 8000 - | ch - - --No -30 | sox -t .s16 -r 8000 -c 1 - -r 16000 ${dest}/${f} + done + pwd +} + +process +#mkidr -p ${test-clean} + diff --git a/asr_wer.py b/asr_wer.py new file mode 100644 index 0000000..553f3ac --- /dev/null +++ b/asr_wer.py @@ -0,0 +1,145 @@ +# coding: utf-8 + +# # Installing Whisper +# +# The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results. + +# In[1]: + + +#get_ipython().system(' pip install git+/~https://github.com/openai/whisper.git') +#get_ipython().system(' pip install jiwer') + + +# # Loading the LibriSpeech dataset +# +# The following will load the test-clean split of the LibriSpeech corpus using torchaudio. + +# In[2]: + + +import os,argparse +import numpy as np + +#try: +# import tensorflow # required in Colab to avoid protobuf compatibility issues +#except ImportError: +# pass + +import torch +import pandas as pd +import whisper +import torchaudio + +from tqdm.notebook import tqdm + + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +# In[3]: + + +class LibriSpeech(torch.utils.data.Dataset): + """ + A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds. + It will drop the last few seconds of a very small portion of the utterances. + """ + def __init__(self, split="test-clean", device=DEVICE): + self.dataset = torchaudio.datasets.LIBRISPEECH( + root=os.path.expanduser("~/.cache"), + url=split, + download=True, + ) + self.device = device + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, item): + audio, sample_rate, text, _, _, _ = self.dataset[item] + assert sample_rate == 16000 + audio = whisper.pad_or_trim(audio.flatten()).to(self.device) + mel = whisper.log_mel_spectrogram(audio) + + return (mel, text) + + +parser = argparse.ArgumentParser() +parser.add_argument('--test_name', default="test-clean", type=str, help='Librispeech test name') +args = parser.parse_args() + +print("start"); +dataset = LibriSpeech(args.test_name) +print("dataset") +loader = torch.utils.data.DataLoader(dataset, batch_size=16) +print("loader") + +# # Running inference on the dataset using a base Whisper model +# +# The following will take a few minutes to transcribe all utterances in the dataset. + +# In[5]: + + +model = whisper.load_model("base.en") +print( + f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " + f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." +) + + +# In[6]: + + +# predict without timestamps for short-form transcription +options = whisper.DecodingOptions(language="en", without_timestamps=True) + + +# In[7]: + + +hypotheses = [] +references = [] + +for mels, texts in loader: + results = model.decode(mels, options) + hypotheses.extend([result.text for result in results]) + references.extend(texts) + + +# In[8]: + + +data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references)) +data + + +# # Calculating the word error rate +# +# Now, we use our English normalizer implementation to standardize the transcription and calculate the WER. + +# In[9]: + + +import jiwer +from whisper.normalizers import EnglishTextNormalizer + +normalizer = EnglishTextNormalizer() + + +# In[10]: + + +data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]] +data["reference_clean"] = [normalizer(text) for text in data["reference"]] +print(data) + + +# In[11]: + + +wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"])) + +print(f"WER: {wer * 100:.2f} %") +