forked from apache/mxnet
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MXNET-1210 ] Gluon Audio - Example (apache#13325)
* Initialized the example * Addressed PR comments, about existing synset.txt file - no overwrite * RST - docstring issues fixed * added README * Addressed PR comments * Addressed PR comments, checking Divide by 0 * Raising error if format is not supported. * changed a line for ndarray of labels * Trigger CI * Trigger CI * PR comments addressed around skip_header argument * Addressed PR comments around librosa import * PR Comments * Passing lazy=lazy from argument * Added PR comments, labels to README.MD * Trigger CI * Addressing PR Comments in README * Modified README.md * Added example under audio folder * Retrigger CI * Retrigger CI
- Loading branch information
1 parent
9921fcd
commit d6ae538
Showing
7 changed files
with
768 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# coding: utf-8 | ||
# pylint: disable= arguments-differ | ||
"""Audio transforms.""" | ||
|
||
import warnings | ||
import numpy as np | ||
try: | ||
import librosa | ||
except ImportError as e: | ||
warnings.warn("librosa dependency could not be resolved or \ | ||
imported, could not provide some/all transform.") | ||
|
||
from mxnet import ndarray as nd | ||
from mxnet.gluon.block import Block | ||
|
||
class MFCC(Block): | ||
"""Extracts Mel frequency cepstrum coefficients from the audio data file | ||
More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html | ||
Attributes | ||
---------- | ||
sampling_rate: int, default 22050 | ||
sampling rate of the input audio signal | ||
num_mfcc: int, default 20 | ||
number of mfccs to return | ||
Inputs: | ||
- **x**: input tensor (samples, ) shape. | ||
Outputs: | ||
- **out**: output array is a scaled NDArray with (samples, ) shape. | ||
""" | ||
|
||
def __init__(self, sampling_rate=22050, num_mfcc=20): | ||
self._sampling_rate = sampling_rate | ||
self._num_fcc = num_mfcc | ||
super(MFCC, self).__init__() | ||
|
||
def forward(self, x): | ||
if isinstance(x, np.ndarray): | ||
y = x | ||
elif isinstance(x, nd.NDArray): | ||
y = x.asnumpy() | ||
else: | ||
warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray") | ||
return x | ||
|
||
audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0) | ||
return nd.array(audio_tmp) | ||
|
||
|
||
class Scale(Block): | ||
"""Scale audio numpy.ndarray from a 16-bit integer to a floating point number between | ||
-1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth. | ||
Attributes | ||
---------- | ||
scale_factor : float | ||
The factor to scale the input tensor by. | ||
Inputs: | ||
- **x**: input tensor (samples, ) shape. | ||
Outputs: | ||
- **out**: output array is a scaled NDArray with (samples, ) shape. | ||
Examples | ||
-------- | ||
>>> scale = audio.transforms.Scale(scale_factor=2) | ||
>>> audio_samples = mx.nd.array([2,3,4]) | ||
>>> scale(audio_samples) | ||
[1. 1.5 2. ] | ||
<NDArray 3 @cpu(0)> | ||
""" | ||
|
||
def __init__(self, scale_factor=2**31): | ||
self.scale_factor = scale_factor | ||
super(Scale, self).__init__() | ||
|
||
def forward(self, x): | ||
if self.scale_factor == 0: | ||
warnings.warn("Scale factor cannot be 0.") | ||
return x | ||
if isinstance(x, np.ndarray): | ||
return nd.array(x/self.scale_factor) | ||
return x / self.scale_factor | ||
|
||
|
||
class PadTrim(Block): | ||
"""Pad/Trim a 1d-NDArray of NPArray (Signal or Labels) | ||
Attributes | ||
---------- | ||
max_len : int | ||
Length to which the array will be padded or trimmed to. | ||
fill_value: int or float | ||
If there is a need of padding, what value to pad at the end of the input array. | ||
Inputs: | ||
- **x**: input tensor (samples, ) shape. | ||
Outputs: | ||
- **out**: output array is a scaled NDArray with (max_len, ) shape. | ||
Examples | ||
-------- | ||
>>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0) | ||
>>> audio_samples = mx.nd.array([1,2,3,4,5]) | ||
>>> padtrim(audio_samples) | ||
[1. 2. 3. 4. 5. 0. 0. 0. 0.] | ||
<NDArray 9 @cpu(0)> | ||
""" | ||
|
||
def __init__(self, max_len, fill_value=0): | ||
self._max_len = max_len | ||
self._fill_value = fill_value | ||
super(PadTrim, self).__init__() | ||
|
||
def forward(self, x): | ||
if isinstance(x, np.ndarray): | ||
x = nd.array(x) | ||
if self._max_len > x.size: | ||
pad = nd.ones((self._max_len - x.size,)) * self._fill_value | ||
x = nd.concat(x, pad, dim=0) | ||
elif self._max_len < x.size: | ||
x = x[:self._max_len] | ||
return x | ||
|
||
|
||
class MEL(Block): | ||
"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow. | ||
Attributes | ||
---------- | ||
sampling_rate: int, default 22050 | ||
sampling rate of the input audio signal | ||
num_fft: int, default 2048 | ||
length of the Fast Fourier transform window | ||
num_mels: int, default 20 | ||
number of mel bands to generate | ||
hop_length: int, default 512 | ||
total samples between successive frames | ||
Inputs: | ||
- **x**: input tensor (samples, ) shape. | ||
Outputs: | ||
- **out**: output array which consists of mel spectograms, shape = (n_mels, 1) | ||
Usage (see librosa.feature.melspectrogram docs): | ||
MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64) | ||
Examples | ||
-------- | ||
>>> mel = audio.transforms.MEL() | ||
>>> audio_samples = mx.nd.array([1,2,3,4,5]) | ||
>>> mel(audio_samples) | ||
[[3.81801406e+04] | ||
[9.86858240e-29] | ||
[1.87405472e-29] | ||
[2.38637225e-29] | ||
[3.94043010e-29] | ||
[3.67071565e-29] | ||
[7.29390295e-29] | ||
[8.84324438e-30]... | ||
<NDArray 128x1 @cpu(0)> | ||
""" | ||
|
||
def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512): | ||
self._sampling_rate = sampling_rate | ||
self._num_fft = num_fft | ||
self._num_mels = num_mels | ||
self._hop_length = hop_length | ||
super(MEL, self).__init__() | ||
|
||
def forward(self, x): | ||
if isinstance(x, nd.NDArray): | ||
x = x.asnumpy() | ||
specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\ | ||
n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length) | ||
return nd.array(specs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Urban Sounds Classification in MXNet Gluon | ||
|
||
This example provides an end-to-end pipeline for a common datahack competition - Urban Sounds Classification Example. | ||
Below is the link to the competition: | ||
https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/ | ||
|
||
After logging in, the data set can be downloaded. | ||
The details of the dataset and the link to download it are given below: | ||
|
||
|
||
## Urban Sounds Dataset: | ||
### Description | ||
The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on. | ||
The task is to classify these audio samples into one of the following 10 labels: | ||
``` | ||
siren, | ||
street_music, | ||
drilling, | ||
dog_bark, | ||
children_playing, | ||
gun_shot, | ||
engine_idling, | ||
air_conditioner, | ||
jackhammer, | ||
car_horn | ||
``` | ||
|
||
To be able to run this example: | ||
|
||
1. `pip install -r requirements.txt` | ||
|
||
If you are in the directory where the requirements.txt file lies, | ||
this step installs the required libraries to run the example. | ||
The main dependency that is required is: Librosa. | ||
The version used to test the example is: `0.6.2` | ||
For more details, refer here: | ||
https://librosa.github.io/librosa/install.html | ||
|
||
2. Download the dataset(train.zip, test.zip) required for this example from the location: | ||
https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU | ||
|
||
3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely, | ||
**Train** and **Test** and two csv files - **train.csv**, **test.csv** | ||
|
||
Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be: | ||
|
||
``` | ||
UrbanSounds | ||
- Train | ||
- 0.wav, 1.wav ... | ||
- train.csv | ||
- train.py | ||
- predict.py ... | ||
``` | ||
4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/ | ||
For information on the current design of how the AudioFolderDataset is implemented, refer below: | ||
https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio | ||
### Usage | ||
For training: | ||
- Arguments | ||
- train : The folder/directory that contains the audio(wav) files locally. Default = "./Train" | ||
- csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv" | ||
- epochs : Number of epochs to train the model. Default = 30 | ||
- batch_size : The batch size for training. Default = 32 | ||
###### To use the default arguments, use: | ||
``` | ||
python train.py | ||
``` | ||
or | ||
###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use : | ||
``` | ||
python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 | ||
``` | ||
For prediction: | ||
- Arguments | ||
- pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test" | ||
###### To use the default arguments, use: | ||
``` | ||
python predict.py | ||
``` | ||
or | ||
###### To pass command-line arguments for test data directory, use : | ||
``` | ||
python predict.py --pred ./Test | ||
``` |
Oops, something went wrong.