Skip to content

Commit

Permalink
[MXNET-1210 ] Gluon Audio - Example (apache#13325)
Browse files Browse the repository at this point in the history
* Initialized the example

* Addressed PR comments, about existing synset.txt file - no overwrite

* RST - docstring issues fixed

* added README

* Addressed PR comments

* Addressed PR comments, checking Divide by 0

* Raising error if format is not supported.

* changed a line for ndarray of labels

* Trigger CI

* Trigger CI

* PR comments addressed around skip_header argument

* Addressed PR comments around librosa import

* PR Comments

* Passing lazy=lazy from argument

* Added PR comments, labels to README.MD

* Trigger CI

* Addressing PR Comments in README

* Modified README.md

* Added example under audio folder

* Retrigger CI

* Retrigger CI
  • Loading branch information
gaurav-gireesh authored and zhaoyao73 committed Dec 9, 2018
1 parent 9921fcd commit d6ae538
Show file tree
Hide file tree
Showing 7 changed files with 768 additions and 0 deletions.
205 changes: 205 additions & 0 deletions example/gluon/audio/transforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# coding: utf-8
# pylint: disable= arguments-differ
"""Audio transforms."""

import warnings
import numpy as np
try:
import librosa
except ImportError as e:
warnings.warn("librosa dependency could not be resolved or \
imported, could not provide some/all transform.")

from mxnet import ndarray as nd
from mxnet.gluon.block import Block

class MFCC(Block):
"""Extracts Mel frequency cepstrum coefficients from the audio data file
More details : https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html
Attributes
----------
sampling_rate: int, default 22050
sampling rate of the input audio signal
num_mfcc: int, default 20
number of mfccs to return
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (samples, ) shape.
"""

def __init__(self, sampling_rate=22050, num_mfcc=20):
self._sampling_rate = sampling_rate
self._num_fcc = num_mfcc
super(MFCC, self).__init__()

def forward(self, x):
if isinstance(x, np.ndarray):
y = x
elif isinstance(x, nd.NDArray):
y = x.asnumpy()
else:
warnings.warn("MFCC - allowed datatypes mx.nd.NDArray and numpy.ndarray")
return x

audio_tmp = np.mean(librosa.feature.mfcc(y=y, sr=self._sampling_rate, n_mfcc=self._num_fcc).T, axis=0)
return nd.array(audio_tmp)


class Scale(Block):
"""Scale audio numpy.ndarray from a 16-bit integer to a floating point number between
-1.0 and 1.0. The 16-bit integer is the sample resolution or bit depth.
Attributes
----------
scale_factor : float
The factor to scale the input tensor by.
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (samples, ) shape.
Examples
--------
>>> scale = audio.transforms.Scale(scale_factor=2)
>>> audio_samples = mx.nd.array([2,3,4])
>>> scale(audio_samples)
[1. 1.5 2. ]
<NDArray 3 @cpu(0)>
"""

def __init__(self, scale_factor=2**31):
self.scale_factor = scale_factor
super(Scale, self).__init__()

def forward(self, x):
if self.scale_factor == 0:
warnings.warn("Scale factor cannot be 0.")
return x
if isinstance(x, np.ndarray):
return nd.array(x/self.scale_factor)
return x / self.scale_factor


class PadTrim(Block):
"""Pad/Trim a 1d-NDArray of NPArray (Signal or Labels)
Attributes
----------
max_len : int
Length to which the array will be padded or trimmed to.
fill_value: int or float
If there is a need of padding, what value to pad at the end of the input array.
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array is a scaled NDArray with (max_len, ) shape.
Examples
--------
>>> padtrim = audio.transforms.PadTrim(max_len=9, fill_value=0)
>>> audio_samples = mx.nd.array([1,2,3,4,5])
>>> padtrim(audio_samples)
[1. 2. 3. 4. 5. 0. 0. 0. 0.]
<NDArray 9 @cpu(0)>
"""

def __init__(self, max_len, fill_value=0):
self._max_len = max_len
self._fill_value = fill_value
super(PadTrim, self).__init__()

def forward(self, x):
if isinstance(x, np.ndarray):
x = nd.array(x)
if self._max_len > x.size:
pad = nd.ones((self._max_len - x.size,)) * self._fill_value
x = nd.concat(x, pad, dim=0)
elif self._max_len < x.size:
x = x[:self._max_len]
return x


class MEL(Block):
"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
Attributes
----------
sampling_rate: int, default 22050
sampling rate of the input audio signal
num_fft: int, default 2048
length of the Fast Fourier transform window
num_mels: int, default 20
number of mel bands to generate
hop_length: int, default 512
total samples between successive frames
Inputs:
- **x**: input tensor (samples, ) shape.
Outputs:
- **out**: output array which consists of mel spectograms, shape = (n_mels, 1)
Usage (see librosa.feature.melspectrogram docs):
MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
Examples
--------
>>> mel = audio.transforms.MEL()
>>> audio_samples = mx.nd.array([1,2,3,4,5])
>>> mel(audio_samples)
[[3.81801406e+04]
[9.86858240e-29]
[1.87405472e-29]
[2.38637225e-29]
[3.94043010e-29]
[3.67071565e-29]
[7.29390295e-29]
[8.84324438e-30]...
<NDArray 128x1 @cpu(0)>
"""

def __init__(self, sampling_rate=22050, num_fft=2048, num_mels=20, hop_length=512):
self._sampling_rate = sampling_rate
self._num_fft = num_fft
self._num_mels = num_mels
self._hop_length = hop_length
super(MEL, self).__init__()

def forward(self, x):
if isinstance(x, nd.NDArray):
x = x.asnumpy()
specs = librosa.feature.melspectrogram(x, sr=self._sampling_rate,\
n_fft=self._num_fft, n_mels=self._num_mels, hop_length=self._hop_length)
return nd.array(specs)
100 changes: 100 additions & 0 deletions example/gluon/audio/urban_sounds/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Urban Sounds Classification in MXNet Gluon

This example provides an end-to-end pipeline for a common datahack competition - Urban Sounds Classification Example.
Below is the link to the competition:
https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/

After logging in, the data set can be downloaded.
The details of the dataset and the link to download it are given below:


## Urban Sounds Dataset:
### Description
The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on.
The task is to classify these audio samples into one of the following 10 labels:
```
siren,
street_music,
drilling,
dog_bark,
children_playing,
gun_shot,
engine_idling,
air_conditioner,
jackhammer,
car_horn
```

To be able to run this example:

1. `pip install -r requirements.txt`

If you are in the directory where the requirements.txt file lies,
this step installs the required libraries to run the example.
The main dependency that is required is: Librosa.
The version used to test the example is: `0.6.2`
For more details, refer here:
https://librosa.github.io/librosa/install.html

2. Download the dataset(train.zip, test.zip) required for this example from the location:
https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU

3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely,
**Train** and **Test** and two csv files - **train.csv**, **test.csv**

Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be:

```
UrbanSounds
- Train
- 0.wav, 1.wav ...
- train.csv
- train.py
- predict.py ...
```
4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/
For information on the current design of how the AudioFolderDataset is implemented, refer below:
https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio
### Usage
For training:
- Arguments
- train : The folder/directory that contains the audio(wav) files locally. Default = "./Train"
- csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv"
- epochs : Number of epochs to train the model. Default = 30
- batch_size : The batch size for training. Default = 32
###### To use the default arguments, use:
```
python train.py
```
or
###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use :
```
python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30
```
For prediction:
- Arguments
- pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test"
###### To use the default arguments, use:
```
python predict.py
```
or
###### To pass command-line arguments for test data directory, use :
```
python predict.py --pred ./Test
```
Loading

0 comments on commit d6ae538

Please sign in to comment.