Skip to content

Commit

Permalink
[voicerss] add support for WAV audio format
Browse files Browse the repository at this point in the history
Signed-off-by: Andreas Brenk <mail@andreasbrenk.com>
  • Loading branch information
abrenk committed Dec 31, 2021
1 parent 3e4aa9d commit 14ad4a6
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ public class VoiceRSSTTSService implements TTSService {

// API Key comes from ConfigAdmin
private static final String CONFIG_API_KEY = "apiKey";

/**
* Map from openHAB AudioFormat Codec to VoiceRSS API Audio Codec
*/
private static final Map<String, String> CODEC_MAP = Map.of(AudioFormat.CODEC_PCM_SIGNED, "WAV",
AudioFormat.CODEC_PCM_UNSIGNED, "WAV", AudioFormat.CODEC_PCM_ALAW, "WAV", AudioFormat.CODEC_PCM_ULAW, "WAV",
AudioFormat.CODEC_MP3, "MP3", AudioFormat.CODEC_VORBIS, "OGG", AudioFormat.CODEC_AAC, "AAC");

/**
* Map from openHAB AudioFormat Frequency to VoiceRSS API Audio Frequency
*/
private static final Map<Long, String> FREQUENCY_MAP = Map.of(8_000L, "8khz", 11_025L, "11khz", 12_000L, "12khz",
16_000L, "16khz", 22_050L, "22khz", 24_000L, "24khz", 32_000L, "32khz", 44_100L, "44khz", 48_000L, "48khz");

private String apiKey;

private final Logger logger = LoggerFactory.getLogger(VoiceRSSTTSService.class);
Expand Down Expand Up @@ -121,22 +135,12 @@ public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFor
if (!voices.contains(voice)) {
throw new TTSException("The passed voice is unsupported");
}
boolean isAudioFormatSupported = false;
for (AudioFormat currentAudioFormat : audioFormats) {
if (currentAudioFormat.isCompatible(requestedFormat)) {
isAudioFormatSupported = true;
break;
}
}
if (!isAudioFormatSupported) {
throw new TTSException("The passed AudioFormat is unsupported");
}

// now create the input stream for given text, locale, format. There is
// only a default voice
// now create the input stream for given text, locale, voice, codec and format.
try {
File cacheAudioFile = voiceRssImpl.getTextToSpeechAsFile(apiKey, trimmedText,
voice.getLocale().toLanguageTag(), voice.getLabel(), getApiAudioFormat(requestedFormat));
voice.getLocale().toLanguageTag(), voice.getLabel(), getApiAudioCodec(requestedFormat),
getApiAudioFormat(requestedFormat));
if (cacheAudioFile == null) {
throw new TTSException("Could not read from VoiceRSS service");
}
Expand Down Expand Up @@ -169,46 +173,53 @@ private Set<Voice> initVoices() {
* @return The audio formats of this instance
*/
private Set<AudioFormat> initAudioFormats() {
Set<AudioFormat> audioFormats = new HashSet<>();
for (String format : voiceRssImpl.getAvailableAudioFormats()) {
audioFormats.add(getAudioFormat(format));
}
return audioFormats;
return voiceRssImpl.getAvailableAudioFormats();
}

private AudioFormat getAudioFormat(String apiFormat) {
Boolean bigEndian = null;
Integer bitDepth = 16;
Integer bitRate = null;
Long frequency = 44100L;

if ("MP3".equals(apiFormat)) {
// we use by default: MP3, 44khz_16bit_mono with bitrate 64 kbps
bitRate = 64000;
return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, bigEndian, bitDepth, bitRate,
frequency);
} else if ("OGG".equals(apiFormat)) {
// we use by default: OGG, 44khz_16bit_mono
return new AudioFormat(AudioFormat.CONTAINER_OGG, AudioFormat.CODEC_VORBIS, bigEndian, bitDepth, bitRate,
frequency);
} else if ("AAC".equals(apiFormat)) {
// we use by default: AAC, 44khz_16bit_mono
return new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_AAC, bigEndian, bitDepth, bitRate,
frequency);
} else {
throw new IllegalArgumentException("Audio format " + apiFormat + " not yet supported");
/**
* Map {@link AudioFormat#getCodec() codec} to VoiceRSS API codec.
*
* @throws TTSException if {@code format} is not supported
*/
private String getApiAudioCodec(AudioFormat format) throws TTSException {
final String internalCodec = format.getCodec();
final String apiCodec = CODEC_MAP.get(internalCodec != null ? internalCodec : AudioFormat.CODEC_PCM_SIGNED);

if (apiCodec == null) {
throw new TTSException("Unsupported audio format: " + format);
}

return apiCodec;
}

private String getApiAudioFormat(AudioFormat format) {
if (format.getCodec().equals(AudioFormat.CODEC_MP3)) {
return "MP3";
} else if (format.getCodec().equals(AudioFormat.CODEC_VORBIS)) {
return "OGG";
} else if (format.getCodec().equals(AudioFormat.CODEC_AAC)) {
return "AAC";
} else {
throw new IllegalArgumentException("Audio format " + format.getCodec() + " not yet supported");
/**
* Map {@link AudioFormat#getBitDepth() bit depth} and {@link AudioFormat#getFrequency() frequency} to VoiceRSS API
* format.
*
* @throws TTSException if {@code format} is not supported
*/
private String getApiAudioFormat(AudioFormat format) throws TTSException {
final int bitDepth = format.getBitDepth() != null ? format.getBitDepth() : 16;
final Long frequency = format.getFrequency() != null ? format.getFrequency() : 44_100L;
final String apiFrequency = FREQUENCY_MAP.get(frequency);

if (apiFrequency == null || (bitDepth != 8 && bitDepth != 16)) {
throw new TTSException("Unsupported audio format: " + format);
}

switch (format.getCodec() != null ? format.getCodec() : AudioFormat.CODEC_PCM_SIGNED) {
case AudioFormat.CODEC_PCM_ALAW:
return "alaw_" + apiFrequency + "_mono";
case AudioFormat.CODEC_PCM_ULAW:
return "ulaw_" + apiFrequency + "_mono";
case AudioFormat.CODEC_PCM_SIGNED:
case AudioFormat.CODEC_PCM_UNSIGNED:
case AudioFormat.CODEC_MP3:
case AudioFormat.CODEC_VORBIS:
case AudioFormat.CODEC_AAC:
return apiFrequency + "_" + bitDepth + "_mono";
default:
throw new TTSException("Unsupported audio format: " + format);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Objects;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -55,17 +56,17 @@ public CachedVoiceRSSCloudImpl(String cacheFolderName) {
}
}

public File getTextToSpeechAsFile(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException {
String fileNameInCache = getUniqueFilenameForText(text, locale, voice);
public File getTextToSpeechAsFile(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException {
String fileNameInCache = getUniqueFilenameForText(text, locale, voice, audioFormat);
// check if in cache
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + audioFormat.toLowerCase());
File audioFileInCache = new File(cacheFolder, fileNameInCache + "." + audioCodec.toLowerCase());
if (audioFileInCache.exists()) {
return audioFileInCache;
}

// if not in cache, get audio data and put to cache
try (InputStream is = super.getTextToSpeech(apiKey, text, locale, voice, audioFormat);
try (InputStream is = super.getTextToSpeech(apiKey, text, locale, voice, audioCodec, audioFormat);
FileOutputStream fos = new FileOutputStream(audioFileInCache)) {
copyStream(is, fos);
// write text to file for transparency too
Expand All @@ -85,11 +86,12 @@ public File getTextToSpeechAsFile(String apiKey, String text, String locale, Str

/**
* Gets a unique filename for a give text, by creating a MD5 hash of it. It
* will be preceded by the locale.
* will be preceded by the locale and suffixed by the format if it is not the
* default of "44khz_16bit_mono".
*
* Sample: "en-US_00a2653ac5f77063bc4ea2fee87318d3"
*/
private String getUniqueFilenameForText(String text, String locale, String voice) {
private String getUniqueFilenameForText(String text, String locale, String voice, String format) {
try {
byte[] bytesOfMessage = text.getBytes(StandardCharsets.UTF_8);
MessageDigest md = MessageDigest.getInstance("MD5");
Expand All @@ -106,6 +108,9 @@ private String getUniqueFilenameForText(String text, String locale, String voice
filename += voice + "_";
}
filename += hashtext;
if (!Objects.equals(format, "44khz_16bit_mono")) {
filename += "_" + format;
}
return filename;
} catch (NoSuchAlgorithmException ex) {
// should not happen
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public interface VoiceRSSCloudAPI {
*
* @return A set of all audio formats supported
*/
Set<String> getAvailableAudioFormats();
Set<AudioFormat> getAvailableAudioFormats();

/**
* Get all supported voices.
Expand Down Expand Up @@ -70,13 +70,15 @@ public interface VoiceRSSCloudAPI {
* the locale to use
* @param voice
* the voice to use, "default" for the default voice
* @param audioCodec
* the audio codec to use
* @param audioFormat
* the audio format to use
* @return an InputStream to the audio data in specified format
* @throws IOException
* will be raised if the audio data can not be retrieved from
* cloud service
*/
InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException;
InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
*/
package org.openhab.voice.voicerss.internal.cloudapi;

import static java.util.stream.Collectors.toSet;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
Expand All @@ -28,8 +26,8 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Stream;

import org.openhab.core.audio.AudioFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -41,21 +39,51 @@
* <ul>
* <li>All API languages supported</li>
* <li>Only default voice supported with good audio quality</li>
* <li>Only MP3, OGG and AAC audio formats supported</li>
* <li>MP3, OGG, AAC and WAV audio formats supported</li>
* <li>It uses HTTP and not HTTPS (for performance reasons)</li>
* </ul>
*
* @author Jochen Hiller - Initial contribution
* @author Laurent Garnier - add support for all API languages
* @author Laurent Garnier - add support for OGG and AAC audio formats
* @author Andreas Brenk - add support for WAV audio format
*/
public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {

public static final String DEFAULT_VOICE = "default";

private final Logger logger = LoggerFactory.getLogger(VoiceRSSCloudImpl.class);

private static final Set<String> SUPPORTED_AUDIO_FORMATS = Stream.of("MP3", "OGG", "AAC").collect(toSet());
private static final Set<AudioFormat> SUPPORTED_AUDIO_FORMATS = Set.of(
new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_MP3, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_OGG, AudioFormat.CODEC_VORBIS, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_NONE, AudioFormat.CODEC_AAC, null, 16, null, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, null, 16, 128_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 176_400, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 96_000, 12_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 192_000, 12_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 128_000, 16_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 256_000, 16_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 352_800, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 192_000, 24_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 384_000, 24_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 256_000, 32_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 512_000, 32_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 352_800, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 705_600, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_UNSIGNED, false, 8, 384_000, 48_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_SIGNED, false, 16, 768_000, 48_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ALAW, null, 8, 352_800, 44_100L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 64_000, 8_000L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 88_200, 11_025L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 176_400, 22_050L),
new AudioFormat(AudioFormat.CONTAINER_WAVE, AudioFormat.CODEC_PCM_ULAW, null, 8, 352_800, 44_100L));

private static final Set<Locale> SUPPORTED_LOCALES = new HashSet<>();
static {
Expand Down Expand Up @@ -164,7 +192,7 @@ public class VoiceRSSCloudImpl implements VoiceRSSCloudAPI {
}

@Override
public Set<String> getAvailableAudioFormats() {
public Set<AudioFormat> getAvailableAudioFormats() {
return SUPPORTED_AUDIO_FORMATS;
}

Expand Down Expand Up @@ -208,9 +236,9 @@ public Set<String> getAvailableVoices(Locale locale) {
* dependencies.
*/
@Override
public InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioFormat)
throws IOException {
String url = createURL(apiKey, text, locale, voice, audioFormat);
public InputStream getTextToSpeech(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) throws IOException {
String url = createURL(apiKey, text, locale, voice, audioCodec, audioFormat);
logger.debug("Call {}", url);
URLConnection connection = new URL(url).openConnection();

Expand Down Expand Up @@ -254,7 +282,8 @@ public InputStream getTextToSpeech(String apiKey, String text, String locale, St
*
* It is in package scope to be accessed by tests.
*/
private String createURL(String apiKey, String text, String locale, String voice, String audioFormat) {
private String createURL(String apiKey, String text, String locale, String voice, String audioCodec,
String audioFormat) {
String encodedMsg;
try {
encodedMsg = URLEncoder.encode(text, "UTF-8");
Expand All @@ -263,11 +292,12 @@ private String createURL(String apiKey, String text, String locale, String voice
// fall through and use msg un-encoded
encodedMsg = text;
}
String url = "http://api.voicerss.org/?key=" + apiKey + "&hl=" + locale + "&c=" + audioFormat;
String url = "http://api.voicerss.org/?key=" + apiKey + "&hl=" + locale + "&c=" + audioCodec + "&f="
+ audioFormat;
if (!DEFAULT_VOICE.equals(voice)) {
url += "&v=" + voice;
}
url += "&f=44khz_16bit_mono&src=" + encodedMsg;
url += "&src=" + encodedMsg;
return url;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ private void generateCacheForMessage(String apiKey, String cacheDir, String loca
return;
}
CachedVoiceRSSCloudImpl impl = new CachedVoiceRSSCloudImpl(cacheDir);
File cachedFile = impl.getTextToSpeechAsFile(apiKey, trimmedMsg, locale, voice, "MP3");
File cachedFile = impl.getTextToSpeechAsFile(apiKey, trimmedMsg, locale, voice, "MP3", null);
System.out.println(
"Created cached audio for locale='" + locale + "', msg='" + trimmedMsg + "' to file=" + cachedFile);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ public class VoiceRSSTTSServiceTest {
AudioFormat.CODEC_VORBIS, null, 16, null, 44_100L);
private static final AudioFormat AAC_44KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_NONE,
AudioFormat.CODEC_MP3, null, 16, null, 44_100L);
private static final AudioFormat WAV_22KHZ_8BIT = new AudioFormat(AudioFormat.CONTAINER_WAVE,
AudioFormat.CODEC_PCM_UNSIGNED, null, 8, null, 22_050L);
private static final AudioFormat WAV_48KHZ_16BIT = new AudioFormat(AudioFormat.CONTAINER_WAVE,
AudioFormat.CODEC_PCM_SIGNED, false, 16, null, 48_000L);

/**
* The {@link VoiceRSSTTSService} under test.
Expand All @@ -58,13 +62,19 @@ public void testSupportedFormats() {

// check generic formats without any further constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(MP3)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(OGG)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(AAC)));

// check specific formats with common constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(MP3_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(OGG_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(AAC_44KHZ_16BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV_22KHZ_8BIT)));
assertThat(supportedFormats, hasItem(compatibleAudioFormat(WAV_48KHZ_16BIT)));

// check specific formats with additional constraints
assertThat(supportedFormats, hasItem(compatibleAudioFormat(bitRate(WAV, 705_600)))); // 44.1 kHz 16-bit

// check unsupported formats
assertThat(supportedFormats, not(hasItem(compatibleAudioFormat(bitDepth(WAV, 24)))));
Expand Down

0 comments on commit 14ad4a6

Please sign in to comment.