Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: speech service fix #512

Merged
merged 2 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions packages/client-discord/src/voice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -416,11 +416,6 @@ export class VoiceManager extends EventEmitter {
ServiceType.TRANSCRIPTION
);

console.log(
"transcriptionService: ",
transcriptionService
);

if (!transcriptionService) {
throw new Error(
"Transcription generation service not found"
Expand Down
1 change: 1 addition & 0 deletions packages/plugin-node/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"cldr-segmentation": "2.2.1",
"command-exists": "1.2.9",
"csv-writer": "1.6.0",
"echogarden": "^2.0.5",
"espeak-ng": "1.0.2",
"ffmpeg-static": "5.2.0",
"fluent-ffmpeg": "2.1.3",
Expand Down
342 changes: 226 additions & 116 deletions packages/plugin-node/src/services/speech.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import { PassThrough, Readable } from "stream";
import {
IAgentRuntime,
ISpeechService,
ITranscriptionService,
ServiceType,
} from "@ai16z/eliza";
import { IAgentRuntime, ISpeechService, ServiceType } from "@ai16z/eliza";
import { getWavHeader } from "./audioUtils.ts";
import { synthesize } from "../vendor/vits.ts";
import { Service } from "@ai16z/eliza";
import { validateNodeConfig } from "../enviroment.ts";
import * as Echogarden from "echogarden";

function prependWavHeader(
readable: Readable,
Expand Down Expand Up @@ -40,77 +35,141 @@ function prependWavHeader(

async function textToSpeech(runtime: IAgentRuntime, text: string) {
await validateNodeConfig(runtime);
const body = {
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
text: text,
voice_settings: {
similarity_boost: runtime.getSetting(
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
),
stability: runtime.getSetting("ELEVENLABS_VOICE_STABILITY"),
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
use_speaker_boost: runtime.getSetting(
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
),
},
};
const options = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
},
body: JSON.stringify(body),
};

const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
options
);

const status = response.status;
if (status != 200) {
console.log(`Received status ${status} from Eleven Labs API`);
const errorBodyString = await response.text();
throw new Error(
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
try {
const response = await fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${runtime.getSetting("ELEVENLABS_VOICE_ID")}/stream?optimize_streaming_latency=${runtime.getSetting("ELEVENLABS_OPTIMIZE_STREAMING_LATENCY")}&output_format=${runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT")}`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": runtime.getSetting("ELEVENLABS_XI_API_KEY"),
},
body: JSON.stringify({
model_id: runtime.getSetting("ELEVENLABS_MODEL_ID"),
text: text,
voice_settings: {
similarity_boost: runtime.getSetting(
"ELEVENLABS_VOICE_SIMILARITY_BOOST"
),
stability: runtime.getSetting(
"ELEVENLABS_VOICE_STABILITY"
),
style: runtime.getSetting("ELEVENLABS_VOICE_STYLE"),
use_speaker_boost: runtime.getSetting(
"ELEVENLABS_VOICE_USE_SPEAKER_BOOST"
),
},
}),
}
);
}

if (response) {
const reader = response.body?.getReader();
const readable = new Readable({
read() {
reader &&
reader.read().then(({ done, value }) => {
if (done) {
this.push(null);
} else {
this.push(value);
}
});
},
});

if (runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").startsWith("pcm_")) {
const sampleRate = parseInt(
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
);
const withHeader = prependWavHeader(
readable,
1024 * 1024 * 100,
sampleRate,
1,
16
const status = response.status;
if (status != 200) {
const errorBodyString = await response.text();
const errorBody = JSON.parse(errorBodyString);

// Check for quota exceeded error
if (
status === 401 &&
errorBody.detail?.status === "quota_exceeded"
) {
console.log("ElevenLabs quota exceeded, falling back to VITS");
throw new Error("QUOTA_EXCEEDED");
}

throw new Error(
`Received status ${status} from Eleven Labs API: ${errorBodyString}`
);
return withHeader;
}

if (response) {
const reader = response.body?.getReader();
const readable = new Readable({
read() {
reader &&
reader.read().then(({ done, value }) => {
if (done) {
this.push(null);
} else {
this.push(value);
}
});
},
});

if (
runtime
.getSetting("ELEVENLABS_OUTPUT_FORMAT")
.startsWith("pcm_")
) {
const sampleRate = parseInt(
runtime.getSetting("ELEVENLABS_OUTPUT_FORMAT").substring(4)
);
const withHeader = prependWavHeader(
readable,
1024 * 1024 * 100,
sampleRate,
1,
16
);
return withHeader;
} else {
return readable;
}
} else {
return readable;
return new Readable({
read() {},
});
}
} catch (error) {
if (error.message === "QUOTA_EXCEEDED") {
// Fall back to VITS
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}
return wavStream;
}
} else {
return new Readable({
read() {},
});
throw error; // Re-throw other errors
}
}

Expand All @@ -124,53 +183,104 @@ export class SpeechService extends Service implements ISpeechService {
}

async generate(runtime: IAgentRuntime, text: string): Promise<Readable> {
// check for elevenlabs API key
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
return textToSpeech(runtime, text);
}
const { audio } = await synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
try {
// check for elevenlabs API key
if (runtime.getSetting("ELEVENLABS_XI_API_KEY")) {
return await textToSpeech(runtime, text);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);
// Default to VITS if no ElevenLabs API key
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}
let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}

return wavStream;
return wavStream;
} catch (error) {
console.error("Speech generation error:", error);
// If ElevenLabs fails for any reason, fall back to VITS
const { audio } = await Echogarden.synthesize(text, {
engine: "vits",
voice: "en_US-hfc_female-medium",
});

let wavStream: Readable;
if (audio instanceof Buffer) {
console.log("audio is a buffer");
wavStream = Readable.from(audio);
} else if ("audioChannels" in audio && "sampleRate" in audio) {
console.log("audio is a RawAudio");
const floatBuffer = Buffer.from(audio.audioChannels[0].buffer);
console.log("buffer length: ", floatBuffer.length);

// Get the sample rate from the RawAudio object
const sampleRate = audio.sampleRate;

// Create a Float32Array view of the floatBuffer
const floatArray = new Float32Array(floatBuffer.buffer);

// Convert 32-bit float audio to 16-bit PCM
const pcmBuffer = new Int16Array(floatArray.length);
for (let i = 0; i < floatArray.length; i++) {
pcmBuffer[i] = Math.round(floatArray[i] * 32767);
}

// Prepend WAV header to the buffer
const wavHeaderBuffer = getWavHeader(
pcmBuffer.length * 2,
sampleRate,
1,
16
);
const wavBuffer = Buffer.concat([
wavHeaderBuffer,
Buffer.from(pcmBuffer.buffer),
]);

wavStream = Readable.from(wavBuffer);
} else {
throw new Error("Unsupported audio format");
}

return wavStream;
}
}
}
Loading
Loading