I am using an AI text-to-speech service, which provides audio chunk-by-chunk. I’d like the user to be able to start listening to the audio before it has finished arriving. Any thoughts on how I could achieve this please? Thanks!
I have this working now - took a lot of fiddling!
Basically:
- When the client clicks a button to start streaming, it calls a javascript function (see below) which continually attempts to download and play the audio file.
- It then also launches a server function.
The server function requests the audio from the source, and steadily assembles an audio file piece by piece, storing it in a data table. When its finished, it sets another data table flag “audio_complete” to true.
Finally, the server has two HTTP endpoints. One simply reads the current version of the audio file from a data table and returns it as a MediaBlob, and the other just returns the value of “audio_complete”.
(Actually to complicate things I have a third HTTP endpoint which first converts the audio into webm and then provides it - this endpoint is used if the browser’s not able to handle mp3, which surprisingly is the case for Firefox).
Here’s the javascript, which was the tricky bit:
function startStreamingAudio(mpegUrl, webmUrl, isAudioCompleteUrl) {
console.log(mpegUrl)
console.log(webmUrl)
console.log(isAudioCompleteUrl)
let delayBeforeGettingFirstAudio = 5000
let delayBetweenAttemptsToGetAudio = 500
let audio = new Audio();
let mediaSource = new MediaSource();
let sourceBuffer;
let lastFetchPosition = 0;
let isAudioComplete = false;
let audioFormat = 'audio/mpeg';
let audioDataUrl = mpegUrl;
// Check browser support for audio formats
console.log("Checking browser compatibilities.")
if (!MediaSource.isTypeSupported('audio/mpeg')) {
console.log("Doesn't support mpeg - trying webm.")
if (!MediaSource.isTypeSupported('audio/webm')) {
console.error('Both MPEG and webm formats are not supported by this browser.');
return;
}
audioFormat = 'audio/webm';
audioDataUrl = webmUrl;
}
// mediaSource.addEventListener('sourceopen', function() {
// sourceBuffer = mediaSource.addSourceBuffer(audioFormat);
// fetchAndAppendAudio();
// });
mediaSource.addEventListener('sourceopen', function() {
sourceBuffer = mediaSource.addSourceBuffer(audioFormat);
// Introduce a delay before the first fetch.
setTimeout(fetchAndAppendAudio, delayBeforeGettingFirstAudio);
});
function fetchAndAppendAudio() {
console.log("Running FaAA. mpegUrl: " + mpegUrl + ", completeUrl: ", isAudioCompleteUrl)
if (isAudioComplete) {
return;
}
// Check if the audio generation is complete
console.log("Getting completion data from " + isAudioCompleteUrl)
fetch(isAudioCompleteUrl).then(response => {
return response.json();
}).then(data => {
isAudioComplete = data;
}).catch(err => {
console.error('Error checking audio completion:', err);
});
// Fetch the audio data
fetch(audioDataUrl).then(response => {
return response.arrayBuffer();
}).then(data => {
if (data.byteLength > lastFetchPosition) {
let newData = data.slice(lastFetchPosition);
sourceBuffer.appendBuffer(newData);
lastFetchPosition = data.byteLength;
}
if (!isAudioComplete) {
setTimeout(fetchAndAppendAudio, delayBetweenAttemptsToGetAudio);
}
}).catch(err => {
console.error('Error fetching audio data:', err);
});
}
audio.src = URL.createObjectURL(mediaSource);
audio.play();
}
This code is inefficient as it repeatedly re-downloads the entire audio file, in order to get just the bit which is new since last time it tried. You could do better using the “start” parameter on the HTTP endpoint - but it seems that Anvil HTTP endpoints don’t support that, so this is the best I was able to do.
Actually that code had various issues, most notably the incompability of MediaSource with various browsers, including mobile browsers. Here’s an improved bit of code. Took ages to write!
My audio stream is slow, so there are various pauses in there to wait for it to be generated. If your audio production is faster you’ll want to eliminate these pauses.
async function startStreamingAudio3(audioUrl, statusUrl) {
let audioContext = new (window.AudioContext || window.webkitAudioContext)();
let audioQueue = [];
let isPlaying = false;
let lastBufferLength = 0;
async function fetchStatus(url) {
const response = await fetch(url);
const data = await response.text();
return data.includes("true");
}
async function fetchAudio(url) {
const response = await fetch(url);
const arrayBuffer = await response.arrayBuffer();
const audioData = await audioContext.decodeAudioData(arrayBuffer);
return audioData;
}
async function createNewAudioSegment(audioData) {
const newLength = audioData.length - lastBufferLength;
if (newLength <= 0) {
return null;
}
const newSegment = audioContext.createBuffer(
audioData.numberOfChannels,
newLength,
audioData.sampleRate
);
for (let channel = 0; channel < audioData.numberOfChannels; channel++) {
const newData = audioData.getChannelData(channel).slice(lastBufferLength);
newSegment.copyToChannel(newData, channel);
}
lastBufferLength = audioData.length;
return newSegment;
}
async function playAudioQueue() {
if (audioContext.state === 'suspended') {
try {
await audioContext.resume();
} catch (err) {
alert('Press OK and then click anywhere on the page to start the audio.');
await new Promise(resolve => document.addEventListener('click', resolve, {once: true}));
await audioContext.resume();
}
}
// Calculate total duration of audio in the queue
const totalDuration = audioQueue.reduce((acc, buffer) => acc + buffer.duration, 0);
// Only start playing if we have at least 8 seconds of audio buffered
// or if it's the end of the stream.
if (totalDuration < 8 && !isComplete) {
return;
}
if (isPlaying || audioQueue.length === 0) {
return;
}
isPlaying = true;
const audioBuffer = audioQueue.shift();
const audioSource = audioContext.createBufferSource();
audioSource.buffer = audioBuffer;
audioSource.connect(audioContext.destination);
audioSource.start(audioContext.currentTime);
audioSource.onended = () => {
isPlaying = false;
playAudioQueue();
};
}
let fetchInterval;
let isComplete = false; // Declare a flag to track the completion status
await new Promise(resolve => setTimeout(resolve, 5000));
try {
const initialAudioData = await fetchAudio(audioUrl);
const initialSegment = await createNewAudioSegment(initialAudioData);
if (initialSegment) {
audioQueue.push(initialSegment);
playAudioQueue();
}
fetchInterval = setInterval(async () => {
isComplete = await fetchStatus(statusUrl); // Update the flag here
if (isComplete) {
clearInterval(fetchInterval);
// Fetch one last time to get any remaining audio data
const finalAudioData = await fetchAudio(audioUrl);
const finalSegment = await createNewAudioSegment(finalAudioData);
if (finalSegment) {
audioQueue.push(finalSegment);
}
// Explicitly call playAudioQueue to make sure the final segment is played
playAudioQueue();
return;
}
const newAudioData = await fetchAudio(audioUrl);
const newSegment = await createNewAudioSegment(newAudioData);
if (newSegment) {
audioQueue.push(newSegment);
playAudioQueue();
}
}, 5000);
} catch (error) {
console.error(`Error streaming audio: ${error}`);
}
}
You can use ReadableStream as response type from backend and then there is no need to refetch anything. Just pure new Audio(link) where link is the api endpoint returning your readable stream