648 lines
20 KiB
TypeScript
648 lines
20 KiB
TypeScript
import { useCallback, useEffect, useRef, useState } from "react";
|
|
import config from "@/config/config";
|
|
import type { SpeechState } from "./GlobalChatbox.types";
|
|
|
|
type AudioStreamStartResponse = {
|
|
stream_id?: string;
|
|
audio_url?: string;
|
|
status_url?: string;
|
|
result_url?: string;
|
|
sample_rate?: number;
|
|
channels?: number;
|
|
error?: string;
|
|
};
|
|
|
|
type AudioStreamStatusResponse = {
|
|
state?: "starting" | "running" | "done" | "failed" | "closed";
|
|
ready?: boolean;
|
|
failed?: boolean;
|
|
closed?: boolean;
|
|
status_text?: string;
|
|
error?: string;
|
|
};
|
|
|
|
type AudioStreamResultResponse = {
|
|
run_status?: string;
|
|
error?: string;
|
|
};
|
|
|
|
// WebKit Speech Recognition compatibility
|
|
interface SpeechRecognitionEvent extends Event {
|
|
readonly resultIndex: number;
|
|
readonly results: SpeechRecognitionResultList;
|
|
}
|
|
|
|
interface SpeechRecognition extends EventTarget {
|
|
lang: string;
|
|
continuous: boolean;
|
|
interimResults: boolean;
|
|
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
onerror: ((event: Event) => void) | null;
|
|
onend: (() => void) | null;
|
|
start(): void;
|
|
stop(): void;
|
|
abort(): void;
|
|
}
|
|
|
|
declare global {
|
|
interface Window {
|
|
SpeechRecognition?: {
|
|
new (): SpeechRecognition;
|
|
prototype: SpeechRecognition;
|
|
};
|
|
webkitSpeechRecognition?: {
|
|
new (): SpeechRecognition;
|
|
prototype: SpeechRecognition;
|
|
};
|
|
webkitAudioContext?: typeof AudioContext;
|
|
}
|
|
}
|
|
|
|
export function useSpeechSynthesis() {
|
|
const [speechState, setSpeechState] = useState<SpeechState>("idle");
|
|
const [speakingMessageId, setSpeakingMessageId] = useState<string | null>(null);
|
|
const audioContextRef = useRef<AudioContext | null>(null);
|
|
const streamAbortControllerRef = useRef<AbortController | null>(null);
|
|
const activeSourceNodesRef = useRef<Set<AudioBufferSourceNode>>(new Set());
|
|
const streamIdRef = useRef<string | null>(null);
|
|
const closeUrlRef = useRef<string | null>(null);
|
|
const statusUrlRef = useRef<string | null>(null);
|
|
const resultUrlRef = useRef<string | null>(null);
|
|
const statusPollTimeoutRef = useRef<number | null>(null);
|
|
const playbackTokenRef = useRef(0);
|
|
|
|
const isSupported =
|
|
typeof window !== "undefined" &&
|
|
typeof window.FormData !== "undefined" &&
|
|
(typeof window.AudioContext !== "undefined" ||
|
|
typeof window.webkitAudioContext !== "undefined");
|
|
|
|
const trimTrailingSlash = useCallback((value: string) => value.replace(/\/+$/, ""), []);
|
|
|
|
const buildServiceUrl = useCallback(
|
|
(path: string) => `${trimTrailingSlash(config.AUDIO_SERVICE_URL)}${path.startsWith("/") ? path : `/${path}`}`,
|
|
[trimTrailingSlash],
|
|
);
|
|
|
|
const resolveServiceUrl = useCallback(
|
|
(pathOrUrl: string) => {
|
|
if (/^https?:\/\//i.test(pathOrUrl)) {
|
|
return pathOrUrl;
|
|
}
|
|
return buildServiceUrl(pathOrUrl);
|
|
},
|
|
[buildServiceUrl],
|
|
);
|
|
|
|
const withQueryParams = useCallback(
|
|
(urlString: string, params: Record<string, string>) => {
|
|
const url = new URL(urlString);
|
|
Object.entries(params).forEach(([key, value]) => {
|
|
url.searchParams.set(key, value);
|
|
});
|
|
return url.toString();
|
|
},
|
|
[],
|
|
);
|
|
|
|
const readErrorMessage = useCallback(async (response: Response, fallback: string) => {
|
|
try {
|
|
const payload = (await response.json()) as { error?: string; message?: string };
|
|
return payload.error || payload.message || fallback;
|
|
} catch {
|
|
return fallback;
|
|
}
|
|
}, []);
|
|
|
|
const closeStream = useCallback(async (closeUrl: string) => {
|
|
const response = await fetch(closeUrl, {
|
|
method: "POST",
|
|
});
|
|
|
|
if (!response.ok) {
|
|
console.error("[GlobalChatbox] Failed to close audio stream:", closeUrl);
|
|
}
|
|
}, []);
|
|
|
|
const stopStatusPolling = useCallback(() => {
|
|
if (statusPollTimeoutRef.current !== null) {
|
|
window.clearTimeout(statusPollTimeoutRef.current);
|
|
statusPollTimeoutRef.current = null;
|
|
}
|
|
}, []);
|
|
|
|
const fetchStreamResult = useCallback(
|
|
async (resultUrl: string) => {
|
|
const response = await fetch(resultUrl);
|
|
if (response.status === 202) {
|
|
return false;
|
|
}
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
await readErrorMessage(
|
|
response,
|
|
`Audio stream result failed with status ${response.status}`,
|
|
),
|
|
);
|
|
}
|
|
|
|
const payload = (await response.json()) as AudioStreamResultResponse;
|
|
if (payload.error) {
|
|
throw new Error(payload.error);
|
|
}
|
|
|
|
return true;
|
|
},
|
|
[readErrorMessage],
|
|
);
|
|
|
|
const clearAudio = useCallback(async () => {
|
|
const abortController = streamAbortControllerRef.current;
|
|
streamAbortControllerRef.current = null;
|
|
abortController?.abort();
|
|
|
|
activeSourceNodesRef.current.forEach((source) => {
|
|
try {
|
|
source.onended = null;
|
|
source.stop();
|
|
} catch {
|
|
// ignore stop errors when source already ended
|
|
}
|
|
source.disconnect();
|
|
});
|
|
activeSourceNodesRef.current.clear();
|
|
|
|
const audioContext = audioContextRef.current;
|
|
audioContextRef.current = null;
|
|
if (!audioContext) return;
|
|
|
|
try {
|
|
await audioContext.close();
|
|
} catch {
|
|
// ignore close errors when context already closed
|
|
}
|
|
}, []);
|
|
|
|
const playPcmStream = useCallback(
|
|
async ({
|
|
audioUrl,
|
|
sampleRate,
|
|
channels,
|
|
playbackToken,
|
|
}: {
|
|
audioUrl: string;
|
|
sampleRate: number;
|
|
channels: number;
|
|
playbackToken: number;
|
|
}) => {
|
|
const AudioContextCtor = window.AudioContext ?? window.webkitAudioContext;
|
|
if (!AudioContextCtor) {
|
|
throw new Error("WebAudio AudioContext is not available in this browser");
|
|
}
|
|
|
|
const abortController = new AbortController();
|
|
streamAbortControllerRef.current = abortController;
|
|
|
|
const response = await fetch(withQueryParams(audioUrl, { format: "pcm" }), {
|
|
signal: abortController.signal,
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
await readErrorMessage(response, `Audio stream failed with status ${response.status}`),
|
|
);
|
|
}
|
|
if (!response.body) {
|
|
throw new Error("Audio stream response body is missing");
|
|
}
|
|
|
|
const audioContext = new AudioContextCtor({
|
|
sampleRate,
|
|
});
|
|
audioContextRef.current = audioContext;
|
|
|
|
const reader = response.body.getReader();
|
|
const bytesPerFrame = Math.max(1, channels) * 2;
|
|
let bufferedRemainder = new Uint8Array(0);
|
|
let nextStartTime = audioContext.currentTime + 0.05;
|
|
let activeSources = 0;
|
|
let streamEnded = false;
|
|
let resolvePlaybackDrain: (() => void) | null = null;
|
|
const playbackDrainPromise = new Promise<void>((resolve) => {
|
|
resolvePlaybackDrain = resolve;
|
|
});
|
|
|
|
const maybeResolvePlaybackDrain = () => {
|
|
if (streamEnded && activeSources === 0) {
|
|
resolvePlaybackDrain?.();
|
|
}
|
|
};
|
|
|
|
const schedulePcmChunk = (pcmBytes: Uint8Array) => {
|
|
const frameCount = pcmBytes.byteLength / bytesPerFrame;
|
|
if (frameCount <= 0) return;
|
|
|
|
const buffer = audioContext.createBuffer(Math.max(1, channels), frameCount, sampleRate);
|
|
const view = new DataView(pcmBytes.buffer, pcmBytes.byteOffset, pcmBytes.byteLength);
|
|
for (let frame = 0; frame < frameCount; frame += 1) {
|
|
for (let channel = 0; channel < Math.max(1, channels); channel += 1) {
|
|
const sampleIndex = frame * Math.max(1, channels) + channel;
|
|
const pcm = view.getInt16(sampleIndex * 2, true);
|
|
buffer.getChannelData(channel)[frame] = pcm / 32768;
|
|
}
|
|
}
|
|
|
|
const source = audioContext.createBufferSource();
|
|
source.buffer = buffer;
|
|
source.connect(audioContext.destination);
|
|
const sourceStartTime = Math.max(nextStartTime, audioContext.currentTime + 0.01);
|
|
nextStartTime = sourceStartTime + buffer.duration;
|
|
|
|
activeSources += 1;
|
|
activeSourceNodesRef.current.add(source);
|
|
source.onended = () => {
|
|
activeSources -= 1;
|
|
activeSourceNodesRef.current.delete(source);
|
|
source.disconnect();
|
|
maybeResolvePlaybackDrain();
|
|
};
|
|
source.start(sourceStartTime);
|
|
};
|
|
|
|
const concatUint8Arrays = (a: Uint8Array, b: Uint8Array) => {
|
|
if (a.byteLength === 0) return b;
|
|
if (b.byteLength === 0) return a;
|
|
const merged = new Uint8Array(a.byteLength + b.byteLength);
|
|
merged.set(a);
|
|
merged.set(b, a.byteLength);
|
|
return merged;
|
|
};
|
|
|
|
while (true) {
|
|
if (playbackToken !== playbackTokenRef.current) {
|
|
throw new DOMException("PCM stream playback cancelled", "AbortError");
|
|
}
|
|
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
if (!value || value.byteLength === 0) continue;
|
|
|
|
const merged = concatUint8Arrays(bufferedRemainder, value);
|
|
const alignedByteLength = merged.byteLength - (merged.byteLength % bytesPerFrame);
|
|
if (alignedByteLength === 0) {
|
|
bufferedRemainder = new Uint8Array(merged);
|
|
continue;
|
|
}
|
|
|
|
const alignedChunk = merged.slice(0, alignedByteLength);
|
|
bufferedRemainder = new Uint8Array(merged.slice(alignedByteLength));
|
|
schedulePcmChunk(alignedChunk);
|
|
}
|
|
|
|
streamEnded = true;
|
|
maybeResolvePlaybackDrain();
|
|
await playbackDrainPromise;
|
|
},
|
|
[readErrorMessage, withQueryParams],
|
|
);
|
|
|
|
const stopPlayback = useCallback(async () => {
|
|
await clearAudio();
|
|
stopStatusPolling();
|
|
|
|
const closeUrl = closeUrlRef.current;
|
|
streamIdRef.current = null;
|
|
closeUrlRef.current = null;
|
|
statusUrlRef.current = null;
|
|
resultUrlRef.current = null;
|
|
setSpeechState("idle");
|
|
setSpeakingMessageId(null);
|
|
|
|
if (closeUrl) {
|
|
try {
|
|
await closeStream(closeUrl);
|
|
} catch (error) {
|
|
console.error("[GlobalChatbox] Failed to close audio stream:", error);
|
|
}
|
|
}
|
|
}, [clearAudio, closeStream, stopStatusPolling]);
|
|
|
|
const pollStreamStatus = useCallback(
|
|
(playbackToken: number, statusUrl: string, resultUrl: string) => {
|
|
stopStatusPolling();
|
|
|
|
statusPollTimeoutRef.current = window.setTimeout(async () => {
|
|
if (
|
|
playbackToken !== playbackTokenRef.current ||
|
|
statusUrlRef.current !== statusUrl ||
|
|
resultUrlRef.current !== resultUrl
|
|
) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const response = await fetch(statusUrl);
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
await readErrorMessage(
|
|
response,
|
|
`Audio stream status failed with status ${response.status}`,
|
|
),
|
|
);
|
|
}
|
|
|
|
const payload = (await response.json()) as AudioStreamStatusResponse;
|
|
if (
|
|
playbackToken !== playbackTokenRef.current ||
|
|
statusUrlRef.current !== statusUrl ||
|
|
resultUrlRef.current !== resultUrl
|
|
) {
|
|
return;
|
|
}
|
|
|
|
if (payload.failed || payload.state === "failed") {
|
|
console.error(
|
|
"[GlobalChatbox] Audio stream failed:",
|
|
payload.error || payload.status_text || statusUrl,
|
|
);
|
|
playbackTokenRef.current += 1;
|
|
void stopPlayback();
|
|
return;
|
|
}
|
|
|
|
if (payload.closed || payload.state === "closed") {
|
|
stopStatusPolling();
|
|
return;
|
|
}
|
|
|
|
if (payload.ready || payload.state === "done") {
|
|
try {
|
|
const isResultReady = await fetchStreamResult(resultUrl);
|
|
if (isResultReady) {
|
|
stopStatusPolling();
|
|
return;
|
|
}
|
|
} catch (error) {
|
|
console.error("[GlobalChatbox] Failed to fetch audio stream result:", error);
|
|
}
|
|
}
|
|
|
|
pollStreamStatus(playbackToken, statusUrl, resultUrl);
|
|
} catch (error) {
|
|
if (
|
|
playbackToken === playbackTokenRef.current &&
|
|
statusUrlRef.current === statusUrl &&
|
|
resultUrlRef.current === resultUrl
|
|
) {
|
|
console.error("[GlobalChatbox] Failed to poll audio stream status:", error);
|
|
pollStreamStatus(playbackToken, statusUrl, resultUrl);
|
|
}
|
|
}
|
|
}, 1000);
|
|
},
|
|
[fetchStreamResult, readErrorMessage, stopPlayback, stopStatusPolling],
|
|
);
|
|
|
|
const stop = useCallback(() => {
|
|
playbackTokenRef.current += 1;
|
|
void stopPlayback();
|
|
}, [stopPlayback]);
|
|
|
|
const speak = useCallback(
|
|
async (messageId: string, text: string) => {
|
|
const normalizedText = text.trim();
|
|
if (!isSupported || !normalizedText) return;
|
|
|
|
const playbackToken = playbackTokenRef.current + 1;
|
|
playbackTokenRef.current = playbackToken;
|
|
await stopPlayback();
|
|
|
|
setSpeakingMessageId(messageId);
|
|
setSpeechState("playing");
|
|
|
|
try {
|
|
const formData = new FormData();
|
|
formData.append("text", normalizedText);
|
|
formData.append("demo_id", "demo-1");
|
|
|
|
const response = await fetch(buildServiceUrl("/api/generate-stream/start"), {
|
|
method: "POST",
|
|
body: formData,
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
await readErrorMessage(
|
|
response,
|
|
`Audio stream start failed with status ${response.status}`,
|
|
),
|
|
);
|
|
}
|
|
|
|
const payload = (await response.json()) as AudioStreamStartResponse;
|
|
const streamId = payload.stream_id;
|
|
const sampleRate =
|
|
typeof payload.sample_rate === "number" && payload.sample_rate > 0
|
|
? payload.sample_rate
|
|
: 24000;
|
|
const channels =
|
|
typeof payload.channels === "number" && payload.channels > 0
|
|
? payload.channels
|
|
: 1;
|
|
const audioUrl = payload.audio_url
|
|
? resolveServiceUrl(payload.audio_url)
|
|
: buildServiceUrl(
|
|
`/api/generate-stream/${encodeURIComponent(streamId ?? "")}/audio?format=pcm`,
|
|
);
|
|
const rawStatusUrl = payload.status_url
|
|
? resolveServiceUrl(payload.status_url)
|
|
: buildServiceUrl(`/api/generate-stream/${encodeURIComponent(streamId ?? "")}/status`);
|
|
const statusUrl = withQueryParams(rawStatusUrl, { compact: "1" });
|
|
const rawResultUrl = payload.result_url
|
|
? resolveServiceUrl(payload.result_url)
|
|
: buildServiceUrl(`/api/generate-stream/${encodeURIComponent(streamId ?? "")}/result`);
|
|
const resultUrl = withQueryParams(rawResultUrl, {
|
|
compact: "1",
|
|
include_audio: "0",
|
|
});
|
|
const closeUrl = buildServiceUrl(
|
|
`/api/generate-stream/${encodeURIComponent(streamId ?? "")}/close`,
|
|
);
|
|
|
|
if (!streamId) {
|
|
throw new Error(payload.error || "Audio stream start response is missing stream_id");
|
|
}
|
|
|
|
if (playbackToken !== playbackTokenRef.current) {
|
|
await closeStream(closeUrl);
|
|
return;
|
|
}
|
|
|
|
streamIdRef.current = streamId;
|
|
closeUrlRef.current = closeUrl;
|
|
statusUrlRef.current = statusUrl;
|
|
resultUrlRef.current = resultUrl;
|
|
|
|
pollStreamStatus(playbackToken, statusUrl, resultUrl);
|
|
await playPcmStream({
|
|
audioUrl,
|
|
sampleRate,
|
|
channels,
|
|
playbackToken,
|
|
});
|
|
|
|
if (playbackToken !== playbackTokenRef.current) {
|
|
return;
|
|
}
|
|
|
|
await clearAudio();
|
|
if (streamIdRef.current === streamId) {
|
|
streamIdRef.current = null;
|
|
closeUrlRef.current = null;
|
|
statusUrlRef.current = null;
|
|
resultUrlRef.current = null;
|
|
setSpeechState("idle");
|
|
setSpeakingMessageId(null);
|
|
}
|
|
stopStatusPolling();
|
|
await fetchStreamResult(resultUrl).catch((error) => {
|
|
console.error("[GlobalChatbox] Failed to fetch audio stream result:", error);
|
|
});
|
|
await closeStream(closeUrl);
|
|
} catch (error) {
|
|
await clearAudio();
|
|
if (
|
|
error instanceof DOMException &&
|
|
error.name === "AbortError" &&
|
|
playbackToken !== playbackTokenRef.current
|
|
) {
|
|
return;
|
|
}
|
|
const closeUrl = closeUrlRef.current;
|
|
streamIdRef.current = null;
|
|
closeUrlRef.current = null;
|
|
statusUrlRef.current = null;
|
|
resultUrlRef.current = null;
|
|
setSpeechState("idle");
|
|
setSpeakingMessageId(null);
|
|
if (closeUrl) {
|
|
try {
|
|
await closeStream(closeUrl);
|
|
} catch (closeError) {
|
|
console.error("[GlobalChatbox] Failed to close audio stream:", closeError);
|
|
}
|
|
}
|
|
console.error("[GlobalChatbox] Failed to play audio stream:", error);
|
|
}
|
|
},
|
|
[
|
|
buildServiceUrl,
|
|
clearAudio,
|
|
closeStream,
|
|
fetchStreamResult,
|
|
isSupported,
|
|
playPcmStream,
|
|
readErrorMessage,
|
|
resolveServiceUrl,
|
|
pollStreamStatus,
|
|
stopPlayback,
|
|
stopStatusPolling,
|
|
withQueryParams,
|
|
],
|
|
);
|
|
|
|
const pause = useCallback(() => {
|
|
if (!isSupported || !audioContextRef.current) return;
|
|
void audioContextRef.current.suspend().then(
|
|
() => {
|
|
setSpeechState("paused");
|
|
},
|
|
(error) => {
|
|
console.error("[GlobalChatbox] Failed to pause PCM playback:", error);
|
|
},
|
|
);
|
|
}, [isSupported]);
|
|
|
|
const resume = useCallback(() => {
|
|
if (!isSupported || !audioContextRef.current) return;
|
|
void audioContextRef.current.resume().then(
|
|
() => {
|
|
setSpeechState("playing");
|
|
},
|
|
(error) => {
|
|
playbackTokenRef.current += 1;
|
|
void stopPlayback();
|
|
console.error("[GlobalChatbox] Failed to resume audio playback:", error);
|
|
},
|
|
);
|
|
}, [isSupported, stopPlayback]);
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
playbackTokenRef.current += 1;
|
|
void stopPlayback();
|
|
};
|
|
}, [stopPlayback]);
|
|
|
|
return { speechState, speakingMessageId, speak, pause, resume, stop, isSupported };
|
|
}
|
|
|
|
export function useSpeechRecognition(onResult: (text: string) => void) {
|
|
const [isListening, setIsListening] = useState(false);
|
|
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
|
const onResultRef = useRef(onResult);
|
|
useEffect(() => {
|
|
onResultRef.current = onResult;
|
|
}, [onResult]);
|
|
|
|
const isSupported =
|
|
typeof window !== "undefined" &&
|
|
("SpeechRecognition" in window || "webkitSpeechRecognition" in window);
|
|
|
|
const start = useCallback(() => {
|
|
if (!isSupported || recognitionRef.current) return;
|
|
const Ctor = window.SpeechRecognition ?? window.webkitSpeechRecognition;
|
|
if (!Ctor) return;
|
|
|
|
const recognition = new Ctor();
|
|
recognition.lang = "zh-CN";
|
|
recognition.continuous = true;
|
|
recognition.interimResults = false;
|
|
|
|
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
|
for (let i = event.resultIndex; i < event.results.length; i++) {
|
|
if (event.results[i].isFinal) {
|
|
onResultRef.current(event.results[i][0].transcript);
|
|
}
|
|
}
|
|
};
|
|
|
|
recognition.onerror = () => {
|
|
setIsListening(false);
|
|
recognitionRef.current = null;
|
|
};
|
|
|
|
recognition.onend = () => {
|
|
setIsListening(false);
|
|
recognitionRef.current = null;
|
|
};
|
|
|
|
recognitionRef.current = recognition;
|
|
recognition.start();
|
|
setIsListening(true);
|
|
}, [isSupported]);
|
|
|
|
const stop = useCallback(() => {
|
|
recognitionRef.current?.stop();
|
|
recognitionRef.current = null;
|
|
setIsListening(false);
|
|
}, []);
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
recognitionRef.current?.stop();
|
|
};
|
|
}, []);
|
|
|
|
return { isListening, start, stop, isSupported };
|
|
}
|