添加语音识别和朗读功能

This commit is contained in:
2026-04-02 15:24:05 +08:00
parent adc12c13f9
commit 295c959b52
+311 -3
View File
@@ -32,11 +32,40 @@ import StopRounded from "@mui/icons-material/StopRounded";
import AutoAwesome from "@mui/icons-material/AutoAwesome"; // Sparkle icon for AI
import ErrorOutlineRounded from "@mui/icons-material/ErrorOutlineRounded";
import AddCommentRounded from "@mui/icons-material/AddCommentRounded";
import VolumeUpRounded from "@mui/icons-material/VolumeUpRounded";
import PauseRounded from "@mui/icons-material/PauseRounded";
import PlayArrowRounded from "@mui/icons-material/PlayArrowRounded";
import MicRounded from "@mui/icons-material/MicRounded";
// Logic
import { streamCopilotChat } from "@/lib/chatStream";
import { parseAssistantMessageSections } from "./chatMessageSections";
// WebKit Speech Recognition compatibility
interface SpeechRecognitionEvent extends Event {
readonly resultIndex: number;
readonly results: SpeechRecognitionResultList;
}
interface SpeechRecognition extends EventTarget {
lang: string;
continuous: boolean;
interimResults: boolean;
onresult: ((event: SpeechRecognitionEvent) => void) | null;
onerror: ((event: Event) => void) | null;
onend: (() => void) | null;
start(): void;
stop(): void;
abort(): void;
}
declare global {
interface Window {
SpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition };
webkitSpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition };
}
}
// Types
type Message = {
id: string;
@@ -59,6 +88,26 @@ const normalizeThoughtTagToken = (token: string): string =>
closingSlash ? "</think>" : "<think>",
);
type SpeechState = "idle" | "playing" | "paused";
const stripMarkdown = (md: string): string =>
md
.replace(/```[\s\S]*?```/g, "")
.replace(/`([^`]+)`/g, "$1")
.replace(/!\[.*?\]\(.*?\)/g, "")
.replace(/\[([^\]]+)\]\(.*?\)/g, "$1")
.replace(/#{1,6}\s+/g, "")
.replace(/\*\*\*(.+?)\*\*\*/g, "$1")
.replace(/\*\*(.+?)\*\*/g, "$1")
.replace(/\*(.+?)\*/g, "$1")
.replace(/~~(.+?)~~/g, "$1")
.replace(/>\s+/g, "")
.replace(/[-*+]\s+/g, "")
.replace(/\d+\.\s+/g, "")
.replace(/\n{2,}/g, "\n")
.replace(/<[^>]+>/g, "")
.trim();
type PersistedChatState = {
messages: Message[];
conversationId?: string;
@@ -150,10 +199,16 @@ const Blob = ({ color, size, top, left, delay }: { color: string; size: number;
type ChatMessageItemProps = {
message: Message;
theme: Theme;
messageSpeechState: SpeechState;
onSpeak: (messageId: string, text: string) => void;
onPause: () => void;
onResume: () => void;
onStopSpeech: () => void;
isTtsSupported: boolean;
};
const ChatMessageItem = React.memo(
({ message, theme }: ChatMessageItemProps) => {
({ message, theme, messageSpeechState, onSpeak, onPause, onResume, onStopSpeech, isTtsSupported }: ChatMessageItemProps) => {
const isUser = message.role === "user";
const isErrorMessage = Boolean(message.isError);
const parsedAssistantSections =
@@ -187,6 +242,7 @@ const ChatMessageItem = React.memo(
</Avatar>
)}
<Box>
<Paper
elevation={isUser ? 8 : isErrorMessage ? 1 : 2}
sx={{
@@ -278,12 +334,194 @@ const ChatMessageItem = React.memo(
<ReactMarkdown remarkPlugins={[remarkGfm]}>{answerContent || "..."}</ReactMarkdown>
</div>
</Paper>
{!isUser && !isErrorMessage && isTtsSupported && (
<Stack direction="row" spacing={0.5} sx={{ mt: 0.5, ml: 0.5 }}>
{messageSpeechState === "idle" && (
<IconButton
size="small"
onClick={() => onSpeak(message.id, stripMarkdown(answerContent))}
aria-label="朗读消息"
sx={{ color: "text.secondary", opacity: 0.6, "&:hover": { opacity: 1 }, p: 0.5 }}
>
<VolumeUpRounded sx={{ fontSize: 16 }} />
</IconButton>
)}
{messageSpeechState === "playing" && (
<>
<IconButton
size="small"
onClick={onPause}
aria-label="暂停朗读"
sx={{ color: "primary.main", p: 0.5 }}
>
<PauseRounded sx={{ fontSize: 16 }} />
</IconButton>
<IconButton
size="small"
onClick={onStopSpeech}
aria-label="停止朗读"
sx={{ color: "error.main", p: 0.5 }}
>
<StopRounded sx={{ fontSize: 16 }} />
</IconButton>
</>
)}
{messageSpeechState === "paused" && (
<>
<IconButton
size="small"
onClick={onResume}
aria-label="继续朗读"
sx={{ color: "primary.main", p: 0.5 }}
>
<PlayArrowRounded sx={{ fontSize: 16 }} />
</IconButton>
<IconButton
size="small"
onClick={onStopSpeech}
aria-label="停止朗读"
sx={{ color: "error.main", p: 0.5 }}
>
<StopRounded sx={{ fontSize: 16 }} />
</IconButton>
</>
)}
</Stack>
)}
</Box>
</motion.div>
);
},
);
ChatMessageItem.displayName = "ChatMessageItem";
// --- Voice Hooks ---
function useSpeechSynthesis() {
const [speechState, setSpeechState] = useState<SpeechState>("idle");
const [speakingMessageId, setSpeakingMessageId] = useState<string | null>(null);
const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);
const isSupported = typeof window !== "undefined" && "speechSynthesis" in window;
const stop = useCallback(() => {
if (!isSupported) return;
window.speechSynthesis.cancel();
utteranceRef.current = null;
setSpeechState("idle");
setSpeakingMessageId(null);
}, [isSupported]);
const speak = useCallback(
(messageId: string, text: string) => {
if (!isSupported || !text) return;
window.speechSynthesis.cancel();
const utterance = new SpeechSynthesisUtterance(text);
utterance.lang = "zh-CN";
utterance.rate = 1;
utterance.onend = () => {
setSpeechState("idle");
setSpeakingMessageId(null);
utteranceRef.current = null;
};
utterance.onerror = () => {
setSpeechState("idle");
setSpeakingMessageId(null);
utteranceRef.current = null;
};
utterance.onpause = () => setSpeechState("paused");
utterance.onresume = () => setSpeechState("playing");
utteranceRef.current = utterance;
setSpeakingMessageId(messageId);
setSpeechState("playing");
window.speechSynthesis.speak(utterance);
},
[isSupported],
);
const pause = useCallback(() => {
if (!isSupported) return;
window.speechSynthesis.pause();
}, [isSupported]);
const resume = useCallback(() => {
if (!isSupported) return;
window.speechSynthesis.resume();
}, [isSupported]);
useEffect(() => {
return () => {
if (typeof window !== "undefined" && "speechSynthesis" in window) {
window.speechSynthesis.cancel();
}
};
}, []);
return { speechState, speakingMessageId, speak, pause, resume, stop, isSupported };
}
function useSpeechRecognition(onResult: (text: string) => void) {
const [isListening, setIsListening] = useState(false);
const recognitionRef = useRef<SpeechRecognition | null>(null);
const onResultRef = useRef(onResult);
useEffect(() => {
onResultRef.current = onResult;
}, [onResult]);
const isSupported =
typeof window !== "undefined" &&
("SpeechRecognition" in window || "webkitSpeechRecognition" in window);
const start = useCallback(() => {
if (!isSupported || recognitionRef.current) return;
const Ctor = window.SpeechRecognition ?? window.webkitSpeechRecognition;
if (!Ctor) return;
const recognition = new Ctor();
recognition.lang = "zh-CN";
recognition.continuous = true;
recognition.interimResults = false;
recognition.onresult = (event: SpeechRecognitionEvent) => {
for (let i = event.resultIndex; i < event.results.length; i++) {
if (event.results[i].isFinal) {
onResultRef.current(event.results[i][0].transcript);
}
}
};
recognition.onerror = () => {
setIsListening(false);
recognitionRef.current = null;
};
recognition.onend = () => {
setIsListening(false);
recognitionRef.current = null;
};
recognitionRef.current = recognition;
recognition.start();
setIsListening(true);
}, [isSupported]);
const stop = useCallback(() => {
recognitionRef.current?.stop();
recognitionRef.current = null;
setIsListening(false);
}, []);
useEffect(() => {
return () => {
recognitionRef.current?.stop();
};
}, []);
return { isListening, start, stop, isSupported };
}
export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
const initialChatStateRef = useRef<PersistedChatState | null>(null);
if (initialChatStateRef.current === null) {
@@ -304,6 +542,28 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
const inputRef = useRef<HTMLInputElement | null>(null);
const theme = useTheme();
// --- Voice Features ---
const {
speechState,
speakingMessageId,
speak: handleSpeak,
pause: handlePauseSpeech,
resume: handleResumeSpeech,
stop: handleStopSpeech,
isSupported: isTtsSupported,
} = useSpeechSynthesis();
const handleSpeechResult = useCallback((text: string) => {
setInput((prev) => prev + text);
}, []);
const {
isListening,
start: startListening,
stop: stopListening,
isSupported: isSttSupported,
} = useSpeechRecognition(handleSpeechResult);
const canSend = useMemo(() => input.trim().length > 0 && !isStreaming, [input, isStreaming]);
const isHeaderMenuOpen = Boolean(headerMenuAnchorEl);
@@ -333,6 +593,7 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
const handleSend = async () => {
const prompt = input.trim();
if (!prompt || isStreaming) return;
stopListening();
const userId = createId();
const assistantId = createId();
@@ -422,6 +683,8 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
const handleNewConversation = useCallback(() => {
abortRef.current?.abort();
handleStopSpeech();
stopListening();
setMessages([]);
setConversationId(undefined);
setInput("");
@@ -431,7 +694,7 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
window.setTimeout(() => {
inputRef.current?.focus();
}, 0);
}, [handleHeaderMenuClose]);
}, [handleHeaderMenuClose, handleStopSpeech, stopListening]);
const handleMouseDown = useCallback((e: React.MouseEvent) => {
e.preventDefault();
@@ -469,9 +732,15 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
key={message.id}
message={message}
theme={theme}
messageSpeechState={speakingMessageId === message.id ? speechState : "idle"}
onSpeak={handleSpeak}
onPause={handlePauseSpeech}
onResume={handleResumeSpeech}
onStopSpeech={handleStopSpeech}
isTtsSupported={isTtsSupported}
/>
)),
[messages, theme],
[messages, theme, speechState, speakingMessageId, handleSpeak, handlePauseSpeech, handleResumeSpeech, handleStopSpeech, isTtsSupported],
);
@@ -756,6 +1025,45 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
}}
/>
{isSttSupported && (
<Box sx={{ display: "flex", alignItems: "center", mr: 1 }}>
{isListening ? (
<motion.div
animate={{ scale: [1, 1.15, 1] }}
transition={{ duration: 1.5, repeat: Infinity, ease: "easeInOut" }}
>
<IconButton
onClick={stopListening}
aria-label="停止语音输入"
sx={{
color: "error.main",
bgcolor: alpha(theme.palette.error.main, 0.1),
width: 44,
height: 44,
"&:hover": { bgcolor: alpha(theme.palette.error.main, 0.2) },
}}
>
<MicRounded />
</IconButton>
</motion.div>
) : (
<IconButton
onClick={startListening}
disabled={isStreaming}
aria-label="语音输入"
sx={{
color: "text.secondary",
width: 44,
height: 44,
"&:hover": { color: "primary.main" },
}}
>
<MicRounded />
</IconButton>
)}
</Box>
)}
<Box sx={{ pr: 0.5 }}>
<AnimatePresence mode="wait">
{isStreaming ? (