diff --git a/src/components/chat/GlobalChatbox.tsx b/src/components/chat/GlobalChatbox.tsx index e1cbed4..eac479f 100644 --- a/src/components/chat/GlobalChatbox.tsx +++ b/src/components/chat/GlobalChatbox.tsx @@ -32,11 +32,40 @@ import StopRounded from "@mui/icons-material/StopRounded"; import AutoAwesome from "@mui/icons-material/AutoAwesome"; // Sparkle icon for AI import ErrorOutlineRounded from "@mui/icons-material/ErrorOutlineRounded"; import AddCommentRounded from "@mui/icons-material/AddCommentRounded"; +import VolumeUpRounded from "@mui/icons-material/VolumeUpRounded"; +import PauseRounded from "@mui/icons-material/PauseRounded"; +import PlayArrowRounded from "@mui/icons-material/PlayArrowRounded"; +import MicRounded from "@mui/icons-material/MicRounded"; // Logic import { streamCopilotChat } from "@/lib/chatStream"; import { parseAssistantMessageSections } from "./chatMessageSections"; +// WebKit Speech Recognition compatibility +interface SpeechRecognitionEvent extends Event { + readonly resultIndex: number; + readonly results: SpeechRecognitionResultList; +} + +interface SpeechRecognition extends EventTarget { + lang: string; + continuous: boolean; + interimResults: boolean; + onresult: ((event: SpeechRecognitionEvent) => void) | null; + onerror: ((event: Event) => void) | null; + onend: (() => void) | null; + start(): void; + stop(): void; + abort(): void; +} + +declare global { + interface Window { + SpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition }; + webkitSpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition }; + } +} + // Types type Message = { id: string; @@ -59,6 +88,26 @@ const normalizeThoughtTagToken = (token: string): string => closingSlash ? "" : "", ); +type SpeechState = "idle" | "playing" | "paused"; + +const stripMarkdown = (md: string): string => + md + .replace(/```[\s\S]*?```/g, "") + .replace(/`([^`]+)`/g, "$1") + .replace(/!\[.*?\]\(.*?\)/g, "") + .replace(/\[([^\]]+)\]\(.*?\)/g, "$1") + .replace(/#{1,6}\s+/g, "") + .replace(/\*\*\*(.+?)\*\*\*/g, "$1") + .replace(/\*\*(.+?)\*\*/g, "$1") + .replace(/\*(.+?)\*/g, "$1") + .replace(/~~(.+?)~~/g, "$1") + .replace(/>\s+/g, "") + .replace(/[-*+]\s+/g, "") + .replace(/\d+\.\s+/g, "") + .replace(/\n{2,}/g, "\n") + .replace(/<[^>]+>/g, "") + .trim(); + type PersistedChatState = { messages: Message[]; conversationId?: string; @@ -150,10 +199,16 @@ const Blob = ({ color, size, top, left, delay }: { color: string; size: number; type ChatMessageItemProps = { message: Message; theme: Theme; + messageSpeechState: SpeechState; + onSpeak: (messageId: string, text: string) => void; + onPause: () => void; + onResume: () => void; + onStopSpeech: () => void; + isTtsSupported: boolean; }; const ChatMessageItem = React.memo( - ({ message, theme }: ChatMessageItemProps) => { + ({ message, theme, messageSpeechState, onSpeak, onPause, onResume, onStopSpeech, isTtsSupported }: ChatMessageItemProps) => { const isUser = message.role === "user"; const isErrorMessage = Boolean(message.isError); const parsedAssistantSections = @@ -187,6 +242,7 @@ const ChatMessageItem = React.memo( )} + {answerContent || "..."} + {!isUser && !isErrorMessage && isTtsSupported && ( + + {messageSpeechState === "idle" && ( + onSpeak(message.id, stripMarkdown(answerContent))} + aria-label="朗读消息" + sx={{ color: "text.secondary", opacity: 0.6, "&:hover": { opacity: 1 }, p: 0.5 }} + > + + + )} + {messageSpeechState === "playing" && ( + <> + + + + + + + + )} + {messageSpeechState === "paused" && ( + <> + + + + + + + + )} + + )} + ); }, ); ChatMessageItem.displayName = "ChatMessageItem"; +// --- Voice Hooks --- + +function useSpeechSynthesis() { + const [speechState, setSpeechState] = useState("idle"); + const [speakingMessageId, setSpeakingMessageId] = useState(null); + const utteranceRef = useRef(null); + + const isSupported = typeof window !== "undefined" && "speechSynthesis" in window; + + const stop = useCallback(() => { + if (!isSupported) return; + window.speechSynthesis.cancel(); + utteranceRef.current = null; + setSpeechState("idle"); + setSpeakingMessageId(null); + }, [isSupported]); + + const speak = useCallback( + (messageId: string, text: string) => { + if (!isSupported || !text) return; + window.speechSynthesis.cancel(); + + const utterance = new SpeechSynthesisUtterance(text); + utterance.lang = "zh-CN"; + utterance.rate = 1; + utterance.onend = () => { + setSpeechState("idle"); + setSpeakingMessageId(null); + utteranceRef.current = null; + }; + utterance.onerror = () => { + setSpeechState("idle"); + setSpeakingMessageId(null); + utteranceRef.current = null; + }; + utterance.onpause = () => setSpeechState("paused"); + utterance.onresume = () => setSpeechState("playing"); + + utteranceRef.current = utterance; + setSpeakingMessageId(messageId); + setSpeechState("playing"); + window.speechSynthesis.speak(utterance); + }, + [isSupported], + ); + + const pause = useCallback(() => { + if (!isSupported) return; + window.speechSynthesis.pause(); + }, [isSupported]); + + const resume = useCallback(() => { + if (!isSupported) return; + window.speechSynthesis.resume(); + }, [isSupported]); + + useEffect(() => { + return () => { + if (typeof window !== "undefined" && "speechSynthesis" in window) { + window.speechSynthesis.cancel(); + } + }; + }, []); + + return { speechState, speakingMessageId, speak, pause, resume, stop, isSupported }; +} + +function useSpeechRecognition(onResult: (text: string) => void) { + const [isListening, setIsListening] = useState(false); + const recognitionRef = useRef(null); + const onResultRef = useRef(onResult); + useEffect(() => { + onResultRef.current = onResult; + }, [onResult]); + + const isSupported = + typeof window !== "undefined" && + ("SpeechRecognition" in window || "webkitSpeechRecognition" in window); + + const start = useCallback(() => { + if (!isSupported || recognitionRef.current) return; + const Ctor = window.SpeechRecognition ?? window.webkitSpeechRecognition; + if (!Ctor) return; + + const recognition = new Ctor(); + recognition.lang = "zh-CN"; + recognition.continuous = true; + recognition.interimResults = false; + + recognition.onresult = (event: SpeechRecognitionEvent) => { + for (let i = event.resultIndex; i < event.results.length; i++) { + if (event.results[i].isFinal) { + onResultRef.current(event.results[i][0].transcript); + } + } + }; + + recognition.onerror = () => { + setIsListening(false); + recognitionRef.current = null; + }; + + recognition.onend = () => { + setIsListening(false); + recognitionRef.current = null; + }; + + recognitionRef.current = recognition; + recognition.start(); + setIsListening(true); + }, [isSupported]); + + const stop = useCallback(() => { + recognitionRef.current?.stop(); + recognitionRef.current = null; + setIsListening(false); + }, []); + + useEffect(() => { + return () => { + recognitionRef.current?.stop(); + }; + }, []); + + return { isListening, start, stop, isSupported }; +} + export const GlobalChatbox: React.FC = ({ open, onClose }) => { const initialChatStateRef = useRef(null); if (initialChatStateRef.current === null) { @@ -304,6 +542,28 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { const inputRef = useRef(null); const theme = useTheme(); + // --- Voice Features --- + const { + speechState, + speakingMessageId, + speak: handleSpeak, + pause: handlePauseSpeech, + resume: handleResumeSpeech, + stop: handleStopSpeech, + isSupported: isTtsSupported, + } = useSpeechSynthesis(); + + const handleSpeechResult = useCallback((text: string) => { + setInput((prev) => prev + text); + }, []); + + const { + isListening, + start: startListening, + stop: stopListening, + isSupported: isSttSupported, + } = useSpeechRecognition(handleSpeechResult); + const canSend = useMemo(() => input.trim().length > 0 && !isStreaming, [input, isStreaming]); const isHeaderMenuOpen = Boolean(headerMenuAnchorEl); @@ -333,6 +593,7 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { const handleSend = async () => { const prompt = input.trim(); if (!prompt || isStreaming) return; + stopListening(); const userId = createId(); const assistantId = createId(); @@ -422,6 +683,8 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { const handleNewConversation = useCallback(() => { abortRef.current?.abort(); + handleStopSpeech(); + stopListening(); setMessages([]); setConversationId(undefined); setInput(""); @@ -431,7 +694,7 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { window.setTimeout(() => { inputRef.current?.focus(); }, 0); - }, [handleHeaderMenuClose]); + }, [handleHeaderMenuClose, handleStopSpeech, stopListening]); const handleMouseDown = useCallback((e: React.MouseEvent) => { e.preventDefault(); @@ -469,9 +732,15 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { key={message.id} message={message} theme={theme} + messageSpeechState={speakingMessageId === message.id ? speechState : "idle"} + onSpeak={handleSpeak} + onPause={handlePauseSpeech} + onResume={handleResumeSpeech} + onStopSpeech={handleStopSpeech} + isTtsSupported={isTtsSupported} /> )), - [messages, theme], + [messages, theme, speechState, speakingMessageId, handleSpeak, handlePauseSpeech, handleResumeSpeech, handleStopSpeech, isTtsSupported], ); @@ -756,6 +1025,45 @@ export const GlobalChatbox: React.FC = ({ open, onClose }) => { }} /> + {isSttSupported && ( + + {isListening ? ( + + + + + + ) : ( + + + + )} + + )} + {isStreaming ? (