添加语音识别和朗读功能
This commit is contained in:
@@ -32,11 +32,40 @@ import StopRounded from "@mui/icons-material/StopRounded";
|
||||
import AutoAwesome from "@mui/icons-material/AutoAwesome"; // Sparkle icon for AI
|
||||
import ErrorOutlineRounded from "@mui/icons-material/ErrorOutlineRounded";
|
||||
import AddCommentRounded from "@mui/icons-material/AddCommentRounded";
|
||||
import VolumeUpRounded from "@mui/icons-material/VolumeUpRounded";
|
||||
import PauseRounded from "@mui/icons-material/PauseRounded";
|
||||
import PlayArrowRounded from "@mui/icons-material/PlayArrowRounded";
|
||||
import MicRounded from "@mui/icons-material/MicRounded";
|
||||
|
||||
// Logic
|
||||
import { streamCopilotChat } from "@/lib/chatStream";
|
||||
import { parseAssistantMessageSections } from "./chatMessageSections";
|
||||
|
||||
// WebKit Speech Recognition compatibility
|
||||
interface SpeechRecognitionEvent extends Event {
|
||||
readonly resultIndex: number;
|
||||
readonly results: SpeechRecognitionResultList;
|
||||
}
|
||||
|
||||
interface SpeechRecognition extends EventTarget {
|
||||
lang: string;
|
||||
continuous: boolean;
|
||||
interimResults: boolean;
|
||||
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
||||
onerror: ((event: Event) => void) | null;
|
||||
onend: (() => void) | null;
|
||||
start(): void;
|
||||
stop(): void;
|
||||
abort(): void;
|
||||
}
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
SpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition };
|
||||
webkitSpeechRecognition?: { new (): SpeechRecognition; prototype: SpeechRecognition };
|
||||
}
|
||||
}
|
||||
|
||||
// Types
|
||||
type Message = {
|
||||
id: string;
|
||||
@@ -59,6 +88,26 @@ const normalizeThoughtTagToken = (token: string): string =>
|
||||
closingSlash ? "</think>" : "<think>",
|
||||
);
|
||||
|
||||
type SpeechState = "idle" | "playing" | "paused";
|
||||
|
||||
const stripMarkdown = (md: string): string =>
|
||||
md
|
||||
.replace(/```[\s\S]*?```/g, "")
|
||||
.replace(/`([^`]+)`/g, "$1")
|
||||
.replace(/!\[.*?\]\(.*?\)/g, "")
|
||||
.replace(/\[([^\]]+)\]\(.*?\)/g, "$1")
|
||||
.replace(/#{1,6}\s+/g, "")
|
||||
.replace(/\*\*\*(.+?)\*\*\*/g, "$1")
|
||||
.replace(/\*\*(.+?)\*\*/g, "$1")
|
||||
.replace(/\*(.+?)\*/g, "$1")
|
||||
.replace(/~~(.+?)~~/g, "$1")
|
||||
.replace(/>\s+/g, "")
|
||||
.replace(/[-*+]\s+/g, "")
|
||||
.replace(/\d+\.\s+/g, "")
|
||||
.replace(/\n{2,}/g, "\n")
|
||||
.replace(/<[^>]+>/g, "")
|
||||
.trim();
|
||||
|
||||
type PersistedChatState = {
|
||||
messages: Message[];
|
||||
conversationId?: string;
|
||||
@@ -150,10 +199,16 @@ const Blob = ({ color, size, top, left, delay }: { color: string; size: number;
|
||||
type ChatMessageItemProps = {
|
||||
message: Message;
|
||||
theme: Theme;
|
||||
messageSpeechState: SpeechState;
|
||||
onSpeak: (messageId: string, text: string) => void;
|
||||
onPause: () => void;
|
||||
onResume: () => void;
|
||||
onStopSpeech: () => void;
|
||||
isTtsSupported: boolean;
|
||||
};
|
||||
|
||||
const ChatMessageItem = React.memo(
|
||||
({ message, theme }: ChatMessageItemProps) => {
|
||||
({ message, theme, messageSpeechState, onSpeak, onPause, onResume, onStopSpeech, isTtsSupported }: ChatMessageItemProps) => {
|
||||
const isUser = message.role === "user";
|
||||
const isErrorMessage = Boolean(message.isError);
|
||||
const parsedAssistantSections =
|
||||
@@ -187,6 +242,7 @@ const ChatMessageItem = React.memo(
|
||||
</Avatar>
|
||||
)}
|
||||
|
||||
<Box>
|
||||
<Paper
|
||||
elevation={isUser ? 8 : isErrorMessage ? 1 : 2}
|
||||
sx={{
|
||||
@@ -278,12 +334,194 @@ const ChatMessageItem = React.memo(
|
||||
<ReactMarkdown remarkPlugins={[remarkGfm]}>{answerContent || "..."}</ReactMarkdown>
|
||||
</div>
|
||||
</Paper>
|
||||
{!isUser && !isErrorMessage && isTtsSupported && (
|
||||
<Stack direction="row" spacing={0.5} sx={{ mt: 0.5, ml: 0.5 }}>
|
||||
{messageSpeechState === "idle" && (
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={() => onSpeak(message.id, stripMarkdown(answerContent))}
|
||||
aria-label="朗读消息"
|
||||
sx={{ color: "text.secondary", opacity: 0.6, "&:hover": { opacity: 1 }, p: 0.5 }}
|
||||
>
|
||||
<VolumeUpRounded sx={{ fontSize: 16 }} />
|
||||
</IconButton>
|
||||
)}
|
||||
{messageSpeechState === "playing" && (
|
||||
<>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={onPause}
|
||||
aria-label="暂停朗读"
|
||||
sx={{ color: "primary.main", p: 0.5 }}
|
||||
>
|
||||
<PauseRounded sx={{ fontSize: 16 }} />
|
||||
</IconButton>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={onStopSpeech}
|
||||
aria-label="停止朗读"
|
||||
sx={{ color: "error.main", p: 0.5 }}
|
||||
>
|
||||
<StopRounded sx={{ fontSize: 16 }} />
|
||||
</IconButton>
|
||||
</>
|
||||
)}
|
||||
{messageSpeechState === "paused" && (
|
||||
<>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={onResume}
|
||||
aria-label="继续朗读"
|
||||
sx={{ color: "primary.main", p: 0.5 }}
|
||||
>
|
||||
<PlayArrowRounded sx={{ fontSize: 16 }} />
|
||||
</IconButton>
|
||||
<IconButton
|
||||
size="small"
|
||||
onClick={onStopSpeech}
|
||||
aria-label="停止朗读"
|
||||
sx={{ color: "error.main", p: 0.5 }}
|
||||
>
|
||||
<StopRounded sx={{ fontSize: 16 }} />
|
||||
</IconButton>
|
||||
</>
|
||||
)}
|
||||
</Stack>
|
||||
)}
|
||||
</Box>
|
||||
</motion.div>
|
||||
);
|
||||
},
|
||||
);
|
||||
ChatMessageItem.displayName = "ChatMessageItem";
|
||||
|
||||
// --- Voice Hooks ---
|
||||
|
||||
function useSpeechSynthesis() {
|
||||
const [speechState, setSpeechState] = useState<SpeechState>("idle");
|
||||
const [speakingMessageId, setSpeakingMessageId] = useState<string | null>(null);
|
||||
const utteranceRef = useRef<SpeechSynthesisUtterance | null>(null);
|
||||
|
||||
const isSupported = typeof window !== "undefined" && "speechSynthesis" in window;
|
||||
|
||||
const stop = useCallback(() => {
|
||||
if (!isSupported) return;
|
||||
window.speechSynthesis.cancel();
|
||||
utteranceRef.current = null;
|
||||
setSpeechState("idle");
|
||||
setSpeakingMessageId(null);
|
||||
}, [isSupported]);
|
||||
|
||||
const speak = useCallback(
|
||||
(messageId: string, text: string) => {
|
||||
if (!isSupported || !text) return;
|
||||
window.speechSynthesis.cancel();
|
||||
|
||||
const utterance = new SpeechSynthesisUtterance(text);
|
||||
utterance.lang = "zh-CN";
|
||||
utterance.rate = 1;
|
||||
utterance.onend = () => {
|
||||
setSpeechState("idle");
|
||||
setSpeakingMessageId(null);
|
||||
utteranceRef.current = null;
|
||||
};
|
||||
utterance.onerror = () => {
|
||||
setSpeechState("idle");
|
||||
setSpeakingMessageId(null);
|
||||
utteranceRef.current = null;
|
||||
};
|
||||
utterance.onpause = () => setSpeechState("paused");
|
||||
utterance.onresume = () => setSpeechState("playing");
|
||||
|
||||
utteranceRef.current = utterance;
|
||||
setSpeakingMessageId(messageId);
|
||||
setSpeechState("playing");
|
||||
window.speechSynthesis.speak(utterance);
|
||||
},
|
||||
[isSupported],
|
||||
);
|
||||
|
||||
const pause = useCallback(() => {
|
||||
if (!isSupported) return;
|
||||
window.speechSynthesis.pause();
|
||||
}, [isSupported]);
|
||||
|
||||
const resume = useCallback(() => {
|
||||
if (!isSupported) return;
|
||||
window.speechSynthesis.resume();
|
||||
}, [isSupported]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (typeof window !== "undefined" && "speechSynthesis" in window) {
|
||||
window.speechSynthesis.cancel();
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
|
||||
return { speechState, speakingMessageId, speak, pause, resume, stop, isSupported };
|
||||
}
|
||||
|
||||
function useSpeechRecognition(onResult: (text: string) => void) {
|
||||
const [isListening, setIsListening] = useState(false);
|
||||
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
||||
const onResultRef = useRef(onResult);
|
||||
useEffect(() => {
|
||||
onResultRef.current = onResult;
|
||||
}, [onResult]);
|
||||
|
||||
const isSupported =
|
||||
typeof window !== "undefined" &&
|
||||
("SpeechRecognition" in window || "webkitSpeechRecognition" in window);
|
||||
|
||||
const start = useCallback(() => {
|
||||
if (!isSupported || recognitionRef.current) return;
|
||||
const Ctor = window.SpeechRecognition ?? window.webkitSpeechRecognition;
|
||||
if (!Ctor) return;
|
||||
|
||||
const recognition = new Ctor();
|
||||
recognition.lang = "zh-CN";
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = false;
|
||||
|
||||
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
||||
for (let i = event.resultIndex; i < event.results.length; i++) {
|
||||
if (event.results[i].isFinal) {
|
||||
onResultRef.current(event.results[i][0].transcript);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
recognition.onerror = () => {
|
||||
setIsListening(false);
|
||||
recognitionRef.current = null;
|
||||
};
|
||||
|
||||
recognition.onend = () => {
|
||||
setIsListening(false);
|
||||
recognitionRef.current = null;
|
||||
};
|
||||
|
||||
recognitionRef.current = recognition;
|
||||
recognition.start();
|
||||
setIsListening(true);
|
||||
}, [isSupported]);
|
||||
|
||||
const stop = useCallback(() => {
|
||||
recognitionRef.current?.stop();
|
||||
recognitionRef.current = null;
|
||||
setIsListening(false);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
recognitionRef.current?.stop();
|
||||
};
|
||||
}, []);
|
||||
|
||||
return { isListening, start, stop, isSupported };
|
||||
}
|
||||
|
||||
export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
const initialChatStateRef = useRef<PersistedChatState | null>(null);
|
||||
if (initialChatStateRef.current === null) {
|
||||
@@ -304,6 +542,28 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
const inputRef = useRef<HTMLInputElement | null>(null);
|
||||
const theme = useTheme();
|
||||
|
||||
// --- Voice Features ---
|
||||
const {
|
||||
speechState,
|
||||
speakingMessageId,
|
||||
speak: handleSpeak,
|
||||
pause: handlePauseSpeech,
|
||||
resume: handleResumeSpeech,
|
||||
stop: handleStopSpeech,
|
||||
isSupported: isTtsSupported,
|
||||
} = useSpeechSynthesis();
|
||||
|
||||
const handleSpeechResult = useCallback((text: string) => {
|
||||
setInput((prev) => prev + text);
|
||||
}, []);
|
||||
|
||||
const {
|
||||
isListening,
|
||||
start: startListening,
|
||||
stop: stopListening,
|
||||
isSupported: isSttSupported,
|
||||
} = useSpeechRecognition(handleSpeechResult);
|
||||
|
||||
const canSend = useMemo(() => input.trim().length > 0 && !isStreaming, [input, isStreaming]);
|
||||
const isHeaderMenuOpen = Boolean(headerMenuAnchorEl);
|
||||
|
||||
@@ -333,6 +593,7 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
const handleSend = async () => {
|
||||
const prompt = input.trim();
|
||||
if (!prompt || isStreaming) return;
|
||||
stopListening();
|
||||
|
||||
const userId = createId();
|
||||
const assistantId = createId();
|
||||
@@ -422,6 +683,8 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
|
||||
const handleNewConversation = useCallback(() => {
|
||||
abortRef.current?.abort();
|
||||
handleStopSpeech();
|
||||
stopListening();
|
||||
setMessages([]);
|
||||
setConversationId(undefined);
|
||||
setInput("");
|
||||
@@ -431,7 +694,7 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
window.setTimeout(() => {
|
||||
inputRef.current?.focus();
|
||||
}, 0);
|
||||
}, [handleHeaderMenuClose]);
|
||||
}, [handleHeaderMenuClose, handleStopSpeech, stopListening]);
|
||||
|
||||
const handleMouseDown = useCallback((e: React.MouseEvent) => {
|
||||
e.preventDefault();
|
||||
@@ -469,9 +732,15 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
key={message.id}
|
||||
message={message}
|
||||
theme={theme}
|
||||
messageSpeechState={speakingMessageId === message.id ? speechState : "idle"}
|
||||
onSpeak={handleSpeak}
|
||||
onPause={handlePauseSpeech}
|
||||
onResume={handleResumeSpeech}
|
||||
onStopSpeech={handleStopSpeech}
|
||||
isTtsSupported={isTtsSupported}
|
||||
/>
|
||||
)),
|
||||
[messages, theme],
|
||||
[messages, theme, speechState, speakingMessageId, handleSpeak, handlePauseSpeech, handleResumeSpeech, handleStopSpeech, isTtsSupported],
|
||||
);
|
||||
|
||||
|
||||
@@ -756,6 +1025,45 @@ export const GlobalChatbox: React.FC<Props> = ({ open, onClose }) => {
|
||||
}}
|
||||
/>
|
||||
|
||||
{isSttSupported && (
|
||||
<Box sx={{ display: "flex", alignItems: "center", mr: 1 }}>
|
||||
{isListening ? (
|
||||
<motion.div
|
||||
animate={{ scale: [1, 1.15, 1] }}
|
||||
transition={{ duration: 1.5, repeat: Infinity, ease: "easeInOut" }}
|
||||
>
|
||||
<IconButton
|
||||
onClick={stopListening}
|
||||
aria-label="停止语音输入"
|
||||
sx={{
|
||||
color: "error.main",
|
||||
bgcolor: alpha(theme.palette.error.main, 0.1),
|
||||
width: 44,
|
||||
height: 44,
|
||||
"&:hover": { bgcolor: alpha(theme.palette.error.main, 0.2) },
|
||||
}}
|
||||
>
|
||||
<MicRounded />
|
||||
</IconButton>
|
||||
</motion.div>
|
||||
) : (
|
||||
<IconButton
|
||||
onClick={startListening}
|
||||
disabled={isStreaming}
|
||||
aria-label="语音输入"
|
||||
sx={{
|
||||
color: "text.secondary",
|
||||
width: 44,
|
||||
height: 44,
|
||||
"&:hover": { color: "primary.main" },
|
||||
}}
|
||||
>
|
||||
<MicRounded />
|
||||
</IconButton>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
|
||||
<Box sx={{ pr: 0.5 }}>
|
||||
<AnimatePresence mode="wait">
|
||||
{isStreaming ? (
|
||||
|
||||
Reference in New Issue
Block a user