realtime 语音录取

This commit is contained in:
宋居成 2025-07-27 12:11:13 +08:00
parent c95e6a2552
commit d808bbfe26
8 changed files with 1251 additions and 103 deletions

322
src/audio_processor.js Normal file
View File

@ -0,0 +1,322 @@
// 音频处理模块 - 提取自 new_app.js 的高级音频处理功能
class AudioProcessor {
constructor(options = {}) {
this.audioContext = null;
this.isRecording = false;
this.audioChunks = [];
// VAD相关属性
this.isSpeaking = false;
this.silenceThreshold = options.silenceThreshold || 0.01;
this.silenceTimeout = options.silenceTimeout || 1000;
this.minSpeechDuration = options.minSpeechDuration || 300;
this.silenceTimer = null;
this.speechStartTime = null;
this.audioBuffer = [];
// API配置
this.apiConfig = {
url: 'https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash',
headers: {
'X-Api-App-Key': '1988591469',
'X-Api-Access-Key': 'mdEyhgZ59on1-NK3GXWAp3L4iLldSG0r',
'X-Api-Resource-Id': 'volc.bigasr.auc_turbo',
'X-Api-Request-Id': this.generateUUID(),
'X-Api-Sequence': '-1',
'Content-Type': 'application/json'
}
};
// 回调函数
this.onSpeechStart = options.onSpeechStart || (() => {});
this.onSpeechEnd = options.onSpeechEnd || (() => {});
this.onRecognitionResult = options.onRecognitionResult || (() => {});
this.onError = options.onError || (() => {});
this.onStatusUpdate = options.onStatusUpdate || (() => {});
}
// 生成UUID
generateUUID() {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
const r = Math.random() * 16 | 0;
const v = c == 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
});
}
// 计算音频能量(音量)
calculateAudioLevel(audioData) {
let sum = 0;
for (let i = 0; i < audioData.length; i++) {
sum += audioData[i] * audioData[i];
}
return Math.sqrt(sum / audioData.length);
}
// 语音活动检测
detectVoiceActivity(audioData) {
const audioLevel = this.calculateAudioLevel(audioData);
const currentTime = Date.now();
if (audioLevel > this.silenceThreshold) {
if (!this.isSpeaking) {
this.isSpeaking = true;
this.speechStartTime = currentTime;
this.audioBuffer = [];
this.onSpeechStart();
this.onStatusUpdate('检测到语音,开始录音...', 'speaking');
console.log('开始说话');
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
return true;
} else {
if (this.isSpeaking && !this.silenceTimer) {
this.silenceTimer = setTimeout(() => {
this.handleSpeechEnd();
}, this.silenceTimeout);
}
return this.isSpeaking;
}
}
// 语音结束处理
async handleSpeechEnd() {
if (this.isSpeaking) {
const speechDuration = Date.now() - this.speechStartTime;
if (speechDuration >= this.minSpeechDuration) {
console.log(`语音结束,时长: ${speechDuration}ms`);
await this.processAudioBuffer();
this.onStatusUpdate('语音识别中...', 'processing');
} else {
console.log('说话时长太短,忽略');
this.onStatusUpdate('等待语音输入...', 'ready');
}
this.isSpeaking = false;
this.speechStartTime = null;
this.audioBuffer = [];
this.onSpeechEnd();
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
}
// 处理音频缓冲区并发送到API
async processAudioBuffer() {
if (this.audioBuffer.length === 0) {
return;
}
try {
// 合并所有音频数据
const totalLength = this.audioBuffer.reduce((sum, buffer) => sum + buffer.length, 0);
const combinedBuffer = new Float32Array(totalLength);
let offset = 0;
for (const buffer of this.audioBuffer) {
combinedBuffer.set(buffer, offset);
offset += buffer.length;
}
// 转换为WAV格式并编码为base64
const wavBuffer = this.encodeWAV(combinedBuffer, 16000);
const base64Audio = this.arrayBufferToBase64(wavBuffer);
// 调用ASR API
await this.callASRAPI(base64Audio);
} catch (error) {
console.error('处理音频数据失败:', error);
this.onError('处理音频数据失败: ' + error.message);
}
}
// 调用ASR API
async callASRAPI(base64AudioData) {
try {
const requestBody = {
user: {
uid: "1988591469"
},
audio: {
data: base64AudioData
},
request: {
model_name: "bigmodel"
}
};
const response = await fetch(this.apiConfig.url, {
method: 'POST',
headers: this.apiConfig.headers,
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const result = await response.json();
this.handleASRResponse(result);
} catch (error) {
console.error('ASR API调用失败:', error);
this.onError('ASR API调用失败: ' + error.message);
}
}
// 处理ASR响应
handleASRResponse(response) {
console.log('ASR响应:', response);
if (response && response.result) {
const recognizedText = response.result.text;
this.onRecognitionResult(recognizedText);
this.onStatusUpdate('识别完成', 'completed');
} else {
console.log('未识别到文字');
this.onStatusUpdate('未识别到文字', 'ready');
}
}
// 编码WAV格式
encodeWAV(samples, sampleRate) {
const length = samples.length;
const buffer = new ArrayBuffer(44 + length * 2);
const view = new DataView(buffer);
// WAV文件头
const writeString = (offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + length * 2, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeString(36, 'data');
view.setUint32(40, length * 2, true);
// 写入音频数据
let offset = 44;
for (let i = 0; i < length; i++) {
const sample = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(offset, sample * 0x7FFF, true);
offset += 2;
}
return buffer;
}
// ArrayBuffer转Base64
arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
for (let i = 0; i < bytes.byteLength; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
// 开始录音
async startRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true
}
});
this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
const source = this.audioContext.createMediaStreamSource(stream);
const processor = this.audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (event) => {
const inputBuffer = event.inputBuffer;
const inputData = inputBuffer.getChannelData(0);
// 语音活动检测
if (this.detectVoiceActivity(inputData)) {
// 如果检测到语音活动,缓存音频数据
this.audioBuffer.push(new Float32Array(inputData));
}
};
source.connect(processor);
processor.connect(this.audioContext.destination);
this.isRecording = true;
this.onStatusUpdate('等待语音输入...', 'ready');
return true;
} catch (error) {
console.error('启动录音失败:', error);
this.onError('启动录音失败: ' + error.message);
return false;
}
}
// 停止录音
stopRecording() {
if (this.audioContext) {
this.audioContext.close();
this.audioContext = null;
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
// 如果正在说话,处理最后的音频
if (this.isSpeaking) {
this.handleSpeechEnd();
}
this.isRecording = false;
this.isSpeaking = false;
this.audioBuffer = [];
this.onStatusUpdate('录音已停止', 'stopped');
console.log('录音已停止');
}
// 获取录音状态
getRecordingStatus() {
return {
isRecording: this.isRecording,
isSpeaking: this.isSpeaking,
hasAudioContext: !!this.audioContext
};
}
}
// 导出模块
export { AudioProcessor };

View File

@ -6,6 +6,9 @@ import { getLLMConfig, getMinimaxiConfig, getAudioConfig, validateConfig } from
// 防止重复播放的标志
let isPlaying = false;
// 音频播放队列
let audioQueue = [];
let isProcessingQueue = false;
async function chatWithAudioStream(userInput) {
// 验证配置
@ -20,7 +23,48 @@ async function chatWithAudioStream(userInput) {
const minimaxiConfig = getMinimaxiConfig();
const audioConfig = getAudioConfig();
// 1. 请求大模型回答
// 清空音频队列
audioQueue = [];
// 定义段落处理函数
const handleSegment = async (segment) => {
console.log('\n=== 处理文本段落 ===');
console.log('段落内容:', segment);
try {
// 为每个段落生成音频
const audioResult = await requestMinimaxi({
apiKey: minimaxiConfig.apiKey,
groupId: minimaxiConfig.groupId,
body: {
model: audioConfig.model,
text: segment,
stream: audioConfig.stream,
language_boost: audioConfig.language_boost,
output_format: audioConfig.output_format,
voice_setting: audioConfig.voiceSetting,
audio_setting: audioConfig.audioSetting,
},
stream: true,
});
// 将音频添加到播放队列
if (audioResult && audioResult.data && audioResult.data.audio) {
audioQueue.push({
text: segment,
audioHex: audioResult.data.audio
});
console.log('音频已添加到队列,队列长度:', audioQueue.length);
// 开始处理队列
processAudioQueue();
}
} catch (error) {
console.error('生成音频失败:', error);
}
};
// 1. 请求大模型回答,并实时处理段落
console.log('\n=== 请求大模型回答 ===');
const llmResponse = await requestLLMStream({
apiKey: llmConfig.apiKey,
@ -29,55 +73,45 @@ async function chatWithAudioStream(userInput) {
{ role: 'system', content: 'You are a helpful assistant.' },
{ role: 'user', content: userInput },
],
onSegment: handleSegment // 传入段落处理回调
});
// 提取大模型回答内容(现在直接返回内容)
const llmContent = llmResponse;
console.log('\n=== 大模型回答 ===');
console.log("llmResponse: ", llmContent);
// 2. 合成音频
console.log('\n=== 开始合成音频 ===');
const audioResult = await requestMinimaxi({
apiKey: minimaxiConfig.apiKey,
groupId: minimaxiConfig.groupId,
body: {
model: audioConfig.model,
text: llmContent,
stream: audioConfig.stream,
language_boost: audioConfig.language_boost,
output_format: audioConfig.output_format,
voice_setting: audioConfig.voiceSetting,
audio_setting: audioConfig.audioSetting,
},
stream: true,
});
// 3. 流式播放音频
console.log('\n=== 开始流式播放音频 ===');
// console.log('音频数据长度:', audioResult.data.audio.length);
await playAudioStream(audioResult.data.audio);
console.log('\n=== 大模型完整回答 ===');
console.log("llmResponse: ", llmResponse);
return {
userInput,
llmResponse: llmContent,
audioResult,
llmResponse,
audioQueue: audioQueue.map(item => ({ text: item.text, hasAudio: !!item.audioHex }))
};
}
// 处理音频播放队列
async function processAudioQueue() {
if (isProcessingQueue) return;
isProcessingQueue = true;
// while (audioQueue.length > 0) {
// const audioItem = audioQueue.shift();
// console.log('\n=== 播放队列中的音频 ===');
// console.log('文本:', audioItem.text);
// try {
// await playAudioStream(audioItem.audioHex);
// } catch (error) {
// console.error('播放音频失败:', error);
// }
// }
isProcessingQueue = false;
}
// 流式播放音频
async function playAudioStream(audioHex) {
if (isPlaying) {
console.log('音频正在播放中,跳过重复播放');
return;
}
console.log('=== 开始播放音频 ===');
console.log('音频数据长度:', audioHex.length);
isPlaying = true;
// 将hex转换为ArrayBuffer
const audioBuffer = hexToArrayBuffer(audioHex);
@ -102,13 +136,11 @@ async function playAudioStream(audioHex) {
return new Promise((resolve) => {
source.onended = () => {
console.log('音频播放完成');
isPlaying = false;
resolve();
};
});
} catch (error) {
console.error('音频播放失败:', error);
isPlaying = false;
throw error;
}
}
@ -175,4 +207,6 @@ async function playAudioStreamNode(audioHex) {
}
}
export { chatWithAudioStream, playAudioStream, playAudioStreamNode };
export { chatWithAudioStream, playAudioStream, playAudioStreamNode};

View File

@ -16,11 +16,11 @@ export const config = {
audio: {
model: 'speech-02-hd',
voiceSetting: {
voice_id: 'yantu-qinggang',
voice_id: 'yantu-qinggang-2',
speed: 1,
vol: 1,
pitch: 0,
emotion: 'happy',
// emotion: 'happy',
},
audioSetting: {
sample_rate: 32000,

139
src/index - 副本.html Normal file
View File

@ -0,0 +1,139 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>实时语音识别</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.container {
background: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.controls {
text-align: center;
margin-bottom: 30px;
}
.record-btn {
background: #4CAF50;
color: white;
border: none;
padding: 15px 30px;
font-size: 18px;
border-radius: 50px;
cursor: pointer;
transition: all 0.3s;
}
.record-btn:hover {
background: #45a049;
}
.record-btn.recording {
background: #f44336;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { transform: scale(1); }
50% { transform: scale(1.05); }
100% { transform: scale(1); }
}
.status {
margin: 20px 0;
padding: 10px;
border-radius: 5px;
text-align: center;
font-weight: bold;
}
.status.connected {
background: #d4edda;
color: #155724;
border: 1px solid #c3e6cb;
}
.status.speaking {
background: #fff3cd;
color: #856404;
border: 1px solid #ffeaa7;
animation: speaking-pulse 0.5s infinite alternate;
}
.status.processing {
background: #cce7ff;
color: #004085;
border: 1px solid #99d6ff;
}
.status.disconnected {
background: #f8d7da;
color: #721c24;
border: 1px solid #f5c6cb;
}
@keyframes speaking-pulse {
0% { opacity: 0.7; }
100% { opacity: 1; }
}
.results {
max-height: 400px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 5px;
padding: 15px;
background: #fafafa;
}
.result-item {
margin-bottom: 15px;
padding: 10px;
background: white;
border-radius: 5px;
border-left: 4px solid #4CAF50;
}
.timestamp {
font-size: 12px;
color: #666;
margin-bottom: 5px;
}
.text {
font-size: 16px;
line-height: 1.4;
}
.help {
margin-top: 20px;
padding: 15px;
background: #e3f2fd;
border-radius: 5px;
font-size: 14px;
color: #1565c0;
}
</style>
</head>
<body>
<div class="container">
<h1>实时语音识别</h1>
<div class="controls">
<button id="recordBtn" class="record-btn">开始录音</button>
</div>
<div id="status" class="status disconnected">未连接</div>
<div class="help">
<strong>使用说明:</strong><br>
1. 点击"开始录音"按钮开启麦克风<br>
2. 系统会自动检测您的语音,只有在检测到说话时才开始录音<br>
3. 说话结束后会自动发送音频进行识别<br>
4. 识别结果会显示在下方区域
</div>
<h3>识别结果:</h3>
<div id="results" class="results">
<!-- 识别结果将显示在这里 -->
</div>
</div>
<script src="new_app.js"></script>
</body>
</html>

View File

@ -1,5 +1,6 @@
// WebRTC 音视频通话应用
import { chatWithAudioStream } from './chat_with_audio.js';
import { AudioProcessor } from './audio_processor.js';
class WebRTCChat {
constructor() {
@ -15,6 +16,30 @@ class WebRTCChat {
this.videoStreams = new Map(); // 存储不同视频的MediaStream
this.currentVideoStream = null;
// 初始化音频处理器
this.audioProcessor = new AudioProcessor({
onSpeechStart: () => {
this.voiceStatus.textContent = '检测到语音,开始录音...';
this.logMessage('检测到语音,开始录音...', 'info');
},
onSpeechEnd: () => {
// 语音结束回调
},
onRecognitionResult: (text) => {
// ASRTEXT = text;
this.voiceStatus.textContent = '识别完成';
this.logMessage(`语音识别结果: ${text}`, 'success');
this.handleVoiceInput(text);
},
onError: (error) => {
this.voiceStatus.textContent = '识别失败';
this.logMessage(error, 'error');
},
onStatusUpdate: (message, status) => {
this.voiceStatus.textContent = message;
}
});
this.initializeElements();
this.initializeSocket();
this.loadVideoMapping();
@ -627,65 +652,34 @@ class WebRTCChat {
});
}
// 修改:使用音频处理器的语音录制功能
async startVoiceRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.mediaRecorder = new MediaRecorder(stream);
this.audioChunks = [];
this.mediaRecorder.ondataavailable = (event) => {
this.audioChunks.push(event.data);
};
this.mediaRecorder.onstop = () => {
const audioBlob = new Blob(this.audioChunks, { type: 'audio/wav' });
this.processVoiceInput(audioBlob);
};
this.mediaRecorder.start();
this.isRecording = true;
const success = await this.audioProcessor.startRecording();
if (success) {
this.startVoiceButton.disabled = true;
this.stopVoiceButton.disabled = false;
this.voiceStatus.textContent = '正在录音...';
this.startVoiceButton.classList.add('recording');
this.logMessage('开始语音录制', 'info');
} catch (error) {
this.logMessage('无法访问麦克风: ' + error.message, 'error');
this.voiceStatus.textContent = '等待语音输入...';
this.logMessage('高级语音录制已启动', 'success');
} else {
this.voiceStatus.textContent = '录音启动失败';
}
}
// 修改:停止语音录制
stopVoiceRecording() {
if (this.mediaRecorder && this.isRecording) {
this.mediaRecorder.stop();
this.isRecording = false;
this.startVoiceButton.disabled = false;
this.stopVoiceButton.disabled = true;
this.voiceStatus.textContent = '点击开始语音输入';
this.startVoiceButton.classList.remove('recording');
this.logMessage('停止语音录制', 'info');
}
}
async processVoiceInput(audioBlob) {
// 这里可以集成语音识别API如Web Speech API或第三方服务
// 为了演示,我们使用一个简单的模拟识别
const mockText = this.simulateSpeechRecognition();
this.socket.emit('voice-input', {
audioData: audioBlob,
text: mockText
});
this.logMessage(`语音识别结果: ${mockText}`, 'info');
// 根据语音识别结果切换视频流
await this.handleVoiceInput(mockText);
this.audioProcessor.stopRecording();
this.startVoiceButton.disabled = false;
this.stopVoiceButton.disabled = true;
this.startVoiceButton.classList.remove('recording');
this.voiceStatus.textContent = '点击开始语音输入';
this.logMessage('语音录制已停止', 'info');
}
// 处理语音输入结果
async handleVoiceInput(text) {
// 根据文本查找对应视频
let videoFile = this.videoMapping['默认'] || this.defaultVideo;
@ -705,8 +699,21 @@ class WebRTCChat {
type: 'voice',
text
});
// 调用大模型处理
try {
this.logMessage('正在处理语音输入,请稍候...', 'info');
const result = await chatWithAudioStream(text);
this.logMessage(`大模型回答: ${result.llmResponse}`, 'success');
} catch (error) {
this.logMessage(`处理语音输入失败: ${error.message}`, 'error');
console.error('chatWithAudioStream error:', error);
}
}
// 删除原有的简单音频处理方法
// processVoiceInput() 和 simulateSpeechRecognition() 方法已被移除
simulateSpeechRecognition() {
// 模拟语音识别,随机返回预设的文本
const texts = ['你好', '再见', '谢谢', 'hello', 'goodbye', 'thank you'];
@ -776,4 +783,4 @@ class WebRTCChat {
// 页面加载完成后初始化应用
document.addEventListener('DOMContentLoaded', () => {
new WebRTCChat();
});
});

View File

@ -1,6 +1,6 @@
// 以流式方式请求LLM大模型接口并打印流式返回内容
async function requestLLMStream({ apiKey, model, messages }) {
async function requestLLMStream({ apiKey, model, messages, onSegment }) {
const response = await fetch('https://ark.cn-beijing.volces.com/api/v3/bots/chat/completions', {
method: 'POST',
headers: {
@ -26,6 +26,10 @@ async function requestLLMStream({ apiKey, model, messages }) {
let done = false;
let buffer = '';
let content = '';
let pendingText = ''; // 待处理的文本片段
// 分段分隔符
const segmentDelimiters = /[,。:;!?,.:;!?]/;
while (!done) {
const { value, done: doneReading } = await reader.read();
@ -47,6 +51,10 @@ async function requestLLMStream({ apiKey, model, messages }) {
if (jsonStr === '[DONE]') {
console.log('LLM SSE流结束');
// 处理最后的待处理文本
if (pendingText.trim() && onSegment) {
await onSegment(pendingText.trim());
}
continue;
}
@ -55,7 +63,29 @@ async function requestLLMStream({ apiKey, model, messages }) {
if (obj.choices && obj.choices[0] && obj.choices[0].delta && obj.choices[0].delta.content) {
const deltaContent = obj.choices[0].delta.content;
content += deltaContent;
pendingText += deltaContent;
console.log('LLM内容片段:', deltaContent);
// 检查是否包含分段分隔符
if (segmentDelimiters.test(pendingText)) {
// 按分隔符分割文本
const segments = pendingText.split(segmentDelimiters);
// 处理完整的段落(除了最后一个,因为可能不完整)
for (let i = 0; i < segments.length - 1; i++) {
const segment = segments[i].trim();
if (segment && onSegment) {
// 找到对应的分隔符
const delimiterMatch = pendingText.match(segmentDelimiters);
const segmentWithDelimiter = segment + (delimiterMatch ? delimiterMatch[0] : '');
console.log('检测到完整段落:', segmentWithDelimiter);
await onSegment(segmentWithDelimiter);
}
}
// 保留最后一个不完整的段落
pendingText = segments[segments.length - 1] || '';
}
}
} catch (e) {
console.error('解析LLM SSE数据失败:', e, '原始数据:', jsonStr);
@ -72,4 +102,4 @@ async function requestLLMStream({ apiKey, model, messages }) {
return content;
}
export { requestLLMStream };
export { requestLLMStream };

View File

@ -1,5 +1,135 @@
// 以流式或非流式方式请求 minimaxi 大模型接口,并打印/返回内容
// 在文件顶部添加音频播放相关的变量和函数
let audioContext = null;
let audioQueue = []; // 音频队列
let isPlaying = false;
let isProcessingQueue = false; // 队列处理状态
let nextStartTime = 0; // 添加这行来声明 nextStartTime 变量
// 初始化音频上下文
function initAudioContext() {
if (!audioContext) {
audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
return audioContext;
}
// 将hex字符串转换为ArrayBuffer
function hexToArrayBuffer(hex) {
const bytes = new Uint8Array(hex.length / 2);
for (let i = 0; i < hex.length; i += 2) {
bytes[i / 2] = parseInt(hex.substr(i, 2), 16);
}
return bytes.buffer;
}
// 将音频添加到队列(不等待播放)
async function addAudioToQueue(audioHex) {
if (!audioHex || audioHex.length === 0) return;
try {
const ctx = initAudioContext();
const audioBuffer = hexToArrayBuffer(audioHex);
const audioData = await ctx.decodeAudioData(audioBuffer);
// 将解码后的音频数据添加到队列
audioQueue.push({
audioData,
timestamp: Date.now()
});
console.log(`音频已添加到队列,队列长度: ${audioQueue.length}`);
// 启动队列处理器(如果还没有运行)
if (!isProcessingQueue) {
processAudioQueue();
}
} catch (error) {
console.error('音频解码失败:', error);
}
}
// 队列处理器 - 独立运行,按顺序播放音频
async function processAudioQueue() {
if (isProcessingQueue) return;
isProcessingQueue = true;
console.log('开始处理音频队列');
while (audioQueue.length > 0 || isPlaying) {
// 如果当前没有音频在播放,且队列中有音频
if (!isPlaying && audioQueue.length > 0) {
const audioItem = audioQueue.shift();
await playAudioData(audioItem.audioData);
} else {
// 等待一小段时间再检查
await new Promise(resolve => setTimeout(resolve, 50));
}
}
isProcessingQueue = false;
console.log('音频队列处理完成');
}
// 播放单个音频数据
function playAudioData(audioData) {
return new Promise((resolve) => {
try {
const ctx = initAudioContext();
const source = ctx.createBufferSource();
source.buffer = audioData;
source.connect(ctx.destination);
isPlaying = true;
source.onended = () => {
console.log('音频片段播放完成');
isPlaying = false;
resolve();
};
// 超时保护
setTimeout(() => {
if (isPlaying) {
console.log('音频播放超时,强制结束');
isPlaying = false;
resolve();
}
}, (audioData.duration + 0.5) * 1000);
source.start(0);
console.log(`开始播放音频片段,时长: ${audioData.duration}`);
} catch (error) {
console.error('播放音频失败:', error);
isPlaying = false;
resolve();
}
});
}
// 修改原来的playAudioChunk函数改为addAudioToQueue
const playAudioChunk = addAudioToQueue;
// 清空音频队列
function clearAudioQueue() {
audioQueue.length = 0;
console.log('音频队列已清空');
}
// 获取队列状态
function getQueueStatus() {
return {
queueLength: audioQueue.length,
isPlaying,
isProcessingQueue
};
}
// 移除waitForCurrentAudioToFinish函数不再需要
async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
const url = `https://api.minimaxi.com/v1/t2a_v2`;
const reqBody = { ...body, stream };
@ -24,13 +154,19 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
console.log(JSON.stringify(result, null, 2));
return result;
} else {
// 流式解析每个chunk合并audio
// 流式解析每个chunk实时播放音频
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let done = false;
let buffer = '';
let audioHex = '';
let lastFullResult = null;
// 重置播放状态
nextStartTime = 0;
if (audioContext) {
nextStartTime = audioContext.currentTime;
}
while (!done) {
const { value, done: doneReading } = await reader.read();
@ -38,19 +174,16 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
if (value) {
const chunk = decoder.decode(value, { stream: true });
buffer += chunk;
// console.log('收到原始chunk:', chunk);
// 处理SSE格式的数据以\n分割
let lines = buffer.split('\n');
buffer = lines.pop(); // 最后一行可能是不完整的,留到下次
for (const line of lines) {
if (!line.trim()) continue;
// console.log('处理行:', line);
// 检查是否是SSE格式的数据行
if (line.startsWith('data:')) {
const jsonStr = line.substring(6); // 移除 'data: ' 前缀
// console.log('提取的JSON字符串:', jsonStr);
if (jsonStr.trim() === '[DONE]') {
console.log('SSE流结束');
@ -59,17 +192,19 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
try {
const obj = JSON.parse(jsonStr);
// 流式解析每个chunk合并audio
if (obj.data && obj.data.audio) {
// 流式解析每个chunk实时播放音频
if (obj.data && obj.data.audio && obj.data.status === 1) {
console.log('收到音频数据片段!', obj.data.audio.length);
audioHex += obj.data.audio;
// 立即播放这个音频片段
await playAudioChunk(obj.data.audio);
}
// status=2为最后一个chunk记录完整结构
if (obj.data && obj.data.status === 2) {
lastFullResult = obj;
console.log('收到最终状态');
}
// 实时打印每个chunk
console.log('解析成功:', JSON.stringify(obj));
} catch (e) {
console.error('解析SSE数据失败:', e, '原始数据:', jsonStr);
}
@ -83,7 +218,11 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
try {
const obj = JSON.parse(line);
if (obj.data && obj.data.audio) {
console.log('收到无data:音频数据!', obj.data.audio.length);
audioHex += obj.data.audio;
// 立即播放这个音频片段
await playAudioChunk(obj.data.audio);
}
if (obj.data && obj.data.status === 2) {
lastFullResult = obj;
@ -109,4 +248,135 @@ async function requestMinimaxi({ apiKey, groupId, body, stream = true }) {
}
}
export { requestMinimaxi };
// 火山引擎TTS方法
async function requestVolcanTTS({
appId,
accessKey,
resourceId = 'volc.service_type.10029',
appKey = 'aGjiRDfUWi',
body,
stream = true
}) {
const url = 'https://openspeech.bytedance.com/api/v3/tts/unidirectional';
// 生成请求ID
const requestId = generateUUID();
const response = await fetch(url, {
method: 'POST',
headers: {
'X-Api-App-Id': appId,
'X-Api-Access-Key': accessKey,
'X-Api-Resource-Id': resourceId,
'X-Api-App-Key': appKey,
'X-Api-Request-Id': requestId,
'Content-Type': 'application/json',
'Accept': stream ? 'text/event-stream' : 'application/json',
'Cache-Control': 'no-cache',
},
body: JSON.stringify(body),
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
if (!stream) {
// 非流式直接返回JSON
const result = await response.json();
console.log('火山引擎TTS非流式结果:', JSON.stringify(result, null, 2));
return result;
} else {
// 流式解析每个chunk合并audio
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let done = false;
let buffer = '';
let audioBase64 = '';
let lastFullResult = null;
while (!done) {
const { value, done: doneReading } = await reader.read();
done = doneReading;
if (value) {
const chunk = decoder.decode(value, { stream: true });
buffer += chunk;
// 处理SSE格式的数据以\n分割
let lines = buffer.split('\n');
buffer = lines.pop(); // 最后一行可能是不完整的,留到下次
for (const line of lines) {
if (!line.trim()) continue;
// 检查是否是SSE格式的数据行
if (line.startsWith('data:')) {
const jsonStr = line.substring(6); // 移除 'data: ' 前缀
if (jsonStr.trim() === '[DONE]') {
console.log('火山引擎TTS流结束');
continue;
}
try {
const obj = JSON.parse(jsonStr);
// 流式解析每个chunk合并audio base64数据
if (obj.data) {
audioBase64 += obj.data;
lastFullResult = obj;
}
// 实时打印每个chunk
console.log('火山引擎TTS解析成功:', JSON.stringify(obj));
} catch (e) {
console.error('解析火山引擎TTS数据失败:', e, '原始数据:', jsonStr);
}
} else if (line.startsWith('event: ') || line.startsWith('id: ') || line.startsWith('retry: ')) {
// 忽略SSE的其他字段
console.log('忽略SSE字段:', line);
continue;
} else if (line.trim() && !line.startsWith('data:')) {
// 尝试直接解析兼容非SSE格式
try {
const obj = JSON.parse(line);
if (obj.data) {
audioBase64 += obj.data;
lastFullResult = obj;
}
console.log('火山引擎TTS直接解析成功:', JSON.stringify(obj));
} catch (e) {
console.error('解析火山引擎TTS chunk失败:', e, line);
}
}
}
}
}
// 合成最终结构
console.log('火山引擎TTS音频数据总长度:', audioBase64.length);
if (lastFullResult) {
// 更新最终结果的音频数据
lastFullResult.data = audioBase64;
console.log('火山引擎TTS最终合成结果:', JSON.stringify(lastFullResult, null, 2));
return lastFullResult;
} else {
// 没有完整结构返回合成的audio
return {
code: 0,
message: '',
data: audioBase64
};
}
}
}
// 生成UUID的辅助函数
function generateUUID() {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
const r = Math.random() * 16 | 0;
const v = c === 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
});
}
export { requestMinimaxi, requestVolcanTTS };

346
src/new_app.js Normal file
View File

@ -0,0 +1,346 @@
let ASRTEXT = ''
class HttpASRRecognizer {
constructor() {
this.mediaRecorder = null;
this.audioContext = null;
this.isRecording = false;
this.audioChunks = [];
// VAD相关属性
this.isSpeaking = false;
this.silenceThreshold = 0.01;
this.silenceTimeout = 1000;
this.minSpeechDuration = 300;
this.silenceTimer = null;
this.speechStartTime = null;
this.audioBuffer = [];
// API配置
this.apiConfig = {
url: 'https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash',
headers: {
'X-Api-App-Key': '1988591469',
'X-Api-Access-Key': 'mdEyhgZ59on1-NK3GXWAp3L4iLldSG0r',
'X-Api-Resource-Id': 'volc.bigasr.auc_turbo',
'X-Api-Request-Id': this.generateUUID(),
'X-Api-Sequence': '-1',
'Content-Type': 'application/json'
}
};
this.recordBtn = document.getElementById('startVoiceButton');
this.statusDiv = document.getElementById('status');
this.resultsDiv = document.getElementById('results');
this.initEventListeners();
}
initEventListeners() {
this.recordBtn.addEventListener('click', () => {
if (this.isRecording) {
this.stopRecording();
} else {
this.startRecording();
}
});
}
// 生成UUID
generateUUID() {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
const r = Math.random() * 16 | 0;
const v = c == 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
});
}
// 计算音频能量(音量)
calculateAudioLevel(audioData) {
let sum = 0;
for (let i = 0; i < audioData.length; i++) {
sum += audioData[i] * audioData[i];
}
return Math.sqrt(sum / audioData.length);
}
// 语音活动检测
detectVoiceActivity(audioData) {
const audioLevel = this.calculateAudioLevel(audioData);
const currentTime = Date.now();
if (audioLevel > this.silenceThreshold) {
if (!this.isSpeaking) {
this.isSpeaking = true;
this.speechStartTime = currentTime;
this.audioBuffer = [];
this.updateStatus('检测到语音,开始录音...', 'speaking');
console.log('开始说话');
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
return true;
} else {
if (this.isSpeaking && !this.silenceTimer) {
this.silenceTimer = setTimeout(() => {
this.onSpeechEnd();
}, this.silenceTimeout);
}
return this.isSpeaking;
}
}
// 语音结束处理
async onSpeechEnd() {
if (this.isSpeaking) {
const speechDuration = Date.now() - this.speechStartTime;
if (speechDuration >= this.minSpeechDuration) {
console.log(`语音结束,时长: ${speechDuration}ms`);
await this.processAudioBuffer();
// this.updateStatus('语音识别中...', 'processing');
console.log('语音识别中')
} else {
console.log('说话时长太短,忽略');
// this.updateStatus('等待语音输入...', 'ready');
console.log('等待语音输入...')
}
this.isSpeaking = false;
this.speechStartTime = null;
this.audioBuffer = [];
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
}
// 处理音频缓冲区并发送到API
async processAudioBuffer() {
if (this.audioBuffer.length === 0) {
return;
}
try {
// 合并所有音频数据
const totalLength = this.audioBuffer.reduce((sum, buffer) => sum + buffer.length, 0);
const combinedBuffer = new Float32Array(totalLength);
let offset = 0;
for (const buffer of this.audioBuffer) {
combinedBuffer.set(buffer, offset);
offset += buffer.length;
}
// 转换为WAV格式并编码为base64
const wavBuffer = this.encodeWAV(combinedBuffer, 16000);
const base64Audio = this.arrayBufferToBase64(wavBuffer);
// 调用ASR API
await this.callASRAPI(base64Audio);
} catch (error) {
console.error('处理音频数据失败:', error);
this.updateStatus('识别失败', 'error');
}
}
// 调用ASR API
async callASRAPI(base64AudioData) {
try {
const requestBody = {
user: {
uid: "1988591469"
},
audio: {
data: base64AudioData
},
request: {
model_name: "bigmodel"
}
};
const response = await fetch(this.apiConfig.url, {
method: 'POST',
headers: this.apiConfig.headers,
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const result = await response.json();
this.handleASRResponse(result);
} catch (error) {
console.error('ASR API调用失败:', error);
this.updateStatus('API调用失败', 'error');
}
}
// 处理ASR响应
handleASRResponse(response) {
console.log('ASR响应:', response);
if (response && response.data && response.data.result) {
ASRTEXT = response.data.result;
// this.displayResult(text);
// this.updateStatus('识别完成', 'completed');
console.log('识别完成')
} else {
console.log('未识别到文字');
// this.updateStatus('未识别到文字', 'ready');
}
}
// 显示识别结果
displayResult(text) {
const resultElement = document.createElement('div');
resultElement.className = 'result-item';
resultElement.innerHTML = `
<span class="timestamp">${new Date().toLocaleTimeString()}</span>
<span class="text">${text}</span>
`;
this.resultsDiv.appendChild(resultElement);
this.resultsDiv.scrollTop = this.resultsDiv.scrollHeight;
}
// 更新状态显示
updateStatus(message, status) {
this.statusDiv.textContent = message;
this.statusDiv.className = `status ${status}`;
}
// 编码WAV格式
encodeWAV(samples, sampleRate) {
const length = samples.length;
const buffer = new ArrayBuffer(44 + length * 2);
const view = new DataView(buffer);
// WAV文件头
const writeString = (offset, string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, 'RIFF');
view.setUint32(4, 36 + length * 2, true);
writeString(8, 'WAVE');
writeString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeString(36, 'data');
view.setUint32(40, length * 2, true);
// 写入音频数据
let offset = 44;
for (let i = 0; i < length; i++) {
const sample = Math.max(-1, Math.min(1, samples[i]));
view.setInt16(offset, sample * 0x7FFF, true);
offset += 2;
}
return buffer;
}
// ArrayBuffer转Base64
arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
for (let i = 0; i < bytes.byteLength; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
async startRecording() {
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true
}
});
this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
const source = this.audioContext.createMediaStreamSource(stream);
const processor = this.audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (event) => {
const inputBuffer = event.inputBuffer;
const inputData = inputBuffer.getChannelData(0);
// 语音活动检测
if (this.detectVoiceActivity(inputData)) {
// 如果检测到语音活动,缓存音频数据
this.audioBuffer.push(new Float32Array(inputData));
}
};
source.connect(processor);
processor.connect(this.audioContext.destination);
this.isRecording = true;
this.recordBtn.textContent = '停止录音';
this.recordBtn.className = 'btn recording';
// this.updateStatus('等待语音输入...', 'ready');
} catch (error) {
console.error('启动录音失败:', error);
// this.updateStatus('录音启动失败', 'error');
}
}
stopRecording() {
if (this.audioContext) {
this.audioContext.close();
this.audioContext = null;
}
if (this.silenceTimer) {
clearTimeout(this.silenceTimer);
this.silenceTimer = null;
}
// 如果正在说话,处理最后的音频
if (this.isSpeaking) {
this.onSpeechEnd();
}
this.isRecording = false;
this.isSpeaking = false;
this.audioBuffer = [];
this.recordBtn.textContent = '开始录音';
this.recordBtn.className = 'btn';
console.log('录音已停止');
// this.updateStatus('录音已停止', 'stopped');
}
}
// 初始化应用
document.addEventListener('DOMContentLoaded', () => {
const asrRecognizer = new HttpASRRecognizer();
console.log('HTTP ASR识别器已初始化');
});