feat: 增加 SILK 语音转 WAV 功能

新增 silk-wasm 依赖,实现语音附件检测与 SILK/AMR 格式转 WAV,并在网关中集成语音转换处理流程。
This commit is contained in:
sliverp
2026-02-09 18:25:24 +08:00
parent 26827d3e07
commit b529f71c7c
4 changed files with 211 additions and 7 deletions

10
package-lock.json generated
View File

@@ -11,6 +11,7 @@
"ws"
],
"dependencies": {
"silk-wasm": "^3.7.1",
"ws": "^8.18.0"
},
"bin": {
@@ -8937,6 +8938,15 @@
"url": "https://github.com/sponsors/isaacs"
}
},
"node_modules/silk-wasm": {
"version": "3.7.1",
"resolved": "https://registry.npmjs.org/silk-wasm/-/silk-wasm-3.7.1.tgz",
"integrity": "sha512-mXPwLRtZxrYV3TZx41jMAeKc80wvmyrcXIcs8HctFxK15Ahz2OJQENYhNgEPeCEOdI6Mbx1NxQsqxzwc3DKerw==",
"license": "MIT",
"engines": {
"node": ">=16.11.0"
}
},
"node_modules/simple-git": {
"version": "3.30.0",
"resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.30.0.tgz",

View File

@@ -19,13 +19,19 @@
"moltbot.plugin.json"
],
"clawdbot": {
"extensions": ["./index.ts"]
"extensions": [
"./index.ts"
]
},
"moltbot": {
"extensions": ["./index.ts"]
"extensions": [
"./index.ts"
]
},
"openclaw": {
"extensions": ["./index.ts"]
"extensions": [
"./index.ts"
]
},
"scripts": {
"build": "tsc || true",
@@ -33,9 +39,11 @@
"prepack": "npm install --omit=dev"
},
"dependencies": {
"silk-wasm": "^3.7.1",
"ws": "^8.18.0"
},
"bundledDependencies": [
"silk-wasm",
"ws"
],
"devDependencies": {
@@ -48,5 +56,9 @@
"moltbot": "*",
"openclaw": "*"
},
"homepage": "https://github.com/sliverp/qqbot"
"homepage": "https://github.com/sliverp/qqbot",
"bundleDependencies": [
"silk-wasm",
"ws"
]
}

View File

@@ -9,6 +9,7 @@ import { getQQBotRuntime } from "./runtime.js";
import { startImageServer, isImageServerRunning, downloadFile, type ImageServerConfig } from "./image-server.js";
import { getImageSize, formatQQBotMarkdownImage, hasQQBotImageSize, DEFAULT_IMAGE_SIZE } from "./utils/image-size.js";
import { parseQQBotPayload, encodePayloadForCron, isCronReminderPayload, isMediaPayload, type CronReminderPayload, type MediaPayload } from "./utils/payload.js";
import { convertSilkToWav, isVoiceAttachment, formatDuration } from "./utils/audio-convert.js";
// QQ Bot intents - 按权限级别分组
const INTENTS = {
@@ -539,9 +540,9 @@ openclaw cron add \\
const downloadDir = path.join(process.env.HOME || "/home/ubuntu", "clawd", "downloads");
if (event.attachments?.length) {
// ============ 接收图片的自然语言描述生成 ============
// 根据需求 4将图片信息转换为自然语言描述便于 AI 理解
// ============ 接收附件描述生成(图片 / 语音 / 其他) ============
const imageDescriptions: string[] = [];
const voiceDescriptions: string[] = [];
const otherAttachments: string[] = [];
for (const att of event.attachments) {
@@ -563,6 +564,46 @@ openclaw cron add \\
- 发送时间:${timestamp}
请根据图片内容进行回复。`);
} else if (isVoiceAttachment(att)) {
// ============ 语音消息处理SILK → WAV ============
log?.info(`[qqbot:${account.accountId}] Voice attachment detected: ${att.filename}, converting SILK to WAV...`);
try {
const result = await convertSilkToWav(localPath, downloadDir);
if (result) {
const durationStr = formatDuration(result.duration);
log?.info(`[qqbot:${account.accountId}] Voice converted: ${result.wavPath} (duration: ${durationStr})`);
const timestamp = new Date().toLocaleString("zh-CN", { timeZone: "Asia/Shanghai" });
voiceDescriptions.push(`
用户发送了一条语音消息:
- 语音文件:${result.wavPath}
- 语音时长:${durationStr}
- 原始文件:${localPath}
- 消息ID${event.messageId}
- 发送时间:${timestamp}
请使用语音文件进行语音转文字处理后回复用户。`);
} else {
// SILK 解码失败,保留原始文件
log?.info(`[qqbot:${account.accountId}] Voice file is not SILK format, keeping original: ${localPath}`);
voiceDescriptions.push(`
用户发送了一条语音消息非SILK格式无法转换
- 语音文件:${localPath}
- 原始格式:${att.filename || "unknown"}
- 消息ID${event.messageId}
请告知用户该语音格式暂不支持解析。`);
}
} catch (convertErr) {
log?.error(`[qqbot:${account.accountId}] Voice conversion failed: ${convertErr}`);
voiceDescriptions.push(`
用户发送了一条语音消息(转换失败):
- 原始文件:${localPath}
- 错误信息:${convertErr}
- 消息ID${event.messageId}
请告知用户语音处理出现问题。`);
}
} else {
otherAttachments.push(`[附件: ${localPath}]`);
}
@@ -591,10 +632,13 @@ openclaw cron add \\
}
}
// 组合附件信息:先图片描述,后其他附件
// 组合附件信息:先图片描述,后语音描述,后其他附件
if (imageDescriptions.length > 0) {
attachmentInfo += "\n" + imageDescriptions.join("\n");
}
if (voiceDescriptions.length > 0) {
attachmentInfo += "\n" + voiceDescriptions.join("\n");
}
if (otherAttachments.length > 0) {
attachmentInfo += "\n" + otherAttachments.join("\n");
}

138
src/utils/audio-convert.ts Normal file
View File

@@ -0,0 +1,138 @@
import * as fs from "node:fs";
import * as path from "node:path";
import { decode, isSilk } from "silk-wasm";
/**
* 检查文件是否为 SILK 格式QQ/微信语音常用格式)
* QQ 语音文件通常以 .amr 扩展名保存,但实际编码可能是 SILK v3
* SILK 文件头部标识: 0x02 "#!SILK_V3"
*/
function isSilkFile(filePath: string): boolean {
try {
const buf = fs.readFileSync(filePath);
return isSilk(new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength));
} catch {
return false;
}
}
/**
* 将 PCM (s16le) 数据封装为 WAV 文件格式
* WAV = 44 字节 RIFF 头 + PCM 原始数据
*/
function pcmToWav(pcmData: Uint8Array, sampleRate: number, channels: number = 1, bitsPerSample: number = 16): Buffer {
const byteRate = sampleRate * channels * (bitsPerSample / 8);
const blockAlign = channels * (bitsPerSample / 8);
const dataSize = pcmData.length;
const headerSize = 44;
const fileSize = headerSize + dataSize;
const buffer = Buffer.alloc(fileSize);
// RIFF header
buffer.write("RIFF", 0);
buffer.writeUInt32LE(fileSize - 8, 4);
buffer.write("WAVE", 8);
// fmt sub-chunk
buffer.write("fmt ", 12);
buffer.writeUInt32LE(16, 16); // sub-chunk size
buffer.writeUInt16LE(1, 20); // PCM format
buffer.writeUInt16LE(channels, 22);
buffer.writeUInt32LE(sampleRate, 24);
buffer.writeUInt32LE(byteRate, 28);
buffer.writeUInt16LE(blockAlign, 32);
buffer.writeUInt16LE(bitsPerSample, 34);
// data sub-chunk
buffer.write("data", 36);
buffer.writeUInt32LE(dataSize, 40);
Buffer.from(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength).copy(buffer, headerSize);
return buffer;
}
/**
* 去除 QQ 语音文件的 AMR 头(如果存在)
* QQ 的 .amr 文件可能在 SILK 数据前有 "#!AMR\n" 头6 字节)
* 需要去除后才能被 silk-wasm 正确解码
*/
function stripAmrHeader(buf: Buffer): Buffer {
const AMR_HEADER = Buffer.from("#!AMR\n");
if (buf.length > 6 && buf.subarray(0, 6).equals(AMR_HEADER)) {
return buf.subarray(6);
}
return buf;
}
/**
* 将 SILK/AMR 语音文件转换为 WAV 格式
*
* @param inputPath 输入文件路径(.amr / .silk / .slk
* @param outputDir 输出目录(默认与输入文件同目录)
* @returns 转换后的 WAV 文件路径,失败返回 null
*/
export async function convertSilkToWav(
inputPath: string,
outputDir?: string,
): Promise<{ wavPath: string; duration: number } | null> {
if (!fs.existsSync(inputPath)) {
return null;
}
const fileBuf = fs.readFileSync(inputPath);
// 去除可能的 AMR 头
const strippedBuf = stripAmrHeader(fileBuf);
// 转为 Uint8Array 以兼容 silk-wasm 类型要求
const rawData = new Uint8Array(strippedBuf.buffer, strippedBuf.byteOffset, strippedBuf.byteLength);
// 验证是否为 SILK 格式
if (!isSilk(rawData)) {
return null;
}
// SILK 解码为 PCM (s16le)
// QQ 语音通常采样率为 24000Hz
const sampleRate = 24000;
const result = await decode(rawData, sampleRate);
// PCM → WAV
const wavBuffer = pcmToWav(result.data, sampleRate);
// 写入 WAV 文件
const dir = outputDir || path.dirname(inputPath);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
const baseName = path.basename(inputPath, path.extname(inputPath));
const wavPath = path.join(dir, `${baseName}.wav`);
fs.writeFileSync(wavPath, wavBuffer);
return { wavPath, duration: result.duration };
}
/**
* 判断是否为语音附件(根据 content_type 或文件扩展名)
*/
export function isVoiceAttachment(att: { content_type?: string; filename?: string }): boolean {
if (att.content_type === "voice" || att.content_type?.startsWith("audio/")) {
return true;
}
const ext = att.filename ? path.extname(att.filename).toLowerCase() : "";
return [".amr", ".silk", ".slk"].includes(ext);
}
/**
* 格式化语音时长为可读字符串
*/
export function formatDuration(durationMs: number): string {
const seconds = Math.round(durationMs / 1000);
if (seconds < 60) {
return `${seconds}`;
}
const minutes = Math.floor(seconds / 60);
const remainSeconds = seconds % 60;
return remainSeconds > 0 ? `${minutes}${remainSeconds}` : `${minutes}分钟`;
}