commit 4d512d7d7bb099284c4ae1746f41da5a8411f631
parent 2cdace311ec8f454534f1dfd6ff3a0359125bd1a
Author: Michael Percival <m@michaelpercival.xyz>
Date: Tue, 14 Apr 2026 14:06:12 +0100
Add voice message transcription via faster-whisper
Telegram voice messages are downloaded, transcribed with faster-whisper,
and passed to Claude as plain text. Model is configurable via WHISPER_MODEL
(default: small).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat:
5 files changed, 104 insertions(+), 7 deletions(-)
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
TELEGRAM_BOT_TOKEN=your_telegram_bot_token_here
ANTHROPIC_API_KEY=your_anthropic_api_key_here
ALLOWED_USER_IDS=123456789
+WHISPER_MODEL=small
diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -34,9 +34,20 @@ echo "Installing npm dependencies..."
npm install
echo "✓ Dependencies installed"
-# Create data directory (gitignored)
-mkdir -p data/conversations
-echo "✓ data/conversations directory ready"
+# Create data directories (gitignored)
+mkdir -p data/conversations data/tmp
+echo "✓ data directories ready"
+
+# Check Python + faster-whisper for voice transcription
+if ! command -v python3 &>/dev/null; then
+ echo "WARNING: python3 not found — voice messages will not be transcribed."
+ echo " Install Python 3 and run: pip install faster-whisper"
+elif ! python3 -c "import faster_whisper" &>/dev/null; then
+ echo "WARNING: faster-whisper not installed — voice messages will not be transcribed."
+ echo " Run: pip install faster-whisper"
+else
+ echo "✓ faster-whisper available"
+fi
# Check SSH key for git push
if ! ssh-add -l &>/dev/null && [ -z "$SSH_AUTH_SOCK" ]; then
diff --git a/scripts/transcribe.py b/scripts/transcribe.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+"""Transcribe an audio file using faster-whisper. Prints transcription to stdout."""
+
+import sys
+import os
+
+def main():
+ if len(sys.argv) < 2:
+ print("Usage: transcribe.py <audio_file>", file=sys.stderr)
+ sys.exit(1)
+
+ audio_path = sys.argv[1]
+ model_size = os.environ.get("WHISPER_MODEL", "small")
+
+ from faster_whisper import WhisperModel
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
+ segments, _ = model.transcribe(audio_path)
+ print(" ".join(seg.text.strip() for seg in segments))
+
+if __name__ == "__main__":
+ main()
diff --git a/src/bot.js b/src/bot.js
@@ -3,6 +3,7 @@ const claude = require('./claude');
const todo = require('./todo');
const git = require('./git');
const state = require('./state');
+const { transcribeVoice } = require('./transcribe');
const ALLOWED_USER_IDS = process.env.ALLOWED_USER_IDS
? process.env.ALLOWED_USER_IDS.split(',').map(id => parseInt(id.trim(), 10))
@@ -19,9 +20,9 @@ function createBot(token) {
bot.on('message', async (msg) => {
const userId = msg.from.id;
const chatId = msg.chat.id;
- const text = msg.text;
+ let text = msg.text;
- if (!text) return; // ignore stickers, photos, etc.
+ if (!text && !msg.voice) return; // ignore stickers, photos, etc.
if (!isAllowed(userId)) {
await bot.sendMessage(chatId, 'Unauthorized.');
@@ -29,10 +30,14 @@ function createBot(token) {
}
try {
- // Show typing indicator while Claude thinks
await bot.sendChatAction(chatId, 'typing');
- console.log(`[user:${userId}] ${text}`);
+ if (msg.voice) {
+ text = await transcribeVoice(bot, msg.voice.file_id);
+ console.log(`[user:${userId}] (voice) ${text}`);
+ } else {
+ console.log(`[user:${userId}] ${text}`);
+ }
const history = state.load(userId);
const currentTodo = todo.read();
diff --git a/src/transcribe.js b/src/transcribe.js
@@ -0,0 +1,59 @@
+const { spawn } = require('child_process');
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+const SCRIPT = path.join(__dirname, '..', 'scripts', 'transcribe.py');
+const TMP_DIR = path.join(__dirname, '..', 'data', 'tmp');
+
+function downloadFile(url, dest) {
+ return new Promise((resolve, reject) => {
+ fs.mkdirSync(path.dirname(dest), { recursive: true });
+ const file = fs.createWriteStream(dest);
+ const client = url.startsWith('https') ? https : http;
+ client.get(url, (res) => {
+ res.pipe(file);
+ file.on('finish', () => file.close(resolve));
+ }).on('error', (err) => {
+ fs.unlink(dest, () => {});
+ reject(err);
+ });
+ });
+}
+
+function runWhisper(audioPath) {
+ return new Promise((resolve, reject) => {
+ const env = { ...process.env };
+ const proc = spawn('python3', [SCRIPT, audioPath], { env });
+
+ let stdout = '';
+ let stderr = '';
+ proc.stdout.on('data', (d) => { stdout += d; });
+ proc.stderr.on('data', (d) => { stderr += d; });
+
+ proc.on('close', (code) => {
+ if (code !== 0) {
+ reject(new Error(`transcribe.py exited ${code}: ${stderr.trim()}`));
+ } else {
+ resolve(stdout.trim());
+ }
+ });
+ });
+}
+
+async function transcribeVoice(bot, fileId) {
+ fs.mkdirSync(TMP_DIR, { recursive: true });
+ const tmpPath = path.join(TMP_DIR, `${fileId}.ogg`);
+
+ try {
+ const fileUrl = await bot.getFileLink(fileId);
+ await downloadFile(fileUrl, tmpPath);
+ const text = await runWhisper(tmpPath);
+ return text;
+ } finally {
+ fs.unlink(tmpPath, () => {});
+ }
+}
+
+module.exports = { transcribeVoice };