Add voice message transcription via faster-whisper - life-todo - Unnamed repository; edit this file 'description' to name the repository.

commit 4d512d7d7bb099284c4ae1746f41da5a8411f631
parent 2cdace311ec8f454534f1dfd6ff3a0359125bd1a
Author: Michael Percival <m@michaelpercival.xyz>
Date:   Tue, 14 Apr 2026 14:06:12 +0100

Add voice message transcription via faster-whisper

Telegram voice messages are downloaded, transcribed with faster-whisper,
and passed to Claude as plain text. Model is configurable via WHISPER_MODEL
(default: small).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Diffstat:
M .env.example  | 1 +
M scripts/setup.sh  | 17 ++++++++++++++---
A scripts/transcribe.py  | 21 +++++++++++++++++++++
M src/bot.js  | 13 +++++++++----
A src/transcribe.js  | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 104 insertions(+), 7 deletions(-)
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
 TELEGRAM_BOT_TOKEN=your_telegram_bot_token_here
 ANTHROPIC_API_KEY=your_anthropic_api_key_here
 ALLOWED_USER_IDS=123456789
+WHISPER_MODEL=small
diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -34,9 +34,20 @@ echo "Installing npm dependencies..."
 npm install
 echo "✓ Dependencies installed"
 
-# Create data directory (gitignored)
-mkdir -p data/conversations
-echo "✓ data/conversations directory ready"
+# Create data directories (gitignored)
+mkdir -p data/conversations data/tmp
+echo "✓ data directories ready"
+
+# Check Python + faster-whisper for voice transcription
+if ! command -v python3 &>/dev/null; then
+  echo "WARNING: python3 not found — voice messages will not be transcribed."
+  echo "  Install Python 3 and run: pip install faster-whisper"
+elif ! python3 -c "import faster_whisper" &>/dev/null; then
+  echo "WARNING: faster-whisper not installed — voice messages will not be transcribed."
+  echo "  Run: pip install faster-whisper"
+else
+  echo "✓ faster-whisper available"
+fi
 
 # Check SSH key for git push
 if ! ssh-add -l &>/dev/null && [ -z "$SSH_AUTH_SOCK" ]; then
diff --git a/scripts/transcribe.py b/scripts/transcribe.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+"""Transcribe an audio file using faster-whisper. Prints transcription to stdout."""
+
+import sys
+import os
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: transcribe.py <audio_file>", file=sys.stderr)
+        sys.exit(1)
+
+    audio_path = sys.argv[1]
+    model_size = os.environ.get("WHISPER_MODEL", "small")
+
+    from faster_whisper import WhisperModel
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+    segments, _ = model.transcribe(audio_path)
+    print(" ".join(seg.text.strip() for seg in segments))
+
+if __name__ == "__main__":
+    main()
diff --git a/src/bot.js b/src/bot.js
@@ -3,6 +3,7 @@ const claude = require('./claude');
 const todo = require('./todo');
 const git = require('./git');
 const state = require('./state');
+const { transcribeVoice } = require('./transcribe');
 
 const ALLOWED_USER_IDS = process.env.ALLOWED_USER_IDS
   ? process.env.ALLOWED_USER_IDS.split(',').map(id => parseInt(id.trim(), 10))
@@ -19,9 +20,9 @@ function createBot(token) {
   bot.on('message', async (msg) => {
     const userId = msg.from.id;
     const chatId = msg.chat.id;
-    const text = msg.text;
+    let text = msg.text;
 
-    if (!text) return; // ignore stickers, photos, etc.
+    if (!text && !msg.voice) return; // ignore stickers, photos, etc.
 
     if (!isAllowed(userId)) {
       await bot.sendMessage(chatId, 'Unauthorized.');
@@ -29,10 +30,14 @@ function createBot(token) {
     }
 
     try {
-      // Show typing indicator while Claude thinks
       await bot.sendChatAction(chatId, 'typing');
 
-      console.log(`[user:${userId}] ${text}`);
+      if (msg.voice) {
+        text = await transcribeVoice(bot, msg.voice.file_id);
+        console.log(`[user:${userId}] (voice) ${text}`);
+      } else {
+        console.log(`[user:${userId}] ${text}`);
+      }
 
       const history = state.load(userId);
       const currentTodo = todo.read();
diff --git a/src/transcribe.js b/src/transcribe.js
@@ -0,0 +1,59 @@
+const { spawn } = require('child_process');
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+const SCRIPT = path.join(__dirname, '..', 'scripts', 'transcribe.py');
+const TMP_DIR = path.join(__dirname, '..', 'data', 'tmp');
+
+function downloadFile(url, dest) {
+  return new Promise((resolve, reject) => {
+    fs.mkdirSync(path.dirname(dest), { recursive: true });
+    const file = fs.createWriteStream(dest);
+    const client = url.startsWith('https') ? https : http;
+    client.get(url, (res) => {
+      res.pipe(file);
+      file.on('finish', () => file.close(resolve));
+    }).on('error', (err) => {
+      fs.unlink(dest, () => {});
+      reject(err);
+    });
+  });
+}
+
+function runWhisper(audioPath) {
+  return new Promise((resolve, reject) => {
+    const env = { ...process.env };
+    const proc = spawn('python3', [SCRIPT, audioPath], { env });
+
+    let stdout = '';
+    let stderr = '';
+    proc.stdout.on('data', (d) => { stdout += d; });
+    proc.stderr.on('data', (d) => { stderr += d; });
+
+    proc.on('close', (code) => {
+      if (code !== 0) {
+        reject(new Error(`transcribe.py exited ${code}: ${stderr.trim()}`));
+      } else {
+        resolve(stdout.trim());
+      }
+    });
+  });
+}
+
+async function transcribeVoice(bot, fileId) {
+  fs.mkdirSync(TMP_DIR, { recursive: true });
+  const tmpPath = path.join(TMP_DIR, `${fileId}.ogg`);
+
+  try {
+    const fileUrl = await bot.getFileLink(fileId);
+    await downloadFile(fileUrl, tmpPath);
+    const text = await runWhisper(tmpPath);
+    return text;
+  } finally {
+    fs.unlink(tmpPath, () => {});
+  }
+}
+
+module.exports = { transcribeVoice };

	life-todo Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README

M	.env.example	\|	1	+
M	scripts/setup.sh	\|	17	++++++++++++++---
A	scripts/transcribe.py	\|	21	+++++++++++++++++++++
M	src/bot.js	\|	13	+++++++++----
A	src/transcribe.js	\|	59	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++