stt-typer/main.py at master · vertuzz/stt-typer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import logging
import os
import signal
import subprocess
import sys
import wave
import pyaudio
import threading
from dotenv import load_dotenv
from google import genai
from google.genai import types

load_dotenv()


def feedback(event: str, message: str = ""):
    """Provide audio and visual feedback to user.

    Args:
        event: One of 'start', 'stop', 'done', 'error'
        message: Optional message to display in notification
    """
    notifications = {
        "start": ("🎙️ Recording Started", "Speak now...", "audio-input-microphone", "device-added"),
        "stop": ("⏹️ Recording Stopped", "Transcribing...", "audio-x-generic", "device-removed"),
        "done": ("✅ Text Typed", message[:100] if message else "Done!", "dialog-ok", "message-new-instant"),
        "error": ("❌ Error", message or "Something went wrong", "dialog-error", "dialog-error"),
    }

    title, body, icon, sound = notifications.get(event, ("STT Typer", message, "dialog-information", "bell"))

    # Visual notification via notify-send
    try:
        subprocess.Popen(
            ["notify-send", "-i", icon, "-t", "2000", title, body],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
    except FileNotFoundError:
        pass  # notify-send not installed, skip silently

    # Audio feedback via canberra-gtk-play (uses system sound theme)
    try:
        subprocess.Popen(
            ["canberra-gtk-play", "-i", sound],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
    except FileNotFoundError:
        pass  # canberra-gtk-play not installed, skip silently

import pyautogui

api_key = os.getenv("GOOGLE_API_KEY")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Audio recording parameters
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
AUDIO_FILE = "/tmp/stt_recording.wav"

# Global variables for recording control
recording = False
record_thread = None

def signal_handler(signum, frame):
    """Handle shutdown signals gracefully"""
    global recording
    print("\nShutdown signal received. Stopping recording...")
    recording = False
    feedback("stop")

def cleanup_audio_file():
    """Remove temporary audio file"""
    try:
        if os.path.exists(AUDIO_FILE):
            os.remove(AUDIO_FILE)
    except Exception as e:
        logger.error(f"Error cleaning up audio file: {e}")

def record_audio():
    """Record audio continuously until stopped"""
    global recording

    audio = pyaudio.PyAudio()

    try:
        stream = audio.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=1024
        )

        frames = []
        print("Recording... Press Ctrl+C to stop.")

        while recording:
            data = stream.read(1024)
            frames.append(data)

        # Save recorded audio to file
        with wave.open(AUDIO_FILE, 'wb') as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(audio.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))

        print(f"Audio saved to {AUDIO_FILE}")

    except Exception as e:
        logger.error(f"Error during recording: {e}")
    finally:
        if 'stream' in locals():
            stream.stop_stream()
            stream.close()
        audio.terminate()

def transcribe_audio():
    """Send audio file to Google Gemini for transcription"""
    try:
        if not os.path.exists(AUDIO_FILE):
            print("No audio file found to transcribe.")
            return

        client = genai.Client(api_key=api_key)

        print("Uploading audio file for transcription...")
        myfile = client.files.upload(file=AUDIO_FILE)

        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=["Generate a transcript of the speech. Do not include any other text. Output only in grammatically correct english. IF YOU HEAR ANYTHING ELSE THAN ENGLISH, TRANSLATE IT TO ENGLISH.", myfile]
        )

        if response.text:
            # Strip any trailing whitespace/newlines that might cause Enter to be pressed
            clean_text = response.text.strip()
            print(f"Transcription: {clean_text}")
            pyautogui.typewrite(clean_text)
            feedback("done", clean_text)
        else:
            print("No transcription received.")
            feedback("error", "No transcription received")

    except Exception as e:
        logger.error(f"Error during transcription: {e}")
        feedback("error", str(e))


def main():
    global recording, record_thread

    # Set up signal handlers for graceful shutdown
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    if not api_key:
        print("Error: GOOGLE_API_KEY not found in environment variables.")
        sys.exit(1)

    # Clean up any existing audio file
    cleanup_audio_file()

    # Start recording
    recording = True
    record_thread = threading.Thread(target=record_audio)
    record_thread.start()
    feedback("start")

    # Wait for user to stop recording (Ctrl+C will set recording to False)
    record_thread.join()

    # Transcribe the recorded audio
    if os.path.exists(AUDIO_FILE):
        transcribe_audio()

    cleanup_audio_file()


if __name__ == "__main__":
    main()