Feat: Integrated Local LLM (Llama 3.2 1B) for Intelligent Correction -- New Core: Added LLMEngine utilizing llama-cpp-python for local private text post-processing. -- Forensic Protocol: Engineered strict system prompts to prevent LLM refusals, censorship, or assistant chatter. -- Three Modes: Grammar, Standard, Rewrite. -- Start/Stop Logic: Consolidated conflicting recording methods. -- Hotkeys: Added dedicated F9 (Correct) vs F8 (Transcribe). -- UI: Updated Settings. -- Build: Updated portable_build.py. -- Docs: Updated README.

2026-01-31 01:02:24 +02:00
parent 3137770742
commit baa5e2e69e
10 changed files with 601 additions and 61 deletions
--- a/main.py
+++ b/main.py
@@ -44,6 +44,7 @@ from src.ui.bridge import UIBridge
 from src.ui.tray import SystemTray
 from src.core.audio_engine import AudioEngine
 from src.core.transcriber import WhisperTranscriber
+from src.core.llm_engine import LLMEngine
 from src.core.hotkey_manager import HotkeyManager
 from src.core.config import ConfigManager
 from src.utils.injector import InputInjector
@@ -188,6 +189,69 @@ class DownloadWorker(QThread):
            logging.error(f"Download failed: {e}")
            self.error.emit(str(e))

+class LLMDownloadWorker(QThread):
+    progress = Signal(int)
+    finished = Signal()
+    error = Signal(str)
+
+    def __init__(self, parent=None):
+        super().__init__(parent)
+
+    def run(self):
+        try:
+            import requests
+            # Support one model for now
+            url = "https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf?download=true"
+            fname = "llama-3.2-1b-instruct-q4_k_m.gguf"
+            
+            model_path = get_models_path() / "llm" / "llama-3.2-1b-instruct"
+            model_path.mkdir(parents=True, exist_ok=True)
+            dest_file = model_path / fname
+            
+            # Simple check if exists and > 0 size? 
+            # We assume if the user clicked download, they want to download it.
+            
+            with requests.Session() as s:
+                head = s.head(url, allow_redirects=True)
+                total_size = int(head.headers.get('content-length', 0))
+                
+                resp = s.get(url, stream=True)
+                resp.raise_for_status()
+                
+                downloaded = 0
+                with open(dest_file, 'wb') as f:
+                    for chunk in resp.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                            downloaded += len(chunk)
+                            if total_size > 0:
+                                pct = int((downloaded / total_size) * 100)
+                                self.progress.emit(pct)
+                                
+            self.finished.emit()
+            
+        except Exception as e:
+            logging.error(f"LLM Download failed: {e}")
+            self.error.emit(str(e))
+
+class LLMWorker(QThread):
+    finished = Signal(str)
+    
+    def __init__(self, llm_engine, text, mode, parent=None):
+        super().__init__(parent)
+        self.llm_engine = llm_engine
+        self.text = text
+        self.mode = mode
+        
+    def run(self):
+        try:
+            corrected = self.llm_engine.correct_text(self.text, self.mode)
+            self.finished.emit(corrected)
+        except Exception as e:
+            logging.error(f"LLMWorker crashed: {e}")
+            self.finished.emit(self.text) # Fail safe: return original text
+
+
 class TranscriptionWorker(QThread):
    finished = Signal(str)
    def __init__(self, transcriber, audio_data, is_file=False, parent=None, task_override=None):
@@ -229,6 +293,7 @@ class WhisperApp(QObject):
        self.bridge.settingChanged.connect(self.on_settings_changed)
        self.bridge.hotkeysEnabledChanged.connect(self.on_hotkeys_enabled_toggle)
        self.bridge.downloadRequested.connect(self.on_download_requested)
+        self.bridge.llmDownloadRequested.connect(self.on_llm_download_requested)
        
        self.engine.rootContext().setContextProperty("ui", self.bridge)
        
@@ -249,7 +314,9 @@ class WhisperApp(QObject):
        # 3. Logic Components Placeholders
        self.audio_engine = None
        self.transcriber = None
+        self.llm_engine = None
        self.hk_transcribe = None
+        self.hk_correct = None
        self.hk_translate = None
        self.overlay_root = None
        
@@ -344,14 +411,19 @@ class WhisperApp(QObject):
        self.audio_engine.set_visualizer_callback(self.bridge.update_amplitude)
        self.audio_engine.set_silence_callback(self.on_silence_detected)
        self.transcriber = WhisperTranscriber()
+        self.llm_engine = LLMEngine()
        
        # Dual Hotkey Managers
        self.hk_transcribe = HotkeyManager(config_key="hotkey")
-        self.hk_transcribe.triggered.connect(lambda: self.toggle_recording(task_override="transcribe"))
+        self.hk_transcribe.triggered.connect(lambda: self.toggle_recording(task_override="transcribe", task_mode="standard"))
        self.hk_transcribe.start()
        
+        self.hk_correct = HotkeyManager(config_key="hotkey_correct")
+        self.hk_correct.triggered.connect(lambda: self.toggle_recording(task_override="transcribe", task_mode="correct"))
+        self.hk_correct.start()
+        
        self.hk_translate = HotkeyManager(config_key="hotkey_translate")
-        self.hk_translate.triggered.connect(lambda: self.toggle_recording(task_override="translate"))
+        self.hk_translate.triggered.connect(lambda: self.toggle_recording(task_override="translate", task_mode="standard"))
        self.hk_translate.start()
        
        self.bridge.update_status("Ready")
@@ -359,6 +431,57 @@ class WhisperApp(QObject):
    def run(self):
        sys.exit(self.qt_app.exec())

+    @Slot(str, str)
+    @Slot(str)
+    def toggle_recording(self, task_override=None, task_mode="standard"):
+        """
+        task_override: 'transcribe' or 'translate' (passed to whisper)
+        task_mode: 'standard' or 'correct' (determines post-processing)
+        """
+        if task_mode == "correct":
+            self.current_task_requires_llm = True
+        elif task_mode == "standard":
+            self.current_task_requires_llm = False # Explicit reset
+            
+        # Actual Logic
+        if self.bridge.isRecording:
+            logging.info("Stopping recording...")
+            # stop_recording returns the numpy array directly
+            audio_data = self.audio_engine.stop_recording()
+            
+            self.bridge.isRecording = False
+            self.bridge.update_status("Processing...")
+            self.bridge.isProcessing = True
+            
+            # Save task override for processing
+            self.last_task_override = task_override
+            
+            if audio_data is not None and len(audio_data) > 0:
+                # Use the task that started this session, or the override if provided
+                final_task = getattr(self, "current_recording_task", self.config.get("task"))
+                if task_override: final_task = task_override
+                
+                self.worker = TranscriptionWorker(self.transcriber, audio_data, parent=self, task_override=final_task)
+                self.worker.finished.connect(self.on_transcription_done)
+                self.worker.start()
+            else:
+                self.bridge.update_status("Ready")
+                self.bridge.isProcessing = False
+                
+        else:
+            # START RECORDING
+            if self.bridge.isProcessing:
+                logging.warning("Ignored toggle request: Transcription in progress.")
+                return
+
+            intended_task = task_override if task_override else self.config.get("task")
+            self.current_recording_task = intended_task
+            
+            logging.info(f"Starting recording... (Task: {intended_task}, Mode: {task_mode})")
+            self.audio_engine.start_recording()
+            self.bridge.isRecording = True
+            self.bridge.update_status(f"Recording ({intended_task})...")
+
    @Slot()
    def quit_app(self):
        logging.info("Shutting down...")
@@ -447,14 +570,16 @@ class WhisperApp(QObject):
        print(f"Setting Changed: {key} = {value}")
        
        # 1. Hotkey Reload
-        if key in ["hotkey", "hotkey_translate"]:
+        if key in ["hotkey", "hotkey_translate", "hotkey_correct"]:
            if self.hk_transcribe: self.hk_transcribe.reload_hotkey()
+            if self.hk_correct: self.hk_correct.reload_hotkey()
            if self.hk_translate: self.hk_translate.reload_hotkey()
            
            if self.tray:
                hk1 = self.format_hotkey(self.config.get("hotkey"))
+                hk3 = self.format_hotkey(self.config.get("hotkey_correct"))
                hk2 = self.format_hotkey(self.config.get("hotkey_translate"))
-                self.tray.setToolTip(f"Whisper Voice\nTranscribe: {hk1}\nTranslate: {hk2}")
+                self.tray.setToolTip(f"Whisper Voice\nTranscribe: {hk1}\nCorrect: {hk3}\nTranslate: {hk2}")

        # 2. AI Model Reload (Heavy)
        if key in ["model_size", "compute_device", "compute_type"]:
@@ -571,40 +696,7 @@ class WhisperApp(QObject):
        # Let's ensure toggle_recording handles no arg calls by stopping the CURRENT task.
        QMetaObject.invokeMethod(self, "toggle_recording", Qt.QueuedConnection)

-    @Slot() # Modified to allow lambda override
-    def toggle_recording(self, task_override=None):
-        if not self.audio_engine: return
-        
-        # Prevent starting a new recording while we are still transcribing the last one
-        if self.bridge.isProcessing:
-            logging.warning("Ignored toggle request: Transcription in progress.")
-            return

-        # Determine which task we are entering
-        if task_override:
-            intended_task = task_override
-        else:
-            intended_task = self.config.get("task")
-
-        if self.audio_engine.recording:
-            # STOP RECORDING
-            self.bridge.update_status("Thinking...")
-            self.bridge.isRecording = False
-            self.bridge.isProcessing = True # Start Processing
-            audio_data = self.audio_engine.stop_recording()
-            
-            # Use the task that started this session, or the override if provided (though usually override is for starting)
-            final_task = getattr(self, "current_recording_task", self.config.get("task"))
-            
-            self.worker = TranscriptionWorker(self.transcriber, audio_data, parent=self, task_override=final_task)
-            self.worker.finished.connect(self.on_transcription_done)
-            self.worker.start()
-        else:
-            # START RECORDING
-            self.current_recording_task = intended_task
-            self.bridge.update_status(f"Recording ({intended_task})...") 
-            self.bridge.isRecording = True
-            self.audio_engine.start_recording()

    @Slot(bool)
    def on_ui_toggle_request(self, state):
@@ -614,11 +706,53 @@ class WhisperApp(QObject):
    @Slot(str)
    def on_transcription_done(self, text: str):
        self.bridge.update_status("Ready")
-        self.bridge.isProcessing = False # End Processing
+        self.bridge.isProcessing = False # Temporarily false? No, keep it true if we chain.
+        
+        # Check LLM Settings -> AND check if the current task requested it
+        llm_enabled = self.config.get("llm_enabled")
+        requires_llm = getattr(self, "current_task_requires_llm", False)
+        
+        # We only correct if:
+        # 1. LLM is globally enabled (safety switch)
+        # 2. current_task_requires_llm is True (triggered by Correct hotkey)
+        # OR 3. Maybe user WANTS global correction? Ideally user uses separate hotkey. 
+        # Let's say: If "Correction" is enabled in settings, does it apply to ALL? 
+        # The user's feedback suggests they DON'T want it on regular hotkey.
+        # So we enforce: Correct Hotkey -> Corrects. Regular Hotkey -> Raw.
+        # BUT we must handle the case where user expects the old behavior?
+        # Let's make it strict: Only correct if triggered by correct hotkey OR if we add a "Correct All" toggle later.
+        # For now, let's respect the flag. But wait, if llm_enabled is OFF, we shouldn't run it even if hotkey pressed?
+        # Yes, safety switch.
+        
+        if text and llm_enabled and requires_llm:
+            # Chain to LLM
+            self.bridge.isProcessing = True
+            self.bridge.update_status("Correcting...")
+            mode = self.config.get("llm_mode")
+            self.llm_worker = LLMWorker(self.llm_engine, text, mode, parent=self)
+            self.llm_worker.finished.connect(self.on_llm_done)
+            self.llm_worker.start()
+            return
+
+        self.bridge.isProcessing = False
        if text:
            method = self.config.get("input_method")
            speed = int(self.config.get("typing_speed"))
            InputInjector.inject_text(text, method, speed)
+            
+    @Slot(str)
+    def on_llm_done(self, text: str):
+        self.bridge.update_status("Ready")
+        self.bridge.isProcessing = False
+        if text:
+            method = self.config.get("input_method")
+            speed = int(self.config.get("typing_speed"))
+            InputInjector.inject_text(text, method, speed)
+        
+        # Cleanup
+        if hasattr(self, 'llm_worker') and self.llm_worker:
+            self.llm_worker.deleteLater()
+            self.llm_worker = None

    @Slot(bool)
    def on_hotkeys_enabled_toggle(self, state):
@@ -638,6 +772,19 @@ class WhisperApp(QObject):
        self.download_worker.error.connect(self.on_download_error)
        self.download_worker.start()

+    @Slot()
+    def on_llm_download_requested(self):
+        if self.bridge.isDownloading: return
+        
+        self.bridge.update_status("Downloading LLM...")
+        self.bridge.isDownloading = True
+        
+        self.llm_dl_worker = LLMDownloadWorker(parent=self)
+        self.llm_dl_worker.progress.connect(self.on_loader_progress) # Reuse existing progress slot? Yes.
+        self.llm_dl_worker.finished.connect(self.on_download_finished) # Reuses same cleanup
+        self.llm_dl_worker.error.connect(self.on_download_error)
+        self.llm_dl_worker.start()
+
    def on_download_finished(self):
        self.bridge.isDownloading = False
        self.bridge.update_status("Ready")