Feat: Integrated Local LLM (Llama 3.2 1B) for Intelligent Correction -- New Core: Added LLMEngine utilizing llama-cpp-python for local private text post-processing. -- Forensic Protocol: Engineered strict system prompts to prevent LLM refusals, censorship, or assistant chatter. -- Three Modes: Grammar, Standard, Rewrite. -- Start/Stop Logic: Consolidated conflicting recording methods. -- Hotkeys: Added dedicated F9 (Correct) vs F8 (Transcribe). -- UI: Updated Settings. -- Build: Updated portable_build.py. -- Docs: Updated README.
This commit is contained in:
223
main.py
223
main.py
@@ -44,6 +44,7 @@ from src.ui.bridge import UIBridge
|
||||
from src.ui.tray import SystemTray
|
||||
from src.core.audio_engine import AudioEngine
|
||||
from src.core.transcriber import WhisperTranscriber
|
||||
from src.core.llm_engine import LLMEngine
|
||||
from src.core.hotkey_manager import HotkeyManager
|
||||
from src.core.config import ConfigManager
|
||||
from src.utils.injector import InputInjector
|
||||
@@ -188,6 +189,69 @@ class DownloadWorker(QThread):
|
||||
logging.error(f"Download failed: {e}")
|
||||
self.error.emit(str(e))
|
||||
|
||||
class LLMDownloadWorker(QThread):
|
||||
progress = Signal(int)
|
||||
finished = Signal()
|
||||
error = Signal(str)
|
||||
|
||||
def __init__(self, parent=None):
|
||||
super().__init__(parent)
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
import requests
|
||||
# Support one model for now
|
||||
url = "https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf?download=true"
|
||||
fname = "llama-3.2-1b-instruct-q4_k_m.gguf"
|
||||
|
||||
model_path = get_models_path() / "llm" / "llama-3.2-1b-instruct"
|
||||
model_path.mkdir(parents=True, exist_ok=True)
|
||||
dest_file = model_path / fname
|
||||
|
||||
# Simple check if exists and > 0 size?
|
||||
# We assume if the user clicked download, they want to download it.
|
||||
|
||||
with requests.Session() as s:
|
||||
head = s.head(url, allow_redirects=True)
|
||||
total_size = int(head.headers.get('content-length', 0))
|
||||
|
||||
resp = s.get(url, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
downloaded = 0
|
||||
with open(dest_file, 'wb') as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total_size > 0:
|
||||
pct = int((downloaded / total_size) * 100)
|
||||
self.progress.emit(pct)
|
||||
|
||||
self.finished.emit()
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"LLM Download failed: {e}")
|
||||
self.error.emit(str(e))
|
||||
|
||||
class LLMWorker(QThread):
|
||||
finished = Signal(str)
|
||||
|
||||
def __init__(self, llm_engine, text, mode, parent=None):
|
||||
super().__init__(parent)
|
||||
self.llm_engine = llm_engine
|
||||
self.text = text
|
||||
self.mode = mode
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
corrected = self.llm_engine.correct_text(self.text, self.mode)
|
||||
self.finished.emit(corrected)
|
||||
except Exception as e:
|
||||
logging.error(f"LLMWorker crashed: {e}")
|
||||
self.finished.emit(self.text) # Fail safe: return original text
|
||||
|
||||
|
||||
class TranscriptionWorker(QThread):
|
||||
finished = Signal(str)
|
||||
def __init__(self, transcriber, audio_data, is_file=False, parent=None, task_override=None):
|
||||
@@ -229,6 +293,7 @@ class WhisperApp(QObject):
|
||||
self.bridge.settingChanged.connect(self.on_settings_changed)
|
||||
self.bridge.hotkeysEnabledChanged.connect(self.on_hotkeys_enabled_toggle)
|
||||
self.bridge.downloadRequested.connect(self.on_download_requested)
|
||||
self.bridge.llmDownloadRequested.connect(self.on_llm_download_requested)
|
||||
|
||||
self.engine.rootContext().setContextProperty("ui", self.bridge)
|
||||
|
||||
@@ -249,7 +314,9 @@ class WhisperApp(QObject):
|
||||
# 3. Logic Components Placeholders
|
||||
self.audio_engine = None
|
||||
self.transcriber = None
|
||||
self.llm_engine = None
|
||||
self.hk_transcribe = None
|
||||
self.hk_correct = None
|
||||
self.hk_translate = None
|
||||
self.overlay_root = None
|
||||
|
||||
@@ -344,14 +411,19 @@ class WhisperApp(QObject):
|
||||
self.audio_engine.set_visualizer_callback(self.bridge.update_amplitude)
|
||||
self.audio_engine.set_silence_callback(self.on_silence_detected)
|
||||
self.transcriber = WhisperTranscriber()
|
||||
self.llm_engine = LLMEngine()
|
||||
|
||||
# Dual Hotkey Managers
|
||||
self.hk_transcribe = HotkeyManager(config_key="hotkey")
|
||||
self.hk_transcribe.triggered.connect(lambda: self.toggle_recording(task_override="transcribe"))
|
||||
self.hk_transcribe.triggered.connect(lambda: self.toggle_recording(task_override="transcribe", task_mode="standard"))
|
||||
self.hk_transcribe.start()
|
||||
|
||||
self.hk_correct = HotkeyManager(config_key="hotkey_correct")
|
||||
self.hk_correct.triggered.connect(lambda: self.toggle_recording(task_override="transcribe", task_mode="correct"))
|
||||
self.hk_correct.start()
|
||||
|
||||
self.hk_translate = HotkeyManager(config_key="hotkey_translate")
|
||||
self.hk_translate.triggered.connect(lambda: self.toggle_recording(task_override="translate"))
|
||||
self.hk_translate.triggered.connect(lambda: self.toggle_recording(task_override="translate", task_mode="standard"))
|
||||
self.hk_translate.start()
|
||||
|
||||
self.bridge.update_status("Ready")
|
||||
@@ -359,6 +431,57 @@ class WhisperApp(QObject):
|
||||
def run(self):
|
||||
sys.exit(self.qt_app.exec())
|
||||
|
||||
@Slot(str, str)
|
||||
@Slot(str)
|
||||
def toggle_recording(self, task_override=None, task_mode="standard"):
|
||||
"""
|
||||
task_override: 'transcribe' or 'translate' (passed to whisper)
|
||||
task_mode: 'standard' or 'correct' (determines post-processing)
|
||||
"""
|
||||
if task_mode == "correct":
|
||||
self.current_task_requires_llm = True
|
||||
elif task_mode == "standard":
|
||||
self.current_task_requires_llm = False # Explicit reset
|
||||
|
||||
# Actual Logic
|
||||
if self.bridge.isRecording:
|
||||
logging.info("Stopping recording...")
|
||||
# stop_recording returns the numpy array directly
|
||||
audio_data = self.audio_engine.stop_recording()
|
||||
|
||||
self.bridge.isRecording = False
|
||||
self.bridge.update_status("Processing...")
|
||||
self.bridge.isProcessing = True
|
||||
|
||||
# Save task override for processing
|
||||
self.last_task_override = task_override
|
||||
|
||||
if audio_data is not None and len(audio_data) > 0:
|
||||
# Use the task that started this session, or the override if provided
|
||||
final_task = getattr(self, "current_recording_task", self.config.get("task"))
|
||||
if task_override: final_task = task_override
|
||||
|
||||
self.worker = TranscriptionWorker(self.transcriber, audio_data, parent=self, task_override=final_task)
|
||||
self.worker.finished.connect(self.on_transcription_done)
|
||||
self.worker.start()
|
||||
else:
|
||||
self.bridge.update_status("Ready")
|
||||
self.bridge.isProcessing = False
|
||||
|
||||
else:
|
||||
# START RECORDING
|
||||
if self.bridge.isProcessing:
|
||||
logging.warning("Ignored toggle request: Transcription in progress.")
|
||||
return
|
||||
|
||||
intended_task = task_override if task_override else self.config.get("task")
|
||||
self.current_recording_task = intended_task
|
||||
|
||||
logging.info(f"Starting recording... (Task: {intended_task}, Mode: {task_mode})")
|
||||
self.audio_engine.start_recording()
|
||||
self.bridge.isRecording = True
|
||||
self.bridge.update_status(f"Recording ({intended_task})...")
|
||||
|
||||
@Slot()
|
||||
def quit_app(self):
|
||||
logging.info("Shutting down...")
|
||||
@@ -447,14 +570,16 @@ class WhisperApp(QObject):
|
||||
print(f"Setting Changed: {key} = {value}")
|
||||
|
||||
# 1. Hotkey Reload
|
||||
if key in ["hotkey", "hotkey_translate"]:
|
||||
if key in ["hotkey", "hotkey_translate", "hotkey_correct"]:
|
||||
if self.hk_transcribe: self.hk_transcribe.reload_hotkey()
|
||||
if self.hk_correct: self.hk_correct.reload_hotkey()
|
||||
if self.hk_translate: self.hk_translate.reload_hotkey()
|
||||
|
||||
if self.tray:
|
||||
hk1 = self.format_hotkey(self.config.get("hotkey"))
|
||||
hk3 = self.format_hotkey(self.config.get("hotkey_correct"))
|
||||
hk2 = self.format_hotkey(self.config.get("hotkey_translate"))
|
||||
self.tray.setToolTip(f"Whisper Voice\nTranscribe: {hk1}\nTranslate: {hk2}")
|
||||
self.tray.setToolTip(f"Whisper Voice\nTranscribe: {hk1}\nCorrect: {hk3}\nTranslate: {hk2}")
|
||||
|
||||
# 2. AI Model Reload (Heavy)
|
||||
if key in ["model_size", "compute_device", "compute_type"]:
|
||||
@@ -571,40 +696,7 @@ class WhisperApp(QObject):
|
||||
# Let's ensure toggle_recording handles no arg calls by stopping the CURRENT task.
|
||||
QMetaObject.invokeMethod(self, "toggle_recording", Qt.QueuedConnection)
|
||||
|
||||
@Slot() # Modified to allow lambda override
|
||||
def toggle_recording(self, task_override=None):
|
||||
if not self.audio_engine: return
|
||||
|
||||
# Prevent starting a new recording while we are still transcribing the last one
|
||||
if self.bridge.isProcessing:
|
||||
logging.warning("Ignored toggle request: Transcription in progress.")
|
||||
return
|
||||
|
||||
# Determine which task we are entering
|
||||
if task_override:
|
||||
intended_task = task_override
|
||||
else:
|
||||
intended_task = self.config.get("task")
|
||||
|
||||
if self.audio_engine.recording:
|
||||
# STOP RECORDING
|
||||
self.bridge.update_status("Thinking...")
|
||||
self.bridge.isRecording = False
|
||||
self.bridge.isProcessing = True # Start Processing
|
||||
audio_data = self.audio_engine.stop_recording()
|
||||
|
||||
# Use the task that started this session, or the override if provided (though usually override is for starting)
|
||||
final_task = getattr(self, "current_recording_task", self.config.get("task"))
|
||||
|
||||
self.worker = TranscriptionWorker(self.transcriber, audio_data, parent=self, task_override=final_task)
|
||||
self.worker.finished.connect(self.on_transcription_done)
|
||||
self.worker.start()
|
||||
else:
|
||||
# START RECORDING
|
||||
self.current_recording_task = intended_task
|
||||
self.bridge.update_status(f"Recording ({intended_task})...")
|
||||
self.bridge.isRecording = True
|
||||
self.audio_engine.start_recording()
|
||||
|
||||
@Slot(bool)
|
||||
def on_ui_toggle_request(self, state):
|
||||
@@ -614,11 +706,53 @@ class WhisperApp(QObject):
|
||||
@Slot(str)
|
||||
def on_transcription_done(self, text: str):
|
||||
self.bridge.update_status("Ready")
|
||||
self.bridge.isProcessing = False # End Processing
|
||||
self.bridge.isProcessing = False # Temporarily false? No, keep it true if we chain.
|
||||
|
||||
# Check LLM Settings -> AND check if the current task requested it
|
||||
llm_enabled = self.config.get("llm_enabled")
|
||||
requires_llm = getattr(self, "current_task_requires_llm", False)
|
||||
|
||||
# We only correct if:
|
||||
# 1. LLM is globally enabled (safety switch)
|
||||
# 2. current_task_requires_llm is True (triggered by Correct hotkey)
|
||||
# OR 3. Maybe user WANTS global correction? Ideally user uses separate hotkey.
|
||||
# Let's say: If "Correction" is enabled in settings, does it apply to ALL?
|
||||
# The user's feedback suggests they DON'T want it on regular hotkey.
|
||||
# So we enforce: Correct Hotkey -> Corrects. Regular Hotkey -> Raw.
|
||||
# BUT we must handle the case where user expects the old behavior?
|
||||
# Let's make it strict: Only correct if triggered by correct hotkey OR if we add a "Correct All" toggle later.
|
||||
# For now, let's respect the flag. But wait, if llm_enabled is OFF, we shouldn't run it even if hotkey pressed?
|
||||
# Yes, safety switch.
|
||||
|
||||
if text and llm_enabled and requires_llm:
|
||||
# Chain to LLM
|
||||
self.bridge.isProcessing = True
|
||||
self.bridge.update_status("Correcting...")
|
||||
mode = self.config.get("llm_mode")
|
||||
self.llm_worker = LLMWorker(self.llm_engine, text, mode, parent=self)
|
||||
self.llm_worker.finished.connect(self.on_llm_done)
|
||||
self.llm_worker.start()
|
||||
return
|
||||
|
||||
self.bridge.isProcessing = False
|
||||
if text:
|
||||
method = self.config.get("input_method")
|
||||
speed = int(self.config.get("typing_speed"))
|
||||
InputInjector.inject_text(text, method, speed)
|
||||
|
||||
@Slot(str)
|
||||
def on_llm_done(self, text: str):
|
||||
self.bridge.update_status("Ready")
|
||||
self.bridge.isProcessing = False
|
||||
if text:
|
||||
method = self.config.get("input_method")
|
||||
speed = int(self.config.get("typing_speed"))
|
||||
InputInjector.inject_text(text, method, speed)
|
||||
|
||||
# Cleanup
|
||||
if hasattr(self, 'llm_worker') and self.llm_worker:
|
||||
self.llm_worker.deleteLater()
|
||||
self.llm_worker = None
|
||||
|
||||
@Slot(bool)
|
||||
def on_hotkeys_enabled_toggle(self, state):
|
||||
@@ -638,6 +772,19 @@ class WhisperApp(QObject):
|
||||
self.download_worker.error.connect(self.on_download_error)
|
||||
self.download_worker.start()
|
||||
|
||||
@Slot()
|
||||
def on_llm_download_requested(self):
|
||||
if self.bridge.isDownloading: return
|
||||
|
||||
self.bridge.update_status("Downloading LLM...")
|
||||
self.bridge.isDownloading = True
|
||||
|
||||
self.llm_dl_worker = LLMDownloadWorker(parent=self)
|
||||
self.llm_dl_worker.progress.connect(self.on_loader_progress) # Reuse existing progress slot? Yes.
|
||||
self.llm_dl_worker.finished.connect(self.on_download_finished) # Reuses same cleanup
|
||||
self.llm_dl_worker.error.connect(self.on_download_error)
|
||||
self.llm_dl_worker.start()
|
||||
|
||||
def on_download_finished(self):
|
||||
self.bridge.isDownloading = False
|
||||
self.bridge.update_status("Ready")
|
||||
|
||||
Reference in New Issue
Block a user