Files
whisper_voice/src/core/transcriber.py
Your Name 0b2b5848e2 Fix: Translation Reliability, Click-Through, and Docs Sync
- Transcriber: Enforced 'beam_size=5' and prompt injection for robust translation.
- Transcriber: Removed conditioning on previous text to prevent language stickiness.
- Transcriber: Refactored kwargs to sanitize inputs.
- Overlay: Fixed click-through by toggling WS_EX_TRANSPARENT.
- UI: Added real download progress reporting.
- Docs: Refactored language list to table.
2026-01-24 19:05:43 +02:00

175 lines
6.5 KiB
Python

"""
Whisper Transcriber Module.
===========================
Transcriber Module.
===================
Handles audio transcription using faster-whisper.
Runs IN-PROCESS (no subprocess) to ensure stability on all systems.
"""
import os
import logging
from typing import Optional
import numpy as np
from src.core.config import ConfigManager
from src.core.paths import get_models_path
# Import directly - valid since we are now running in the full environment
from faster_whisper import WhisperModel
class WhisperTranscriber:
"""
Manages the faster-whisper model and transcription process.
"""
def __init__(self):
"""Initialize settings."""
self.config = ConfigManager()
self.model = None
self.current_model_size = None
self.current_compute_device = None
self.current_compute_type = None
def load_model(self):
"""
Loads the model specified in config.
Safe to call multiple times (checks if reload needed).
"""
size = self.config.get("model_size")
device = self.config.get("compute_device")
compute = self.config.get("compute_type")
# Check if already loaded
if (self.model and
self.current_model_size == size and
self.current_compute_device == device and
self.current_compute_type == compute):
return
logging.info(f"Loading Model: {size} on {device} ({compute})...")
try:
# Construct path to local model for offline support
new_path = get_models_path() / f"faster-whisper-{size}"
model_input = str(new_path) if new_path.exists() else size
# Force offline if path exists to avoid HF errors
local_only = new_path.exists()
self.model = WhisperModel(
model_input,
device=device,
compute_type=compute,
download_root=str(get_models_path()),
local_files_only=local_only
)
self.current_model_size = size
self.current_compute_device = device
self.current_compute_type = compute
logging.info("Model loaded successfully.")
except Exception as e:
logging.error(f"Failed to load model: {e}")
self.model = None
def transcribe(self, audio_data, is_file: bool = False, task: Optional[str] = None) -> str:
"""
Transcribe audio data.
"""
logging.info(f"Starting transcription... (is_file={is_file}, task={task})")
# Ensure model is loaded
if not self.model:
self.load_model()
if not self.model:
return "Error: Model failed to load."
try:
# Config
beam_size = int(self.config.get("beam_size"))
best_of = int(self.config.get("best_of"))
vad = False if is_file else self.config.get("vad_filter")
language = self.config.get("language")
# Use task override if provided, otherwise config
# Ensure safe string and lowercase ("transcribe" vs "Transcribe")
raw_task = task if task else self.config.get("task")
final_task = str(raw_task).strip().lower() if raw_task else "transcribe"
# Sanity check for valid Whisper tasks
if final_task not in ["transcribe", "translate"]:
logging.warning(f"Invalid task '{final_task}' detected. Defaulting to 'transcribe'.")
final_task = "transcribe"
# Language handling
final_language = language if language != "auto" else None
# Anti-Hallucination: Force condition_on_previous_text=False for translation
condition_prev = self.config.get("condition_on_previous_text")
# Helper options for Translation Stability
initial_prompt = self.config.get("initial_prompt")
if final_task == "translate":
condition_prev = False
# Force beam search if user has set it to greedy (1)
# Translation requires more search breadth to find the English mapping
if beam_size < 5:
logging.info("Forcing beam_size=5 for Translation task.")
beam_size = 5
# Inject guidance prompt if none exists
if not initial_prompt:
initial_prompt = "Translate this to English."
logging.info(f"Model Dispatch: Task='{final_task}', Language='{final_language}', ConditionPrev={condition_prev}, Beam={beam_size}")
# Build arguments dynamically to avoid passing None if that's the issue
transcribe_opts = {
"beam_size": beam_size,
"best_of": best_of,
"vad_filter": vad,
"task": final_task,
"vad_parameters": dict(min_silence_duration_ms=500),
"condition_on_previous_text": condition_prev,
"without_timestamps": True
}
if initial_prompt:
transcribe_opts["initial_prompt"] = initial_prompt
# Only add language if it's explicitly set (not None/Auto)
# This avoids potentially confusing the model with explicit None
if final_language:
transcribe_opts["language"] = final_language
# Transcribe
segments, info = self.model.transcribe(audio_data, **transcribe_opts)
# Aggregate text
text_result = ""
for segment in segments:
text_result += segment.text + " "
return text_result.strip()
except Exception as e:
logging.error(f"Transcription failed: {e}")
return f"Error: {str(e)}"
def model_exists(self, size: str) -> bool:
"""Checks if a model size is already downloaded."""
new_path = get_models_path() / f"faster-whisper-{size}"
if (new_path / "config.json").exists():
return True
# Legacy HF cache check
folder_name = f"models--Systran--faster-whisper-{size}"
path = get_models_path() / folder_name / "snapshots"
if path.exists() and any(path.iterdir()):
return True
return False