add ffmpeg.rs, subtitles.rs, and fonts.rs

2026-02-19 01:59:21 +02:00
parent 4fd04f55c8
commit ffa37e6061
4 changed files with 2075 additions and 0 deletions
--- a/src-tauri/src/subtitles.rs
+++ b/src-tauri/src/subtitles.rs
@@ -0,0 +1,652 @@
+//! Subtitle handling: SRT-to-VTT conversion, sidecar discovery, storage,
+//! and embedded subtitle extraction via ffmpeg.
+
+use once_cell::sync::Lazy;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+#[cfg(target_os = "windows")]
+use std::os::windows::process::CommandExt;
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/// Supported subtitle file extensions.
+pub const SUB_EXTS: &[&str] = &[".srt", ".vtt"];
+
+/// Languages considered "English" for sidecar priority.
+const ENGLISH_LANGS: &[&str] = &["en", "eng", "english"];
+
+/// All language suffixes to strip when normalizing subtitle basenames.
+const ALL_LANG_SUFFIXES: &[&str] = &[
+    "en", "eng", "english", "fr", "de", "es", "it", "pt", "ru", "ja", "ko", "zh",
+];
+
+/// Windows CREATE_NO_WINDOW flag for subprocess creation.
+#[cfg(target_os = "windows")]
+const CREATE_NO_WINDOW: u32 = 0x08000000;
+
+// ---------------------------------------------------------------------------
+// Structs
+// ---------------------------------------------------------------------------
+
+/// Result of storing a subtitle file for a video.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SubtitleStored {
+    /// Relative path like `"subtitles/{fid}_{name}.vtt"`.
+    pub vtt: String,
+    /// Display label (source filename).
+    pub label: String,
+}
+
+// ---------------------------------------------------------------------------
+// Compiled regex patterns
+// ---------------------------------------------------------------------------
+
+/// Matches a line that is only digits (SRT cue index).
+static CUE_INDEX_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d+$").unwrap());
+
+/// Matches characters that are NOT alphanumeric, dot, underscore, or hyphen.
+static SANITIZE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[^a-zA-Z0-9._\-]").unwrap());
+
+/// Collapses runs of whitespace and dash/underscore into a single space for
+/// normalized comparison of subtitle stems.
+static NORMALIZE_SEP_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[-_\s]+").unwrap());
+
+// ---------------------------------------------------------------------------
+// 1. srt_to_vtt
+// ---------------------------------------------------------------------------
+
+/// Convert SRT subtitle text to WebVTT format string.
+///
+/// - Removes BOM (`\u{FEFF}`) if present.
+/// - Adds the `WEBVTT` header.
+/// - Skips cue index numbers (lines that are just digits).
+/// - Converts timestamp separators: comma -> dot.
+/// - Collects subtitle text between timestamp lines and empty lines.
+pub fn srt_to_vtt(srt_text: &str) -> String {
+    let text = srt_text.replace('\u{FEFF}', "");
+    let lines: Vec<&str> = text.lines().collect();
+    let mut out: Vec<String> = vec!["WEBVTT".to_string(), String::new()];
+
+    let mut i = 0;
+    while i < lines.len() {
+        let line = lines[i].trim_end_matches('\r');
+
+        // Empty line -> blank line in output
+        if line.trim().is_empty() {
+            out.push(String::new());
+            i += 1;
+            continue;
+        }
+
+        // Skip cue index (pure digit line)
+        if CUE_INDEX_RE.is_match(line.trim()) {
+            i += 1;
+            if i >= lines.len() {
+                break;
+            }
+            // Re-read the next line as a potential timestamp
+            let line = lines[i].trim_end_matches('\r');
+
+            if line.contains("-->") {
+                let ts_line = line.replace(',', ".");
+                out.push(ts_line);
+                i += 1;
+                // Collect subtitle text until blank line
+                while i < lines.len() {
+                    let t = lines[i].trim_end_matches('\r');
+                    if t.trim().is_empty() {
+                        out.push(String::new());
+                        i += 1;
+                        break;
+                    }
+                    out.push(t.to_string());
+                    i += 1;
+                }
+            } else {
+                i += 1;
+            }
+        } else if line.contains("-->") {
+            // Timestamp line without preceding cue index
+            let ts_line = line.replace(',', ".");
+            out.push(ts_line);
+            i += 1;
+            while i < lines.len() {
+                let t = lines[i].trim_end_matches('\r');
+                if t.trim().is_empty() {
+                    out.push(String::new());
+                    i += 1;
+                    break;
+                }
+                out.push(t.to_string());
+                i += 1;
+            }
+        } else {
+            i += 1;
+        }
+    }
+
+    let joined = out.join("\n");
+    format!("{}\n", joined.trim())
+}
+
+// ---------------------------------------------------------------------------
+// 2. auto_subtitle_sidecar (helpers)
+// ---------------------------------------------------------------------------
+
+/// Normalize a string for fuzzy subtitle matching: lowercase, replace `-` and
+/// `_` with space, collapse whitespace.
+fn normalize_stem(s: &str) -> String {
+    let lower = s.to_lowercase();
+    let replaced = NORMALIZE_SEP_RE.replace_all(&lower, " ");
+    replaced.trim().to_string()
+}
+
+/// Strip a trailing language suffix from a subtitle stem.
+///
+/// For example, `"video.en"` -> `Some(("video", "en"))`.
+/// Returns `None` if no known language suffix is found.
+fn strip_lang_suffix(stem: &str) -> Option<(String, String)> {
+    if let Some(dot_pos) = stem.rfind('.') {
+        let base = &stem[..dot_pos];
+        let suffix = &stem[dot_pos + 1..];
+        let suffix_lower = suffix.to_lowercase();
+        if ALL_LANG_SUFFIXES.contains(&suffix_lower.as_str()) {
+            return Some((base.to_string(), suffix_lower));
+        }
+    }
+    None
+}
+
+/// Find a subtitle sidecar file matching the given video path.
+///
+/// Returns the best matching subtitle file path, or `None`.
+///
+/// Priority (lower is better):
+/// - 0: Exact stem match (case-insensitive)
+/// - 1: Normalized exact match
+/// - 2: English language suffix with exact base
+/// - 3: English language suffix with normalized base
+/// - 4: Other language suffix with exact base
+/// - 5: Other/no language with normalized base
+pub fn auto_subtitle_sidecar(video_path: &Path) -> Option<PathBuf> {
+    let parent = video_path.parent()?;
+    let video_stem = video_path.file_stem()?.to_string_lossy().to_string();
+    let video_stem_lower = video_stem.to_lowercase();
+    let video_stem_norm = normalize_stem(&video_stem);
+
+    // Collect all subtitle files in the same directory.
+    let entries = fs::read_dir(parent).ok()?;
+
+    let mut best: Option<(u8, PathBuf)> = None;
+
+    for entry in entries.flatten() {
+        let path = entry.path();
+        if !path.is_file() {
+            continue;
+        }
+
+        let fname = match path.file_name() {
+            Some(n) => n.to_string_lossy().to_string(),
+            None => continue,
+        };
+        let fname_lower = fname.to_lowercase();
+
+        // Must end with a supported subtitle extension.
+        let is_sub = SUB_EXTS
+            .iter()
+            .any(|ext| fname_lower.ends_with(ext));
+        if !is_sub {
+            continue;
+        }
+
+        // Extract the stem (without the subtitle extension).
+        let sub_stem = match path.file_stem() {
+            Some(s) => s.to_string_lossy().to_string(),
+            None => continue,
+        };
+        let sub_stem_lower = sub_stem.to_lowercase();
+
+        // Priority 0: exact stem match (case-insensitive).
+        if sub_stem_lower == video_stem_lower {
+            let priority = 0u8;
+            if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                best = Some((priority, path.clone()));
+            }
+            continue;
+        }
+
+        // Check for language suffix.
+        if let Some((base, lang)) = strip_lang_suffix(&sub_stem) {
+            let base_lower = base.to_lowercase();
+            let base_norm = normalize_stem(&base);
+            let is_english = ENGLISH_LANGS.contains(&lang.as_str());
+
+            if is_english {
+                // Priority 2: English suffix, exact base.
+                if base_lower == video_stem_lower {
+                    let priority = 2u8;
+                    if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                        best = Some((priority, path.clone()));
+                    }
+                    continue;
+                }
+                // Priority 3: English suffix, normalized base.
+                if base_norm == video_stem_norm {
+                    let priority = 3u8;
+                    if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                        best = Some((priority, path.clone()));
+                    }
+                    continue;
+                }
+            } else {
+                // Priority 4: Other language suffix, exact base.
+                if base_lower == video_stem_lower {
+                    let priority = 4u8;
+                    if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                        best = Some((priority, path.clone()));
+                    }
+                    continue;
+                }
+                // Priority 5: Other language suffix, normalized base.
+                if base_norm == video_stem_norm {
+                    let priority = 5u8;
+                    if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                        best = Some((priority, path.clone()));
+                    }
+                    continue;
+                }
+            }
+        }
+
+        // Priority 1: Normalized match (no language suffix).
+        let sub_stem_norm = normalize_stem(&sub_stem);
+        if sub_stem_norm == video_stem_norm {
+            let priority = 1u8;
+            if best.as_ref().map_or(true, |(bp, _)| priority < *bp) {
+                best = Some((priority, path.clone()));
+            }
+        }
+
+        // Priority 5 fallback: normalized match for subtitle files whose
+        // language suffix was not recognised above (handled by the
+        // strip_lang_suffix branch already for known languages).
+    }
+
+    best.map(|(_, p)| p)
+}
+
+// ---------------------------------------------------------------------------
+// 3. store_subtitle_for_fid
+// ---------------------------------------------------------------------------
+
+/// Sanitize a filename component: replace non-alphanumeric chars (except
+/// `._-`) with `_`, then truncate to 60 characters.
+fn sanitize_name(name: &str) -> String {
+    let sanitized = SANITIZE_RE.replace_all(name, "_");
+    let s = sanitized.as_ref();
+    if s.len() > 60 {
+        s[..60].to_string()
+    } else {
+        s.to_string()
+    }
+}
+
+/// Store a subtitle file for a given fid. Converts SRT->VTT if needed.
+///
+/// The output file is written as `{fid}_{sanitized_name}.vtt` inside
+/// `subs_dir`. Returns `SubtitleStored` with the relative path (from the
+/// parent of `subs_dir`) and a display label.
+///
+/// Returns `None` if the source file extension is not supported or reading
+/// the source fails.
+pub fn store_subtitle_for_fid(
+    fid: &str,
+    src_path: &Path,
+    subs_dir: &Path,
+) -> Option<SubtitleStored> {
+    let ext_lower = src_path
+        .extension()
+        .map(|e| format!(".{}", e.to_string_lossy().to_lowercase()))?;
+
+    if !SUB_EXTS.contains(&ext_lower.as_str()) {
+        return None;
+    }
+
+    let src_filename = src_path
+        .file_name()
+        .map(|n| n.to_string_lossy().to_string())
+        .unwrap_or_default();
+
+    let src_stem = src_path
+        .file_stem()
+        .map(|s| s.to_string_lossy().to_string())
+        .unwrap_or_else(|| "subtitle".to_string());
+
+    let sanitized = sanitize_name(&src_stem);
+    let out_name = format!("{}_{}.vtt", fid, sanitized);
+
+    // Ensure subs_dir exists.
+    let _ = fs::create_dir_all(subs_dir);
+
+    let out_path = subs_dir.join(&out_name);
+
+    let content = fs::read_to_string(src_path).ok()?;
+
+    let vtt_content = if ext_lower == ".srt" {
+        srt_to_vtt(&content)
+    } else {
+        // Already VTT - use as-is.
+        content
+    };
+
+    fs::write(&out_path, vtt_content.as_bytes()).ok()?;
+
+    // Build relative path: "subtitles/{out_name}".
+    let subs_dir_name = subs_dir
+        .file_name()
+        .map(|n| n.to_string_lossy().to_string())
+        .unwrap_or_else(|| "subtitles".to_string());
+
+    let vtt_rel = format!("{}/{}", subs_dir_name, out_name);
+
+    Some(SubtitleStored {
+        vtt: vtt_rel,
+        label: src_filename,
+    })
+}
+
+// ---------------------------------------------------------------------------
+// 4. extract_embedded_subtitle
+// ---------------------------------------------------------------------------
+
+/// Extract an embedded subtitle track from a video using ffmpeg.
+///
+/// Runs: `ffmpeg -y -i {video_path} -map 0:{track_index} -c:s webvtt {output_path}`
+///
+/// The output file is `{fid}_embedded_{track_index}.vtt` inside `subs_dir`.
+/// On Windows, the process is created with `CREATE_NO_WINDOW`.
+///
+/// Returns `SubtitleStored` on success, or an error message string.
+pub fn extract_embedded_subtitle(
+    video_path: &Path,
+    track_index: u32,
+    ffmpeg_path: &Path,
+    subs_dir: &Path,
+    fid: &str,
+) -> Result<SubtitleStored, String> {
+    let _ = fs::create_dir_all(subs_dir);
+
+    let out_name = format!("{}_embedded_{}.vtt", fid, track_index);
+    let out_path = subs_dir.join(&out_name);
+
+    let mut cmd = Command::new(ffmpeg_path);
+    cmd.args([
+        "-y",
+        "-i",
+        &video_path.to_string_lossy(),
+        "-map",
+        &format!("0:{}", track_index),
+        "-c:s",
+        "webvtt",
+        &out_path.to_string_lossy(),
+    ]);
+
+    #[cfg(target_os = "windows")]
+    {
+        cmd.creation_flags(CREATE_NO_WINDOW);
+    }
+
+    let output = cmd
+        .output()
+        .map_err(|e| format!("Failed to run ffmpeg: {}", e))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        return Err(format!(
+            "ffmpeg exited with status {}: {}",
+            output.status, stderr
+        ));
+    }
+
+    if !out_path.exists() {
+        return Err("ffmpeg did not produce an output file".to_string());
+    }
+
+    let subs_dir_name = subs_dir
+        .file_name()
+        .map(|n| n.to_string_lossy().to_string())
+        .unwrap_or_else(|| "subtitles".to_string());
+
+    let vtt_rel = format!("{}/{}", subs_dir_name, out_name);
+
+    Ok(SubtitleStored {
+        vtt: vtt_rel,
+        label: format!("Embedded track {}", track_index),
+    })
+}
+
+// ===========================================================================
+// Tests
+// ===========================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use tempfile::TempDir;
+
+    // -- srt_to_vtt ----------------------------------------------------------
+
+    #[test]
+    fn test_srt_to_vtt_basic() {
+        let srt = "\
+1
+00:00:01,000 --> 00:00:04,000
+Hello, world!
+
+2
+00:00:05,000 --> 00:00:08,000
+This is a test.
+";
+        let vtt = srt_to_vtt(srt);
+        assert!(vtt.starts_with("WEBVTT\n"));
+        assert!(vtt.contains("00:00:01.000 --> 00:00:04.000"));
+        assert!(vtt.contains("Hello, world!"));
+        assert!(vtt.contains("00:00:05.000 --> 00:00:08.000"));
+        assert!(vtt.contains("This is a test."));
+        // Timestamp commas must be converted to dots.
+        assert!(!vtt.contains("00:00:01,000"));
+        assert!(!vtt.contains("00:00:04,000"));
+    }
+
+    #[test]
+    fn test_srt_to_vtt_bom() {
+        let srt = "\u{FEFF}1\n00:00:01,000 --> 00:00:02,000\nHello\n";
+        let vtt = srt_to_vtt(srt);
+        assert!(vtt.starts_with("WEBVTT"));
+        // BOM must be removed.
+        assert!(!vtt.contains('\u{FEFF}'));
+        assert!(vtt.contains("Hello"));
+    }
+
+    #[test]
+    fn test_srt_to_vtt_empty() {
+        let vtt = srt_to_vtt("");
+        assert!(vtt.starts_with("WEBVTT"));
+        // Should be just the header.
+        assert_eq!(vtt.trim(), "WEBVTT");
+    }
+
+    #[test]
+    fn test_srt_to_vtt_windows_line_endings() {
+        let srt = "1\r\n00:00:01,000 --> 00:00:02,000\r\nHello\r\n\r\n\
+                    2\r\n00:00:03,000 --> 00:00:04,000\r\nWorld\r\n";
+        let vtt = srt_to_vtt(srt);
+        assert!(vtt.starts_with("WEBVTT"));
+        assert!(vtt.contains("00:00:01.000 --> 00:00:02.000"));
+        assert!(vtt.contains("Hello"));
+        assert!(vtt.contains("00:00:03.000 --> 00:00:04.000"));
+        assert!(vtt.contains("World"));
+    }
+
+    #[test]
+    fn test_srt_to_vtt_no_cue_indices() {
+        // Some SRT files omit cue numbers entirely.
+        let srt = "\
+00:00:01,500 --> 00:00:03,500
+First line
+
+00:00:04,000 --> 00:00:06,000
+Second line
+";
+        let vtt = srt_to_vtt(srt);
+        assert!(vtt.starts_with("WEBVTT"));
+        assert!(vtt.contains("00:00:01.500 --> 00:00:03.500"));
+        assert!(vtt.contains("First line"));
+        assert!(vtt.contains("00:00:04.000 --> 00:00:06.000"));
+        assert!(vtt.contains("Second line"));
+    }
+
+    // -- auto_subtitle_sidecar -----------------------------------------------
+
+    #[test]
+    fn test_auto_subtitle_sidecar_exact_match() {
+        let dir = TempDir::new().unwrap();
+        let video = dir.path().join("lecture.mp4");
+        let sub = dir.path().join("lecture.srt");
+        fs::write(&video, b"video").unwrap();
+        fs::write(&sub, b"1\n00:00:00,000 --> 00:00:01,000\nhi\n").unwrap();
+
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap(), sub);
+    }
+
+    #[test]
+    fn test_auto_subtitle_sidecar_english_suffix() {
+        let dir = TempDir::new().unwrap();
+        let video = dir.path().join("lecture.mp4");
+        let sub = dir.path().join("lecture.en.srt");
+        fs::write(&video, b"video").unwrap();
+        fs::write(&sub, b"sub content").unwrap();
+
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap(), sub);
+    }
+
+    #[test]
+    fn test_auto_subtitle_sidecar_no_match() {
+        let dir = TempDir::new().unwrap();
+        let video = dir.path().join("lecture.mp4");
+        fs::write(&video, b"video").unwrap();
+        // No subtitle files at all.
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_auto_subtitle_sidecar_priority_order() {
+        let dir = TempDir::new().unwrap();
+        let video = dir.path().join("lecture.mp4");
+        fs::write(&video, b"video").unwrap();
+
+        // Priority 0: exact stem match.
+        let exact = dir.path().join("lecture.srt");
+        // Priority 2: English suffix with exact base.
+        let en_suffix = dir.path().join("lecture.en.srt");
+        // Priority 4: Other language suffix with exact base.
+        let fr_suffix = dir.path().join("lecture.fr.srt");
+
+        fs::write(&exact, b"exact").unwrap();
+        fs::write(&en_suffix, b"english").unwrap();
+        fs::write(&fr_suffix, b"french").unwrap();
+
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_some());
+        // Should pick priority 0 (exact match) over others.
+        assert_eq!(result.unwrap(), exact);
+
+        // Remove exact match -> should pick English suffix (priority 2).
+        fs::remove_file(&exact).unwrap();
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap(), en_suffix);
+
+        // Remove English suffix -> should pick French suffix (priority 4).
+        fs::remove_file(&en_suffix).unwrap();
+        let result = auto_subtitle_sidecar(&video);
+        assert!(result.is_some());
+        assert_eq!(result.unwrap(), fr_suffix);
+    }
+
+    // -- store_subtitle_for_fid ----------------------------------------------
+
+    #[test]
+    fn test_store_subtitle_srt_converts_to_vtt() {
+        let dir = TempDir::new().unwrap();
+        let subs_dir = dir.path().join("subtitles");
+        let src = dir.path().join("my_sub.srt");
+
+        let srt_content = "1\n00:00:01,000 --> 00:00:02,000\nHello\n";
+        fs::write(&src, srt_content).unwrap();
+
+        let result = store_subtitle_for_fid("abc123", &src, &subs_dir);
+        assert!(result.is_some());
+
+        let stored = result.unwrap();
+        assert!(stored.vtt.ends_with(".vtt"));
+        assert!(stored.vtt.starts_with("subtitles/"));
+        assert_eq!(stored.label, "my_sub.srt");
+
+        // Verify the VTT output file was actually created and converted.
+        let out_path = subs_dir.join(format!("abc123_{}.vtt", "my_sub"));
+        assert!(out_path.exists());
+
+        let vtt_content = fs::read_to_string(&out_path).unwrap();
+        assert!(vtt_content.starts_with("WEBVTT"));
+        assert!(vtt_content.contains("00:00:01.000 --> 00:00:02.000"));
+        assert!(vtt_content.contains("Hello"));
+    }
+
+    #[test]
+    fn test_store_subtitle_vtt_copies() {
+        let dir = TempDir::new().unwrap();
+        let subs_dir = dir.path().join("subtitles");
+        let src = dir.path().join("my_sub.vtt");
+
+        let vtt_content = "WEBVTT\n\n00:00:01.000 --> 00:00:02.000\nHello\n";
+        fs::write(&src, vtt_content).unwrap();
+
+        let result = store_subtitle_for_fid("def456", &src, &subs_dir);
+        assert!(result.is_some());
+
+        let stored = result.unwrap();
+        assert!(stored.vtt.ends_with(".vtt"));
+        assert_eq!(stored.label, "my_sub.vtt");
+
+        // Verify the output file has the same content (not SRT-converted).
+        let out_path = subs_dir.join("def456_my_sub.vtt");
+        assert!(out_path.exists());
+
+        let content = fs::read_to_string(&out_path).unwrap();
+        assert_eq!(content, vtt_content);
+    }
+
+    #[test]
+    fn test_store_subtitle_unsupported_ext() {
+        let dir = TempDir::new().unwrap();
+        let subs_dir = dir.path().join("subtitles");
+        let src = dir.path().join("notes.txt");
+        fs::write(&src, "Some notes").unwrap();
+
+        let result = store_subtitle_for_fid("xyz789", &src, &subs_dir);
+        assert!(result.is_none());
+    }
+}