//! Subtitle handling: SRT-to-VTT conversion, sidecar discovery, storage, //! and embedded subtitle extraction via ffmpeg. use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Serialize}; use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; #[cfg(target_os = "windows")] use std::os::windows::process::CommandExt; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- /// Supported subtitle file extensions. pub const SUB_EXTS: &[&str] = &[".srt", ".vtt"]; /// Languages considered "English" for sidecar priority. const ENGLISH_LANGS: &[&str] = &["en", "eng", "english"]; /// All language suffixes to strip when normalizing subtitle basenames. const ALL_LANG_SUFFIXES: &[&str] = &[ "en", "eng", "english", "fr", "de", "es", "it", "pt", "ru", "ja", "ko", "zh", ]; /// Windows CREATE_NO_WINDOW flag for subprocess creation. #[cfg(target_os = "windows")] const CREATE_NO_WINDOW: u32 = 0x08000000; // --------------------------------------------------------------------------- // Structs // --------------------------------------------------------------------------- /// Result of storing a subtitle file for a video. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SubtitleStored { /// Relative path like `"subtitles/{fid}_{name}.vtt"`. pub vtt: String, /// Display label (source filename). pub label: String, } // --------------------------------------------------------------------------- // Compiled regex patterns // --------------------------------------------------------------------------- /// Matches a line that is only digits (SRT cue index). static CUE_INDEX_RE: Lazy = Lazy::new(|| Regex::new(r"^\d+$").unwrap()); /// Matches characters that are NOT alphanumeric, dot, underscore, or hyphen. static SANITIZE_RE: Lazy = Lazy::new(|| Regex::new(r"[^a-zA-Z0-9._\-]").unwrap()); /// Collapses runs of whitespace and dash/underscore into a single space for /// normalized comparison of subtitle stems. static NORMALIZE_SEP_RE: Lazy = Lazy::new(|| Regex::new(r"[-_\s]+").unwrap()); // --------------------------------------------------------------------------- // 1. srt_to_vtt // --------------------------------------------------------------------------- /// Convert SRT subtitle text to WebVTT format string. /// /// - Removes BOM (`\u{FEFF}`) if present. /// - Adds the `WEBVTT` header. /// - Skips cue index numbers (lines that are just digits). /// - Converts timestamp separators: comma -> dot. /// - Collects subtitle text between timestamp lines and empty lines. pub fn srt_to_vtt(srt_text: &str) -> String { let text = srt_text.replace('\u{FEFF}', ""); let lines: Vec<&str> = text.lines().collect(); let mut out: Vec = vec!["WEBVTT".to_string(), String::new()]; let mut i = 0; while i < lines.len() { let line = lines[i].trim_end_matches('\r'); // Empty line -> blank line in output if line.trim().is_empty() { out.push(String::new()); i += 1; continue; } // Skip cue index (pure digit line) if CUE_INDEX_RE.is_match(line.trim()) { i += 1; if i >= lines.len() { break; } // Re-read the next line as a potential timestamp let line = lines[i].trim_end_matches('\r'); if line.contains("-->") { let ts_line = line.replace(',', "."); out.push(ts_line); i += 1; // Collect subtitle text until blank line while i < lines.len() { let t = lines[i].trim_end_matches('\r'); if t.trim().is_empty() { out.push(String::new()); i += 1; break; } out.push(t.to_string()); i += 1; } } else { i += 1; } } else if line.contains("-->") { // Timestamp line without preceding cue index let ts_line = line.replace(',', "."); out.push(ts_line); i += 1; while i < lines.len() { let t = lines[i].trim_end_matches('\r'); if t.trim().is_empty() { out.push(String::new()); i += 1; break; } out.push(t.to_string()); i += 1; } } else { i += 1; } } let joined = out.join("\n"); format!("{}\n", joined.trim()) } // --------------------------------------------------------------------------- // 2. auto_subtitle_sidecar (helpers) // --------------------------------------------------------------------------- /// Normalize a string for fuzzy subtitle matching: lowercase, replace `-` and /// `_` with space, collapse whitespace. fn normalize_stem(s: &str) -> String { let lower = s.to_lowercase(); let replaced = NORMALIZE_SEP_RE.replace_all(&lower, " "); replaced.trim().to_string() } /// Strip a trailing language suffix from a subtitle stem. /// /// For example, `"video.en"` -> `Some(("video", "en"))`. /// Returns `None` if no known language suffix is found. fn strip_lang_suffix(stem: &str) -> Option<(String, String)> { if let Some(dot_pos) = stem.rfind('.') { let base = &stem[..dot_pos]; let suffix = &stem[dot_pos + 1..]; let suffix_lower = suffix.to_lowercase(); if ALL_LANG_SUFFIXES.contains(&suffix_lower.as_str()) { return Some((base.to_string(), suffix_lower)); } } None } /// Find a subtitle sidecar file matching the given video path. /// /// Returns the best matching subtitle file path, or `None`. /// /// Priority (lower is better): /// - 0: Exact stem match (case-insensitive) /// - 1: Normalized exact match /// - 2: English language suffix with exact base /// - 3: English language suffix with normalized base /// - 4: Other language suffix with exact base /// - 5: Other/no language with normalized base pub fn auto_subtitle_sidecar(video_path: &Path) -> Option { let parent = video_path.parent()?; let video_stem = video_path.file_stem()?.to_string_lossy().to_string(); let video_stem_lower = video_stem.to_lowercase(); let video_stem_norm = normalize_stem(&video_stem); // Collect all subtitle files in the same directory. let entries = fs::read_dir(parent).ok()?; let mut best: Option<(u8, PathBuf)> = None; for entry in entries.flatten() { let path = entry.path(); if !path.is_file() { continue; } let fname = match path.file_name() { Some(n) => n.to_string_lossy().to_string(), None => continue, }; let fname_lower = fname.to_lowercase(); // Must end with a supported subtitle extension. let is_sub = SUB_EXTS .iter() .any(|ext| fname_lower.ends_with(ext)); if !is_sub { continue; } // Extract the stem (without the subtitle extension). let sub_stem = match path.file_stem() { Some(s) => s.to_string_lossy().to_string(), None => continue, }; let sub_stem_lower = sub_stem.to_lowercase(); // Priority 0: exact stem match (case-insensitive). if sub_stem_lower == video_stem_lower { let priority = 0u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } continue; } // Check for language suffix. if let Some((base, lang)) = strip_lang_suffix(&sub_stem) { let base_lower = base.to_lowercase(); let base_norm = normalize_stem(&base); let is_english = ENGLISH_LANGS.contains(&lang.as_str()); if is_english { // Priority 2: English suffix, exact base. if base_lower == video_stem_lower { let priority = 2u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } continue; } // Priority 3: English suffix, normalized base. if base_norm == video_stem_norm { let priority = 3u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } continue; } } else { // Priority 4: Other language suffix, exact base. if base_lower == video_stem_lower { let priority = 4u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } continue; } // Priority 5: Other language suffix, normalized base. if base_norm == video_stem_norm { let priority = 5u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } continue; } } } // Priority 1: Normalized match (no language suffix). let sub_stem_norm = normalize_stem(&sub_stem); if sub_stem_norm == video_stem_norm { let priority = 1u8; if best.as_ref().map_or(true, |(bp, _)| priority < *bp) { best = Some((priority, path.clone())); } } // Priority 5 fallback: normalized match for subtitle files whose // language suffix was not recognised above (handled by the // strip_lang_suffix branch already for known languages). } best.map(|(_, p)| p) } // --------------------------------------------------------------------------- // 3. store_subtitle_for_fid // --------------------------------------------------------------------------- /// Sanitize a filename component: replace non-alphanumeric chars (except /// `._-`) with `_`, then truncate to 60 characters. fn sanitize_name(name: &str) -> String { let sanitized = SANITIZE_RE.replace_all(name, "_"); let s = sanitized.as_ref(); if s.len() > 60 { s[..60].to_string() } else { s.to_string() } } /// Store a subtitle file for a given fid. Converts SRT->VTT if needed. /// /// The output file is written as `{fid}_{sanitized_name}.vtt` inside /// `subs_dir`. Returns `SubtitleStored` with the relative path (from the /// parent of `subs_dir`) and a display label. /// /// Returns `None` if the source file extension is not supported or reading /// the source fails. pub fn store_subtitle_for_fid( fid: &str, src_path: &Path, subs_dir: &Path, ) -> Option { let ext_lower = src_path .extension() .map(|e| format!(".{}", e.to_string_lossy().to_lowercase()))?; if !SUB_EXTS.contains(&ext_lower.as_str()) { return None; } let src_filename = src_path .file_name() .map(|n| n.to_string_lossy().to_string()) .unwrap_or_default(); let src_stem = src_path .file_stem() .map(|s| s.to_string_lossy().to_string()) .unwrap_or_else(|| "subtitle".to_string()); let sanitized = sanitize_name(&src_stem); let out_name = format!("{}_{}.vtt", fid, sanitized); // Ensure subs_dir exists. let _ = fs::create_dir_all(subs_dir); let out_path = subs_dir.join(&out_name); let content = fs::read_to_string(src_path).ok()?; let vtt_content = if ext_lower == ".srt" { srt_to_vtt(&content) } else { // Already VTT - use as-is. content }; fs::write(&out_path, vtt_content.as_bytes()).ok()?; // Build relative path: "subtitles/{out_name}". let subs_dir_name = subs_dir .file_name() .map(|n| n.to_string_lossy().to_string()) .unwrap_or_else(|| "subtitles".to_string()); let vtt_rel = format!("{}/{}", subs_dir_name, out_name); Some(SubtitleStored { vtt: vtt_rel, label: src_filename, }) } // --------------------------------------------------------------------------- // 4. extract_embedded_subtitle // --------------------------------------------------------------------------- /// Extract an embedded subtitle track from a video using ffmpeg. /// /// Runs: `ffmpeg -y -i {video_path} -map 0:{track_index} -c:s webvtt {output_path}` /// /// The output file is `{fid}_embedded_{track_index}.vtt` inside `subs_dir`. /// On Windows, the process is created with `CREATE_NO_WINDOW`. /// /// Returns `SubtitleStored` on success, or an error message string. pub fn extract_embedded_subtitle( video_path: &Path, track_index: u32, ffmpeg_path: &Path, subs_dir: &Path, fid: &str, ) -> Result { let _ = fs::create_dir_all(subs_dir); let out_name = format!("{}_embedded_{}.vtt", fid, track_index); let out_path = subs_dir.join(&out_name); let mut cmd = Command::new(ffmpeg_path); cmd.args([ "-y", "-i", &video_path.to_string_lossy(), "-map", &format!("0:{}", track_index), "-c:s", "webvtt", &out_path.to_string_lossy(), ]); #[cfg(target_os = "windows")] { cmd.creation_flags(CREATE_NO_WINDOW); } let output = cmd .output() .map_err(|e| format!("Failed to run ffmpeg: {}", e))?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); return Err(format!( "ffmpeg exited with status {}: {}", output.status, stderr )); } if !out_path.exists() { return Err("ffmpeg did not produce an output file".to_string()); } let subs_dir_name = subs_dir .file_name() .map(|n| n.to_string_lossy().to_string()) .unwrap_or_else(|| "subtitles".to_string()); let vtt_rel = format!("{}/{}", subs_dir_name, out_name); Ok(SubtitleStored { vtt: vtt_rel, label: format!("Embedded track {}", track_index), }) } // =========================================================================== // Tests // =========================================================================== #[cfg(test)] mod tests { use super::*; use std::fs; use tempfile::TempDir; // -- srt_to_vtt ---------------------------------------------------------- #[test] fn test_srt_to_vtt_basic() { let srt = "\ 1 00:00:01,000 --> 00:00:04,000 Hello, world! 2 00:00:05,000 --> 00:00:08,000 This is a test. "; let vtt = srt_to_vtt(srt); assert!(vtt.starts_with("WEBVTT\n")); assert!(vtt.contains("00:00:01.000 --> 00:00:04.000")); assert!(vtt.contains("Hello, world!")); assert!(vtt.contains("00:00:05.000 --> 00:00:08.000")); assert!(vtt.contains("This is a test.")); // Timestamp commas must be converted to dots. assert!(!vtt.contains("00:00:01,000")); assert!(!vtt.contains("00:00:04,000")); } #[test] fn test_srt_to_vtt_bom() { let srt = "\u{FEFF}1\n00:00:01,000 --> 00:00:02,000\nHello\n"; let vtt = srt_to_vtt(srt); assert!(vtt.starts_with("WEBVTT")); // BOM must be removed. assert!(!vtt.contains('\u{FEFF}')); assert!(vtt.contains("Hello")); } #[test] fn test_srt_to_vtt_empty() { let vtt = srt_to_vtt(""); assert!(vtt.starts_with("WEBVTT")); // Should be just the header. assert_eq!(vtt.trim(), "WEBVTT"); } #[test] fn test_srt_to_vtt_windows_line_endings() { let srt = "1\r\n00:00:01,000 --> 00:00:02,000\r\nHello\r\n\r\n\ 2\r\n00:00:03,000 --> 00:00:04,000\r\nWorld\r\n"; let vtt = srt_to_vtt(srt); assert!(vtt.starts_with("WEBVTT")); assert!(vtt.contains("00:00:01.000 --> 00:00:02.000")); assert!(vtt.contains("Hello")); assert!(vtt.contains("00:00:03.000 --> 00:00:04.000")); assert!(vtt.contains("World")); } #[test] fn test_srt_to_vtt_no_cue_indices() { // Some SRT files omit cue numbers entirely. let srt = "\ 00:00:01,500 --> 00:00:03,500 First line 00:00:04,000 --> 00:00:06,000 Second line "; let vtt = srt_to_vtt(srt); assert!(vtt.starts_with("WEBVTT")); assert!(vtt.contains("00:00:01.500 --> 00:00:03.500")); assert!(vtt.contains("First line")); assert!(vtt.contains("00:00:04.000 --> 00:00:06.000")); assert!(vtt.contains("Second line")); } // -- auto_subtitle_sidecar ----------------------------------------------- #[test] fn test_auto_subtitle_sidecar_exact_match() { let dir = TempDir::new().unwrap(); let video = dir.path().join("lecture.mp4"); let sub = dir.path().join("lecture.srt"); fs::write(&video, b"video").unwrap(); fs::write(&sub, b"1\n00:00:00,000 --> 00:00:01,000\nhi\n").unwrap(); let result = auto_subtitle_sidecar(&video); assert!(result.is_some()); assert_eq!(result.unwrap(), sub); } #[test] fn test_auto_subtitle_sidecar_english_suffix() { let dir = TempDir::new().unwrap(); let video = dir.path().join("lecture.mp4"); let sub = dir.path().join("lecture.en.srt"); fs::write(&video, b"video").unwrap(); fs::write(&sub, b"sub content").unwrap(); let result = auto_subtitle_sidecar(&video); assert!(result.is_some()); assert_eq!(result.unwrap(), sub); } #[test] fn test_auto_subtitle_sidecar_no_match() { let dir = TempDir::new().unwrap(); let video = dir.path().join("lecture.mp4"); fs::write(&video, b"video").unwrap(); // No subtitle files at all. let result = auto_subtitle_sidecar(&video); assert!(result.is_none()); } #[test] fn test_auto_subtitle_sidecar_priority_order() { let dir = TempDir::new().unwrap(); let video = dir.path().join("lecture.mp4"); fs::write(&video, b"video").unwrap(); // Priority 0: exact stem match. let exact = dir.path().join("lecture.srt"); // Priority 2: English suffix with exact base. let en_suffix = dir.path().join("lecture.en.srt"); // Priority 4: Other language suffix with exact base. let fr_suffix = dir.path().join("lecture.fr.srt"); fs::write(&exact, b"exact").unwrap(); fs::write(&en_suffix, b"english").unwrap(); fs::write(&fr_suffix, b"french").unwrap(); let result = auto_subtitle_sidecar(&video); assert!(result.is_some()); // Should pick priority 0 (exact match) over others. assert_eq!(result.unwrap(), exact); // Remove exact match -> should pick English suffix (priority 2). fs::remove_file(&exact).unwrap(); let result = auto_subtitle_sidecar(&video); assert!(result.is_some()); assert_eq!(result.unwrap(), en_suffix); // Remove English suffix -> should pick French suffix (priority 4). fs::remove_file(&en_suffix).unwrap(); let result = auto_subtitle_sidecar(&video); assert!(result.is_some()); assert_eq!(result.unwrap(), fr_suffix); } // -- store_subtitle_for_fid ---------------------------------------------- #[test] fn test_store_subtitle_srt_converts_to_vtt() { let dir = TempDir::new().unwrap(); let subs_dir = dir.path().join("subtitles"); let src = dir.path().join("my_sub.srt"); let srt_content = "1\n00:00:01,000 --> 00:00:02,000\nHello\n"; fs::write(&src, srt_content).unwrap(); let result = store_subtitle_for_fid("abc123", &src, &subs_dir); assert!(result.is_some()); let stored = result.unwrap(); assert!(stored.vtt.ends_with(".vtt")); assert!(stored.vtt.starts_with("subtitles/")); assert_eq!(stored.label, "my_sub.srt"); // Verify the VTT output file was actually created and converted. let out_path = subs_dir.join(format!("abc123_{}.vtt", "my_sub")); assert!(out_path.exists()); let vtt_content = fs::read_to_string(&out_path).unwrap(); assert!(vtt_content.starts_with("WEBVTT")); assert!(vtt_content.contains("00:00:01.000 --> 00:00:02.000")); assert!(vtt_content.contains("Hello")); } #[test] fn test_store_subtitle_vtt_copies() { let dir = TempDir::new().unwrap(); let subs_dir = dir.path().join("subtitles"); let src = dir.path().join("my_sub.vtt"); let vtt_content = "WEBVTT\n\n00:00:01.000 --> 00:00:02.000\nHello\n"; fs::write(&src, vtt_content).unwrap(); let result = store_subtitle_for_fid("def456", &src, &subs_dir); assert!(result.is_some()); let stored = result.unwrap(); assert!(stored.vtt.ends_with(".vtt")); assert_eq!(stored.label, "my_sub.vtt"); // Verify the output file has the same content (not SRT-converted). let out_path = subs_dir.join("def456_my_sub.vtt"); assert!(out_path.exists()); let content = fs::read_to_string(&out_path).unwrap(); assert_eq!(content, vtt_content); } #[test] fn test_store_subtitle_unsupported_ext() { let dir = TempDir::new().unwrap(); let subs_dir = dir.path().join("subtitles"); let src = dir.path().join("notes.txt"); fs::write(&src, "Some notes").unwrap(); let result = store_subtitle_for_fid("xyz789", &src, &subs_dir); assert!(result.is_none()); } }