outlay/outlay-core/src/ocr.rs

use std::io::Write;
use std::path::PathBuf;
use std::process::Command;

/// Extract all monetary amounts from a receipt image using tesseract OCR.
/// Returns each amount paired with the line of text it was found on (trimmed).
/// Results are sorted: lines containing "total" first, then by amount descending.
/// Returns None if tesseract is unavailable or no amounts are found.
pub fn extract_amounts_from_image(image_bytes: &[u8]) -> Option<Vec<(f64, String)>> {
    let tesseract = find_tesseract()?;

    // Write image to a temp file
    let tmp_dir = std::env::temp_dir();
    let tmp_path = tmp_dir.join("outlay_ocr_tmp.png");
    let mut file = std::fs::File::create(&tmp_path).ok()?;
    file.write_all(image_bytes).ok()?;
    drop(file);

    let mut cmd = Command::new(&tesseract);
    cmd.arg(&tmp_path).arg("stdout");

    // If using bundled tesseract, point TESSDATA_PREFIX to bundled tessdata
    if let Some(parent) = tesseract.parent() {
        let tessdata = parent.join("tessdata");
        if tessdata.is_dir() {
            cmd.env("TESSDATA_PREFIX", parent);
        }
    }

    let output = cmd.output().ok()?;
    let _ = std::fs::remove_file(&tmp_path);

    if !output.status.success() {
        return None;
    }

    let text = String::from_utf8_lossy(&output.stdout);
    let results = parse_all_amounts(&text);
    if results.is_empty() {
        None
    } else {
        Some(results)
    }
}

/// Returns true if tesseract is available (bundled or system).
pub fn is_available() -> bool {
    find_tesseract().is_some()
}

fn find_tesseract() -> Option<PathBuf> {
    // Check for bundled tesseract next to our binary (AppImage layout)
    if let Ok(exe) = std::env::current_exe() {
        if let Some(bin_dir) = exe.parent() {
            let bundled = bin_dir.join("tesseract");
            if bundled.is_file() {
                return Some(bundled);
            }
            // Also check ../lib/tesseract (AppImage usr/lib layout)
            let lib_bundled = bin_dir.join("../lib/tesseract").canonicalize().ok();
            if let Some(p) = lib_bundled {
                if p.is_file() {
                    return Some(p);
                }
            }
        }
    }

    // Fall back to system PATH
    Command::new("tesseract")
        .arg("--version")
        .output()
        .ok()
        .filter(|o| o.status.success())
        .map(|_| PathBuf::from("tesseract"))
}

fn parse_all_amounts(text: &str) -> Vec<(f64, String)> {
    let mut results: Vec<(f64, String, bool)> = Vec::new();

    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            continue;
        }
        let line_amounts = extract_amounts_from_line(trimmed);
        let is_total = trimmed.to_lowercase().contains("total");
        for amt in line_amounts {
            // Deduplicate: skip if we already have this exact amount
            if !results.iter().any(|(a, _, _)| (*a - amt).abs() < 0.001) {
                results.push((amt, trimmed.to_string(), is_total));
            }
        }
    }

    // Sort: "total" lines first, then by amount descending
    results.sort_by(|a, b| {
        b.2.cmp(&a.2).then(b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal))
    });

    results.into_iter().map(|(amt, line, _)| (amt, line)).collect()
}

fn extract_amounts_from_line(line: &str) -> Vec<f64> {
    let mut results = Vec::new();
    let chars: Vec<char> = line.chars().collect();
    let len = chars.len();
    let mut i = 0;

    while i < len {
        // Look for digit sequences followed by separator and exactly 2 digits
        if chars[i].is_ascii_digit() {
            let start = i;
            // Consume integer part
            while i < len && chars[i].is_ascii_digit() {
                i += 1;
            }
            // Check for decimal separator followed by exactly 2 digits
            if i < len && (chars[i] == '.' || chars[i] == ',') {
                let sep = i;
                i += 1;
                let decimal_start = i;
                while i < len && chars[i].is_ascii_digit() {
                    i += 1;
                }
                if i - decimal_start == 2 {
                    let int_part: String = chars[start..sep].iter().collect();
                    let dec_part: String = chars[decimal_start..i].iter().collect();
                    if let Ok(val) = format!("{}.{}", int_part, dec_part).parse::<f64>() {
                        if val > 0.0 {
                            results.push(val);
                        }
                    }
                }
            }
        } else {
            i += 1;
        }
    }

    results
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_all_returns_sorted() {
        let text = "Item 1    5.99\nItem 2    3.50\nTotal    9.49\n";
        let results = parse_all_amounts(text);
        // "Total" line should come first
        assert_eq!(results[0].0, 9.49);
        assert!(results[0].1.contains("Total"));
        assert_eq!(results.len(), 3);
    }

    #[test]
    fn test_parse_comma_separator() {
        let text = "Total: 12,99\n";
        let results = parse_all_amounts(text);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].0, 12.99);
    }

    #[test]
    fn test_no_total_sorts_by_amount() {
        let text = "Coffee    4.50\nSandwich    8.99\n";
        let results = parse_all_amounts(text);
        assert_eq!(results[0].0, 8.99);
        assert_eq!(results[1].0, 4.50);
    }

    #[test]
    fn test_no_amounts() {
        let text = "Hello world\nNo numbers here\n";
        let results = parse_all_amounts(text);
        assert!(results.is_empty());
    }

    #[test]
    fn test_total_case_insensitive() {
        let text = "Sub 5.00\nTOTAL 15.00\nChange 5.00\n";
        let results = parse_all_amounts(text);
        // TOTAL line first
        assert_eq!(results[0].0, 15.00);
        assert!(results[0].1.contains("TOTAL"));
    }

    #[test]
    fn test_deduplicates_amounts() {
        let text = "Subtotal    10.00\nTotal    10.00\n";
        let results = parse_all_amounts(text);
        // Same amount on two lines - should deduplicate
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].0, 10.00);
    }

    #[test]
    fn test_large_amount() {
        let text = "Grand Total    1250.00\n";
        let results = parse_all_amounts(text);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].0, 1250.00);
    }
}