typogenie/src/utils/contentDetector.ts

export type ContentType = 'html' | 'markdown' | 'text';

export interface DetectionResult {
  type: ContentType;
  error?: string;
  detectedFormat?: string;
}

const BINARY_SIGNATURES: [string, string][] = [
  ['%PDF', 'PDF document'],
  ['PK', 'Word document or ZIP archive'],
  ['\x89PNG', 'PNG image'],
  ['\xFF\xD8', 'JPEG image'],
  ['GIF8', 'GIF image'],
  ['RIFF', 'media file'],
  ['Rar!', 'RAR archive'],
];

function detectBinaryFormat(content: string): string | null {
  if (content.includes('\0')) {
    for (const [sig, name] of BINARY_SIGNATURES) {
      if (content.startsWith(sig)) return name;
    }
    return 'binary file';
  }

  let nonPrintable = 0;
  const len = Math.min(content.length, 512);
  for (let i = 0; i < len; i++) {
    const code = content.charCodeAt(i);
    if (code < 32 && code !== 9 && code !== 10 && code !== 13) {
      nonPrintable++;
    }
  }
  return nonPrintable / len > 0.1 ? 'binary file' : null;
}

function stripCodeBlocks(content: string): string {
  return content.replace(/```[\s\S]*?```/g, '');
}

const STRUCTURAL_TAG_RE = /<(div|p|table|tr|td|th|thead|tbody|tfoot|ul|ol|li|h[1-6]|section|article|header|footer|nav|main|aside|form|blockquote|pre|dl|dt|dd|figure|figcaption|hr)\b[^>]*>/gi;
const INLINE_TAG_RE = /<(span|b|i|u|strong|em|a|img|br|code|sub|sup|small|mark|del|ins|s|abbr)\b[^>]*>/gi;

function countStructuralTags(content: string): number {
  return (content.match(STRUCTURAL_TAG_RE) || []).length;
}

function countInlineTags(content: string): number {
  return (content.match(INLINE_TAG_RE) || []).length;
}

function countMarkdownSyntax(content: string): number {
  let score = 0;
  const lines = content.split('\n');

  for (const line of lines) {
    const t = line.trim();
    if (/^#{1,6}\s/.test(t)) score += 3;
    if (/^[-*+]\s/.test(t)) score += 2;
    if (/^\d+\.\s/.test(t)) score += 2;
    if (/^>\s/.test(t)) score += 2;
    if (/^(---|\*\*\*|___)$/.test(t)) score += 2;
    if (/^```/.test(t)) score += 3;
  }

  const sample = content.substring(0, 5000);
  score += (sample.match(/\*\*[^*]+\*\*/g) || []).length;
  score += (sample.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length * 2;
  score += (sample.match(/!\[([^\]]*)\]\(([^)]+)\)/g) || []).length * 2;

  return score;
}

export function detectContentType(content: string, extension: string): DetectionResult {
  if (!content || !content.trim()) {
    return { type: 'text' };
  }

  const binaryFormat = detectBinaryFormat(content);
  if (binaryFormat) {
    return {
      type: 'text',
      error: `This appears to be a ${binaryFormat}. TypoGenie accepts Markdown, HTML, and plain text files.`,
      detectedFormat: binaryFormat,
    };
  }

  // Full HTML document detection
  const trimmed = content.trimStart().toLowerCase();
  if (trimmed.startsWith('<!doctype') || trimmed.startsWith('<html')) {
    return { type: 'html' };
  }

  const mdScore = countMarkdownSyntax(content);

  // Count HTML tags on content with code blocks stripped to avoid false positives
  const stripped = stripCodeBlocks(content);
  const structural = countStructuralTags(stripped);
  const inline = countInlineTags(stripped);

  // Both signals strong - likely markdown with HTML examples
  if (structural >= 3 && mdScore >= 5) {
    return { type: 'markdown' };
  }

  // Strong HTML signal
  if (structural >= 3) {
    return { type: 'html' };
  }

  // Moderate HTML: few structural tags but heavy inline tags (Blogger/Google Docs style)
  if (structural >= 1 && inline >= 10) {
    return { type: 'html' };
  }

  // Strong markdown signal
  if (mdScore >= 3) {
    return { type: 'markdown' };
  }

  // Weak HTML with no markdown at all
  if (structural >= 1 && mdScore === 0) {
    return { type: 'html' };
  }

  // Extension as tiebreaker
  if (extension === 'html' || extension === 'htm') {
    return { type: 'html' };
  }
  if (extension === 'md' || extension === 'markdown') {
    return { type: 'markdown' };
  }

  return { type: 'text' };
}