export type ContentType = 'html' | 'markdown' | 'text'; export interface DetectionResult { type: ContentType; error?: string; detectedFormat?: string; } const BINARY_SIGNATURES: [string, string][] = [ ['%PDF', 'PDF document'], ['PK', 'Word document or ZIP archive'], ['\x89PNG', 'PNG image'], ['\xFF\xD8', 'JPEG image'], ['GIF8', 'GIF image'], ['RIFF', 'media file'], ['Rar!', 'RAR archive'], ]; function detectBinaryFormat(content: string): string | null { if (content.includes('\0')) { for (const [sig, name] of BINARY_SIGNATURES) { if (content.startsWith(sig)) return name; } return 'binary file'; } let nonPrintable = 0; const len = Math.min(content.length, 512); for (let i = 0; i < len; i++) { const code = content.charCodeAt(i); if (code < 32 && code !== 9 && code !== 10 && code !== 13) { nonPrintable++; } } return nonPrintable / len > 0.1 ? 'binary file' : null; } function stripCodeBlocks(content: string): string { return content.replace(/```[\s\S]*?```/g, ''); } const STRUCTURAL_TAG_RE = /<(div|p|table|tr|td|th|thead|tbody|tfoot|ul|ol|li|h[1-6]|section|article|header|footer|nav|main|aside|form|blockquote|pre|dl|dt|dd|figure|figcaption|hr)\b[^>]*>/gi; const INLINE_TAG_RE = /<(span|b|i|u|strong|em|a|img|br|code|sub|sup|small|mark|del|ins|s|abbr)\b[^>]*>/gi; function countStructuralTags(content: string): number { return (content.match(STRUCTURAL_TAG_RE) || []).length; } function countInlineTags(content: string): number { return (content.match(INLINE_TAG_RE) || []).length; } function countMarkdownSyntax(content: string): number { let score = 0; const lines = content.split('\n'); for (const line of lines) { const t = line.trim(); if (/^#{1,6}\s/.test(t)) score += 3; if (/^[-*+]\s/.test(t)) score += 2; if (/^\d+\.\s/.test(t)) score += 2; if (/^>\s/.test(t)) score += 2; if (/^(---|\*\*\*|___)$/.test(t)) score += 2; if (/^```/.test(t)) score += 3; } const sample = content.substring(0, 5000); score += (sample.match(/\*\*[^*]+\*\*/g) || []).length; score += (sample.match(/\[([^\]]+)\]$([^)]+)$/g) || []).length * 2; score += (sample.match(/!\[([^\]]*)\]$([^)]+)$/g) || []).length * 2; return score; } export function detectContentType(content: string, extension: string): DetectionResult { if (!content || !content.trim()) { return { type: 'text' }; } const binaryFormat = detectBinaryFormat(content); if (binaryFormat) { return { type: 'text', error: `This appears to be a ${binaryFormat}. TypoGenie accepts Markdown, HTML, and plain text files.`, detectedFormat: binaryFormat, }; } // Full HTML document detection const trimmed = content.trimStart().toLowerCase(); if (trimmed.startsWith('= 3 && mdScore >= 5) { return { type: 'markdown' }; } // Strong HTML signal if (structural >= 3) { return { type: 'html' }; } // Moderate HTML: few structural tags but heavy inline tags (Blogger/Google Docs style) if (structural >= 1 && inline >= 10) { return { type: 'html' }; } // Strong markdown signal if (mdScore >= 3) { return { type: 'markdown' }; } // Weak HTML with no markdown at all if (structural >= 1 && mdScore === 0) { return { type: 'html' }; } // Extension as tiebreaker if (extension === 'html' || extension === 'htm') { return { type: 'html' }; } if (extension === 'md' || extension === 'markdown') { return { type: 'markdown' }; } return { type: 'text' }; }