added html import, image embedding, and font checks

2026-03-21 15:38:29 +02:00
parent 2a029588aa
commit 7e6b52586d
12 changed files with 492 additions and 42 deletions
--- a/src/utils/contentDetector.ts
+++ b/src/utils/contentDetector.ts
@@ -0,0 +1,136 @@
+export type ContentType = 'html' | 'markdown' | 'text';
+
+export interface DetectionResult {
+  type: ContentType;
+  error?: string;
+  detectedFormat?: string;
+}
+
+const BINARY_SIGNATURES: [string, string][] = [
+  ['%PDF', 'PDF document'],
+  ['PK', 'Word document or ZIP archive'],
+  ['\x89PNG', 'PNG image'],
+  ['\xFF\xD8', 'JPEG image'],
+  ['GIF8', 'GIF image'],
+  ['RIFF', 'media file'],
+  ['Rar!', 'RAR archive'],
+];
+
+function detectBinaryFormat(content: string): string | null {
+  if (content.includes('\0')) {
+    for (const [sig, name] of BINARY_SIGNATURES) {
+      if (content.startsWith(sig)) return name;
+    }
+    return 'binary file';
+  }
+
+  let nonPrintable = 0;
+  const len = Math.min(content.length, 512);
+  for (let i = 0; i < len; i++) {
+    const code = content.charCodeAt(i);
+    if (code < 32 && code !== 9 && code !== 10 && code !== 13) {
+      nonPrintable++;
+    }
+  }
+  return nonPrintable / len > 0.1 ? 'binary file' : null;
+}
+
+function stripCodeBlocks(content: string): string {
+  return content.replace(/```[\s\S]*?```/g, '');
+}
+
+const STRUCTURAL_TAG_RE = /<(div|p|table|tr|td|th|thead|tbody|tfoot|ul|ol|li|h[1-6]|section|article|header|footer|nav|main|aside|form|blockquote|pre|dl|dt|dd|figure|figcaption|hr)\b[^>]*>/gi;
+const INLINE_TAG_RE = /<(span|b|i|u|strong|em|a|img|br|code|sub|sup|small|mark|del|ins|s|abbr)\b[^>]*>/gi;
+
+function countStructuralTags(content: string): number {
+  return (content.match(STRUCTURAL_TAG_RE) || []).length;
+}
+
+function countInlineTags(content: string): number {
+  return (content.match(INLINE_TAG_RE) || []).length;
+}
+
+function countMarkdownSyntax(content: string): number {
+  let score = 0;
+  const lines = content.split('\n');
+
+  for (const line of lines) {
+    const t = line.trim();
+    if (/^#{1,6}\s/.test(t)) score += 3;
+    if (/^[-*+]\s/.test(t)) score += 2;
+    if (/^\d+\.\s/.test(t)) score += 2;
+    if (/^>\s/.test(t)) score += 2;
+    if (/^(---|\*\*\*|___)$/.test(t)) score += 2;
+    if (/^```/.test(t)) score += 3;
+  }
+
+  const sample = content.substring(0, 5000);
+  score += (sample.match(/\*\*[^*]+\*\*/g) || []).length;
+  score += (sample.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length * 2;
+  score += (sample.match(/!\[([^\]]*)\]\(([^)]+)\)/g) || []).length * 2;
+
+  return score;
+}
+
+export function detectContentType(content: string, extension: string): DetectionResult {
+  if (!content || !content.trim()) {
+    return { type: 'text' };
+  }
+
+  const binaryFormat = detectBinaryFormat(content);
+  if (binaryFormat) {
+    return {
+      type: 'text',
+      error: `This appears to be a ${binaryFormat}. TypoGenie accepts Markdown, HTML, and plain text files.`,
+      detectedFormat: binaryFormat,
+    };
+  }
+
+  // Full HTML document detection
+  const trimmed = content.trimStart().toLowerCase();
+  if (trimmed.startsWith('<!doctype') || trimmed.startsWith('<html')) {
+    return { type: 'html' };
+  }
+
+  const mdScore = countMarkdownSyntax(content);
+
+  // Count HTML tags on content with code blocks stripped to avoid false positives
+  const stripped = stripCodeBlocks(content);
+  const structural = countStructuralTags(stripped);
+  const inline = countInlineTags(stripped);
+
+  // Both signals strong - likely markdown with HTML examples
+  if (structural >= 3 && mdScore >= 5) {
+    return { type: 'markdown' };
+  }
+
+  // Strong HTML signal
+  if (structural >= 3) {
+    return { type: 'html' };
+  }
+
+  // Moderate HTML: few structural tags but heavy inline tags (Blogger/Google Docs style)
+  if (structural >= 1 && inline >= 10) {
+    return { type: 'html' };
+  }
+
+  // Strong markdown signal
+  if (mdScore >= 3) {
+    return { type: 'markdown' };
+  }
+
+  // Weak HTML with no markdown at all
+  if (structural >= 1 && mdScore === 0) {
+    return { type: 'html' };
+  }
+
+  // Extension as tiebreaker
+  if (extension === 'html' || extension === 'htm') {
+    return { type: 'html' };
+  }
+  if (extension === 'md' || extension === 'markdown') {
+    return { type: 'markdown' };
+  }
+
+  return { type: 'text' };
+}