Files
typogenie/src/utils/contentDetector.ts

137 lines
3.9 KiB
TypeScript

export type ContentType = 'html' | 'markdown' | 'text';
export interface DetectionResult {
type: ContentType;
error?: string;
detectedFormat?: string;
}
const BINARY_SIGNATURES: [string, string][] = [
['%PDF', 'PDF document'],
['PK', 'Word document or ZIP archive'],
['\x89PNG', 'PNG image'],
['\xFF\xD8', 'JPEG image'],
['GIF8', 'GIF image'],
['RIFF', 'media file'],
['Rar!', 'RAR archive'],
];
function detectBinaryFormat(content: string): string | null {
if (content.includes('\0')) {
for (const [sig, name] of BINARY_SIGNATURES) {
if (content.startsWith(sig)) return name;
}
return 'binary file';
}
let nonPrintable = 0;
const len = Math.min(content.length, 512);
for (let i = 0; i < len; i++) {
const code = content.charCodeAt(i);
if (code < 32 && code !== 9 && code !== 10 && code !== 13) {
nonPrintable++;
}
}
return nonPrintable / len > 0.1 ? 'binary file' : null;
}
function stripCodeBlocks(content: string): string {
return content.replace(/```[\s\S]*?```/g, '');
}
const STRUCTURAL_TAG_RE = /<(div|p|table|tr|td|th|thead|tbody|tfoot|ul|ol|li|h[1-6]|section|article|header|footer|nav|main|aside|form|blockquote|pre|dl|dt|dd|figure|figcaption|hr)\b[^>]*>/gi;
const INLINE_TAG_RE = /<(span|b|i|u|strong|em|a|img|br|code|sub|sup|small|mark|del|ins|s|abbr)\b[^>]*>/gi;
function countStructuralTags(content: string): number {
return (content.match(STRUCTURAL_TAG_RE) || []).length;
}
function countInlineTags(content: string): number {
return (content.match(INLINE_TAG_RE) || []).length;
}
function countMarkdownSyntax(content: string): number {
let score = 0;
const lines = content.split('\n');
for (const line of lines) {
const t = line.trim();
if (/^#{1,6}\s/.test(t)) score += 3;
if (/^[-*+]\s/.test(t)) score += 2;
if (/^\d+\.\s/.test(t)) score += 2;
if (/^>\s/.test(t)) score += 2;
if (/^(---|\*\*\*|___)$/.test(t)) score += 2;
if (/^```/.test(t)) score += 3;
}
const sample = content.substring(0, 5000);
score += (sample.match(/\*\*[^*]+\*\*/g) || []).length;
score += (sample.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length * 2;
score += (sample.match(/!\[([^\]]*)\]\(([^)]+)\)/g) || []).length * 2;
return score;
}
export function detectContentType(content: string, extension: string): DetectionResult {
if (!content || !content.trim()) {
return { type: 'text' };
}
const binaryFormat = detectBinaryFormat(content);
if (binaryFormat) {
return {
type: 'text',
error: `This appears to be a ${binaryFormat}. TypoGenie accepts Markdown, HTML, and plain text files.`,
detectedFormat: binaryFormat,
};
}
// Full HTML document detection
const trimmed = content.trimStart().toLowerCase();
if (trimmed.startsWith('<!doctype') || trimmed.startsWith('<html')) {
return { type: 'html' };
}
const mdScore = countMarkdownSyntax(content);
// Count HTML tags on content with code blocks stripped to avoid false positives
const stripped = stripCodeBlocks(content);
const structural = countStructuralTags(stripped);
const inline = countInlineTags(stripped);
// Both signals strong - likely markdown with HTML examples
if (structural >= 3 && mdScore >= 5) {
return { type: 'markdown' };
}
// Strong HTML signal
if (structural >= 3) {
return { type: 'html' };
}
// Moderate HTML: few structural tags but heavy inline tags (Blogger/Google Docs style)
if (structural >= 1 && inline >= 10) {
return { type: 'html' };
}
// Strong markdown signal
if (mdScore >= 3) {
return { type: 'markdown' };
}
// Weak HTML with no markdown at all
if (structural >= 1 && mdScore === 0) {
return { type: 'html' };
}
// Extension as tiebreaker
if (extension === 'html' || extension === 'htm') {
return { type: 'html' };
}
if (extension === 'md' || extension === 'markdown') {
return { type: 'markdown' };
}
return { type: 'text' };
}