added html import, image embedding, and font checks
This commit is contained in:
136
src/utils/contentDetector.ts
Normal file
136
src/utils/contentDetector.ts
Normal file
@@ -0,0 +1,136 @@
|
||||
export type ContentType = 'html' | 'markdown' | 'text';
|
||||
|
||||
export interface DetectionResult {
|
||||
type: ContentType;
|
||||
error?: string;
|
||||
detectedFormat?: string;
|
||||
}
|
||||
|
||||
const BINARY_SIGNATURES: [string, string][] = [
|
||||
['%PDF', 'PDF document'],
|
||||
['PK', 'Word document or ZIP archive'],
|
||||
['\x89PNG', 'PNG image'],
|
||||
['\xFF\xD8', 'JPEG image'],
|
||||
['GIF8', 'GIF image'],
|
||||
['RIFF', 'media file'],
|
||||
['Rar!', 'RAR archive'],
|
||||
];
|
||||
|
||||
function detectBinaryFormat(content: string): string | null {
|
||||
if (content.includes('\0')) {
|
||||
for (const [sig, name] of BINARY_SIGNATURES) {
|
||||
if (content.startsWith(sig)) return name;
|
||||
}
|
||||
return 'binary file';
|
||||
}
|
||||
|
||||
let nonPrintable = 0;
|
||||
const len = Math.min(content.length, 512);
|
||||
for (let i = 0; i < len; i++) {
|
||||
const code = content.charCodeAt(i);
|
||||
if (code < 32 && code !== 9 && code !== 10 && code !== 13) {
|
||||
nonPrintable++;
|
||||
}
|
||||
}
|
||||
return nonPrintable / len > 0.1 ? 'binary file' : null;
|
||||
}
|
||||
|
||||
function stripCodeBlocks(content: string): string {
|
||||
return content.replace(/```[\s\S]*?```/g, '');
|
||||
}
|
||||
|
||||
const STRUCTURAL_TAG_RE = /<(div|p|table|tr|td|th|thead|tbody|tfoot|ul|ol|li|h[1-6]|section|article|header|footer|nav|main|aside|form|blockquote|pre|dl|dt|dd|figure|figcaption|hr)\b[^>]*>/gi;
|
||||
const INLINE_TAG_RE = /<(span|b|i|u|strong|em|a|img|br|code|sub|sup|small|mark|del|ins|s|abbr)\b[^>]*>/gi;
|
||||
|
||||
function countStructuralTags(content: string): number {
|
||||
return (content.match(STRUCTURAL_TAG_RE) || []).length;
|
||||
}
|
||||
|
||||
function countInlineTags(content: string): number {
|
||||
return (content.match(INLINE_TAG_RE) || []).length;
|
||||
}
|
||||
|
||||
function countMarkdownSyntax(content: string): number {
|
||||
let score = 0;
|
||||
const lines = content.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
const t = line.trim();
|
||||
if (/^#{1,6}\s/.test(t)) score += 3;
|
||||
if (/^[-*+]\s/.test(t)) score += 2;
|
||||
if (/^\d+\.\s/.test(t)) score += 2;
|
||||
if (/^>\s/.test(t)) score += 2;
|
||||
if (/^(---|\*\*\*|___)$/.test(t)) score += 2;
|
||||
if (/^```/.test(t)) score += 3;
|
||||
}
|
||||
|
||||
const sample = content.substring(0, 5000);
|
||||
score += (sample.match(/\*\*[^*]+\*\*/g) || []).length;
|
||||
score += (sample.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length * 2;
|
||||
score += (sample.match(/!\[([^\]]*)\]\(([^)]+)\)/g) || []).length * 2;
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
export function detectContentType(content: string, extension: string): DetectionResult {
|
||||
if (!content || !content.trim()) {
|
||||
return { type: 'text' };
|
||||
}
|
||||
|
||||
const binaryFormat = detectBinaryFormat(content);
|
||||
if (binaryFormat) {
|
||||
return {
|
||||
type: 'text',
|
||||
error: `This appears to be a ${binaryFormat}. TypoGenie accepts Markdown, HTML, and plain text files.`,
|
||||
detectedFormat: binaryFormat,
|
||||
};
|
||||
}
|
||||
|
||||
// Full HTML document detection
|
||||
const trimmed = content.trimStart().toLowerCase();
|
||||
if (trimmed.startsWith('<!doctype') || trimmed.startsWith('<html')) {
|
||||
return { type: 'html' };
|
||||
}
|
||||
|
||||
const mdScore = countMarkdownSyntax(content);
|
||||
|
||||
// Count HTML tags on content with code blocks stripped to avoid false positives
|
||||
const stripped = stripCodeBlocks(content);
|
||||
const structural = countStructuralTags(stripped);
|
||||
const inline = countInlineTags(stripped);
|
||||
|
||||
// Both signals strong - likely markdown with HTML examples
|
||||
if (structural >= 3 && mdScore >= 5) {
|
||||
return { type: 'markdown' };
|
||||
}
|
||||
|
||||
// Strong HTML signal
|
||||
if (structural >= 3) {
|
||||
return { type: 'html' };
|
||||
}
|
||||
|
||||
// Moderate HTML: few structural tags but heavy inline tags (Blogger/Google Docs style)
|
||||
if (structural >= 1 && inline >= 10) {
|
||||
return { type: 'html' };
|
||||
}
|
||||
|
||||
// Strong markdown signal
|
||||
if (mdScore >= 3) {
|
||||
return { type: 'markdown' };
|
||||
}
|
||||
|
||||
// Weak HTML with no markdown at all
|
||||
if (structural >= 1 && mdScore === 0) {
|
||||
return { type: 'html' };
|
||||
}
|
||||
|
||||
// Extension as tiebreaker
|
||||
if (extension === 'html' || extension === 'htm') {
|
||||
return { type: 'html' };
|
||||
}
|
||||
if (extension === 'md' || extension === 'markdown') {
|
||||
return { type: 'markdown' };
|
||||
}
|
||||
|
||||
return { type: 'text' };
|
||||
}
|
||||
Reference in New Issue
Block a user