diff --git a/src/api/wikipedia.ts b/src/api/wikipedia.ts index 664cb69..58e39ef 100644 --- a/src/api/wikipedia.ts +++ b/src/api/wikipedia.ts @@ -116,18 +116,20 @@ export async function wikipediaSection( */ function stripWikiHtml(raw: string): string { if (!raw) return '' - return raw + let text = raw // 1. Drop entirely-non-prose blocks. Each gets matched as a unit // so their text content (heading words, edit-link text, etc.) // disappears with the wrapping tags. .replace(//gi, '') .replace(//gi, '') .replace(/]*class="[^"]*reference[^"]*"[^>]*>[\s\S]*?<\/sup>/gi, '') - .replace(/]*class="[^"]*mw-editsection[^"]*"[^>]*>[\s\S]*?<\/span>/gi, '') + // Edit links: match both double and single quoted class attrs, and + // also catch the bracket wrappers that sometimes leak through. + .replace(/]*class=["'][^"']*mw-editsection[^"']*["'][^>]*>[\s\S]*?<\/span>/gi, '') .replace(/]*>[\s\S]*?<\/h[1-6]>/gi, '\n\n') .replace(//gi, '') .replace(//gi, '') - .replace(/]*class="[^"]*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"]*"[\s\S]*?<\/div>/gi, '') + .replace(/]*class=["'][^"']*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"']*["'][\s\S]*?<\/div>/gi, '') // 2. Preserve paragraph + line breaks before flattening tags. .replace(/<\/p>/gi, '\n\n') .replace(//gi, '\n') @@ -139,15 +141,36 @@ function stripWikiHtml(raw: string): string { .replace(/\[\s*\d+\s*\]/g, '') .replace(/\[\s*[a-z]\s*\]/g, '') .replace(/\[\s*(citation needed|clarification needed|when\?|who\?|why\?)\s*\]/gi, '') - // 5. HTML entities. - .replace(/ /g, ' ') - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/</g, '<') - .replace(/>/g, '>') - // 6. Whitespace cleanup. Collapse runs of spaces, trim stray - // indents around newlines, normalise blank-line gaps to one. + // 5. Stray edit markers that leaked through (e.g. standalone + // brackets or "Edit" at the start of the text). + .replace(/^\s*\[?\s*edit\s*\]?\s*/i, '') + .replace(/\n\s*\[?\s*edit\s*\]?\s*/gi, '\n') + + // 6. Decode ALL HTML entities (numeric, named, hex) via DOM so we + // don't have to maintain an exhaustive regex list. + if (typeof document !== 'undefined') { + const tmp = document.createElement('textarea') + tmp.innerHTML = text + text = tmp.value + } else { + // Fallback for SSR / test environments: cover the common ones. + text = text + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/—/g, '-') + .replace(/’/g, "'") + .replace(/…/g, '...') + .replace(/</g, '<') + .replace(/>/g, '>') + } + + // 7. Whitespace cleanup. Collapse runs of spaces, trim stray + // indents around newlines, normalise blank-line gaps to one. + return text .replace(/[ \t]+/g, ' ') .replace(/[ \t]*\n[ \t]*/g, '\n') .replace(/\n{3,}/g, '\n\n') diff --git a/src/pages/TmdbDetailPage.tsx b/src/pages/TmdbDetailPage.tsx index 4b3d94f..3042713 100644 --- a/src/pages/TmdbDetailPage.tsx +++ b/src/pages/TmdbDetailPage.tsx @@ -286,12 +286,6 @@ export default function TmdbDetailPage({ tmdbId, kind }: Props) { )} - {overview && ( -

- {overview} -

- )} -
{matchedLocal && (