remove duplicate overview from TMDB detail hero, fix wikipedia edit text and entity decoding

This commit is contained in:
2026-04-18 09:15:57 +03:00
parent 48cef31467
commit a7e36478a0
2 changed files with 35 additions and 18 deletions
+35 -12
View File
@@ -116,18 +116,20 @@ export async function wikipediaSection(
*/
function stripWikiHtml(raw: string): string {
if (!raw) return ''
return raw
let text = raw
// 1. Drop entirely-non-prose blocks. Each gets matched as a unit
// so their text content (heading words, edit-link text, etc.)
// disappears with the wrapping tags.
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<sup[^>]*class="[^"]*reference[^"]*"[^>]*>[\s\S]*?<\/sup>/gi, '')
.replace(/<span[^>]*class="[^"]*mw-editsection[^"]*"[^>]*>[\s\S]*?<\/span>/gi, '')
// Edit links: match both double and single quoted class attrs, and
// also catch the bracket wrappers that sometimes leak through.
.replace(/<span[^>]*class=["'][^"']*mw-editsection[^"']*["'][^>]*>[\s\S]*?<\/span>/gi, '')
.replace(/<h[1-6][^>]*>[\s\S]*?<\/h[1-6]>/gi, '\n\n')
.replace(/<table[\s\S]*?<\/table>/gi, '')
.replace(/<figure[\s\S]*?<\/figure>/gi, '')
.replace(/<div[^>]*class="[^"]*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"]*"[\s\S]*?<\/div>/gi, '')
.replace(/<div[^>]*class=["'][^"']*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"']*["'][\s\S]*?<\/div>/gi, '')
// 2. Preserve paragraph + line breaks before flattening tags.
.replace(/<\/p>/gi, '\n\n')
.replace(/<br\s*\/?>/gi, '\n')
@@ -139,15 +141,36 @@ function stripWikiHtml(raw: string): string {
.replace(/\[\s*\d+\s*\]/g, '')
.replace(/\[\s*[a-z]\s*\]/g, '')
.replace(/\[\s*(citation needed|clarification needed|when\?|who\?|why\?)\s*\]/gi, '')
// 5. HTML entities.
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
// 6. Whitespace cleanup. Collapse runs of spaces, trim stray
// indents around newlines, normalise blank-line gaps to one.
// 5. Stray edit markers that leaked through (e.g. standalone
// brackets or "Edit" at the start of the text).
.replace(/^\s*\[?\s*edit\s*\]?\s*/i, '')
.replace(/\n\s*\[?\s*edit\s*\]?\s*/gi, '\n')
// 6. Decode ALL HTML entities (numeric, named, hex) via DOM so we
// don't have to maintain an exhaustive regex list.
if (typeof document !== 'undefined') {
const tmp = document.createElement('textarea')
tmp.innerHTML = text
text = tmp.value
} else {
// Fallback for SSR / test environments: cover the common ones.
text = text
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&apos;/g, "'")
.replace(/&#39;/g, "'")
.replace(/&#039;/g, "'")
.replace(/&#8212;/g, '-')
.replace(/&#8217;/g, "'")
.replace(/&#8230;/g, '...')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
}
// 7. Whitespace cleanup. Collapse runs of spaces, trim stray
// indents around newlines, normalise blank-line gaps to one.
return text
.replace(/[ \t]+/g, ' ')
.replace(/[ \t]*\n[ \t]*/g, '\n')
.replace(/\n{3,}/g, '\n\n')