remove duplicate overview from TMDB detail hero, fix wikipedia edit text and entity decoding
This commit is contained in:
+35
-12
@@ -116,18 +116,20 @@ export async function wikipediaSection(
|
||||
*/
|
||||
function stripWikiHtml(raw: string): string {
|
||||
if (!raw) return ''
|
||||
return raw
|
||||
let text = raw
|
||||
// 1. Drop entirely-non-prose blocks. Each gets matched as a unit
|
||||
// so their text content (heading words, edit-link text, etc.)
|
||||
// disappears with the wrapping tags.
|
||||
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
||||
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
||||
.replace(/<sup[^>]*class="[^"]*reference[^"]*"[^>]*>[\s\S]*?<\/sup>/gi, '')
|
||||
.replace(/<span[^>]*class="[^"]*mw-editsection[^"]*"[^>]*>[\s\S]*?<\/span>/gi, '')
|
||||
// Edit links: match both double and single quoted class attrs, and
|
||||
// also catch the bracket wrappers that sometimes leak through.
|
||||
.replace(/<span[^>]*class=["'][^"']*mw-editsection[^"']*["'][^>]*>[\s\S]*?<\/span>/gi, '')
|
||||
.replace(/<h[1-6][^>]*>[\s\S]*?<\/h[1-6]>/gi, '\n\n')
|
||||
.replace(/<table[\s\S]*?<\/table>/gi, '')
|
||||
.replace(/<figure[\s\S]*?<\/figure>/gi, '')
|
||||
.replace(/<div[^>]*class="[^"]*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"]*"[\s\S]*?<\/div>/gi, '')
|
||||
.replace(/<div[^>]*class=["'][^"']*(?:thumb|infobox|navbox|hatnote|reflist|mw-references|gallery)[^"']*["'][\s\S]*?<\/div>/gi, '')
|
||||
// 2. Preserve paragraph + line breaks before flattening tags.
|
||||
.replace(/<\/p>/gi, '\n\n')
|
||||
.replace(/<br\s*\/?>/gi, '\n')
|
||||
@@ -139,15 +141,36 @@ function stripWikiHtml(raw: string): string {
|
||||
.replace(/\[\s*\d+\s*\]/g, '')
|
||||
.replace(/\[\s*[a-z]\s*\]/g, '')
|
||||
.replace(/\[\s*(citation needed|clarification needed|when\?|who\?|why\?)\s*\]/gi, '')
|
||||
// 5. HTML entities.
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
// 6. Whitespace cleanup. Collapse runs of spaces, trim stray
|
||||
// indents around newlines, normalise blank-line gaps to one.
|
||||
// 5. Stray edit markers that leaked through (e.g. standalone
|
||||
// brackets or "Edit" at the start of the text).
|
||||
.replace(/^\s*\[?\s*edit\s*\]?\s*/i, '')
|
||||
.replace(/\n\s*\[?\s*edit\s*\]?\s*/gi, '\n')
|
||||
|
||||
// 6. Decode ALL HTML entities (numeric, named, hex) via DOM so we
|
||||
// don't have to maintain an exhaustive regex list.
|
||||
if (typeof document !== 'undefined') {
|
||||
const tmp = document.createElement('textarea')
|
||||
tmp.innerHTML = text
|
||||
text = tmp.value
|
||||
} else {
|
||||
// Fallback for SSR / test environments: cover the common ones.
|
||||
text = text
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/—/g, '-')
|
||||
.replace(/’/g, "'")
|
||||
.replace(/…/g, '...')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
}
|
||||
|
||||
// 7. Whitespace cleanup. Collapse runs of spaces, trim stray
|
||||
// indents around newlines, normalise blank-line gaps to one.
|
||||
return text
|
||||
.replace(/[ \t]+/g, ' ')
|
||||
.replace(/[ \t]*\n[ \t]*/g, '\n')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
|
||||
Reference in New Issue
Block a user