From e14d543eddd5ac78f84dbc791ed3768024c33c9c Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Mon, 22 Jul 2019 11:39:27 -0500 Subject: [PATCH] handle more edge cases in archor tagger --- app/lib/sanitize_config.rb | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/app/lib/sanitize_config.rb b/app/lib/sanitize_config.rb index e7b70f078..bc37cd59a 100644 --- a/app/lib/sanitize_config.rb +++ b/app/lib/sanitize_config.rb @@ -36,18 +36,21 @@ class Sanitize href = node['href'] return if href == node.text.strip - # remove query string from link text - node.inner_html = node.inner_html.sub(/\?\S+=\S+/, '') - - # href matches link text without query string? - text = node.text.strip - return if href == text + # href matches link text with sanitized query string? + text = Sanitize::sanitize_query_string(node.text.strip) + if href == text + node.inner_html = "\u2728 #{node.inner_html}" + return + end # strip ellipse & replace keyword search obscuring text = text.sub(/ *(?:\u2026|\.\.\.)\Z/, '').gsub(/ dot /i, '.').gsub(/[\u200b-\u200d\ufeff\u200e\u200f]/, '') # href now matches text without obscuring? - return if href == text + if href == text + node.inner_html = "\u2728 #{node.inner_html}" + return + end # try to detect filenames href_filename = '/'.in?(href) ? href.rpartition('/')[2] : nil @@ -57,10 +60,10 @@ class Sanitize return end - # many fedi servers obfuscate media filenames - ext = text.rpartition('.')[-1] - if ext.downcase.in?(MEDIA_EXTENSIONS) && ext == href_filename.rpartition('.')[2] - node.inner_html = "\xf0\x9f\x93\x8e #{node.inner_html}" + # possibly linked media? + ext = href_filename.rpartition('.')[2] + if ext.downcase.in?(MEDIA_EXTENSIONS) + node.inner_html = "\xf0\x9f\x96\xbc\xef\xb8\x8f #{node.inner_html}" return end end