From dca70079b1f01624b5656b2811e046d114f913e0 Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Tue, 7 May 2019 22:38:43 -0500 Subject: [PATCH] Remove automatic language detection. --- Gemfile | 1 - Gemfile.lock | 9 +- app/helpers/settings_helper.rb | 4 - app/lib/activitypub/activity/create.rb | 2 +- app/lib/language_detector.rb | 99 ------------------ app/services/post_status_service.rb | 2 +- spec/lib/language_detector_spec.rb | 134 ------------------------- 7 files changed, 3 insertions(+), 248 deletions(-) delete mode 100644 app/lib/language_detector.rb delete mode 100644 spec/lib/language_detector_spec.rb diff --git a/Gemfile b/Gemfile index f488116a7..fbf228ff2 100644 --- a/Gemfile +++ b/Gemfile @@ -30,7 +30,6 @@ gem 'browser' gem 'charlock_holmes', '~> 0.7.6' gem 'iso-639' gem 'chewy', '~> 5.0' -gem 'cld3', '~> 3.2.4' gem 'devise', '~> 4.6' gem 'devise-two-factor', '~> 3.0' diff --git a/Gemfile.lock b/Gemfile.lock index 4dacad5e4..ad0f64e3d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -146,8 +146,6 @@ GEM elasticsearch (>= 2.0.0) elasticsearch-dsl chunky_png (1.3.10) - cld3 (3.2.4) - ffi (>= 1.1.0, < 1.11.0) climate_control (0.2.0) cocaine (0.5.8) climate_control (>= 0.0.3, < 1.0) @@ -382,10 +380,6 @@ GEM omniauth (~> 1.3, >= 1.3.2) ruby-saml (~> 1.7) orm_adapter (0.5.0) - ostatus2 (2.0.3) - addressable (~> 2.5) - http (~> 3.0) - nokogiri (~> 1.8) ox (2.10.0) paperclip (6.0.0) activemodel (>= 4.2.0) @@ -673,6 +667,7 @@ DEPENDENCIES brakeman (~> 4.5) browser bullet (~> 6.0) + bundler (~> 1.17) bundler-audit (~> 0.6) capistrano (~> 3.11) capistrano-rails (~> 1.4) @@ -681,7 +676,6 @@ DEPENDENCIES capybara (~> 3.20) charlock_holmes (~> 0.7.6) chewy (~> 5.0) - cld3 (~> 3.2.4) climate_control (~> 0.2) concurrent-ruby derailed_benchmarks @@ -728,7 +722,6 @@ DEPENDENCIES omniauth (~> 1.9) omniauth-cas (~> 1.1) omniauth-saml (~> 1.10) - ostatus2 (~> 2.0) ox (~> 2.10) paperclip (~> 6.0) paperclip-av-transcoder (~> 0.6) diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb index 92bc222ea..84a13c434 100644 --- a/app/helpers/settings_helper.rb +++ b/app/helpers/settings_helper.rb @@ -67,10 +67,6 @@ module SettingsHelper HUMAN_LOCALES[locale] end - def filterable_languages - LanguageDetector.instance.language_names.select(&HUMAN_LOCALES.method(:key?)) - end - def hash_to_object(hash) HashObject.new(hash) end diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index df735a7bf..5514d9a6e 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -338,7 +338,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity elsif summary_language_map? @object['summaryMap'].keys.first elsif supported_object_type? - LanguageDetector.instance.detect(text_from_content, @account) + nil end end diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb deleted file mode 100644 index 1e90af42d..000000000 --- a/app/lib/language_detector.rb +++ /dev/null @@ -1,99 +0,0 @@ -# frozen_string_literal: true - -class LanguageDetector - include Singleton - - WORDS_THRESHOLD = 4 - RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]+/m - - def initialize - @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) - end - - def detect(text, account) - input_text = prepare_text(text) - - return if input_text.blank? - - detect_language_code(input_text) || default_locale(account) - end - - def language_names - @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq - end - - private - - def prepare_text(text) - simplify_text(text).strip - end - - def unreliable_input?(text) - !reliable_input?(text) - end - - def reliable_input?(text) - sufficient_text_length?(text) || language_specific_character_set?(text) - end - - def sufficient_text_length?(text) - text.split(/\s+/).size >= WORDS_THRESHOLD - end - - def language_specific_character_set?(text) - words = text.scan(RELIABLE_CHARACTERS_RE) - - if words.present? - words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size.to_f > 0.3 - else - false - end - end - - def detect_language_code(text) - return if unreliable_input?(text) - result = @identifier.find_language(text) - iso6391(result.language.to_s).to_sym if result.reliable? - end - - def iso6391(bcp47) - iso639 = bcp47.split('-').first - - # CLD3 returns grandfathered language code for Hebrew - return 'he' if iso639 == 'iw' - - ISO_639.find(iso639).alpha2 - end - - def simplify_text(text) - new_text = remove_html(text) - new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') - new_text.gsub!(Account::MENTION_RE, '') - new_text.gsub!(Tag::HASHTAG_RE, '') - new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') - new_text.gsub!(/\s+/, ' ') - new_text - end - - def new_scrubber - scrubber = Rails::Html::PermitScrubber.new - scrubber.tags = %w(br p) - scrubber - end - - def scrubber - @scrubber ||= new_scrubber - end - - def remove_html(text) - text = Loofah.fragment(text).scrub!(scrubber).to_s - text.gsub!('
', "\n") - text.gsub!('

', "\n\n") - text.gsub!(/(^

|<\/p>$)/, '') - text - end - - def default_locale(account) - account.user_locale&.to_sym || I18n.default_locale if account.local? - end -end diff --git a/app/services/post_status_service.rb b/app/services/post_status_service.rb index 1635fffa7..93cfa5b0a 100644 --- a/app/services/post_status_service.rb +++ b/app/services/post_status_service.rb @@ -176,7 +176,7 @@ class PostStatusService < BaseService spoiler_text: @options[:spoiler_text] || '', visibility: @visibility, sharekey: @sharekey, - language: language_from_option(@options[:language]) || @account.user&.setting_default_language&.presence || LanguageDetector.instance.detect(@text, @account), + language: language_from_option(@options[:language]) || @account.user&.setting_default_language&.presence || nil, application: @options[:application], content_type: @options[:content_type] || @account.user&.setting_default_content_type, }.compact diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb deleted file mode 100644 index 0cb70605a..000000000 --- a/spec/lib/language_detector_spec.rb +++ /dev/null @@ -1,134 +0,0 @@ -# frozen_string_literal: true - -require 'rails_helper' - -describe LanguageDetector do - describe 'prepare_text' do - it 'returns unmodified string without special cases' do - string = 'just a regular string' - result = described_class.instance.send(:prepare_text, string) - - expect(result).to eq string - end - - it 'collapses spacing in strings' do - string = 'The formatting in this is very odd' - - result = described_class.instance.send(:prepare_text, string) - expect(result).to eq 'The formatting in this is very odd' - end - - it 'strips usernames from strings before detection' do - string = '@username Yeah, very surreal...! also @friend' - - result = described_class.instance.send(:prepare_text, string) - expect(result).to eq 'Yeah, very surreal...! also' - end - - it 'strips URLs from strings before detection' do - string = 'Our website is https://example.com and also http://localhost.dev' - - result = described_class.instance.send(:prepare_text, string) - expect(result).to eq 'Our website is and also' - end - - it 'strips #hashtags from strings before detection' do - string = 'Hey look at all the #animals and #fish' - - result = described_class.instance.send(:prepare_text, string) - expect(result).to eq 'Hey look at all the and' - end - end - - describe 'detect' do - let(:account_without_user_locale) { Fabricate(:user, locale: nil).account } - let(:account_remote) { Fabricate(:account, domain: 'joinmastodon.org') } - - it 'detects english language for basic strings' do - strings = [ - "Hello and welcome to mastodon how are you today?", - "I'd rather not!", - "a lot of people just want to feel righteous all the time and that's all that matters", - ] - strings.each do |string| - result = described_class.instance.detect(string, account_without_user_locale) - - expect(result).to eq(:en), string - end - end - - it 'detects spanish language' do - string = 'Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon. Obtener un Hola y bienvenidos a Mastodon' - result = described_class.instance.detect(string, account_without_user_locale) - - expect(result).to eq :es - end - - describe 'when language can\'t be detected' do - it 'uses nil when sent an empty document' do - result = described_class.instance.detect('', account_without_user_locale) - expect(result).to eq nil - end - - describe 'because of a URL' do - it 'uses nil when sent just a URL' do - string = 'http://example.com/media/2kFTgOJLXhQf0g2nKB4' - cld_result = CLD3::NNetLanguageIdentifier.new(0, 2048).find_language(string) - expect(cld_result).not_to eq :en - - result = described_class.instance.detect(string, account_without_user_locale) - - expect(result).to eq nil - end - end - - describe 'with an account' do - it 'uses the account locale when present' do - account = double(user_locale: 'fr') - result = described_class.instance.detect('', account) - - expect(result).to eq nil - end - - it 'uses nil when account is present but has no locale' do - result = described_class.instance.detect('', account_without_user_locale) - - expect(result).to eq nil - end - end - - describe 'with an `en` default locale' do - it 'uses nil for undetectable string' do - result = described_class.instance.detect('', account_without_user_locale) - - expect(result).to eq nil - end - end - - describe 'remote user' do - it 'detects Korean language' do - string = '안녕하세요' - result = described_class.instance.detect(string, account_remote) - - expect(result).to eq :ko - end - end - - describe 'with a non-`en` default locale' do - around(:each) do |example| - before = I18n.default_locale - I18n.default_locale = :ja - example.run - I18n.default_locale = before - end - - it 'uses nil for undetectable string' do - string = '' - result = described_class.instance.detect(string, account_without_user_locale) - - expect(result).to eq nil - end - end - end - end -end