From 6613005ae63bdc31a818d7488c36a130b558a903 Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Sat, 3 Aug 2019 05:32:49 -0500 Subject: [PATCH] `monsterpit-janitor` is now built in as a sidekiq job (with better code) --- app/helpers/blocklist_helper.rb | 51 +++++++ app/workers/scheduler/janitor_scheduler.rb | 148 +++++++++++++++++++++ config/sidekiq.yml | 3 + 3 files changed, 202 insertions(+) create mode 100644 app/helpers/blocklist_helper.rb create mode 100644 app/workers/scheduler/janitor_scheduler.rb diff --git a/app/helpers/blocklist_helper.rb b/app/helpers/blocklist_helper.rb new file mode 100644 index 000000000..5730a2ef5 --- /dev/null +++ b/app/helpers/blocklist_helper.rb @@ -0,0 +1,51 @@ +module BlocklistHelper + def merged_blocklist + # ordered by preference + # prefer vulpine b/c they have easy-to-parse reason text + blocklist = vulpine_club_blocks | dialup_express_blocks | ten_forward_blocks + blocklist.uniq { |entry| entry[:domain] } + end + + def dialup_express_blocks + admin_id = Account.find_remote('xenon', 'sleeping.town')&.id + return [] if admin_id.nil? + + domains = ActiveRecord::Base.connection.select_values("SELECT unnest(regexp_matches(text, '\\m[\\w\\-]+\\.[\\w\-]+(?:\\.[\\w\\-]+)*', 'g')) FROM statuses WHERE account_id = #{admin_id.to_i} AND NOT reply AND created_at >= (NOW() - INTERVAL '2 days') AND tsv @@ to_tsquery('new <-> dialup <-> express <2> block') EXCEPT SELECT domain FROM domain_blocks") + + domains.map! do |domain| + {domain: domain, severity: :suspend, reason: '(imported from dialup.express)'} + end + end + + def ten_forward_blocks + admin_id = Account.find_remote('guinan', 'tenforward.social')&.id + return [] if admin_id.nil? + + domains += ActiveRecord::Base.connection.select_values("SELECT unnest(regexp_matches(text, '\\m[\\w\\-]+\\.[\\w\-]+(?:\\.[\\w\\-]+)*', 'g')) FROM statuses WHERE account_id = #{admin_id.to_i} AND NOT reply AND created_at >= (NOW() - INTERVAL '2 days') AND tsv @@ to_tsquery('ten <-> forward <-> moderation <-> announcement') EXCEPT SELECT domain FROM domain_blocks") + + domains.map! do |domain| + {domain: domain, severity: :suspend, reason: '(imported from ten.forward)'} + end + end + + def vulpine_club_blocks + url = "https://raw.githubusercontent.com/vulpineclub/vulpineclub.github.io/master/_data/blocks.yml" + + body = Request.new(:get, url).perform do |response| + response.code != 200 ? nil : response.body_with_limit(66.kilobytes) + end + + return [] unless body.present? + + yaml = YAML::load(body) + yaml.map! do |entry| + domain = entry['domain'] + next if domain.blank? + severity = entry['severity'].split('/') + reject_media = 'nomedia'.in?(severity) + severity = severity[0] || 'noop' + reason = "(imported from vulpine.club) #{entry['reason']}#{entry['link'].present? ? " (#{entry['link']})" : ''}".rstrip + {domain: domain, severity: severity.to_sym, reject_media: reject_media, reason: reason} + end + end +end diff --git a/app/workers/scheduler/janitor_scheduler.rb b/app/workers/scheduler/janitor_scheduler.rb new file mode 100644 index 000000000..70d1c4ea0 --- /dev/null +++ b/app/workers/scheduler/janitor_scheduler.rb @@ -0,0 +1,148 @@ +# frozen_string_literal: true + +class Scheduler::JanitorScheduler + include Sidekiq::Worker + include BlocklistHelper + include BangtagHelper + + MIN_POSTS = 6 + + sidekiq_options unique: :until_executed, retry: 0 + + def perform + @account = admin_account + return if @account.nil? + + @exclude_ids = excluded_account_ids + @exclude_domains = excluded_domains + @exclude_markov = excluded_accounts_from_env('MARKOV') + + prune_deleted_accounts! + suspend_abandoned_accounts! + suspend_spammers! + silence_markov! + import_blocklists! + end + + private + + def prune_deleted_accounts! + Account.local.where.not(suspended_at: nil).destroy_all + end + + def suspend_abandoned_accounts! + reason = "Appears to be abandoned. Freeing up the username for someone else." + abandoned_accounts.find_each do |account| + account_policy(account.username, nil, :suspend, reason) + end + end + + def suspend_spammers! + reason = 'Appears to be a spammer account.' + spammer_accounts.find_each do |spammer| + account_policy(spammer.username, spammer.domain, :suspend, reason) + end + end + + def silence_markov! + reason = 'Appears to be a markov bot.' + markov_accounts.find_each do |markov| + account_policy(markov.username, markov.domain, :silence, reason) + end + end + + def import_blocklists! + blocks = merged_blocklist.reject { |entry| entry[:domain].in?(@exclude_domains) } + blocks.each do |entry| + block = DomainBlock.create!(entry) + DomainBlockWorker.perform_async(block) + Admin::ActionLog.create(account: @account, action: :create, target: block) + user_friendly_action_log(@account, :create, block) + end + end + + + + def admin_account + account_id = ENV.fetch('JANITOR_USER', '').to_i + return if account_id == 0 + Account.find_by(id: account_id) + end + + def spammer_accounts + spammer_ids = spammer_account_ids + Account.reorder(nil).where(id: spammer_ids, suspended_at: nil) + .where.not(id: @exclude_ids) + end + + def markov_accounts + Account.reorder(nil).where(silenced_at: nil).where.not(id: @exclude_markov) + .where('username LIKE ? OR note ILIKE ?', '%ebooks%', '%markov%') + end + + def abandoned_accounts + Account.reorder(nil).where(id: abandoned_account_ids) + end + + def abandoned_users + User.select(:account_id).where('last_sign_in_at < ?', 3.months.ago) + end + + def excluded_domains + existing_policy_domains | domains_from_account_ids | excluded_from_env('DOMAINS') + end + + + def abandoned_account_ids + AccountStat.select(:account_id) + .where(account_id: abandoned_users) + .where('statuses_count < ?', MIN_POSTS) + end + + def excluded_account_ids + local_account_ids | outgoing_follow_ids | excluded_accounts_from_env('USERNAMES') + end + + def spammer_account_ids + post_spammer_ids | card_spammer_ids + end + + def existing_policy_domains + DomainBlock.all.pluck(:domain) + end + + def domains_from_account_ids + Account.reorder(nil).where(id: @account_ids).pluck(:domain).uniq + end + + def local_account_ids + Account.local.reorder(nil).pluck(:id) + end + + def outgoing_follow_ids + Account.local.reorder(nil).flat_map { |account| account.following_ids } + end + + def post_spammer_ids + Status.with_public_visibility + .reorder(nil) + .where('tsv @@ to_tsquery(?)', 'womenarestupid.site & /blog/:*') + .pluck(:account_id) + end + + def card_spammer_ids + PreviewCard.where('url LIKE ? OR title ILIKE ?', '%womenarestupid%', '%womenaredumb%') + .reorder(nil) + .flat_map { |card| card.statuses.pluck(:account_id) } + end + + + def excluded_accounts_from_env(suffix) + excluded_usernames = ENV.fetch("JANITOR_EXCLUDE_#{suffix.upcase}", '').split + Account.reorder(nil).where(username: excluded_usernames).pluck(:id) + end + + def excluded_from_env(suffix) + ENV.fetch("JANITOR_EXCLUDE_#{suffix.upcase}", '').split + end +end diff --git a/config/sidekiq.yml b/config/sidekiq.yml index 1ab523efb..4390b5a0e 100644 --- a/config/sidekiq.yml +++ b/config/sidekiq.yml @@ -12,6 +12,9 @@ destructing_statuses_scheduler: every: '1m' class: Scheduler::DestructingStatusesScheduler + janitor_scheduler: + every: '1h' + class: Scheduler::JanitorScheduler media_cleanup_scheduler: cron: '<%= Random.rand(0..59) %> <%= Random.rand(3..5) %> * * *' class: Scheduler::MediaCleanupScheduler