#!/usr/bin/env ruby # =comment_spam_stats.rb # # Simple stats tool for my weblog comment spam workflow, helps me figure out # which articles receive how much 'successful' spam. # # Uses Spotlight's mdfind commandline tool to search for a specific subset of # emails, then extracts a string pattern from each mail, and builds a ranking # of the captured strings. # # See http://dekstop.de/weblog/2006/07/spotlight_helps_fight_comment_spam/ # # By Martin Dittus (martin_at_dekstop_dot_de), 2006-07-07 # -- last change: 2006-07-07 # ========= # = prefs = # ========= @prefs = { # mdfind -onlyin parameter. Adjust to select a specific mail folder. :mail_dir => '~/Library/Mail/', # mdfind spotlight query. Adjust to filter against your specific criteria. :spotlight_query => "kMDItemTitle == '*New Comment*'", # string pattern we're extracting from each email. :url_pattern => 'http://dekstop.de(/weblog/2.*?/)index.html' } # ======== # = main = # ======== urls = {} # escape mail_dir for shell use escaped_mail_dir = @prefs[:mail_dir].gsub(/([ @])/) { |s| '\\'+$1} # get list of files print "Searching..." mdfind_output = `mdfind -onlyin #{escaped_mail_dir} \"#{@prefs[:spotlight_query]}\"` mail_files = mdfind_output.split("\n") puts " found #{mail_files.size} emails." # extract URLs puts "Parsing" mail_files.each do |filename| text = File.read(filename.chomp) print '.' text.match(@prefs[:url_pattern]).captures.each do |url| urls[url] = (urls[url] || 0) + 1 end end puts " done." # output puts "Ranking:" urls.keys.sort_by { |k| -urls[k]}.each do |url| puts "#{urls[url]} times #{url}" end