#!/usr/local/bin/ruby # 22c3-opml.rb # # retrieves the list of blogs reporting on 22C3 from the congress wiki page: # http://events.ccc.de/congress/2005/wiki/Special:Export/Weblogs # and creates an OPML file with their feed URLs # # execute like this: # # ruby 22c3-blogs-opml.rb > 22c3-blogs.opml # # # NOTES: # - only takes the first feed URL of each blog (e.g., if RSS and Atom are # present, the Atom feed will be ignored.) # - there are some character conversion issues; might look into this later. # # by Martin Dittus (martin@dekstop.de), 2005-12-28 # last change: 2005-12-29 require 'net/http' require 'rexml/document' require 'time' # ================== # = load wiki data = # ================== # either read from a local file #xmldata = File.read('export.xml') # or from the 22c3 webserver host = 'events.ccc.de' url = '/congress/2005/wiki/Special:Export/Weblogs' xmldata = Net::HTTP.start(host, 80) { |http| http.get(url).response.body } # ========= # = prefs = # ========= @opmlprefs = { :title => '22C3 Weblogs', :ownerName => 'http://events.ccc.de/congress/2005/' } # OPML 1.0 template opmltemplate = " #{ @opmlprefs[:title] } #{ Time.now.rfc2822 } #{ Time.now.rfc2822 } #{ @opmlprefs[:ownerName] } " opmldoc = REXML::Document.new(opmltemplate) opmlbody = opmldoc.elements['opml/body'] outline = opmlbody.add_element('outline', { 'title' => @opmlprefs[:title]} ) # ======== # = main = # ======== # extract wikicode from xml xml = REXML::Document.new(xmldata) wikicode = xml.elements['mediawiki/page/revision/text'].text # parse the first table's content rows = wikicode.match(Regexp.new('\{\|(.*)\|\}', Regexp::MULTILINE))[1] rows.each('|-') do |row| # extract all links and their descriptions; # the first link of a table row points to the blog page, the rest are feed urls links = row.scan(/\[(http.+?) ([^\]]+)\]/i) # only regard rows where there are at minimum 2 links (blog page and feed) if (links.size >= 2) blog = links[0] # blog page feed = links[1] # first feed outline.add_element('outline', { 'title' => blog[1], 'htmlUrl' => blog[0], 'xmlUrl' => feed[0] }) end end xml = "" opmldoc.write(xml, 0) # auto-indent puts xml