# == Synopsis
# This program parses the DBLP (http://www.informatik.uni-trier.de/~ley/db/) XML records file and extracts all the titles. It then creates a new file, "titles.txt", that contains a list of the 1000 longest titles in the file. (Note: Any existing file with the name "titles.txt" will be overwritten.)
#
# == Usage
# Simply run the program. The uncompressed dblp.xml file (http://dblp.uni-trier.de/xml/) must be present in the current directory.
#
# == Author
# Trevor Harmon (trevor@vocaro.com)
#
# == Copyright
# Copyright (c) 2006 Trevor Harmon <trevor@vocaro.com>
# Licensed under the GNU General Public License (http://www.gnu.org/copyleft/gpl.html)

require 'rexml/document'
require 'rexml/streamlistener'

include REXML

# Listener class for XML stream processing
class Listener
  include StreamListener

  attr_accessor :titles

  def initialize
    # Store the titles as one big array
    @titles = Array.new
  end

  def tag_start(name, attributes)
    if name == "title"
      @inside_title = true
    end
  end

  def text(text)
    if @inside_title
      # Okay, we found a title. Add it to the list
      @titles.push(text)
      
      # Display some status information to the user
      if @titles.size % 1000 == 0
        print "."
        STDOUT.flush
      end
    end
  end

  def tag_end(name)
    @inside_title = false
  end

end

puts "Parsing file... (one dot = 1000 entries)"

# Set up the XML stream processor and begin parsing
listener = Listener.new
parser = Parsers::StreamParser.new(File.new("dblp.xml"), listener)
parser.parse

puts "\nSorting..."

# Sort the list of titles by length, longest titles first
listener.titles.sort! {|x,y| y.length <=> x.length}

# Write out the 1000 longest titles
File.open("titles.txt", "w") do |file|
  listener.titles[0..1000].each {|title| file.puts title }
end

puts "Done."