# == Synopsis # This program parses the DBLP (http://www.informatik.uni-trier.de/~ley/db/) XML records file and extracts all the titles. It then creates a new file, "titles.txt", that contains a list of the 1000 longest titles in the file. (Note: Any existing file with the name "titles.txt" will be overwritten.) # # == Usage # Simply run the program. The uncompressed dblp.xml file (http://dblp.uni-trier.de/xml/) must be present in the current directory. # # == Author # Trevor Harmon (trevor@vocaro.com) # # == Copyright # Copyright (c) 2006 Trevor Harmon # Licensed under the GNU General Public License (http://www.gnu.org/copyleft/gpl.html) require 'rexml/document' require 'rexml/streamlistener' include REXML # Listener class for XML stream processing class Listener include StreamListener attr_accessor :titles def initialize # Store the titles as one big array @titles = Array.new end def tag_start(name, attributes) if name == "title" @inside_title = true end end def text(text) if @inside_title # Okay, we found a title. Add it to the list @titles.push(text) # Display some status information to the user if @titles.size % 1000 == 0 print "." STDOUT.flush end end end def tag_end(name) @inside_title = false end end puts "Parsing file... (one dot = 1000 entries)" # Set up the XML stream processor and begin parsing listener = Listener.new parser = Parsers::StreamParser.new(File.new("dblp.xml"), listener) parser.parse puts "\nSorting..." # Sort the list of titles by length, longest titles first listener.titles.sort! {|x,y| y.length <=> x.length} # Write out the 1000 longest titles File.open("titles.txt", "w") do |file| listener.titles[0..1000].each {|title| file.puts title } end puts "Done."