#!/usr/bin/env ruby require 'thread' require "net/http" require "base64" module Pooled def self.run workers, max_concurrent = 3 results = [] i = 0 lock = Mutex.new active_workers = (1 .. max_concurrent).map do Thread.new do loop do local_i = lock.synchronize do i += 1 i - 1 end break if local_i >= workers.length results[local_i] = workers[local_i].call end end end active_workers.map &:join results end end class Page @@cached = {} @@num_created = 0 def self.cached_pages @@cached end attr_reader :url, :links, :keywords, :full_url def initialize url, scheme = nil, host = nil, load_children: false begin @url = url @scheme = URI(@url).scheme || scheme @host = URI(@url).host || host @full_url = fix_url @url @children = nil data = fetch_url @url unless data @url = nil return nil end # save the pulled data to a file so that we can render it with links :) @filename = "saved_files/" + Base64.urlsafe_encode64("file-#{@@num_created}") File.write @filename, data @@num_created += 1 # Grab all href=... links @links = data.scan(/href=["'](.*?)["']/).map do |l| l[0] end.select do |link| link[0] != "#" && link != "" && link[0...7] != "mailto:" end # Get word freq for all the individual words on the page @keywords = {} data.scan(/[[:blank:]]*([[:alnum:]]+) ?/) do |word| word = word[0].downcase if @keywords[word] @keywords[word] = @keywords[word] + 1 else @keywords[word] = 1 end end children if load_children @@cached[@full_url] = self rescue SocketError, URI::InvalidURIError, Net::ReadTimeout, OpenSSL::SSL::SSLError, Net::OpenTimeout, Errno::ECONNREFUSED, Errno::ECONNRESET => e # if the page is really odd, mark it as failed by nilling its url # puts "Failed to connect to #{@url} --> #{e}" @url = nil end end def children return [] unless @links && @url unless @children children = Pooled::run (@links.map do |link| lambda do Page.new link, @scheme, @host end end), 25 children.select! do |child| child.url != nil end deduped = {} children.each do |page| deduped[page.url] = page end @children = [] deduped.each do |url, page| @children << page end end @children end def max_scores keywords @children.reduce (keywords.map {0}) do |maxes, page| maxes.map.with_index do |current_max, index| [current_max, page.keywords[keywords[index]] || 0].max end end end # Find all matching keyword counts in links # ^2 -> sum, pick max sum link def score_page page, keywords, max_scores return 0 unless page.url keywords.map.with_index do |word, index| score = page.keywords[word.downcase] max = if max_scores[index] != 0 then max_scores[index] else 1 end if score then (score * score) / max else 0 end end.sum end def maximum_score_child keywords maxes = max_scores keywords children.reduce do |max_page, page| if (score_page max_page, keywords, maxes) < (score_page page, keywords, maxes) page else max_page end end end def sorted_children keywords maxes = max_scores keywords children.sort do |a, b| (score_page a, keywords, maxes) <=> (score_page b, keywords, maxes) end end def fix_url url, scheme = @scheme, host = @host if url[0..1] == "//" url = "#{scheme}:#{url}" elsif url[0] == "/" || url[0] == "?" url = "#{scheme}://#{host}#{url}" end url end def render lines = `elinks -dump #{@filename}`.lines height = `tput lines`.to_i - 3 i = 0 while i < lines.length lines[i, height].each do |line| puts line.chomp end i += height break if $stdin.gets.chomp == "q" end puts "OK." end private def fetch_url url, limit = 10, scheme = @scheme, host = @host return nil if limit == 0 url = fix_url url uri = URI url path = if uri.path == "" then "/" else uri.path || "/" end response = Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme != "http", :read_timeout => 10, :open_timeout => 5 ) do |http| http.request_get path, "User-Agent" => "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html" end case response.code when Net::HTTPRedirection then fetch_url response['location'], limit - 1, uri.scheme, uri.host when Net::HTTPMovedPermanently then nil else response.body end end end `mkdir -p saved_files` # `rm saved_files/*` page = Page.new ARGV[0], :load_children => true loop do page.render keywords = $stdin.gets.chomp.split break if keywords == [] puts "Rendered -> #{page.fix_url (page.maximum_score_child keywords).url}" maxes = page.max_scores keywords (page.sorted_children keywords)[-6..-1].reverse.each do |page| puts "#{page.full_url} #{page.score_page page, keywords, maxes}" end # move to top page next_page = page.maximum_score_child keywords if Page.cached_pages[next_page.full_url] next_page = Page.cached_pages[next_page.full_url] else puts "Navigating to uncached page, something went very wrong" exit end page = next_page page.children end # `rm -r saved_files`