patriot_web_parser.rb 4.46 KB
Newer Older
1 2 3 4
require_relative 'patriot_web_networker'
require 'nokogiri'

class String
5
  # Checks if a String is a alphanumeric
6 7 8 9 10 11
  def alpha?
    !!match(/^[[:alpha:]]+$/)
  end
end

module PatriotWeb
12
  # Contains methods for parsing data retrieved from Patriot Web
13 14 15 16
  class Parser
    def initialize
      @networker = PatriotWeb::Networker.new
    end
17 18
    
    # Parses all semesters avaliable on Patriot Web
19
    def parse_semesters
20 21
      response = @networker.fetch_page_containing_semester_data 
      document = Nokogiri::HTML(response) # parse the document from the HTTP response
22

23
      get_semesters_from_option_values(document).compact
24 25
    end

26 27
    # Parses subjects belonging to a given semester id
    # @param semester_id [Integer]
28 29
    def parse_subjects(semester_id)
      response = @networker.fetch_subjects(semester_id)
30 31
      document = Nokogiri::HTML(response)
      get_subject_codes_from_option_values(document)
32 33
    end

34 35
    # Parses all courses belonging to a given subject
    # @param subject [String]
36
    def parse_courses_in_subject(subject)
37 38 39
      response = @networker.fetch_courses_in_subject(subject)
      document = Nokogiri::HTML(response)
      feed_course_info(document)
40 41 42
    end

    private
43 44 45 46 47 48 49 50
    
    # Parse the values of all different options on the Patriot Web
    # semester select page
    # @param document [Nokogiri::HTML::Document]
    def get_semesters_from_option_values(document)
      document.css('option').map do |opt| # for each option value
        if opt.attr('value').start_with? '20' # ensure it is a semester value
          opt.attr('value') # return the value
51 52 53 54
        end
      end
    end

55 56 57 58 59 60 61
    # Parse all subject codes from the select element on the Patriot Web
    # subject select page
    # @param document [Nokogiri::HTML::Document]
    def get_subject_codes_from_option_values(document)
      document.xpath('//*[@id="subj_id"]/option').map do |opt| # for each option value under "subj_id"
        if opt.attr('value').strip.alpha? # if the value is alphanumeric
          opt.attr('value') # return the value
62 63 64 65
        end
      end
    end

66
    # TODO write docs
67
    def feed_course_info(searcher)
68 69
      # find the table containing the courses
      table = searcher.css('html body div.pagebodydiv table.datadisplaytable') 
70 71
      data = {}
      currentobj = nil
72 73
      table.css('table.datadisplaytable').first.children.each do |row| # for each row in the table
        next unless row.name == 'tr' # only search table rows, ignore headers
74 75 76 77 78 79 80
        row.children.each do |item|
          currentobj = sort_item(item, currentobj, data)
        end
      end
      data
    end

81
    # TODO break this up and write docs
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
    def sort_item(item, currentobj, data)
      if item.name == 'th'
        if item.to_html.include? '-'
          titletxt = item.text
          if item.text.include? ' - Honors'
            titletxt = titletxt.gsub(' - Honors', ' (Honors)')
          end
          titledetails = titletxt.split(' - ')
          if titledetails.count > 4
            titledetails = ["#{titledetails[0]} #{titledetails[1]}", titledetails[2], titledetails[3], titledetails[4]]
          end
          titledata = titledetails[2].split(' ')
          begin
            data = get_details(data, titledetails, titledata)[0]
            currentobj = get_details(data, titledetails, titledata)[1]
          rescue StandardError => e
            puts item
            puts e
            exit(1)
          end
          currentobj[:fields] = []
        end
      elsif item.is_a? Nokogiri::XML::Element
        item.css('th').each do |field|
          currentobj[:fields].push(field.text.downcase.tr(' ', '_'))
        end
        iter = 0
        if currentobj
          if currentobj[:fields]
            upper = currentobj[:fields].count - 1
            while iter <= upper
              assign = item.css('td')[iter].text
              currentobj[currentobj[:fields][iter]] = assign
              iter += 1
            end
          end
        end
      end
      currentobj
    end
122 123
    
    # TODO break this up and write docs
124 125 126 127 128 129 130 131 132 133 134 135
    def get_details(data, titledetails, titledata)
      crn = titledetails[1].strip
      data[crn] = {} unless data[titledetails[1]]
      crsinfo = { 'name': titledetails[0].strip }
      uniquedata = { 'sect': titledetails[3].strip, 'crn': titledetails[1].strip }
      general = { 'subj': titledata[0].strip, 'code': titledata[1].strip }
      data[crn] = general.merge(uniquedata.merge(crsinfo))
      data[crn][:code] = titledetails[2].split(' ')[1]
      [data, data[crn]]
    end
  end
end