patriot_web_parser.rb 4.19 KB
Newer Older
1
2
3
4
require_relative 'patriot_web_networker'
require 'nokogiri'

class String
5
  # Checks if a String is a alphanumeric
6
7
8
9
10
11
  def alpha?
    !!match(/^[[:alpha:]]+$/)
  end
end

module PatriotWeb
12
  # Contains methods for parsing data retrieved from Patriot Web
13
14
15
16
  class Parser
    def initialize
      @networker = PatriotWeb::Networker.new
    end
17
18
    
    # Parses all semesters avaliable on Patriot Web
19
    def parse_semesters
20
21
      response = @networker.fetch_page_containing_semester_data 
      document = Nokogiri::HTML(response) # parse the document from the HTTP response
22

23
      get_semesters_from_option_values(document).compact
24
25
    end

26
27
    # Parses subjects belonging to a given semester id
    # @param semester_id [Integer]
28
29
    def parse_subjects(semester_id)
      response = @networker.fetch_subjects(semester_id)
30
31
      document = Nokogiri::HTML(response)
      get_subject_codes_from_option_values(document)
32
33
    end

34
35
    # Parses all courses belonging to a given subject
    # @param subject [String]
36
    def parse_courses_in_subject(subject)
37
38
      response = @networker.fetch_courses_in_subject(subject)
      document = Nokogiri::HTML(response)
39
      get_courses(document, subject)
40
41
42
    end

    private
43
44
45
46
47
48
49
50
    
    # Parse the values of all different options on the Patriot Web
    # semester select page
    # @param document [Nokogiri::HTML::Document]
    def get_semesters_from_option_values(document)
      document.css('option').map do |opt| # for each option value
        if opt.attr('value').start_with? '20' # ensure it is a semester value
          opt.attr('value') # return the value
51
52
53
54
        end
      end
    end

55
56
57
58
59
60
61
    # Parse all subject codes from the select element on the Patriot Web
    # subject select page
    # @param document [Nokogiri::HTML::Document]
    def get_subject_codes_from_option_values(document)
      document.xpath('//*[@id="subj_id"]/option').map do |opt| # for each option value under "subj_id"
        if opt.attr('value').strip.alpha? # if the value is alphanumeric
          opt.attr('value') # return the value
62
63
64
65
        end
      end
    end

Zac Wood's avatar
Zac Wood committed
66
67
68
    # Parse all courses from the subject search page 
    # @param document [Nokogiri::HTML::Document]
    # @return [Array] courses
69
    def get_courses(document, subject)
Zac Wood's avatar
Zac Wood committed
70
71
      table = document.css('html body div.pagebodydiv table.datadisplaytable')
      rows = table.css('tr')
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
      # rows[100..110].each_with_index do |row, i|
      #   puts i
      #   puts row
      # end
      data_from rows
    end

    def data_from(rows)
      i = 0
      title_index = 0
      result = []
      
      while i < rows.length
        if is_title(rows[i].text) # check if the row is a title
          data = {}
Zac Wood's avatar
Zac Wood committed
87

88
89
90
91
92
93
94
95
          title_elements = rows[i].text.split(' - ')
          data[:title] = title_elements[0].strip
          data[:crn] = title_elements[1]
          full_name = title_elements[2].split(' ')
          next unless full_name.length == 2
          data[:subj] = full_name[0]
          data[:course_number] = full_name[1]
          data[:section] = title_elements[3].strip
Zac Wood's avatar
Zac Wood committed
96

97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
          details = rows[i+2].css('td table tr td')
          unless details.length > 0
            puts "#{full_name.join(' ')} is fake news"
            i += 1
            next
          end
          
          times = details[1].text.split(' - ')
          if (times.length == 1)
            data[:start_time] = 'TBA'
            data[:end_time] = 'TBA'
          else
            data[:start_time] = times[0]
            data[:end_time] = times[1]
          end
Zac Wood's avatar
Zac Wood committed
112

113
114
115
116
117
118
119
120
121
122
123
124
          data[:days] = details[2].text.strip
          data[:location] = details[3].text.strip
          
          dates = details[4].text.split(' - ')
          data[:start_date] = dates[0]
          data[:end_date] = dates[1]
          
          data[:type] = details[5].text
          data[:instructor] = details[6].text

          result << data
          i += 5 # skip to what we think is the next title
Zac Wood's avatar
Zac Wood committed
125
        else
126
          i += 1 # try the next row if this one was not a title
Zac Wood's avatar
Zac Wood committed
127
        end
Zac Wood's avatar
Zac Wood committed
128
      end
Zac Wood's avatar
Zac Wood committed
129
      
130
131
132
133
134
135
136
      result
    end
    
    # a title looks this: Survey of Accounting - 71117 - ACCT 203 - 001
    def is_title(text)
      elements = text.split(' - ')
      elements.length == 4 && elements[2].split(' ').length == 2
137
138
139
    end
  end
end