Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Khalid Ali
schedules
Commits
89b77445
Commit
89b77445
authored
Jun 16, 2018
by
Zac Wood
Browse files
Refactor of parser + seeds.rb
Parser could use some more docs, but this is fine for now
parent
ec853376
Changes
2
Show whitespace changes
Inline
Side-by-side
schedules_api/db/patriot_web_parser.rb
View file @
89b77445
...
@@ -36,7 +36,7 @@ module PatriotWeb
...
@@ -36,7 +36,7 @@ module PatriotWeb
def
parse_courses_in_subject
(
subject
)
def
parse_courses_in_subject
(
subject
)
response
=
@networker
.
fetch_courses_in_subject
(
subject
)
response
=
@networker
.
fetch_courses_in_subject
(
subject
)
document
=
Nokogiri
::
HTML
(
response
)
document
=
Nokogiri
::
HTML
(
response
)
feed
_course
_info
(
document
)
get
_course
s
(
document
)
end
end
private
private
...
@@ -63,73 +63,57 @@ module PatriotWeb
...
@@ -63,73 +63,57 @@ module PatriotWeb
end
end
end
end
# TODO write docs
# Parse all courses from the subject search page
def
feed_course_info
(
searcher
)
# @param document [Nokogiri::HTML::Document]
# find the table containing the courses
# @return [Array] courses
table
=
searcher
.
css
(
'html body div.pagebodydiv table.datadisplaytable'
)
def
get_courses
(
document
)
table
=
document
.
css
(
'html body div.pagebodydiv table.datadisplaytable'
).
first
rows
=
table
.
children
.
drop
2
# first two elements are junk
# each section is represented by 6 rows in the table
(
0
..
(
rows
.
length
/
6
-
1
)).
map
do
|
i
|
start
=
i
*
6
data
=
{}
data
=
{}
currentobj
=
nil
title
=
rows
[
start
].
text
table
.
css
(
'table.datadisplaytable'
).
first
.
children
.
each
do
|
row
|
# for each row in the table
# the title looks this: Survey of Accounting - 71117 - ACCT 203 - 001
next
unless
row
.
name
==
'tr'
# only search table rows, ignore headers
# so split it by ' - ' and extract
row
.
children
.
each
do
|
item
|
title_elements
=
title
.
split
(
' - '
)
currentobj
=
sort_item
(
item
,
currentobj
,
data
)
data
[
:title
]
=
title_elements
[
0
].
strip
end
data
[
:crn
]
=
title_elements
[
1
]
end
data
end
# TODO break this up and write docs
full_name
=
title_elements
[
2
].
split
(
' '
)
def
sort_item
(
item
,
currentobj
,
data
)
next
unless
full_name
.
length
==
2
if
item
.
name
==
'th'
data
[
:subj
]
=
title_elements
[
2
].
split
(
' '
)[
0
]
if
item
.
to_html
.
include?
'-'
data
[
:course_number
]
=
title_elements
[
2
].
split
(
' '
)[
1
]
titletxt
=
item
.
text
if
item
.
text
.
include?
' - Honors'
data
[
:section
]
=
title_elements
[
3
].
strip
titletxt
=
titletxt
.
gsub
(
' - Honors'
,
' (Honors)'
)
end
# rows 1 to 3 contain info about registration and drop dates.
titledetails
=
titletxt
.
split
(
' - '
)
# for now we're gonna ignore them and skip to row 4, which contains details
if
titledetails
.
count
>
4
detail_rows
=
rows
[
start
+
4
].
css
(
'tr'
)
titledetails
=
[
"
#{
titledetails
[
0
]
}
#{
titledetails
[
1
]
}
"
,
titledetails
[
2
],
titledetails
[
3
],
titledetails
[
4
]]
next
unless
detail_rows
.
length
>
0
# if there are no details, skip this item
end
details
=
detail_rows
.
last
.
text
.
split
(
"
\n
"
).
compact
.
reject
(
&
:empty?
)
# skip empty strings
titledata
=
titledetails
[
2
].
split
(
' '
)
begin
times
=
details
[
1
].
split
(
' - '
)
data
=
get_details
(
data
,
titledetails
,
titledata
)[
0
]
if
(
times
.
length
==
1
)
currentobj
=
get_details
(
data
,
titledetails
,
titledata
)[
1
]
data
[
:start_time
]
=
'TBA'
rescue
StandardError
=>
e
data
[
:end_time
]
=
'TBA'
puts
item
else
puts
e
data
[
:start_time
]
=
times
[
0
]
exit
(
1
)
data
[
:end_time
]
=
times
[
1
]
end
currentobj
[
:fields
]
=
[]
end
elsif
item
.
is_a?
Nokogiri
::
XML
::
Element
item
.
css
(
'th'
).
each
do
|
field
|
currentobj
[
:fields
].
push
(
field
.
text
.
downcase
.
tr
(
' '
,
'_'
))
end
iter
=
0
if
currentobj
if
currentobj
[
:fields
]
upper
=
currentobj
[
:fields
].
count
-
1
while
iter
<=
upper
assign
=
item
.
css
(
'td'
)[
iter
].
text
currentobj
[
currentobj
[
:fields
][
iter
]]
=
assign
iter
+=
1
end
end
end
end
currentobj
end
end
# TODO break this up and write docs
data
[
:days
]
=
details
[
2
].
strip
def
get_details
(
data
,
titledetails
,
titledata
)
data
[
:location
]
=
details
[
3
].
strip
crn
=
titledetails
[
1
].
strip
data
[
crn
]
=
{}
unless
data
[
titledetails
[
1
]]
dates
=
details
[
4
].
split
(
' - '
)
crsinfo
=
{
'name'
:
titledetails
[
0
].
strip
}
data
[
:start_date
]
=
dates
[
0
]
uniquedata
=
{
'sect'
:
titledetails
[
3
].
strip
,
'crn'
:
titledetails
[
1
].
strip
}
data
[
:end_date
]
=
dates
[
1
]
general
=
{
'subj'
:
titledata
[
0
].
strip
,
'code'
:
titledata
[
1
].
strip
}
data
[
crn
]
=
general
.
merge
(
uniquedata
.
merge
(
crsinfo
))
data
[
:type
]
=
details
[
5
]
data
[
crn
][
:code
]
=
titledetails
[
2
].
split
(
' '
)[
1
]
data
[
:instructor
]
=
details
[
6
]
[
data
,
data
[
crn
]]
data
end
end
end
end
end
end
end
schedules_api/db/seeds.rb
View file @
89b77445
...
@@ -8,22 +8,25 @@ require 'nokogiri'
...
@@ -8,22 +8,25 @@ require 'nokogiri'
require
'json'
require
'json'
threads
=
[]
threads
=
[]
total
=
[]
total
=
{}
parser
=
PatriotWeb
::
Parser
.
new
parser
=
PatriotWeb
::
Parser
.
new
# get the first semester only
-- no need to ddos patriot web
# get the first semester only
semester
=
parser
.
parse_semesters
.
first
semester
=
parser
.
parse_semesters
.
first
puts
"DDOSing Patriot Web, buckle up kids
"
# parse all subjects and their courses in the semester
# parse all subjects and their courses in the semester
parser
.
parse_subjects
(
semester
).
each
do
|
subject
|
parser
.
parse_subjects
(
semester
).
each
do
|
subject
|
puts
"Getting courses for
#{
subject
}
"
threads
<<
Thread
.
new
{
threads
<<
Thread
.
new
{
total
<<
parser
.
parse_courses_in_subject
(
subject
)
total
[
subject
]
=
parser
.
parse_courses_in_subject
(
subject
)
}
}
end
end
# For testing, only get first subject
# For testing, only get first subject
# subject = parser.parse_subjects(semester).first
# subject = parser.parse_subjects(semester).first
# total
<<
parser.parse_courses_in_subject(subject)
# total
[subject] =
parser.parse_courses_in_subject(subject)
# wait for all the threads to finish
# wait for all the threads to finish
ThreadsWait
.
all_waits
(
*
threads
)
ThreadsWait
.
all_waits
(
*
threads
)
...
@@ -38,46 +41,37 @@ Semester.delete_all
...
@@ -38,46 +41,37 @@ Semester.delete_all
semester
=
Semester
.
create!
season:
'Fall'
,
year:
2018
semester
=
Semester
.
create!
season:
'Fall'
,
year:
2018
semester
.
save!
semester
.
save!
total
.
each
do
|
subject
|
# for each course
total
.
each
do
|
subject
,
sections
|
subject
.
each_value
do
|
section
|
# for each value in the
subject
hash
puts
"Adding courses for
#{
subject
}
..."
# ensure all necessary fields are present
sections
.
each
do
|
section
|
next
unless
(
section
.
key?
"date_range"
)
&&
(
section
.
key?
"instructors"
)
&&
(
section
.
key?
"days"
)
next
if
section
.
nil?
||
!
section
.
key?
(
:subj
)
||
!
section
.
key?
(
:course_number
)
# create a course and set its semester
# Find or create a course and set its semester
# TODO: this breaks when you try to do more than one semester,
# since just the subject + course_number do not uniquely identify a course
# Check the semester as well
course
=
Course
.
find_or_create_by
(
subject:
section
[
:subj
],
course
=
Course
.
find_or_create_by
(
subject:
section
[
:subj
],
course_number:
section
[
:code
])
course_number:
section
[
:course_number
])
course
.
semester
=
semester
course
.
semester
=
semester
course
.
save!
course
.
save!
section_name
=
"
#{
section
[
:subj
]
}
#{
section
[
:code
]
}
#{
section
[
:sect
]
}
"
section_name
=
"
#{
section
[
:subj
]
}
#{
section
[
:course_number
]
}
#{
section
[
:section
]
}
"
puts
"Adding
#{
section_name
}
..."
# the start and end times are located in the "time" key and look like START_TIME - END_TIME
# so, split them by the dash and add them
start_time
=
if
section
.
key?
"time"
section
[
"time"
].
split
(
' - '
).
first
else
"N/A"
end
end_time
=
if
section
.
key?
"time"
puts
"Adding
#{
section_name
}
..."
section
[
"time"
].
split
(
' - '
).
last
else
"N/A"
end
Section
.
create!
(
name:
section_name
,
Section
.
create!
(
name:
section_name
,
crn:
section
[
:crn
],
crn:
section
[
:crn
],
title:
section
[
:name
],
section_type:
section
[
:type
],
location:
section
[
"where"
],
title:
section
[
:title
],
days:
section
[
"days"
],
instructor:
section
[
:instructor
],
start_date:
section
[
"date_range"
].
split
(
' - '
).
first
,
start_date:
section
[
:start_date
],
end_date:
section
[
"date_range"
].
split
(
' - '
).
last
,
end_date:
section
[
:end_date
],
start_time:
start_time
,
days:
section
[
:days
],
end_time:
end_time
,
start_time:
section
[
:start_time
],
instructor:
section
[
"instructors"
].
split
(
' '
).
map
{
|
word
|
word
unless
word
.
empty?
}.
join
(
' '
),
end_time:
section
[
:end_time
],
location:
section
[
:location
],
course:
course
)
course:
course
)
end
end
end
end
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment