Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Khalid Ali
schedules
Commits
89b77445
Commit
89b77445
authored
Jun 16, 2018
by
Zac Wood
Browse files
Refactor of parser + seeds.rb
Parser could use some more docs, but this is fine for now
parent
ec853376
Changes
2
Show whitespace changes
Inline
Side-by-side
schedules_api/db/patriot_web_parser.rb
View file @
89b77445
...
...
@@ -36,7 +36,7 @@ module PatriotWeb
def
parse_courses_in_subject
(
subject
)
response
=
@networker
.
fetch_courses_in_subject
(
subject
)
document
=
Nokogiri
::
HTML
(
response
)
feed
_course
_info
(
document
)
get
_course
s
(
document
)
end
private
...
...
@@ -63,73 +63,57 @@ module PatriotWeb
end
end
# TODO write docs
def
feed_course_info
(
searcher
)
# find the table containing the courses
table
=
searcher
.
css
(
'html body div.pagebodydiv table.datadisplaytable'
)
# Parse all courses from the subject search page
# @param document [Nokogiri::HTML::Document]
# @return [Array] courses
def
get_courses
(
document
)
table
=
document
.
css
(
'html body div.pagebodydiv table.datadisplaytable'
).
first
rows
=
table
.
children
.
drop
2
# first two elements are junk
# each section is represented by 6 rows in the table
(
0
..
(
rows
.
length
/
6
-
1
)).
map
do
|
i
|
start
=
i
*
6
data
=
{}
currentobj
=
nil
table
.
css
(
'table.datadisplaytable'
).
first
.
children
.
each
do
|
row
|
# for each row in the table
next
unless
row
.
name
==
'tr'
# only search table rows, ignore headers
row
.
children
.
each
do
|
item
|
currentobj
=
sort_item
(
item
,
currentobj
,
data
)
end
end
data
end
title
=
rows
[
start
].
text
# the title looks this: Survey of Accounting - 71117 - ACCT 203 - 001
# so split it by ' - ' and extract
title_elements
=
title
.
split
(
' - '
)
data
[
:title
]
=
title_elements
[
0
].
strip
data
[
:crn
]
=
title_elements
[
1
]
# TODO break this up and write docs
def
sort_item
(
item
,
currentobj
,
data
)
if
item
.
name
==
'th'
if
item
.
to_html
.
include?
'-'
titletxt
=
item
.
text
if
item
.
text
.
include?
' - Honors'
titletxt
=
titletxt
.
gsub
(
' - Honors'
,
' (Honors)'
)
end
titledetails
=
titletxt
.
split
(
' - '
)
if
titledetails
.
count
>
4
titledetails
=
[
"
#{
titledetails
[
0
]
}
#{
titledetails
[
1
]
}
"
,
titledetails
[
2
],
titledetails
[
3
],
titledetails
[
4
]]
end
titledata
=
titledetails
[
2
].
split
(
' '
)
begin
data
=
get_details
(
data
,
titledetails
,
titledata
)[
0
]
currentobj
=
get_details
(
data
,
titledetails
,
titledata
)[
1
]
rescue
StandardError
=>
e
puts
item
puts
e
exit
(
1
)
end
currentobj
[
:fields
]
=
[]
end
elsif
item
.
is_a?
Nokogiri
::
XML
::
Element
item
.
css
(
'th'
).
each
do
|
field
|
currentobj
[
:fields
].
push
(
field
.
text
.
downcase
.
tr
(
' '
,
'_'
))
end
iter
=
0
if
currentobj
if
currentobj
[
:fields
]
upper
=
currentobj
[
:fields
].
count
-
1
while
iter
<=
upper
assign
=
item
.
css
(
'td'
)[
iter
].
text
currentobj
[
currentobj
[
:fields
][
iter
]]
=
assign
iter
+=
1
end
end
end
end
currentobj
full_name
=
title_elements
[
2
].
split
(
' '
)
next
unless
full_name
.
length
==
2
data
[
:subj
]
=
title_elements
[
2
].
split
(
' '
)[
0
]
data
[
:course_number
]
=
title_elements
[
2
].
split
(
' '
)[
1
]
data
[
:section
]
=
title_elements
[
3
].
strip
# rows 1 to 3 contain info about registration and drop dates.
# for now we're gonna ignore them and skip to row 4, which contains details
detail_rows
=
rows
[
start
+
4
].
css
(
'tr'
)
next
unless
detail_rows
.
length
>
0
# if there are no details, skip this item
details
=
detail_rows
.
last
.
text
.
split
(
"
\n
"
).
compact
.
reject
(
&
:empty?
)
# skip empty strings
times
=
details
[
1
].
split
(
' - '
)
if
(
times
.
length
==
1
)
data
[
:start_time
]
=
'TBA'
data
[
:end_time
]
=
'TBA'
else
data
[
:start_time
]
=
times
[
0
]
data
[
:end_time
]
=
times
[
1
]
end
# TODO break this up and write docs
def
get_details
(
data
,
titledetails
,
titledata
)
crn
=
titledetails
[
1
].
strip
data
[
crn
]
=
{}
unless
data
[
titledetails
[
1
]]
crsinfo
=
{
'name'
:
titledetails
[
0
].
strip
}
uniquedata
=
{
'sect'
:
titledetails
[
3
].
strip
,
'crn'
:
titledetails
[
1
].
strip
}
general
=
{
'subj'
:
titledata
[
0
].
strip
,
'code'
:
titledata
[
1
].
strip
}
data
[
crn
]
=
general
.
merge
(
uniquedata
.
merge
(
crsinfo
))
data
[
crn
][
:code
]
=
titledetails
[
2
].
split
(
' '
)[
1
]
[
data
,
data
[
crn
]]
data
[
:days
]
=
details
[
2
].
strip
data
[
:location
]
=
details
[
3
].
strip
dates
=
details
[
4
].
split
(
' - '
)
data
[
:start_date
]
=
dates
[
0
]
data
[
:end_date
]
=
dates
[
1
]
data
[
:type
]
=
details
[
5
]
data
[
:instructor
]
=
details
[
6
]
data
end
end
end
end
schedules_api/db/seeds.rb
View file @
89b77445
...
...
@@ -8,22 +8,25 @@ require 'nokogiri'
require
'json'
threads
=
[]
total
=
[]
total
=
{}
parser
=
PatriotWeb
::
Parser
.
new
# get the first semester only
-- no need to ddos patriot web
# get the first semester only
semester
=
parser
.
parse_semesters
.
first
puts
"DDOSing Patriot Web, buckle up kids
"
# parse all subjects and their courses in the semester
parser
.
parse_subjects
(
semester
).
each
do
|
subject
|
puts
"Getting courses for
#{
subject
}
"
threads
<<
Thread
.
new
{
total
<<
parser
.
parse_courses_in_subject
(
subject
)
total
[
subject
]
=
parser
.
parse_courses_in_subject
(
subject
)
}
end
# For testing, only get first subject
# subject = parser.parse_subjects(semester).first
# total
<<
parser.parse_courses_in_subject(subject)
# total
[subject] =
parser.parse_courses_in_subject(subject)
# wait for all the threads to finish
ThreadsWait
.
all_waits
(
*
threads
)
...
...
@@ -38,46 +41,37 @@ Semester.delete_all
semester
=
Semester
.
create!
season:
'Fall'
,
year:
2018
semester
.
save!
total
.
each
do
|
subject
|
# for each course
subject
.
each_value
do
|
section
|
# for each value in the
subject
hash
# ensure all necessary fields are present
next
unless
(
section
.
key?
"date_range"
)
&&
(
section
.
key?
"instructors"
)
&&
(
section
.
key?
"days"
)
total
.
each
do
|
subject
,
sections
|
puts
"Adding courses for
#{
subject
}
..."
sections
.
each
do
|
section
|
next
if
section
.
nil?
||
!
section
.
key?
(
:subj
)
||
!
section
.
key?
(
:course_number
)
# create a course and set its semester
# Find or create a course and set its semester
# TODO: this breaks when you try to do more than one semester,
# since just the subject + course_number do not uniquely identify a course
# Check the semester as well
course
=
Course
.
find_or_create_by
(
subject:
section
[
:subj
],
course_number:
section
[
:code
])
course_number:
section
[
:course_number
])
course
.
semester
=
semester
course
.
save!
section_name
=
"
#{
section
[
:subj
]
}
#{
section
[
:code
]
}
#{
section
[
:sect
]
}
"
puts
"Adding
#{
section_name
}
..."
# the start and end times are located in the "time" key and look like START_TIME - END_TIME
# so, split them by the dash and add them
start_time
=
if
section
.
key?
"time"
section
[
"time"
].
split
(
' - '
).
first
else
"N/A"
end
section_name
=
"
#{
section
[
:subj
]
}
#{
section
[
:course_number
]
}
#{
section
[
:section
]
}
"
end_time
=
if
section
.
key?
"time"
section
[
"time"
].
split
(
' - '
).
last
else
"N/A"
end
puts
"Adding
#{
section_name
}
..."
Section
.
create!
(
name:
section_name
,
crn:
section
[
:crn
],
title:
section
[
:name
],
location:
section
[
"where"
],
days:
section
[
"days"
],
start_date:
section
[
"date_range"
].
split
(
' - '
).
first
,
end_date:
section
[
"date_range"
].
split
(
' - '
).
last
,
start_time:
start_time
,
end_time:
end_time
,
instructor:
section
[
"instructors"
].
split
(
' '
).
map
{
|
word
|
word
unless
word
.
empty?
}.
join
(
' '
),
section_type:
section
[
:type
],
title:
section
[
:title
],
instructor:
section
[
:instructor
],
start_date:
section
[
:start_date
],
end_date:
section
[
:end_date
],
days:
section
[
:days
],
start_time:
section
[
:start_time
],
end_time:
section
[
:end_time
],
location:
section
[
:location
],
course:
course
)
end
end
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment