Commit 2ed1ebd3 authored by Landon DeCoito's avatar Landon DeCoito
Browse files

Merge branch '12-getconnectedfixer' into 'master'

Resolve "Fix the Get Connected Parse because you shouldn't have pushed it"

Closes #12

See merge request !29
parents b40071ef e1cd9773
# flask imports
from flask import Flask
from flask import Response
from flask import render_template
# app imports
from parscript import load_data
from getconnectedscript import load_getconn_data
# python imports
import json
app = Flask(__name__)
@app.route("/")
def display_default():
resp = Response(("Welcome to the masontoday API! Go to https://git.gmu.edu/srct/mason-today-web <br/><br/>"
+ "Feel free to go to /api/25live/ or /api/getconnected/ to find our api!").encode('utf-8'))
resp = render_template('welcomepage.html')
return resp
@app.route("/api/25live")
def display_data():
resp = Response(json.dumps(load_data(), ensure_ascii=False).encode('utf-8'))
resp = Response(json.dumps(load_data(), ensure_ascii=False)
.encode('utf-8'))
resp.headers['Content-Type'] = 'application/json; charset=utf-8'
return resp
@app.route("/api/getconnected")
def display_GC_data():
resp = Response(json.dumps(load_getconn_data(), ensure_ascii=False).encode('utf-8'))
resp = Response(json.dumps(load_getconn_data(), ensure_ascii=False)
.encode('utf-8'))
resp.headers['Content-Type'] = 'application/json; charset=utf-8'
return resp
This diff is collapsed.
import requests
from bs4 import BeautifulSoup
from parscript import cleanup, doTheTime
# app imports
from parscript import cleanup, convertTime
# third party imports
import feedparser
from bs4 import BeautifulSoup
import requests
# DEV REMINDER: CHANGE THE LINES IN INTIALISATION ERROR MESSAGE (LINE 138)
# TODO: ADD "getconnected" ATTRIBUTE TO LOAD_DATA DICTLIST
def splitAndConvertTime(strin):
strin = strin.replace(" ", "")
strin = strin.split("-")
returnlist = ["", ""]
returnlist[1] = convertTime(strin[1])
if not (strin[0][-2:] == "am" or strin[0][-2:] == "AM") \
and not (strin[0][-2:] == "pm" or strin[0][-2:] == "PM"):
if (strin[1][-2:] == "am"):
returnlist[0] = convertTime(strin[0] + "am")
else:
returnlist[0] = convertTime(strin[0] + "pm")
else:
returnlist[0] = convertTime(strin[0])
return returnlist
# woah = cleanup(requests.get("https://getconnected.gmu.edu/events/events.rss").text)
# soup = BeautifulSoup(woah, "lxml")
# print soup.prettify
def load_getconn_data():
feedtext = requests.get("https://getconnected.gmu.edu/events/events.rss").text
feedtext = requests.get(
"https://getconnected.gmu.edu/events/events.rss").text
feedtext = cleanup(feedtext)
# this calls the RSS feed parser from !feedparser
feed = feedparser.parse(feedtext)
feed = feedparser.parse(feedtext) # this calls the RSS feed parser from !feedparser
# print feed, "\n\n\n"
# ctr = 0
dictlist = []
for entry in feed.entries:
# print"==================================="
error = []
# this collects the attributes which are always there
# also creates the Soup and sets up for the rest of the parsing
try:
uniqueid = entry.id[-7:]
# print uniqueid
title = entry.title
# print title
sumdetsoup = BeautifulSoup(entry.summary_detail["value"].encode("ascii", "replace"), "html.parser")
sumdetsoup = BeautifulSoup(entry.summary_detail["value"]
.encode("utf-8"), "html.parser")
location = [sumdetsoup.div.span.text]
# print location
......@@ -36,44 +55,106 @@ def load_getconn_data():
description = sumdetsoup.find_all("div")[1].text
# print description
datetime = sumdetsoup.b.text
# print datetime
if (datetime.count("(") == 1):
except Exception as e:
error.append(str(e))
# this handles events which start and end on the same day
# if we've found an error, there's no point in continuing
if (datetime.count("(") == 1) and (error == []):
# parses out date info
try:
datesplit = datetime.split(", ")
weekday = datesplit[0]
month = datesplit[1].split(" ")
monthday = month[1]
month = month[0]
temp = datesplit[1].split(" ")
monthday = temp[1]
month = temp[0]
year = datesplit[2][:5]
parsedtimelist = doTheTime(datesplit[2][6:-1])
except Exception as e:
error.append(str(e))
# uses helper function to get the start and end time
try:
parsedtimelist = splitAndConvertTime(datesplit[2][6:-1])
timestart = parsedtimelist[0]
timestop = parsedtimelist[1]
# print {"id":uniqueid, "title":title, "dayofweek":weekday, "dayofmonth":monthday, "month":month, "year":year, "timestart":timestart, "timestop":timestop, "location":location, "description":description}
dictlist.append({"id":uniqueid, "title":title, "dayofweek":weekday, "dayofmonth":monthday, "month":month, "year":year, "timestart":timestart, "timestop":timestop, "location":location, "description":description})
except Exception as e:
error.append(str(e))
# appends the dictlist if no errors were found
if error == []:
dictlist.append({
"multiday": False, "id": uniqueid,
"title": title, "dayofweek": weekday,
"dayofmonth": monthday, "month": month,
"year": year, "timestart": timestart,
"timestop": timestop, "location": location,
"description": description
})
else:
dictlist.append({"id": uniqueid, "error": str(e),
"errorlocation": ""})
# this handles events which start on one day and end on another
elif error == []:
# getting the information for the start day/time
try:
datesplit = datetime.split(" - ")
tempsplits = datesplit[0].split(", ")
weekday = tempsplits[0]
month = tempsplits[1].split(" ")[0]
monthday = tempsplits[1].split(" ")[1]
year = tempsplits[2].split(" ")[0]
timestart = datesplit[0].split("(")[1][:-1]
timestart = convertTime(timestart)
except Exception as e:
error.append(str(e))
# getting the information for the end day/time
try:
tempsplits = datesplit[1].split(", ")
endweekday = tempsplits[0]
endmonth = tempsplits[1].split(" ")[0]
endmonthday = tempsplits[1].split(" ")[1]
endyear = tempsplits[2].split(" ")[0]
timestop = datesplit[1].split("(")[1][:-1]
timestop = convertTime(timestop)
except Exception as e:
error.append(str(e))
# append the dictlist if no errors were found
if error == []:
dictlist.append({
"multiday": True, "id": uniqueid,
"title": title, "dayofweek": weekday,
"dayofmonth": monthday, "month": month,
"year": year, "timestart": timestart,
"timestop": timestop, "location": location,
"description": description,
"enddayofweek": endweekday,
"enddayofmonth": endmonthday,
"endmonth": endmonth, "endyear": endyear
})
else:
dictlist.append({"id": uniqueid, "error": str(e)})
else:
dictlist.append({"error": "issue in initialization of event.\
check lines 40-56 in getconnectedscript.py"})
return dictlist
#print "\n\n", sumdetsoup.prettify()
#print"==================================="
#dictlist.append({"id":uniqueid, "title":entry_title, "dayofweek":weekday, "dayofmonth":monthday, "month":month, "year":year, "timestart":timestart, "timestop":timestop, "location":location, "description":description})
#This was intended to figure out what objects are in each entry and what appears only sometimes
#The results are:
####Every event has:
#-------summary
#-------published_parsed
#-------links
#-------author
#-------summary
#-------guidislink
#-------title_detail
#-------link
#-------authors
#-------title
#-------author_detail
#-------id
#-------published
####Some events have:
#-------tags
# Every event has:
# -------summary
# -------published_parsed
# -------links
# -------author
# -------summary
# -------guidislink
# -------title_detail
# -------link
# -------authors
# -------title
# -------author_detail
# -------id
# -------published
# Some events have:
# -------tags
#print "and we begin"
# print "and we begin"
# third party imports
from bs4 import BeautifulSoup
from datetime import date, time
import requests
def cleanup(str): #this function cleans up some of the useless html leftovers to characters we can actually use
str = str.replace("&amp;", "&")
str = str.replace("&nbsp;", " ")
str = str.replace("&ndash;", "-")
str = str.replace("&lt;", "<")
str = str.replace("&gt;", ">")
str = str.replace("<br/>", "\n")
str = str.replace("Publish event on the Calendar?: TRUE \n" , "")
str = str.replace("Performing any medical procedures?: FALSE \n" , "")
str = str.replace("Parking Needed?: FALSE \n" , "")
str = str[0:len(str) - 1]
str = str.replace("&rsquo;", "'")
return str
class eventException: #this class is just an exception for our use
def __init__(self,message):
self.__message = message
#self.__exceptionlist = []
def __str__(self):
return self.__message
def doTheTime(strin):
strin = strin.replace(" ", "")
strin = strin.split("-")
returnlist = ["",""]
try:
returnlist[1] = convertTime(strin[1])
except ValueError:
raise eventException(str(strin))
if not (strin[0][-2:] == "am") and not (strin[0][-2:] == "pm"):
if (strin[1][-2:] == "am"):
returnlist[0] = convertTime(strin[0] + "am")
else:
returnlist[0] = convertTime(strin[0] + "pm")
else:
returnlist[0] = convertTime(strin[0])
return returnlist
#convertTime accepts strings in the form of ""
def convertTime(stri): #this function is used for splicing the event times.
if (stri[-2:] == "pm" or stri[-2:] == "PM"): #checks to see if the time presented is pm
if not ((stri[0] == "1") and (stri[1] == "2")): #if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
try: #this try block works with the exception handler to add 12 to any pm times
# this function cleans up some of the useless html leftovers to characters we can actually use
def cleanup(dirtystring):
replacements = [
("&amp;", "&"),
("&nbsp;", " "),
("&ndash;", "-"),
("&lt;", "<"),
("&gt;", ">"),
("<br/>", "\n"),
("Publish event on the Calendar?: TRUE \n", ""),
("Performing any medical procedures?: FALSE \n", ""),
("Parking Needed?: FALSE \n", ""),
("\n\n\n", "\n"),
("\n\n", "\n"),
("&rsquo;", "'")
]
for replacement in replacements:
dirtystring = dirtystring.replace(replacement[0], replacement[1])
return dirtystring[:-1]
# convertTime accepts strings in the form of ""
def convertTime(stri): # this function is used for splicing the event times.
if (stri[-2:] == "pm" or stri[-2:] == "PM"): # checks to see if the time presented is pm
if not ((stri[0] == "1") and (stri[1] == "2")): # if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
try: # this try block works with the exception handler to add 12 to any pm times
stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
#print "I did the first one " + stri
except:
# print "I did the first one " + stri
except Exception:
stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
#print "I did the NOT first one " + stri
if ":" in stri: #this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
# print "I did the NOT first one " + stri
if ":" in stri: # this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
try:
return ((int(stri[0:2])) * 60) + int(stri[3:5])
except:
except Exception:
return ((int(stri[0])) * 60) + int(stri[2:4])
else:
try:
return (int(stri[0:2])) * 60
except:
except Exception:
return (int(stri[0])) * 60
elif (stri[-2:] == "am" or stri[-2:] == "AM"): #checks if the time presented is am, and executes identical code from the pm block, just without adding 12
elif (stri[-2:] == "am" or stri[-2:] == "AM"): # checks if the time presented is am, and executes identical code from the pm block, just without adding 12
if ":" in stri:
try:
return (int(stri[0:2]) * 60) + int(stri[3:5])
except:
except Exception:
return (int(stri[0]) * 60) + int(stri[2:4])
else:
try:
return int(stri[0:2]) * 60
except:
except Exception:
return int(stri[0]) * 60
else:
raise eventException("This is weird and please don't happen")
raise Exception("Issue with time dilation. Input string: " + stri)
def load_data():
......@@ -85,128 +70,128 @@ def load_data():
"""
dictlist = []
DaysOfWeek = {
"Sunday" : 0,
"Monday" : 1,
"Tuesday" : 2,
"Wednesday" : 3,
"Thursday" : 4,
"Friday" : 5,
"Saturday" : 6,
"Sunday": 0,
"Monday": 1,
"Tuesday": 2,
"Wednesday": 3,
"Thursday": 4,
"Friday": 5,
"Saturday": 6,
}
notProvide = "Not Provided"
counter = 0
soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml") #creates soup of the xml
#creates a list of all the entry tags from the xml
soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml")
# creates a list of all the entry tags from the xml
entries = soup.findAll('entry')
#indexs an entry in the list of entries
# indexs an entry in the list of entries
for entry in entries:
error = []
#pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
try:
uniqueid = entry.id.text
uniqueid = uniqueid[-9:]
except Exception:
uniqueid = "Error with getting ID"
# pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
try:
entry_title = entry.title.text
entry_content = entry.content.text
uniqueid = entry.id.text
#makes it easy to find as things may be unevenly spaced
entry_content = entry_content.replace("\n\n\n" , "\n")
entry_content = entry_content.replace("\n\n" , "\n")
# makes it easy to find as things may be unevenly spaced
entry_content = entry_content.replace("\n\n\n", "\n")
entry_content = entry_content.replace("\n\n", "\n")
#check clearcontent function
entry_content = cleanup(entry_content) #we might just get rid of this one
# check clearcontent function
entry_content = cleanup(entry_content) # we might just get rid of this one
#each piece of content may is seperated by a newline, entry_detailes creates a list
# each piece of content may is seperated by a newline, entry_detailes creates a list
entry_detailes = entry_content.split("\n")
except Exception as e:
error.append(str(e))
dictlist.append({"id": uniqueid, "error": error})
continue
# in entry detailes list normally the conditions go as follow
# [0] is the location
# [1] is the date
# [2] is the description
#in entry detailes list normally the conditions go as follow
#[0] is the location
#[1] is the date
#[2] is the description
#either conditions follows
#[0] is date
#[0] is location
#[1] is date
# either conditions follows
# [0] is date
#[0] is date
#[1] is description
# [0] is location
# [1] is date
#sometimes the location or description is not given; however, the location always goes before date and
#the description always follows the date. The date is always present. See examples above
# [0] is date
# [1] is description
#(A) if the location is not given then the date must be index [0]
#(B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
#(C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]
# sometimes the location or description is not given; however, the location always goes before date and
# the description always follows the date. The date is always present. See examples above
#(D) if the location is given then the date must be index [1]
#(E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
#(F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]
# (A) if the location is not given then the date must be index [0]
# (B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
# (C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]
# (D) if the location is given then the date must be index [1]
# (E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
# (F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]
#the two if statements finds the date string. The date string always starts with
#Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
#is always on either [0] or [1]
# the two if statements finds the date string. The date string always starts with
# Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
# is always on either [0] or [1]
#see (A) above
# see (A) above
try:
if entry_detailes[0].split(",")[0] in DaysOfWeek:
#See (B)
# See (B)
if len(entry_detailes) == 1:
location = notProvide
date = entry_detailes[0]
description = notProvide
#see (C)
# see (C)
elif len(entry_detailes) == 2:
location = notProvide
date = entry_detailes[0]
description = entry_detailes[1]
#This extra case was made because one entry had the description split into two by a
#newline so it registered as two descriptions making the length = 3
# This extra case was made because one entry had the description split into two by a
# newline so it registered as two descriptions making the length = 3
elif len(entry_detailes) == 3:
location = notProvide
date = entry_detailes[0]
description = entry_detailes[1] + " " + entry_detailes[2]
#this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
# this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
else:
raise eventException("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
raise Exception("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
#see (D) above
# see (D) above
elif entry_detailes[1].split(",")[0] in DaysOfWeek:
#See (E)
# See (E)
if len(entry_detailes) == 2:
location = entry_detailes[0]
date = entry_detailes[1]
description = notProvide
#See (F)
# See (F)
elif len(entry_detailes) == 3:
location = entry_detailes[0]
date = entry_detailes[1]
description = entry_detailes[2]
#This extra case was made because one entry had the description split into two by a
#newline so it registered as two descriptions making the length = 3
# This extra case was made because one entry had the description split into two by a
# newline so it registered as two descriptions making the length = 3
elif len(entry_detailes) == 4:
location = entry_detailes[0]
date = entry_detailes[1]
description = entry_detailes[2] + " " + entry_detailes[3]
#this will print if the code has failed to account for something in detailes
# this will print if the code has failed to account for something in detailes
else:
raise eventException("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
#this will print if the above if statements failed to find the date block
raise Exception("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
# this will print if the above if statements failed to find the date block
else:
raise eventException("failed to find and account for date element in entry_detailes list")
except eventException as e:
raise Exception("failed to find and account for date element in entry_detailes list")
except Exception as e:
error.append(str(e))
except Exception:
error.append("Error intialising event")
try:
uniqueid = uniqueid[-9:]
except:
uniqueid = "Error with getting ID"
try:
if location != notProvide:
......@@ -223,7 +208,7 @@ def load_data():
else:
location = [location]
except Exception:
error.append("Error with location")
error.append("Location Error: " + str(e))
try:
date = date.split(",")
......@@ -234,18 +219,17 @@ def load_data():
month = date[0]
monthday = date[1][:(len(date[1]) - 1)]
year = date[2]
except Exception:
error.append("Error with time/date splicing")
except Exception as e:
error.append("Date Error: " + str(e))
try:
time = time.replace(" ", "")
time = time.split("-")
try:
timestop = convertTime(time[1])
except ValueError:
raise eventException(str(time))
if timestop == None:
raise eventException(str(time))
if timestop is None:
raise Exception(str(time))
if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
if (time[1][-2:] == "am"):
timestart = convertTime(time[0] + "am")
......@@ -253,10 +237,8 @@ def load_data():
timestart = convertTime(time[0] + "pm")
else:
timestart = convertTime(time[0])