Commit d09a0594 authored by Landon DeCoito's avatar Landon DeCoito
Browse files

made parscript a BIT nicer and revamped the cleanup() function

parent 7241df82
#print "and we begin" # print "and we begin"
# third party imports
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import date, time
import requests import requests
def cleanup(str): #this function cleans up some of the useless html leftovers to characters we can actually use # this function cleans up some of the useless html leftovers to characters we can actually use
str = str.replace("&", "&") def cleanup(dirtystring):
str = str.replace(" ", " ") replacements = [
str = str.replace("–", "-") ("&", "&"),
str = str.replace("&lt;", "<") ("&nbsp;", " "),
str = str.replace("&gt;", ">") ("&ndash;", "-"),
str = str.replace("<br/>", "\n") ("&lt;", "<"),
str = str.replace("Publish event on the Calendar?: TRUE \n" , "") ("&gt;", ">"),
str = str.replace("Performing any medical procedures?: FALSE \n" , "") ("<br/>", "\n"),
str = str.replace("Parking Needed?: FALSE \n" , "") ("Publish event on the Calendar?: TRUE \n", ""),
str = str[0:len(str) - 1] ("Performing any medical procedures?: FALSE \n", ""),
str = str.replace("&rsquo;", "'") ("Parking Needed?: FALSE \n", ""),
("\n\n\n", "\n"),
("\n\n", "\n"),
("&rsquo;", "'")
]
for replacement in replacements:
dirtystring.replace(replacement[0], replacement[1])
dirtystring = dirtystring[0:len(dirtystring) - 1]
return str return str
class eventException: #this class is just an exception for our use
def __init__(self,message): class eventException: # this class is just an exception for our use
def __init__(self, message):
self.__message = message self.__message = message
#self.__exceptionlist = []
def __str__(self): def __str__(self):
return self.__message return self.__message
#convertTime accepts strings in the form of ""
def convertTime(stri): #this function is used for splicing the event times. # convertTime accepts strings in the form of ""
if (stri[-2:] == "pm" or stri[-2:] == "PM"): #checks to see if the time presented is pm def convertTime(stri): # this function is used for splicing the event times.
if not ((stri[0] == "1") and (stri[1] == "2")): #if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it if (stri[-2:] == "pm" or stri[-2:] == "PM"): # checks to see if the time presented is pm
try: #this try block works with the exception handler to add 12 to any pm times if not ((stri[0] == "1") and (stri[1] == "2")): # if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
try: # this try block works with the exception handler to add 12 to any pm times
stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1) stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
#print "I did the first one " + stri # print "I did the first one " + stri
except: except:
stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1) stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
#print "I did the NOT first one " + stri # print "I did the NOT first one " + stri
if ":" in stri: #this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour" if ":" in stri: # this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
try: try:
return ((int(stri[0:2])) * 60) + int(stri[3:5]) return ((int(stri[0:2])) * 60) + int(stri[3:5])
except: except:
...@@ -46,7 +57,7 @@ def convertTime(stri): #this function is used for splicing the event times. ...@@ -46,7 +57,7 @@ def convertTime(stri): #this function is used for splicing the event times.
return (int(stri[0:2])) * 60 return (int(stri[0:2])) * 60
except: except:
return (int(stri[0])) * 60 return (int(stri[0])) * 60
elif (stri[-2:] == "am" or stri[-2:] == "AM"): #checks if the time presented is am, and executes identical code from the pm block, just without adding 12 elif (stri[-2:] == "am" or stri[-2:] == "AM"): # checks if the time presented is am, and executes identical code from the pm block, just without adding 12
if ":" in stri: if ":" in stri:
try: try:
return (int(stri[0:2]) * 60) + int(stri[3:5]) return (int(stri[0:2]) * 60) + int(stri[3:5])
...@@ -68,117 +79,115 @@ def load_data(): ...@@ -68,117 +79,115 @@ def load_data():
""" """
dictlist = [] dictlist = []
DaysOfWeek = { DaysOfWeek = {
"Sunday" : 0, "Sunday": 0,
"Monday" : 1, "Monday": 1,
"Tuesday" : 2, "Tuesday": 2,
"Wednesday" : 3, "Wednesday": 3,
"Thursday" : 4, "Thursday": 4,
"Friday" : 5, "Friday": 5,
"Saturday" : 6, "Saturday": 6,
} }
notProvide = "Not Provided" notProvide = "Not Provided"
counter = 0 counter = 0
soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml") #creates soup of the xml soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml")
#creates a list of all the entry tags from the xml # creates a list of all the entry tags from the xml
entries = soup.findAll('entry') entries = soup.findAll('entry')
#indexs an entry in the list of entries # indexs an entry in the list of entries
for entry in entries: for entry in entries:
error = [] error = []
#pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string # pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
entry_title = entry.title.text entry_title = entry.title.text
entry_content = entry.content.text entry_content = entry.content.text
uniqueid = entry.id.text uniqueid = entry.id.text
#makes it easy to find as things may be unevenly spaced
entry_content = entry_content.replace("\n\n\n" , "\n")
entry_content = entry_content.replace("\n\n" , "\n")
#check clearcontent function # makes it easy to find as things may be unevenly spaced
entry_content = cleanup(entry_content) #we might just get rid of this one entry_content = entry_content.replace("\n\n\n", "\n")
entry_content = entry_content.replace("\n\n", "\n")
# check clearcontent function
entry_content = cleanup(entry_content) # we might just get rid of this one
#each piece of content may is seperated by a newline, entry_detailes creates a list # each piece of content may is seperated by a newline, entry_detailes creates a list
entry_detailes = entry_content.split("\n") entry_detailes = entry_content.split("\n")
# in entry detailes list normally the conditions go as follow
# [0] is the location
# [1] is the date
# [2] is the description
# either conditions follows
# [0] is date
# [0] is location
# [1] is date
# [0] is date
# [1] is description
#in entry detailes list normally the conditions go as follow # sometimes the location or description is not given; however, the location always goes before date and
#[0] is the location # the description always follows the date. The date is always present. See examples above
#[1] is the date
#[2] is the description # (A) if the location is not given then the date must be index [0]
# (B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
#either conditions follows # (C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]
#[0] is date
# (D) if the location is given then the date must be index [1]
#[0] is location # (E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
#[1] is date # (F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]
#[0] is date # the two if statements finds the date string. The date string always starts with
#[1] is description # Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
# is always on either [0] or [1]
#sometimes the location or description is not given; however, the location always goes before date and
#the description always follows the date. The date is always present. See examples above # see (A) above
#(A) if the location is not given then the date must be index [0]
#(B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
#(C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]
#(D) if the location is given then the date must be index [1]
#(E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
#(F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]
#the two if statements finds the date string. The date string always starts with
#Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
#is always on either [0] or [1]
#see (A) above
try: try:
if entry_detailes[0].split(",")[0] in DaysOfWeek: if entry_detailes[0].split(",")[0] in DaysOfWeek:
#See (B) # See (B)
if len(entry_detailes) == 1: if len(entry_detailes) == 1:
location = notProvide location = notProvide
date = entry_detailes[0] date = entry_detailes[0]
description = notProvide description = notProvide
#see (C) # see (C)
elif len(entry_detailes) == 2: elif len(entry_detailes) == 2:
location = notProvide location = notProvide
date = entry_detailes[0] date = entry_detailes[0]
description = entry_detailes[1] description = entry_detailes[1]
#This extra case was made because one entry had the description split into two by a # This extra case was made because one entry had the description split into two by a
#newline so it registered as two descriptions making the length = 3 # newline so it registered as two descriptions making the length = 3
elif len(entry_detailes) == 3: elif len(entry_detailes) == 3:
location = notProvide location = notProvide
date = entry_detailes[0] date = entry_detailes[0]
description = entry_detailes[1] + " " + entry_detailes[2] description = entry_detailes[1] + " " + entry_detailes[2]
#this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017 # this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
else: else:
raise eventException("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list") raise eventException("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
# see (D) above
#see (D) above
elif entry_detailes[1].split(",")[0] in DaysOfWeek: elif entry_detailes[1].split(",")[0] in DaysOfWeek:
#See (E) # See (E)
if len(entry_detailes) == 2: if len(entry_detailes) == 2:
location = entry_detailes[0] location = entry_detailes[0]
date = entry_detailes[1] date = entry_detailes[1]
description = notProvide description = notProvide
#See (F) # See (F)
elif len(entry_detailes) == 3: elif len(entry_detailes) == 3:
location = entry_detailes[0] location = entry_detailes[0]
date = entry_detailes[1] date = entry_detailes[1]
description = entry_detailes[2] description = entry_detailes[2]
#This extra case was made because one entry had the description split into two by a # This extra case was made because one entry had the description split into two by a
#newline so it registered as two descriptions making the length = 3 # newline so it registered as two descriptions making the length = 3
elif len(entry_detailes) == 4: elif len(entry_detailes) == 4:
location = entry_detailes[0] location = entry_detailes[0]
date = entry_detailes[1] date = entry_detailes[1]
description = entry_detailes[2] + " " + entry_detailes[3] description = entry_detailes[2] + " " + entry_detailes[3]
#this will print if the code has failed to account for something in detailes # this will print if the code has failed to account for something in detailes
else: else:
raise eventException("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list") raise eventException("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
#this will print if the above if statements failed to find the date block # this will print if the above if statements failed to find the date block
else: else:
raise eventException("failed to find and account for date element in entry_detailes list") raise eventException("failed to find and account for date element in entry_detailes list")
except eventException as e: except eventException as e:
...@@ -227,7 +236,7 @@ def load_data(): ...@@ -227,7 +236,7 @@ def load_data():
timestop = convertTime(time[1]) timestop = convertTime(time[1])
except ValueError: except ValueError:
raise eventException(str(time)) raise eventException(str(time))
if timestop == None: if timestop is None:
raise eventException(str(time)) raise eventException(str(time))
if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"): if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
if (time[1][-2:] == "am"): if (time[1][-2:] == "am"):
...@@ -239,8 +248,6 @@ def load_data(): ...@@ -239,8 +248,6 @@ def load_data():
except Exception: except Exception:
error.append("Error with time reformatting") error.append("Error with time reformatting")
'''print "-----------------------------------------------------------------------------" '''print "-----------------------------------------------------------------------------"
print location print location
print day print day
...@@ -252,10 +259,11 @@ def load_data(): ...@@ -252,10 +259,11 @@ def load_data():
print description print description
print "----------------------------------------------------------------------------" print "----------------------------------------------------------------------------"
''' '''
if (error == []): if (error == []):
dictlist.append({"id":uniqueid, "title":entry_title, "dayofweek":day, "dayofmonth":monthday, "month":month, "year":year, "timestart":timestart, "timestop":timestop, "location":location, "description":description}) dictlist.append({"id": uniqueid, "title": entry_title, "dayofweek": day, "dayofmonth": monthday, "month": month, "year": year, "timestart": timestart, "timestop": timestop, "location": location, "description": description})
else: else:
dictlist.append({"id":uniqueid, "error":error}) dictlist.append({"id": uniqueid, "error": error})
return dictlist return dictlist
# everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it # everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment