parscript.py 8.77 KB
Newer Older
1 2 3 4
#print "and we begin"
from bs4 import BeautifulSoup
from datetime import date, time
import requests
5

6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
def cleanup(str): #this function cleans up some of the useless html leftovers to characters we can actually use
	str = str.replace("&", "&")
	str = str.replace(" ", " ")
	str = str.replace("–", "-")
	str = str.replace("&lt;", "<")
	str = str.replace("&gt;", ">")
	str = str.replace("<br/>", "\n")
	str = str.replace("Publish event on the Calendar?: TRUE \n" , "")
	str = str.replace("Performing any medical procedures?: FALSE \n" , "")
	str = str.replace("Parking Needed?: FALSE \n" , "")
	str = str[0:len(str) - 1]
	return str

class eventException: #this class is just an exception for our use

	def __init__(self,message):
		self.__message = message
		#self.__exceptionlist = []

	def __str__(self):
		return self.__message

def convertTime(stri): #this function is used for splicing the event times.
	if (stri[-2:] == "pm"): #checks to see if the time presented is pm 
		if not ((stri[0] == "1") and (stri[1] == "2")): #if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
				try: #this try block works with the exception handler to add 12 to any pm times
					stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
					#print "I did the first one " + stri
				except:
					stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
					#print "I did the NOT first one " + stri
		if ":" in stri: #this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
			try:
				return ((int(stri[0:2])) * 60) + int(stri[3:5])
			except:
				return ((int(stri[0])) * 60) + int(stri[2:4])
		else:
			try:
				return (int(stri[0:2])) * 60
			except:
				return (int(stri[0])) * 60
	elif (stri[-2:] == "am"): #checks if the time presented is am, and executes identical code from the pm block, just without adding 12
		if ":" in stri:
			try:
				return (int(stri[0:2]) * 60) + int(stri[3:5])
			except:
				return (int(stri[0]) * 60) + int(stri[2:4])
		else:
			try:
				return int(stri[0:2]) * 60
			except:
				return int(stri[0]) * 60
	else:
		raise eventException("This is weird and please don't happen")


62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
def load_data():
	"""
	Parses the XML from Mason and mines 2 BTC.
	Returns a dict of all the events.
	"""
	dictlist = []
	DaysOfWeek = {
		"Sunday" : 0,
		"Monday" : 1,
		"Tuesday" : 2,
		"Wednesday" : 3,
		"Thursday" : 4,
		"Friday" : 5,
		"Saturday" : 6,
	}

78
	notProvide = "Not Provided"
79 80 81
	counter = 0

	soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml") #creates soup of the xml
82
	#creates a list of all the entry tags from the xml
83
	entries = soup.findAll('entry')
84
	#indexs an entry in the list of entries 
85
	for entry in entries:
86 87
		error = []
		#pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
88 89 90
		entry_title = entry.title.text

		entry_content = entry.content.text
91 92
		uniqueid = entry.id.text
		
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
		#makes it easy to find as things may be unevenly spaced 
		entry_content = entry_content.replace("\n\n\n" , "\n")
		entry_content = entry_content.replace("\n\n" , "\n")

		#check clearcontent function
		entry_content = cleanup(entry_content) #we might just get rid of this one

		#each piece of content may is seperated by a newline, entry_detailes creates a list 
		entry_detailes = entry_content.split("\n")


		#in entry detailes list normally the conditions go as follow
		#[0] is the location
		#[1] is the date
		#[2] is the description

		#either conditions follows
		#[0] is date 

		#[0] is location
		#[1] is date 

		#[0] is date
		#[1] is description
	 
		#sometimes the location or description is not given; however, the location always goes before date and
		#the description always follows the date. The date is always present. See examples above
		
		#(A) if the location is not given then the date must be index [0]
		#(B) if the length of the list = 1 and date is index [0] --> location not given & description is not given              
		#(C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]         
		
		#(D) if the location is given then the date must be index [1]       
		#(E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given      
		#(F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]   
		

		#the two if statements finds the date string. The date string always starts with 
		#Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date 
		#is always on either [0] or [1]
		
		#see (A) above
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
		try:
			if entry_detailes[0].split(",")[0] in DaysOfWeek:
				#See (B)
				if len(entry_detailes) == 1:
					location = notProvide
					date = entry_detailes[0]
					description = notProvide
				#see (C)
				elif len(entry_detailes) == 2:
					location = notProvide
					date = entry_detailes[0]
					description = entry_detailes[1]
				#This extra case was made because one entry had the description split into two by a 
				#newline so it registered as two descriptions making the length = 3
				elif len(entry_detailes) == 3:  
					location = notProvide
					date = entry_detailes[0]
					description = entry_detailes[1] + " " + entry_detailes[2]
				#this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
				else:
					raise eventException("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")


			#see (D) above
			elif entry_detailes[1].split(",")[0] in DaysOfWeek:
				#See (E)
				if len(entry_detailes) == 2:
					location = entry_detailes[0]
					date = entry_detailes[1]
					description = notProvide
				#See (F)
				elif len(entry_detailes) == 3:
					location = entry_detailes[0]
					date = entry_detailes[1]
					description = entry_detailes[2]
				#This extra case was made because one entry had the description split into two by a 
				#newline so it registered as two descriptions making the length = 3
				elif len(entry_detailes) == 4:
					location = entry_detailes[0]
					date = entry_detailes[1]
					description = entry_detailes[2] + " " + entry_detailes[3]
				#this will print if the code has failed to account for something in detailes
				else:
					raise eventException("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
			#this will print if the above if statements failed to find the date block
180
			else:
181 182
				raise eventException("failed to find and account for date element in entry_detailes list")
		except eventException as e:
Zach Osman's avatar
Zach Osman committed
183
			error.append(str(e))
184 185
		except Exception:
			error.append("Error intialising event")
186

187 188 189 190
		try:
			uniqueid = uniqueid[-9:]
		except:
			uniqueid = "Error with getting ID"
191

192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
		try:
			if location != notProvide:
				location = location[:-1]
				location += ", "
			if "Fairfax Campus" in location:
				location = location.split(", Fairfax Campus, ")
				campus = "Fairfax"
				del location[-1]
			elif "Arlington Campus" in location:
				location = location.split(", Arlington Campus, ")
				campus = "Arlington"
				del location[-1]
			else:
				location = [location]
		except Exception:
			error.append("Error with location")
208

209 210 211 212 213 214 215 216 217 218 219
		try:
			date = date.split(",")
			day = date[0]
			time = date[3][1:]
			date = date[1][1:] + "," + date[2]
			date = date.split(" ")
			month = date[0]
			monthday = date[1][:(len(date[1]) - 1)]
			year = date[2]
		except Exception:
			error.append("Error with time/date splicing")
220 221

		try:
222 223 224 225 226 227 228 229 230 231 232 233 234
			time = time.replace(" ", "")
			time = time.split("-")
			try:
				timestop = convertTime(time[1])
			except ValueError:
				raise eventException(str(time))
			if timestop == None:
				raise eventException(str(time))
			if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
				if (time[1][-2:] == "am"):
					timestart = convertTime(time[0] + "am")
				else:
					timestart = convertTime(time[0] + "pm")
235
			else:
236 237 238
				timestart = convertTime(time[0])
		except Exception:
			error.append("Error with time reformatting")
239 240 241 242 243 244 245 246 247 248 249 250 251 252



		'''print "-----------------------------------------------------------------------------"
		print location
		print day
		print month
		print monthday
		print year
		print timestart
		print timestop
		print description
		print "----------------------------------------------------------------------------"
		'''
253 254 255 256
		if (error == []):
			dictlist.append({"id":uniqueid, "title":entry_title, "dayofweek":day, "dayofmonth":monthday, "month":month, "year":year, "timestart":timestart, "timestop":timestop, "location":location, "description":description})
		else:
			dictlist.append({"id":uniqueid, "error":error})
257 258
	return dictlist

259

260
#everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it