parscript.py 12.1 KB
Newer Older
1 2 3
# print "and we begin"

# third party imports
4
from bs4 import BeautifulSoup
5
from datetime import datetime
6
import requests
7 8
import collections

9

10 11 12 13 14 15 16 17 18 19 20 21 22 23
_MONTH_DICT = {
    "January": 1,
    "Febuary": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}
Landon DeCoito's avatar
Landon DeCoito committed
24

25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
# this function cleans up some of the useless html leftovers to characters we can actually use
def cleanup(dirtystring):
    replacements = [
        ("&", "&"),
        (" ", " "),
        ("–", "-"),
        ("&lt;", "<"),
        ("&gt;", ">"),
        ("<br/>", "\n"),
        ("Publish event on the Calendar?: TRUE \n", ""),
        ("Performing any medical procedures?: FALSE \n", ""),
        ("Parking Needed?: FALSE \n", ""),
        ("\n\n\n", "\n"),
        ("\n\n", "\n"),
        ("&rsquo;", "'")
    ]

    for replacement in replacements:
Landon DeCoito's avatar
Landon DeCoito committed
43
        dirtystring = dirtystring.replace(replacement[0], replacement[1])
44

Landon DeCoito's avatar
Landon DeCoito committed
45
    return dirtystring[:-1]
46

47 48 49 50 51
# Simple event quality test
def qualityTest(desc):
    # none, bad, okay, good
    length = len(desc)

52
    if desc == "Not Provided":
53
        return "none"
54
    elif length < 10:
55
        return "bad"
56
    elif length < 40:
57
        return "okay"
58
    elif length < 80:
59
        return "good"
60 61
    elif length < 100:
        return "verygood"
62
    else:
63
        return "excellent"
64

65 66 67 68 69
# convertTime accepts strings in the form of ""
def convertTime(stri):  # this function is used for splicing the event times.
    if (stri[-2:] == "pm" or stri[-2:] == "PM"):  # checks to see if the time presented is pm
        if not ((stri[0] == "1") and (stri[1] == "2")):  # if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
                try:  # this try block works with the exception handler to add 12 to any pm times
70
                    stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
71
                    # print "I did the first one " + stri
Landon DeCoito's avatar
Landon DeCoito committed
72
                except Exception:
73
                    stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
74 75
                    # print "I did the NOT first one " + stri
        if ":" in stri:  # this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
76 77
            try:
                return ((int(stri[0:2])) * 60) + int(stri[3:5])
Landon DeCoito's avatar
Landon DeCoito committed
78
            except Exception:
79 80 81 82
                return ((int(stri[0])) * 60) + int(stri[2:4])
        else:
            try:
                return (int(stri[0:2])) * 60
Landon DeCoito's avatar
Landon DeCoito committed
83
            except Exception:
84
                return (int(stri[0])) * 60
85
    elif (stri[-2:] == "am" or stri[-2:] == "AM"):  # checks if the time presented is am, and executes identical code from the pm block, just without adding 12
86 87 88
        if ":" in stri:
            try:
                return (int(stri[0:2]) * 60) + int(stri[3:5])
Landon DeCoito's avatar
Landon DeCoito committed
89
            except Exception:
90 91 92 93
                return (int(stri[0]) * 60) + int(stri[2:4])
        else:
            try:
                return int(stri[0:2]) * 60
Landon DeCoito's avatar
Landon DeCoito committed
94
            except Exception:
95 96
                return int(stri[0]) * 60
    else:
Landon DeCoito's avatar
Landon DeCoito committed
97
        raise Exception("Issue with time dilation. Input string: " + stri)
98

99

100
def filter_data_into_days(dictlist):
101
    day_dict = collections.OrderedDict()
102
    for event in dictlist:
103
        if "error" in event:
104
            continue
105 106 107
        event_date = "{}/{}/{}".format(event["dayofmonth"],
                                       _MONTH_DICT[event["month"]],
                                       event["year"])
Zach Osman's avatar
Zach Osman committed
108
        if event_date in day_dict:
109
            day_dict[event_date]["data"].append(event)
110
        else:
111 112 113
            timestamp = datetime(
                int(event["year"]), _MONTH_DICT[event["month"]], int(event["dayofmonth"]))
            day_dict[event_date] = {
114
                "date": event_date, "datetime": timestamp.isoformat(' '), "year": event["year"], "month": event["month"], "dayofmonth": event["dayofmonth"], "data": [event]}
115 116 117 118
    day_list = [
        day_dict[day]
        for day in day_dict
    ]
Zach Osman's avatar
Zach Osman committed
119
    return day_list
120

121
def load_data():
122 123 124 125 126 127
    """
    Parses the XML from Mason and mines 2 BTC.
    Returns a dict of all the events.
    """
    dictlist = []
    DaysOfWeek = {
128 129 130 131 132 133 134
        "Sunday": 0,
        "Monday": 1,
        "Tuesday": 2,
        "Wednesday": 3,
        "Thursday": 4,
        "Friday": 5,
        "Saturday": 6,
135
    }
136

137
    notProvide = "Not Provided"
138

139 140
    soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml")
    # creates a list of all the entry tags from the xml
141
    entries = soup.findAll('entry')
142 143
    # indexs an entry in the list of entries

144 145
    for entry in entries:
        error = []
Landon DeCoito's avatar
Landon DeCoito committed
146 147 148 149 150 151
        try:
            uniqueid = entry.id.text
            uniqueid = uniqueid[-9:]
        except Exception:
            uniqueid = "Error with getting ID"

152
        # pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
Landon DeCoito's avatar
Landon DeCoito committed
153 154
        try:
            entry_title = entry.title.text
155

Landon DeCoito's avatar
Landon DeCoito committed
156
            entry_content = entry.content.text
157

Landon DeCoito's avatar
Landon DeCoito committed
158 159 160
            # makes it easy to find as things may be unevenly spaced
            entry_content = entry_content.replace("\n\n\n", "\n")
            entry_content = entry_content.replace("\n\n", "\n")
161

Landon DeCoito's avatar
Landon DeCoito committed
162 163
            # check clearcontent function
            entry_content = cleanup(entry_content)  # we might just get rid of this one
164

Landon DeCoito's avatar
Landon DeCoito committed
165 166 167 168 169 170
            # each piece of content may is seperated by a newline, entry_detailes creates a list
            entry_detailes = entry_content.split("\n")
        except Exception as e:
            error.append(str(e))
            dictlist.append({"id": uniqueid, "error": error})
            continue
171

172 173 174 175 176 177 178 179 180 181 182 183 184
        # in entry detailes list normally the conditions go as follow
        # [0] is the location
        # [1] is the date
        # [2] is the description

        # either conditions follows
        # [0] is date

        # [0] is location
        # [1] is date

        # [0] is date
        # [1] is description
185

186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
        # sometimes the location or description is not given; however, the location always goes before date and
        # the description always follows the date. The date is always present. See examples above

        # (A) if the location is not given then the date must be index [0]
        # (B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
        # (C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]

        # (D) if the location is given then the date must be index [1]
        # (E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
        # (F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]

        # the two if statements finds the date string. The date string always starts with
        # Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
        # is always on either [0] or [1]

        # see (A) above
202 203
        try:
            if entry_detailes[0].split(",")[0] in DaysOfWeek:
204
                # See (B)
205 206 207 208
                if len(entry_detailes) == 1:
                    location = notProvide
                    date = entry_detailes[0]
                    description = notProvide
209
                # see (C)
210 211 212 213
                elif len(entry_detailes) == 2:
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1]
214 215 216
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
                elif len(entry_detailes) == 3:
217 218 219
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1] + " " + entry_detailes[2]
220
                # this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
221
                else:
Landon DeCoito's avatar
Landon DeCoito committed
222
                    raise Exception("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
223

224
            # see (D) above
225
            elif entry_detailes[1].split(",")[0] in DaysOfWeek:
226
                # See (E)
227 228 229 230
                if len(entry_detailes) == 2:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = notProvide
231
                # See (F)
232 233 234 235
                elif len(entry_detailes) == 3:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2]
236 237
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
238 239 240 241
                elif len(entry_detailes) == 4:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2] + " " + entry_detailes[3]
242
                # this will print if the code has failed to account for something in detailes
243
                else:
Landon DeCoito's avatar
Landon DeCoito committed
244
                    raise Exception("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
245
            # this will print if the above if statements failed to find the date block
246
            else:
Landon DeCoito's avatar
Landon DeCoito committed
247 248
                raise Exception("failed to find and account for date element in entry_detailes list")
        except Exception as e:
249
            error.append(str(e))
250

251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
        try:
            if location != notProvide:
                location = location[:-1]
                location += ", "
            if "Fairfax Campus" in location:
                location = location.split(", Fairfax Campus, ")
                campus = "Fairfax"
                del location[-1]
            elif "Arlington Campus" in location:
                location = location.split(", Arlington Campus, ")
                campus = "Arlington"
                del location[-1]
            else:
                location = [location]
        except Exception:
Landon DeCoito's avatar
Landon DeCoito committed
266
            error.append("Location Error: " + str(e))
267

268 269 270 271 272 273 274 275 276
        try:
            date = date.split(",")
            day = date[0]
            time = date[3][1:]
            date = date[1][1:] + "," + date[2]
            date = date.split(" ")
            month = date[0]
            monthday = date[1][:(len(date[1]) - 1)]
            year = date[2]
Landon DeCoito's avatar
Landon DeCoito committed
277
        except Exception as e:
Landon DeCoito's avatar
Landon DeCoito committed
278
            error.append("Date Error: " + str(e))
279

280 281 282
        try:
            time = time.replace(" ", "")
            time = time.split("-")
Landon DeCoito's avatar
Landon DeCoito committed
283 284 285

            timestop = convertTime(time[1])

286
            if timestop is None:
Landon DeCoito's avatar
Landon DeCoito committed
287
                raise Exception(str(time))
288 289 290 291 292 293 294
            if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
                if (time[1][-2:] == "am"):
                    timestart = convertTime(time[0] + "am")
                else:
                    timestart = convertTime(time[0] + "pm")
            else:
                timestart = convertTime(time[0])
Landon DeCoito's avatar
Landon DeCoito committed
295
        except Exception as e:
Landon DeCoito's avatar
Landon DeCoito committed
296
            error.append("Time Dilation Error: " + str(e))
297

298 299 300 301 302 303 304 305 306 307 308
        # print "-----------------------------------------------------------------------------"
        # print location
        # print day
        # print month
        # print monthday
        # print year
        # print timestart
        # print timestop
        # print description
        # print "----------------------------------------------------------------------------"

309
        if (error == []):
310 311 312
            quality = qualityTest(description)
            dictlist.append({"id": uniqueid, "quality": quality, "title": entry_title, "dayofweek": day, "dayofmonth": monthday, "month": month,
             "year": year, "timestart": timestart, "timestop": timestop, "location": location, "description": description})
313
        else:
314
            dictlist.append({"id": uniqueid, "error": error})
315

316
    return filter_data_into_days(dictlist)
317

318
# everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it
319