parscript.py 11.7 KB
Newer Older
1 2 3
# print "and we begin"

# third party imports
4 5
from bs4 import BeautifulSoup
import requests
6

7 8 9 10 11 12 13 14 15 16 17 18 19 20
_MONTH_DICT = {
    "January": 1,
    "Febuary": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}
21

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
# this function cleans up some of the useless html leftovers to characters we can actually use
def cleanup(dirtystring):
    replacements = [
        ("&", "&"),
        (" ", " "),
        ("–", "-"),
        ("&lt;", "<"),
        ("&gt;", ">"),
        ("<br/>", "\n"),
        ("Publish event on the Calendar?: TRUE \n", ""),
        ("Performing any medical procedures?: FALSE \n", ""),
        ("Parking Needed?: FALSE \n", ""),
        ("\n\n\n", "\n"),
        ("\n\n", "\n"),
        ("&rsquo;", "'")
    ]

    for replacement in replacements:
40
        dirtystring = dirtystring.replace(replacement[0], replacement[1])
41

42
    return dirtystring[:-1]
43

44 45 46 47 48
# Simple event quality test
def qualityTest(desc):
    # none, bad, okay, good
    length = len(desc)

49
    if desc == "Not Provided":
50
        return "none"
51
    elif length < 10:
52
        return "bad"
53
    elif length < 40:
54
        return "okay"
55
    elif length < 80:
56
        return "good"
57 58
    elif length < 100:
        return "verygood"
59
    else:
60
        return "excellent"
61

62 63 64 65 66
# convertTime accepts strings in the form of ""
def convertTime(stri):  # this function is used for splicing the event times.
    if (stri[-2:] == "pm" or stri[-2:] == "PM"):  # checks to see if the time presented is pm
        if not ((stri[0] == "1") and (stri[1] == "2")):  # if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
                try:  # this try block works with the exception handler to add 12 to any pm times
67
                    stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
68
                    # print "I did the first one " + stri
69
                except Exception:
70
                    stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
71 72
                    # print "I did the NOT first one " + stri
        if ":" in stri:  # this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
73 74
            try:
                return ((int(stri[0:2])) * 60) + int(stri[3:5])
75
            except Exception:
76 77 78 79
                return ((int(stri[0])) * 60) + int(stri[2:4])
        else:
            try:
                return (int(stri[0:2])) * 60
80
            except Exception:
81
                return (int(stri[0])) * 60
82
    elif (stri[-2:] == "am" or stri[-2:] == "AM"):  # checks if the time presented is am, and executes identical code from the pm block, just without adding 12
83 84 85
        if ":" in stri:
            try:
                return (int(stri[0:2]) * 60) + int(stri[3:5])
86
            except Exception:
87 88 89 90
                return (int(stri[0]) * 60) + int(stri[2:4])
        else:
            try:
                return int(stri[0:2]) * 60
91
            except Exception:
92 93
                return int(stri[0]) * 60
    else:
94
        raise Exception("Issue with time dilation. Input string: " + stri)
95

96 97 98
def filter_data_into_days(dictlist):
    new_dictlist = {}
    for event in dictlist:
99
        if "error" in event:
100
            continue
101 102 103 104

        event_date = "{}/{}/{}".format(event["dayofmonth"],
                                       _MONTH_DICT[event["month"]],
                                       event["year"])
105 106
        if event_date in new_dictlist:
            new_dictlist[event_date].append(event)
107
        else:
108 109
            new_dictlist[event_date] = [event]
    return new_dictlist
110

111
def load_data():
112 113 114 115 116 117
    """
    Parses the XML from Mason and mines 2 BTC.
    Returns a dict of all the events.
    """
    dictlist = []
    DaysOfWeek = {
118 119 120 121 122 123 124
        "Sunday": 0,
        "Monday": 1,
        "Tuesday": 2,
        "Wednesday": 3,
        "Thursday": 4,
        "Friday": 5,
        "Saturday": 6,
125
    }
126

127
    notProvide = "Not Provided"
128

129 130
    soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml")
    # creates a list of all the entry tags from the xml
131
    entries = soup.findAll('entry')
132 133
    # indexs an entry in the list of entries

134 135
    for entry in entries:
        error = []
136 137 138 139 140 141
        try:
            uniqueid = entry.id.text
            uniqueid = uniqueid[-9:]
        except Exception:
            uniqueid = "Error with getting ID"

142
        # pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
143 144
        try:
            entry_title = entry.title.text
145

146
            entry_content = entry.content.text
147

148 149 150
            # makes it easy to find as things may be unevenly spaced
            entry_content = entry_content.replace("\n\n\n", "\n")
            entry_content = entry_content.replace("\n\n", "\n")
151

152 153
            # check clearcontent function
            entry_content = cleanup(entry_content)  # we might just get rid of this one
154

155 156 157 158 159 160
            # each piece of content may is seperated by a newline, entry_detailes creates a list
            entry_detailes = entry_content.split("\n")
        except Exception as e:
            error.append(str(e))
            dictlist.append({"id": uniqueid, "error": error})
            continue
161

162 163 164 165 166 167 168 169 170 171 172 173 174
        # in entry detailes list normally the conditions go as follow
        # [0] is the location
        # [1] is the date
        # [2] is the description

        # either conditions follows
        # [0] is date

        # [0] is location
        # [1] is date

        # [0] is date
        # [1] is description
175

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
        # sometimes the location or description is not given; however, the location always goes before date and
        # the description always follows the date. The date is always present. See examples above

        # (A) if the location is not given then the date must be index [0]
        # (B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
        # (C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]

        # (D) if the location is given then the date must be index [1]
        # (E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
        # (F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]

        # the two if statements finds the date string. The date string always starts with
        # Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
        # is always on either [0] or [1]

        # see (A) above
192 193
        try:
            if entry_detailes[0].split(",")[0] in DaysOfWeek:
194
                # See (B)
195 196 197 198
                if len(entry_detailes) == 1:
                    location = notProvide
                    date = entry_detailes[0]
                    description = notProvide
199
                # see (C)
200 201 202 203
                elif len(entry_detailes) == 2:
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1]
204 205 206
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
                elif len(entry_detailes) == 3:
207 208 209
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1] + " " + entry_detailes[2]
210
                # this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
211
                else:
212
                    raise Exception("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
213

214
            # see (D) above
215
            elif entry_detailes[1].split(",")[0] in DaysOfWeek:
216
                # See (E)
217 218 219 220
                if len(entry_detailes) == 2:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = notProvide
221
                # See (F)
222 223 224 225
                elif len(entry_detailes) == 3:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2]
226 227
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
228 229 230 231
                elif len(entry_detailes) == 4:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2] + " " + entry_detailes[3]
232
                # this will print if the code has failed to account for something in detailes
233
                else:
234
                    raise Exception("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
235
            # this will print if the above if statements failed to find the date block
236
            else:
237 238
                raise Exception("failed to find and account for date element in entry_detailes list")
        except Exception as e:
239
            error.append(str(e))
240

241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
        try:
            if location != notProvide:
                location = location[:-1]
                location += ", "
            if "Fairfax Campus" in location:
                location = location.split(", Fairfax Campus, ")
                campus = "Fairfax"
                del location[-1]
            elif "Arlington Campus" in location:
                location = location.split(", Arlington Campus, ")
                campus = "Arlington"
                del location[-1]
            else:
                location = [location]
        except Exception:
256
            error.append("Location Error: " + str(e))
257

258 259 260 261 262 263 264 265 266
        try:
            date = date.split(",")
            day = date[0]
            time = date[3][1:]
            date = date[1][1:] + "," + date[2]
            date = date.split(" ")
            month = date[0]
            monthday = date[1][:(len(date[1]) - 1)]
            year = date[2]
267
        except Exception as e:
268
            error.append("Date Error: " + str(e))
269

270 271 272
        try:
            time = time.replace(" ", "")
            time = time.split("-")
273 274 275

            timestop = convertTime(time[1])

276
            if timestop is None:
277
                raise Exception(str(time))
278 279 280 281 282 283 284
            if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
                if (time[1][-2:] == "am"):
                    timestart = convertTime(time[0] + "am")
                else:
                    timestart = convertTime(time[0] + "pm")
            else:
                timestart = convertTime(time[0])
285
        except Exception as e:
286
            error.append("Time Dilation Error: " + str(e))
287

288 289 290 291 292 293 294 295 296 297 298
        # print "-----------------------------------------------------------------------------"
        # print location
        # print day
        # print month
        # print monthday
        # print year
        # print timestart
        # print timestop
        # print description
        # print "----------------------------------------------------------------------------"

299
        if (error == []):
300 301 302
            quality = qualityTest(description)
            dictlist.append({"id": uniqueid, "quality": quality, "title": entry_title, "dayofweek": day, "dayofmonth": monthday, "month": month,
             "year": year, "timestart": timestart, "timestop": timestop, "location": location, "description": description})
303
        else:
304
            dictlist.append({"id": uniqueid, "error": error})
305

306
    return filter_data_into_days(dictlist)
307

308
# everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it
309