parscript.py 10.7 KB
Newer Older
1
2
3
# print "and we begin"

# third party imports
4
5
from bs4 import BeautifulSoup
import requests
6

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# this function cleans up some of the useless html leftovers to characters we can actually use
def cleanup(dirtystring):
    replacements = [
        ("&", "&"),
        (" ", " "),
        ("–", "-"),
        ("&lt;", "<"),
        ("&gt;", ">"),
        ("<br/>", "\n"),
        ("Publish event on the Calendar?: TRUE \n", ""),
        ("Performing any medical procedures?: FALSE \n", ""),
        ("Parking Needed?: FALSE \n", ""),
        ("\n\n\n", "\n"),
        ("\n\n", "\n"),
        ("&rsquo;", "'")
    ]

    for replacement in replacements:
        dirtystring.replace(replacement[0], replacement[1])

    dirtystring = dirtystring[0:len(dirtystring) - 1]
Landon DeCoito's avatar
Landon DeCoito committed
28
    return dirtystring
29
30


31
32
33
class eventException:  # this class is just an exception for our use

    def __init__(self, message):
34
        self.__message = message
35

36
37
    def __str__(self):
        return self.__message
Landon DeCoito's avatar
Landon DeCoito committed
38

39
40
41
42
43
44

# convertTime accepts strings in the form of ""
def convertTime(stri):  # this function is used for splicing the event times.
    if (stri[-2:] == "pm" or stri[-2:] == "PM"):  # checks to see if the time presented is pm
        if not ((stri[0] == "1") and (stri[1] == "2")):  # if the time is pm, then the 12:00 hour is noon and shouldn't get 12 added to it
                try:  # this try block works with the exception handler to add 12 to any pm times
45
                    stri = stri.replace(stri[0:2], str(int(stri[0:2]) + 12), 1)
46
                    # print "I did the first one " + stri
47
48
                except:
                    stri = stri.replace(stri[0], str(int(stri[0]) + 12), 1)
49
50
                    # print "I did the NOT first one " + stri
        if ":" in stri:  # this if/else reliably converts the time to minutes. accepts either "hour:minute" or simply "hour"
51
52
53
54
55
56
57
58
59
            try:
                return ((int(stri[0:2])) * 60) + int(stri[3:5])
            except:
                return ((int(stri[0])) * 60) + int(stri[2:4])
        else:
            try:
                return (int(stri[0:2])) * 60
            except:
                return (int(stri[0])) * 60
60
    elif (stri[-2:] == "am" or stri[-2:] == "AM"):  # checks if the time presented is am, and executes identical code from the pm block, just without adding 12
61
62
63
64
65
66
67
68
69
70
71
72
        if ":" in stri:
            try:
                return (int(stri[0:2]) * 60) + int(stri[3:5])
            except:
                return (int(stri[0]) * 60) + int(stri[2:4])
        else:
            try:
                return int(stri[0:2]) * 60
            except:
                return int(stri[0]) * 60
    else:
        raise eventException("This is weird and please don't happen")
73
74


75
def load_data():
76
77
78
79
80
81
    """
    Parses the XML from Mason and mines 2 BTC.
    Returns a dict of all the events.
    """
    dictlist = []
    DaysOfWeek = {
82
83
84
85
86
87
88
        "Sunday": 0,
        "Monday": 1,
        "Tuesday": 2,
        "Wednesday": 3,
        "Thursday": 4,
        "Friday": 5,
        "Saturday": 6,
89
    }
90

91
92
    notProvide = "Not Provided"
    counter = 0
93

94
95
    soup = BeautifulSoup(cleanup(requests.get("http://25livepub.collegenet.com/calendars/events_all.xml").text), "lxml")
    # creates a list of all the entry tags from the xml
96
    entries = soup.findAll('entry')
97
98
    # indexs an entry in the list of entries

99
100
    for entry in entries:
        error = []
101
        # pulls up an entries in the list of entries, finds the title tag and .text deletes all xml tags and returns just the text as a string
102
        entry_title = entry.title.text
103

104
105
        entry_content = entry.content.text
        uniqueid = entry.id.text
106

107
108
109
110
111
112
        # makes it easy to find as things may be unevenly spaced
        entry_content = entry_content.replace("\n\n\n", "\n")
        entry_content = entry_content.replace("\n\n", "\n")

        # check clearcontent function
        entry_content = cleanup(entry_content)  # we might just get rid of this one
113

114
        # each piece of content may is seperated by a newline, entry_detailes creates a list
115
        entry_detailes = entry_content.split("\n")
116

117
118
119
120
121
122
123
124
125
126
127
128
129
        # in entry detailes list normally the conditions go as follow
        # [0] is the location
        # [1] is the date
        # [2] is the description

        # either conditions follows
        # [0] is date

        # [0] is location
        # [1] is date

        # [0] is date
        # [1] is description
130

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
        # sometimes the location or description is not given; however, the location always goes before date and
        # the description always follows the date. The date is always present. See examples above

        # (A) if the location is not given then the date must be index [0]
        # (B) if the length of the list = 1 and date is index [0] --> location not given & description is not given
        # (C) if the length of the list = 2 and date is index [0] --> location not given but description is given at [1]

        # (D) if the location is given then the date must be index [1]
        # (E) if the length of the list = 2 and date is index [1] --> location is given at [0] but description is not given
        # (F) if the length of the list = 3 and date is index [1] --> location is given at [0] and description is given at [2]

        # the two if statements finds the date string. The date string always starts with
        # Monday Tuesday Wednesday Thursday Friday Saturday Sunday or Ongoing and the date
        # is always on either [0] or [1]

        # see (A) above
147
148
        try:
            if entry_detailes[0].split(",")[0] in DaysOfWeek:
149
                # See (B)
150
151
152
153
                if len(entry_detailes) == 1:
                    location = notProvide
                    date = entry_detailes[0]
                    description = notProvide
154
                # see (C)
155
156
157
158
                elif len(entry_detailes) == 2:
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1]
159
160
161
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
                elif len(entry_detailes) == 3:
162
163
164
                    location = notProvide
                    date = entry_detailes[0]
                    description = entry_detailes[1] + " " + entry_detailes[2]
165
                # this will print if the code has failed to account for something in detailes, but it works as of December 26th 2017
166
167
                else:
                    raise eventException("failed to account for detail in entry_detailes when date element is index 0 on entry_detailes list")
168

169
            # see (D) above
170
            elif entry_detailes[1].split(",")[0] in DaysOfWeek:
171
                # See (E)
172
173
174
175
                if len(entry_detailes) == 2:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = notProvide
176
                # See (F)
177
178
179
180
                elif len(entry_detailes) == 3:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2]
181
182
                # This extra case was made because one entry had the description split into two by a
                # newline so it registered as two descriptions making the length = 3
183
184
185
186
                elif len(entry_detailes) == 4:
                    location = entry_detailes[0]
                    date = entry_detailes[1]
                    description = entry_detailes[2] + " " + entry_detailes[3]
187
                # this will print if the code has failed to account for something in detailes
188
189
                else:
                    raise eventException("failed to account for detail in entry_detailes when date element is index 1 on entry_detailes list")
190
            # this will print if the above if statements failed to find the date block
191
192
193
194
195
196
            else:
                raise eventException("failed to find and account for date element in entry_detailes list")
        except eventException as e:
            error.append(str(e))
        except Exception:
            error.append("Error intialising event")
197

198
199
200
201
        try:
            uniqueid = uniqueid[-9:]
        except:
            uniqueid = "Error with getting ID"
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
        try:
            if location != notProvide:
                location = location[:-1]
                location += ", "
            if "Fairfax Campus" in location:
                location = location.split(", Fairfax Campus, ")
                campus = "Fairfax"
                del location[-1]
            elif "Arlington Campus" in location:
                location = location.split(", Arlington Campus, ")
                campus = "Arlington"
                del location[-1]
            else:
                location = [location]
        except Exception:
            error.append("Error with location")
219

220
221
222
223
224
225
226
227
228
229
230
        try:
            date = date.split(",")
            day = date[0]
            time = date[3][1:]
            date = date[1][1:] + "," + date[2]
            date = date.split(" ")
            month = date[0]
            monthday = date[1][:(len(date[1]) - 1)]
            year = date[2]
        except Exception:
            error.append("Error with time/date splicing")
231

232
233
234
235
236
237
238
        try:
            time = time.replace(" ", "")
            time = time.split("-")
            try:
                timestop = convertTime(time[1])
            except ValueError:
                raise eventException(str(time))
239
            if timestop is None:
240
241
242
243
244
245
246
247
248
249
                raise eventException(str(time))
            if not (time[0][-2:] == "am") and not (time[0][-2:] == "pm"):
                if (time[1][-2:] == "am"):
                    timestart = convertTime(time[0] + "am")
                else:
                    timestart = convertTime(time[0] + "pm")
            else:
                timestart = convertTime(time[0])
        except Exception:
            error.append("Error with time reformatting")
250

251
252
253
254
255
256
257
258
259
260
261
        '''print "-----------------------------------------------------------------------------"
        print location
        print day
        print month
        print monthday
        print year
        print timestart
        print timestop
        print description
        print "----------------------------------------------------------------------------"
        '''
262

263
        if (error == []):
264
            dictlist.append({"id": uniqueid, "title": entry_title, "dayofweek": day, "dayofmonth": monthday, "month": month, "year": year, "timestart": timestart, "timestop": timestop, "location": location, "description": description})
265
        else:
266
            dictlist.append({"id": uniqueid, "error": error})
267
    return dictlist
268

269
# everything in the house is fuzzy, stupid dogs were acting like pollinators, if that's how you even spell it