From a036fe8a0a8bda1629194743f8288ce56476d5a7 Mon Sep 17 00:00:00 2001 From: Michael Bailey Date: Sun, 2 Apr 2017 19:55:40 -0400 Subject: [PATCH] All that's left is to dump entirecampus somewhere sensical db-wise --- scrape_gmu.py | 66 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/scrape_gmu.py b/scrape_gmu.py index 38d6e23..0519bfa 100644 --- a/scrape_gmu.py +++ b/scrape_gmu.py @@ -3,14 +3,64 @@ from bs4 import BeautifulSoup import requests import re +import json -# Collect the HTML of the laundry page so we can parse it -r = requests.get('https://housing.gmu.edu/laundry') +def getLocations(): + try: + # Collect the HTML of the laundry page so we can parse it + r = requests.get('https://housing.gmu.edu/laundry') -# Don't just assume the site doesn't time out or shit itself -if r.status_code == 200: - # Isolate Speedqueen URLS - for location in re.findall('(?<=quantum\.speedqueen\.com\/wa\/)(.*)(?=\" )',r.text): - # Print Location name - print location + # Don't just assume the site doesn't time out or shit itself + if r.status_code == 200: + # Isolate Speedqueen URLS + locations = re.findall('(?<=quantum\.speedqueen\.com\/wa\/)(.*)(?=\" )',r.text) + return locations + else: + return [] + except requests.exceptions.RequestException: + return [] + +def scrapeLaundry(location): + machines = [] + try: + r = requests.get("http://quantum.speedqueen.com/wa/{0}".format(location)) + if r.status_code == 200: + outerparse = BeautifulSoup(r.text, 'html.parser') + innerframe = outerparse.iframe['src'] + innerreq = requests.get(innerframe) + if innerreq.status_code == 200: + innerparse = BeautifulSoup(innerreq.text, 'html.parser') + for list in innerparse.find_all('tr'): + try: + classtxt = list['class'] + classed = True + except: + classed = False + if classed == True: + bullets = list.find_all('td') + for bullet in bullets: + if bullet['class'] == ['name']: + name = bullet.text + if bullet['class'] == ['type']: + type = bullet.text + if bullet['class'] == ['status']: + status = bullet.text + if bullet['class'] == ['time']: + time = bullet.text + obj = [name, type, status, time] + if obj[2] != "In use": + obj[3] = "" + machines.append(obj) + except requests.exceptions.RequestException: + return [] + return machines + +entirecampus = [] + +for loc in getLocations(): + location = [] + obj = [loc] + scrapeLaundry(loc) + entirecampus.append(obj) + +print(entirecampus) \ No newline at end of file -- GitLab