Commit 54810548 authored by Michael Bailey's avatar Michael Bailey

add project data

parent 3c26c5ca
Pipeline #3050 failed with stage
in 14 seconds
runner:
image: library/python:2.7
stage: build
script:
- pip install -r requirements.txt
- python scrape.py > data.csv
- cp build data.csv
artifacts:
paths:
- build
from lxml import html
import requests
import os
ROOT_URL = "http://hr.gmu.edu/worklife/discounts"
def writeRow(row, studious, link):
linkref = row.xpath('.//a')
link = linkref[0].values()[0]
ps = row.xpath('.//p|.//span')
text = []
for p in ps:
if bool(p.text_content().strip()) and bool(p.text_content()):
text += [p.text_content()]
title = text[0].strip()
footer = text[1].strip()
descrip = " ".join(text[2:])
while " " in descrip:
descrip = descrip.replace(" "," ")
descrip = descrip.replace("\r\n","").replace(". .",".").replace('"',"'")
return [{"title": title, "footer": footer, "description": descrip.strip().encode('ascii',errors='ignore'), "studious": studious, "link": link}]
def processSection(stub):
page = requests.get("{0}/{1}".format(ROOT_URL, stub))
tree = html.fromstring(page.content)
selector = tree.xpath('//div[@id="mainarea"]')
rows = tree.xpath('.//tr')
sect = []
for row in rows:
if 'student' in row.text_content() or 'community' in row.text_content():
sect += [writeRow(row, True, "{0}/{1}".format(ROOT_URL, stub))]
elif len(row.xpath('.//p|.//span')) > 2:
sect += [writeRow(row, False, "{0}/{1}".format(ROOT_URL, stub))]
return sect
def getSections():
page = requests.get("{0}/index.php".format(ROOT_URL))
tree = html.fromstring(page.content)
selector = tree.xpath('//a[@style="text-decoration: none; font-size: 14px; color: #006633; font-weight: normal;"]')
stubs = []
for link in selector:
hrefloc = link.keys().index("href")
href = link.values()[hrefloc]
root = href.split("/")[-1].split("#")[0]
if not root in stubs:
stubs += [root]
return stubs
def generate(rows):
print("\"title\",\"subtitle\",\"student friendly (guess)\",\"long description\"")
titles = []
for j in rows:
for i in j:
if i[0]['title'] not in titles:
print("\"{0}\",\"{1}\",\"{2}\",\"{3}\"".format(i[0]['title'].encode('ascii',errors='ignore'), i[0]['footer'].encode('ascii',errors='ignore'), i[0]['studious'], i[0]['description'].encode('ascii',errors='ignore')))
titles += [i[0]['title']]
rows = []
for i in getSections():
rows += [processSection(i)]
generate(rows)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment