import re import requests from bs4 import BeautifulSoup BASE_URL = "http://peoplefinder.gmu.edu/index.php" MODE = { "standard": "mode=standard", "advanced": "mode=advanced" } def call_standard(search, group="all", page=1): url = "{base}?search={search}&group={group}&people=100&page={page}".format( base=BASE_URL, search=search, group=group, page=page) current_page = requests.get(url) soup = BeautifulSoup(current_page.content) people = [] container_divs = soup.findAll('div', {'class': 'person'}) people_divs = map(lambda x: list(x.children), container_divs) for person in people_divs: current_result = {"raw":[]} for attribute in person: current_result['raw'].append(str(attribute)) if attribute.name == 'h3': current_result['name'] = attribute.text elif attribute.name == 'p': content = attribute.text major_match = re.match("Major: (.*)", content) if major_match: current_result['major'] = major_match.group(1) email_match = re.match("Email: (.*)", content) if email_match: current_result['email'] = email_match.group(1) if attribute.acronym: current_result['address'] = attribute.text elif attribute.span: phone = attribute.find('span', {'class': 'phone'}) current_result['phone'] = phone.text if phone else None fax = attribute.find('span', {'class': 'fax'}) current_result['fax'] = fax.text[5:] if fax else None # comment the line below to debug del current_result["raw"] people.append(current_result) return { 'results': people, 'hasNextPage': bool(soup.find('li', {'class': 'next'})) }