# -*- coding: utf-8 -*-
"""
Created on Sun Dec 04 16:08:55 2016
Parse Wikipedia page on 108th Congress, download information on members
@author: nschiff
BS4 documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
import socket, urllib2
from bs4 import BeautifulSoup
import os
from bs4.element import Tag
from pprint import pprint
working_path="C:\\Users\\nschiff"
os.chdir(working_path)
#Open up URL and parse with BeautifulSoup
wikiURL = "https://en.wikipedia.org/wiki/108th_United_States_Congress"
response = urllib2.urlopen(wikiURL)
wikiHTML = response.read()
soup=BeautifulSoup(wikiHTML)
#Look for text just before list of representatives
startText="The Members of the House of Representatives are preceded by the district number."
temp=soup.find('p',text=startText)
congress_table=temp.findNext("table", class_="multicol") #find table directly after startText
stateList=congress_table.find_all("h4") #each state has
header text
currState=stateList[0].findNext("ul") #under each state is an unordered list, , of reps
curr_state_reps=currState.find_all("li") #each list item is a rep--careful with reps who change mid-term
#Show example of finding Jo Bonner's alma mater
Jo_Bonner=[]
Jo_Bonner.append(stateList[0].a.getText()) #current state
Jo_Bonner.append(curr_state_reps[0].a.getText()) #district
Jo_Bonner.append(curr_state_reps[0].a.findNext("a").getText()) #name
JB_link="https://en.wikipedia.org"+curr_state_reps[0].a.findNext("a")['href']
response = urllib2.urlopen(JB_link)
JB_soup=BeautifulSoup(response.read())
JB_table=JB_soup.find("table", class_="infobox vcard")
JB_univ=JB_table.find("a", title="Alma mater").findNext("td").getText()
Jo_Bonner.append(JB_univ)
#Now iterate through entire House Of Reps for 108th
congressman_list=[]
for curr_state in stateList:
curr_rep_list=curr_state.findNext("ul").find_all("li")
for ctr, curr_rep in enumerate(curr_rep_list):
temp_rep=[]
temp_rep.append(curr_state.a.getText()) #current state
print(curr_state.a.getText()+" district "+str(ctr))
#If only one link then congressman is replacement
if len(curr_rep.find_all("a"))!=1:
temp_rep.append(curr_rep.a.getText()) #district
temp_rep.append(curr_rep.a.findNext("a").getText()) #name
temp_rep_link="https://en.wikipedia.org"+curr_rep.a.findNext("a")['href'] #link to bio page
else:
temp_rep.append(str(ctr-1)) #use previous district
temp_rep.append(curr_rep.a.getText()) #name
temp_rep_link="https://en.wikipedia.org"+curr_rep.a['href'] #link to bio page
response = urllib2.urlopen(temp_rep_link)
temp_rep_soup=BeautifulSoup(response.read())
temp_rep_table=temp_rep_soup.find("table", class_="infobox vcard")
if temp_rep_table.find("a", title="Alma mater")==None:
temp_rep.append("alma mater missing")
else:
temp_rep.append(temp_rep_table.find("a", title="Alma mater").findNext("td").getText())
congressman_list.append(temp_rep) #append indiv rep to list
pprint(congressman_list)
#Can now export to CSV or whatever