# -*- coding: utf-8 -*-
"""
Created on Sun Dec 04 16:08:55 2016
Parse Wikipedia page on 108th Congress, download information on members
@author: nschiff

BS4 documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""

import socket, urllib2
from bs4 import BeautifulSoup
import os
from bs4.element import Tag
from pprint import pprint


working_path="C:\\Users\\nschiff"
os.chdir(working_path)

#Open up URL and parse with BeautifulSoup
wikiURL = "https://en.wikipedia.org/wiki/108th_United_States_Congress"
response = urllib2.urlopen(wikiURL)
wikiHTML = response.read()
soup=BeautifulSoup(wikiHTML)

#Look for text just before list of representatives
startText="The Members of the House of Representatives are preceded by the district number."
temp=soup.find('p',text=startText)
congress_table=temp.findNext("table", class_="multicol") #find table directly after startText
stateList=congress_table.find_all("h4") #each state has <h4> header text
currState=stateList[0].findNext("ul") #under each state is an unordered list, <ul>, of reps
curr_state_reps=currState.find_all("li") #each list item is a rep--careful with reps who change mid-term

#Show example of finding Jo Bonner's alma mater
Jo_Bonner=[]
Jo_Bonner.append(stateList[0].a.getText()) #current state
Jo_Bonner.append(curr_state_reps[0].a.getText()) #district
Jo_Bonner.append(curr_state_reps[0].a.findNext("a").getText()) #name

JB_link="https://en.wikipedia.org"+curr_state_reps[0].a.findNext("a")['href']
response = urllib2.urlopen(JB_link)
JB_soup=BeautifulSoup(response.read())

JB_table=JB_soup.find("table", class_="infobox vcard")
JB_univ=JB_table.find("a", title="Alma mater").findNext("td").getText()
Jo_Bonner.append(JB_univ)

#Now iterate through entire House Of Reps for 108th
congressman_list=[]
for curr_state in stateList:
    curr_rep_list=curr_state.findNext("ul").find_all("li")
    for ctr, curr_rep in enumerate(curr_rep_list):
        temp_rep=[]
        temp_rep.append(curr_state.a.getText()) #current state
        print(curr_state.a.getText()+" district "+str(ctr))
        #If only one link then congressman is replacement
        if len(curr_rep.find_all("a"))!=1:
            temp_rep.append(curr_rep.a.getText()) #district
            temp_rep.append(curr_rep.a.findNext("a").getText()) #name
            temp_rep_link="https://en.wikipedia.org"+curr_rep.a.findNext("a")['href'] #link to bio page
        else:
            temp_rep.append(str(ctr-1)) #use previous district
            temp_rep.append(curr_rep.a.getText()) #name
            temp_rep_link="https://en.wikipedia.org"+curr_rep.a['href'] #link to bio page
            
        response = urllib2.urlopen(temp_rep_link)
        temp_rep_soup=BeautifulSoup(response.read())
        temp_rep_table=temp_rep_soup.find("table", class_="infobox vcard")
        if temp_rep_table.find("a", title="Alma mater")==None:
            temp_rep.append("alma mater missing")
        else:
            temp_rep.append(temp_rep_table.find("a", title="Alma mater").findNext("td").getText())          
        congressman_list.append(temp_rep) #append indiv rep to list
        
pprint(congressman_list)
#Can now export to CSV or whatever