-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtexas_house_data_collect.py
More file actions
119 lines (102 loc) · 3.61 KB
/
texas_house_data_collect.py
File metadata and controls
119 lines (102 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import string
import json
import time
# this code will let me open previous json scan filed bills and
# compare to what is alread in json then
# crawl the ones it needs to add
# i have not added a append json to this but i need to run it tonigh
# and its a good start
#f = open('All_Texas_Senate_Bills.json')
#data = json.load(f)
#for key in data:
#print(key.keys())
#print(key['Bill Main Url'])
#all_keys.append(key['Bill Main Url'])
#difference = list(set(house_list_of_urls) - set(all_keys))
#print(len(difference))
house_list_of_urls = []
create_json = []
website ='https://capitol.texas.gov/Reports/Report.aspx?LegSess=89R&ID=housefiled'
def make_soup1(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "lxml")
return soupdata
soup = make_soup1(website)
time.sleep(1)
for record in soup.findAll('tr'): #type of data
for data in record.findAll('a'):# type of data from website
#print(data.get('href'))
house_list_of_urls.append(data.get('href'))
print(len(house_list_of_urls))
for list_of_urls in house_list_of_urls:
print(list_of_urls)
try:
hrbillsaved=[] # saves everything into a list
soup = make_soup1(list_of_urls)# the website i am scraping from
time.sleep(1)
for bill_num in soup.findAll('span',id='usrBillInfoTabs_lblBill'):
print(bill_num.text)
bill_name = bill_num.text
for data in soup.findAll('td', id = 'cellAuthors'):# type of data from website
print(data.text)
author = data.text
for data in soup.findAll('td', id = 'cellCaptionText'):# type of data from website
print(data.text)
caption = data.text
committees = []
for data in soup.findAll('td', id = 'cellComm1Committee'):# type of data from website
for href in data.findAll('a'):
if href is None:
print('no committees yet')
pass
else:
print('https://capitol.texas.gov/Committees/'+href.get('href'))
committees.append('https://capitol.texas.gov/Committees/'+href.get('href'))
for record in soup.findAll('tr'): #type of data
for data in record.findAll('a', class_ = 'enabledButNotActive'):# type of data from website
if 'Text' in data.get('href'):
bill_text_url ='https://capitol.texas.gov/BillLookup/'+data.get('href')
time.sleep(1)
soup = make_soup1(bill_text_url)
for record in soup.findAll('tr'): #type of data
for data in record.findAll('a'):
#print(data.get('href'))
if data.get('href') is None:
#print('none')
pass
elif 'html' in data.get('href'):
#print(data.get('href'))
bill_text_html = 'https://capitol.texas.gov/'+data.get('href')
time.sleep(1)
soup = make_soup1(bill_text_html)# the website i am scraping from
for record in soup.findAll('tr'): #type of data
for data in record.findAll('td'): # type of data from website
#print(data.text)
data.text.strip()
hrbillsaved.append(data.text)
bill = ' '.join(hrbillsaved)
bill_strip = bill.strip()
#print(bill_strip)
create_json.append(dict(
{'Bill Name': bill_name,
'Bill Main Url': list_of_urls,
'Author': author,
'Caption': caption,
'Commitee': committees,
'Bill Text Url': bill_text_url,
'Bill Text HTML': bill_text_html,
'Bill Text': bill
}))
#for data in record.findAll('a'):
#print(data.get('href')
except Exception as e:
print(str(e))
json_data = json.dumps(create_json, indent=4)
with open('New_All_Texas_House_Bills.json','w') as f:
f.write(json_data)
#time.sleep(2)