-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtmx_scraper.py
More file actions
executable file
·109 lines (98 loc) · 3.37 KB
/
tmx_scraper.py
File metadata and controls
executable file
·109 lines (98 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python
# TMX Scraper for stock symbols and their rate of return.
# Inspired by following webpage:
# #https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python
#https://stackoverflow.com/questions/10309550/python-beautifulsoup-iterate-over-table
#https://stackoverflow.com/questions/37926684/entering-value-into-search-bar-and-downloading-output-from-webpage
###################################
###################################
# import dryscrape
# from bs4 import BeautifulSoup
# session = dryscrape.Session()
# session.visit(my_url)
# response = session.body()
# soup = BeautifulSoup(response)
# soup.find(id="intro-text")
## Result:
# <p id="intro-text">Yay! Supports javascript</p>
###################################
###################################
import dryscrape
from bs4 import BeautifulSoup
import os
import sys
import json
import requests
import string
import datetime
import glob
#import shutil
def get_post_data(html_soup, query):
btn_search = 'Search'
return { 'SearchKeyword': query,
'btn-search.orangeButton': btn_search }
################################################
################################################
if __name__ == '__main__':
# example: ./tmx_scrap "A" "A_symbols"
in_macro = str(sys.argv[0]) # Input File
letter = str(sys.argv[1]) # Letter to scrape
out_file = str(sys.argv[2]) # Output file
#base_url = "https://www.tmxmoney.com/en/research/listed_company_directory.html#"
#cur_url = base_url + str(letter);
base_url = "https://www.tsx.com/json/company-directory/search/tsx/"
cur_url = base_url +str(letter)+ "?"
print( "Visiting: " + cur_url )
with open( out_file, 'w' ) as out_f:
session = dryscrape.Session()
session.visit( str(cur_url) )
#response = session.body()
#print( response.json() )
#soup = BeautifulSoup( response, "lxml" )
#json_object = json.load(soup)
#payload = get_post_data( soup, "^A" )
response = requests.get(str(cur_url))
data = response.text
d = json.loads(str(data))
for sym in range( len(d['results']) ):
print( "Name:"+str(d['results'][sym]['name']) )
print( "Symbol:"+str(d['results'][sym]['symbol']) )
print( "-----------------------" )
print d['results'][0]
print ( str(len(d['results'])) )
#print( data )
#json_d = json.dumps(data)
#print( "#################" )
#print( json_d )
json_l = json.loads(data)
#for ip in data.iteritems():
# print( ip['symbol'] )
# print( ip['name'] )
#print( response )
#print( r )
#print ( "r Above" )
#print( soup )
#json_data = json.loads(response.text)
#array = str(soup).split( "[{" )
#for sym in range( 0, len(array) ):
# print (array[sym])
#table1 = soup.find( id="tresults" )
#print ( table1 )
#table2 = table1.find( "tbody" )
#print( table2 )
#table3 = table2.find_all( "tr" )
#print ( table3 )
#for row in table3:
# for biglink in row.find_all( 'td' ):
# for link in biglink.find_all( 'a' ):
# print ( link.get( 'href' ) )
#for biglink in row.find_all( 'a' ):
# print( biglink.get('href') )
#print t
#table.find_all( lambda tag: tag.name=='tr' )
#for link in soup.find_all( 'a' ):
#for link in soup.find( class="fullTableWrapper" ).find_all( 'a' ):
# print(link.get('href'))
#A&W Revenue Rylty Un
#<a href="//web.tmxmoney.com/company.php?qm_symbol=AW.UN&locale=EN">A&W Revenue Rylty Un</a>
out_f.close()