-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawler_thread.py
99 lines (87 loc) · 3.37 KB
/
crawler_thread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import httplib2
import os
import re
import threading
import urllib
from urlparse import urlparse, urljoin
from BeautifulSoup import BeautifulSoup
class Singleton(object):
def __new__(cls):
if not hasattr(cls, 'instance'):
cls.instance = super(Singleton, cls).__new__(cls)
return cls.instance
class ImageDownloaderThread(threading.Thread):
"""A thread for downloading image"""
def __init__(self, thread_id, name, counter):
threading.Thread.__init__(self)
self.name = name
def run(self):
print "Starting thread ", self.name
download_images(self.name)
print "Finsihed thread ", self.name
def traverse_site(max_links=10):
"""BFS algorithm to find links and adds them to set for downloadin"""
link_parser_singleton = Singleton()
#pages to parse in the queue, loops starts
while link_parser_singleton.queue_to_parse:
#if collected enough links to download images
if len(link_parser_singleton.to_visit) == max_links:
return
url = link_parser_singleton.queue_to_parse.pop()
http = httplib2.Http()
try:
status, response = http.request(url)
print status.get('content-type')
except Exception:
continue
if not 'text/html' in status.get('content-type'):
continue
link_parser_singleton.to_visit.add(url)
print "added", url, "to queue"
bs = BeautifulSoup(response)
for link in BeautifulSoup.findAll(bs, 'a'):
link_url = link.get('href')
if not link_url:
continue
parsed = urlparse(link_url)
if parsed.netloc and parsed.netloc != parsed_root.netloc:
continue
link_url = (parsed.scheme or parsed_root.scheme) + '://' + (parsed.netloc or parsed_root.netloc) + parsed.path or ''
if link_url in link_parser_singleton.to_visit:
continue
link_parser_singleton.queue_to_parse = [link_url] + link_parser_singleton.queue_to_parse
def download_images(thread_name):
singleton = Singleton()
while singleton.to_visit:
url = singleton.to_visit.pop()
http = httplib2.Http()
print thread_name, 'starting downloading images from ' , url
try:
status, response = http.request(url)
except Exception:
continue
bs = BeautifulSoup(response)
images = BeautifulSoup.findAll(bs, 'img')
for image in images:
src = image.get('src')
src = urljoin(url, src)
basename = os.path.basename(src)#gets a base name to name file locally
if src not in singleton.downloaded:
singleton.downloaded.add(src)
print 'Downloading ', src
urllib.urlretrieve(src, os.path.join('images', basename))
print thread_name, 'finished downolading images from ', url
if __name__ == '__main__':
root = 'http://www.thapanirajan.com'
parsed_root = urlparse(root)
singleton = Singleton()
singleton.queue_to_parse = [root]
singleton.to_visit = set()
singleton.downloaded = set()
traverse_site()
if not os.path.exists('images'):
os.mkdir('images')
thread1 = ImageDownloaderThread(1, "Thread-1", 1)
thread2 = ImageDownloaderThread(2, "Thread-2", 2)
thread1.start()
thread2.start()