-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchinaz.py
86 lines (59 loc) · 2.12 KB
/
chinaz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
__author__ = 'bloodchilde'
import urllib
import urllib2
import re
import os
import time
class Spider:
def __init__(self):
self.siteUrl="http://sc.chinaz.com/biaoqing/"
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
self.headers = { 'User-Agent' : self.user_agent }
def getPage(self,pageIndex):
url = self.siteUrl+"index_"+str(pageIndex)+".html"
print url
request = urllib2.Request(url,headers = self.headers)
response = urllib2.urlopen(request)
return response.read().decode("utf-8")
def getContents(self,pageIndex):
page = self.getPage(pageIndex)
pattern = re.compile('''<div.*?class='num_1'.*?>.*?<p>.*?<a.*?href='.*?'.*?target='_blank'.*?title='(.*?)'.*?><img.*?src2="(.*?)".*?>.*?</a>.*?</p>.*?</div>''',re.S)
items = re.findall(pattern,page)
print items
contents=[]
for item in items:
print "%s---%s"%(item[0],item[1])
time.sleep(0.1)
contents.append([item[0],item[1]])
return contents
def mk_dir(self,path):
isExisist = os.path.exists(path)
if not isExisist:
os.makedirs(path)
return True
else:
return False
def downImage(self,url,dirname):
imageUrl = url
request = urllib2.Request(imageUrl,headers = self.headers)
response = urllib2.urlopen(request)
imageContents = response.read()
urlArr = imageUrl.split(u"/")
imageName = str(urlArr[len(urlArr)-1])
print imageName
path = u"C:/l/python/code_python/images"+dirname
self.mk_dir(path)
imagePath = path+u"/"+imageName
f = open(imagePath, 'wb')
f.write(imageContents)
f.close()
def downLoadAllPicture(self,PageIndex):
contents = self.getContents(PageIndex)
for list in contents:
dirname = list[0]
imageUrl = list[1]
self.downImage(imageUrl,dirname)
demo = Spider()
for page in range(3,4):
# demo.downLoadAllPicture(page)
demo.getContents(page)