-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmain.py
225 lines (193 loc) · 7.62 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# -*- coding: utf-8 -*-
# __author__ = "Hong Nguyen Nam (Jeremy Nguyen)"
# __copyright__ = "Copyright 2022, The Browser Clone"
# __license__ = "GPL"
# __version__ = "2.1.0"
# __email__ = "a2FpdG9raWQxNDEyLmNvbmFuQGdtYWlsLmNvbQ=="
__black_list_type__ = ['.php']
__status_code__ = [200, 404]
__clone_all__ = False
__zip__ = False
__headless__ = False
__clone_url__ = 'https://themesbrand.com/velzon/html/default/index.html'
import os
import os.path
import re
import shutil
import time
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from zipfile36 import ZipFile
def extract_info_url(url, main=False):
data_url = urlparse(url)
domain = data_url.netloc
path_file = domain.replace('.', '') + os.path.split(data_url.path)[0] + '/'
file_name = os.path.split(data_url.path)[1]
scheme = data_url.scheme
url_ori = url.replace(file_name, '')
black_list = ['', '/']
if main == True and file_name in black_list:
file_name = 'index.html'
return {"domain": domain, "path": path_file, "file_name": file_name, "scheme": scheme, "url": url_ori}
def get_all_file_paths(directory):
file_paths = []
for root, directories, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
file_paths.append(filepath)
return file_paths
def compress(path_folder):
print(f'Compression files... {str(path_folder)}.zip')
directory = path_folder
file_paths = get_all_file_paths(directory)
with ZipFile(f'{path_folder}.zip', 'w') as zip:
for file in file_paths:
zip.write(file)
print('All files zipped successfully!')
def check_invalid(file_name):
regex = r"[a-z-0-9]+.html"
matches = re.finditer(regex, file_name, re.MULTILINE)
for match in matches:
return match.group()
class File:
info_url = ''
def __init__(self, url):
self.url = url
self.info_url = extract_info_url(url, True)
self.check_exists(url)
def download_file(self, url, headers):
info_url = extract_info_url(url)
if url == self.url:
info_url = extract_info_url(url, True)
if info_url['file_name'][-4:] not in __black_list_type__:
file_name = info_url['file_name']
black_list = ['', '/']
if file_name in black_list:
file_name = 'index.html'
path_file = info_url['path'] + file_name
if not os.path.exists(path_file):
r = requests.get(url, headers=headers)
os.makedirs(os.path.dirname(path_file), exist_ok=True)
with open(path_file, 'wb') as f:
f.write(r.content)
def check_exists(self, url):
info_url = extract_info_url(url)
if info_url['domain'] != self.info_url['domain']:
return False
path_file = info_url['path'] + info_url['file_name']
return os.path.exists(path_file) == False
def get_all_urls_in_page(self, page_source):
result = []
source = BeautifulSoup(page_source, 'html.parser')
try:
data_a = source.find_all("a")
except Exception:
data_a = None
a_tag = []
for a in data_a:
if a.get('href') != '' and a.get('href') != '#' and str(a.get('href')) not in a_tag and check_invalid(str(a.get('href'))) is not None:
a_tag.append(a.get('href'))
for href in a_tag:
domain = urlparse(href).netloc
if domain == '':
if len(href.split('../')) > 1:
cut = self.info_url['url'].split('/')[-(len(href.split('../'))):]
link = self.info_url['url']
for text in cut:
if text != '':
link = link.replace(f'{str(text)}/', '')
result.append(link + href.replace('../', ''))
elif href[:1] == '/':
link = re.split('[\/]+', self.info_url['url'])[:2]
link = f'{str(link[0])}//{str(link[1])}'
result.append(link + href)
else:
result.append(self.info_url['url'] + href)
if domain == self.info_url['domain']:
result.append(href)
return result
class BrowserClone(File):
driver = ''
page_source = ''
all_url = []
url_down = []
headers = {}
def __init__(self, url):
super().__init__(url)
self.url = url
self.open_browser()
def open_browser(self):
print('============================== Begin ==============================')
options = webdriver.ChromeOptions()
if __headless__:
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_experimental_option("useAutomationExtension", False)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.page_load_strategy = 'none'
self.driver = webdriver.Chrome(chrome_options=options, executable_path=ChromeDriverManager().install())
self.driver.get(self.url)
print('Waiting 30s to make sure the page has finished loading...')
time.sleep(30)
self.set_page_source()
self.extract_file()
print('Getting all the links to crawl...')
all_urls_in_page = super().get_all_urls_in_page(self.page_source)
for url_in_page in all_urls_in_page:
self.all_url.append(url_in_page)
self.extract_html(url_in_page)
# clone options
if __clone_all__:
all_url = list(set(self.all_url))
for url in all_url:
self.driver.get(url)
self.extract_file()
print('Get all the links done!')
self.extract_file(True)
if __zip__:
url_info = extract_info_url(self.url, True)
folder = './' + url_info['domain'].replace('.', '')
compress(folder)
try:
shutil.rmtree(folder, ignore_errors=True)
except OSError as e:
print(f"Error: {folder} : {e.strerror}")
print('============================== End Game ==============================')
def extract_html(self, url):
super().__init__(url)
self.driver.get(url)
self.set_page_source()
all_urls_in_page = super().get_all_urls_in_page(self.page_source)
for url_in_page in all_urls_in_page:
self.all_url.append(url_in_page)
self.headers = self.driver.requests[0].headers
def extract_file(self, down=False):
for request in self.driver.requests:
if (
request.response
and request.response.status_code in __status_code__
and request.url not in self.url_down
):
self.url_down.append(request.url)
if down:
print('Save files...')
super().__init__(self.url)
data = list(set(self.url_down))
with tqdm(total=len(data)) as pbar:
for file in data:
if super().check_exists(file):
super().download_file(file, self.headers)
pbar.update(1)
print('Save files Done!')
def set_page_source(self):
for _ in range(5):
try:
self.driver.switch_to.alert.accept()
except Exception:
continue
self.page_source = self.driver.page_source
BrowserClone(__clone_url__)