-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject.py
259 lines (225 loc) · 9.33 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import re
import os
import time
import requests
import cssbeautifier
from typing import List, Literal, Union
from slugify import slugify
from bs4 import BeautifulSoup, Tag
from bs4.element import ResultSet
from urllib.parse import urljoin, urlparse
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
}
def main():
while True:
url = input("Enter URL: ")
start = time.perf_counter()
soup = get_soup(url)
assert soup.head is not None
try:
css_styles = soup.head.find_all("style")
css_links = soup.head.find_all(css_selector)
except AttributeError:
print("Invalid URL. Please enter a valid URL.")
else:
break
if css_links or css_styles:
folder_name = get_title(soup)
print("Page title: " + str(soup.head.title.get_text()))
create_folder(folder_name, url, soup)
finish = time.perf_counter()
print(f"Finished in {round(finish - start, 2)} second(s).")
else:
print("No CSS found.")
def get_soup(url: str) -> BeautifulSoup:
"""This function prepares the soup."""
try:
request = Request(url, headers=headers)
time.sleep(3)
html = urlopen(request)
except URLError:
raise URLError("Server could not be found!")
except ValueError:
raise ValueError("Invalid URL. Please enter a valid URL.")
else:
soup = BeautifulSoup(html.read(), "html.parser")
return soup
def get_title(soup: BeautifulSoup) -> str:
"""This function grabs the soup's title and converts it into an OS-friendly name"""
assert soup.head is not None
if soup.head.title:
title = soup.head.title.get_text()
folder_name = slugify(title)
else:
folder_name = "website"
return folder_name
def get_basename(basename: str) -> str:
"""This function updates the basename of a file by removing any extra characters
after the file's extension"""
url_pattern = re.compile(r"([\w-]+\.\w+).*")
updated_basename = re.sub(url_pattern, lambda match: match.group(1), basename)
return updated_basename
def get_css_urls(css_url: str, links: List[str]) -> None:
"""This function downloads all the resources from a css file into a "resources" folder"""
global headers
resources_path = "resources"
if not os.path.exists(resources_path):
os.makedirs(resources_path)
os.chdir(resources_path) # change to "resources" folder
for link in links:
basename = get_basename(os.path.basename(link[0]))
filetype_path = link[1]
if not os.path.exists(filetype_path):
os.makedirs(filetype_path)
os.chdir(filetype_path) # change to {filetype} folder
with open(basename, "wb") as file:
if not urlparse(link[0]).scheme:
full_url = urljoin(css_url, link[0])
response = requests.get(full_url, headers=headers)
file.write(response.content)
else:
response = requests.get(link[0], headers=headers)
file.write(response.content)
os.chdir("../") # go back to "resources" folder
os.chdir("../") # go back to "css" folder
def replace_urls_regex(css_file: str) -> None:
"""This function replaces the URLs within the CSS files with their appropriate relative paths"""
url_pattern = re.compile(
r"([^\"\']?(?<=url\([\"\'\/\w])(?!data:)[^\)]+\.(\w+)[^\)\"\']*)"
)
def replace_url(match):
url = match.group(1)
extension = match.group(2)
filename = os.path.basename(url)
new_url = f"resources/{extension}/{filename}"
return new_url
with open(css_file) as file:
css_contents = file.read()
new_css_contents = re.sub(url_pattern, replace_url, css_contents)
with open(css_file, "w") as file:
file.write(new_css_contents)
def create_folder(path: str, url: str, soup: BeautifulSoup) -> None:
"""This function creates the folder that will contain the website's contents"""
"""This function tries to do too many things: create the directory of the webpage,
find the css tags in the document, parse them, output them, use regex to replace
url paths with relative paths, and create the index.html file. TOO MUCH STUFF GOING ON.
I'll break this function down into individual components.
"""
if not os.path.exists(path):
os.makedirs(path)
os.chdir(path) # change to website folder
css_path = "css/"
if not os.path.exists(css_path):
os.makedirs(css_path)
os.chdir(css_path) # change to css folder
css_links = soup.head.find_all(css_selector)
css_styles = soup.head.find_all("style")
if css_links:
links = get_css_links(css_links, url)
write_css_file(links, "link")
get_resources("links.css", links[0])
replace_urls_regex("links.css")
if css_styles:
write_css_file(css_styles, "style")
get_resources("styles.css", url)
replace_urls_regex("styles.css")
os.chdir("../") # go back to the website's folder
create_index(soup)
def get_resources(css_file: str, css_url: str) -> None:
"""This function gets all the resources from urls found in a css file in a "resources" folder"""
with open(css_file) as file:
css_contents = file.read()
url_pattern = re.compile(
r"([^\"\']?(?<=url\([\"\'\/\w])(?!data:)[^\)]+\.(\w+)[^\)\"\']*)"
)
links_re = re.findall(url_pattern, css_contents)
if links_re:
get_css_urls(css_url, links_re)
print(f'Created "resources" folder from {len(links_re)} links')
def create_index(soup: BeautifulSoup) -> None:
"""This function creates the index.html file with either a <link> or <style> tags
in the head section"""
assert soup.head is not None
css_links = soup.head.find_all(css_selector)
css_styles = soup.head.find_all("style")
if css_links:
for link in css_links:
link.decompose()
new_css_link = soup.new_tag(
"link", href="css/links.css", rel="stylesheet", type="text/css"
)
soup.head.append(new_css_link)
if css_styles:
for style in css_styles:
style.decompose()
new_css_style = soup.new_tag(
"link", href="css/styles.css", rel="stylesheet", type="text/css"
)
soup.head.append(new_css_style)
formatted_html = soup.prettify()
with open("index.html", "wb") as file:
file.write(formatted_html.encode())
def css_selector(tag: Tag) -> bool:
"""Selector for find_all() function that filters all css <link> tags"""
"""This version of this function wants to filter out the
<link rel="stylesheet" type="text/css" href="style.css"> tag(s)
which is intended to be passed as a parameter to a soup.head.find_all()
function. Currently, it tries to see if a tag has both a `rel` attribute
name and a `stylesheet` value to the `rel` attribute, which is redundant.
I commented out the old version and proposed a non-reduntant one.
"""
# return tag.name == "link" and (
# (tag.has_attr("rel") and "stylesheet" in tag.get("rel"))
# or (tag.has_attr("type") and "text/css" in tag.get("type"))
# )
return tag.name == "link" and (
tag.get("rel") == "stylesheet" or tag.get("type") == "text/css"
)
def get_css_links(results: ResultSet, url: str) -> List[str]:
"""This function returns a list of full href URLs found from <link> tags"""
links: List[str] = []
for result in results:
link = result.get("href")
parsed_link = urlparse(link)
if parsed_link.scheme:
links.append(link)
elif not parsed_link.scheme and result.has_attr("href"):
css_location = urljoin(url, result.get("href"))
links.append(css_location)
return links
def get_css(url: str) -> str:
"""This function requests a css file from URL and returns it"""
global headers
request = Request(url, headers=headers)
css = urlopen(request).read().decode()
return css
def write_css_file(
input: Union[ResultSet, List[str]], tag_name: Literal["style", "link"]
) -> None:
"""This function takes either a ResultSet object with <style> Tag objects, or a list of links,
formats their contents, and outputs them into a css file"""
"""I'll have to break this function down as well"""
try:
if isinstance(input, ResultSet):
with open("styles.css", "w") as file:
output_name = "styles.css"
for item in input:
css = item.get_text()
formatted_css = cssbeautifier.beautify(css)
file.write(formatted_css)
elif isinstance(input, List):
with open("links.css", "w") as file:
output_name = "links.css"
for item in input:
css = get_css(item)
formatted_css = cssbeautifier.beautify(css)
file.write(formatted_css)
except HTTPError as error:
print(f"Something went wrong. Reason: {error.status} {error.reason}")
else:
print(f"Created {output_name} from {len(input)} <{tag_name}> tag(s)")
if __name__ == "__main__":
main()