File tree 2 files changed +3
-9
lines changed
2 files changed +3
-9
lines changed Original file line number Diff line number Diff line change @@ -32,8 +32,6 @@ def get_all_website_links(url):
32
32
"""
33
33
# all URLs of `url`
34
34
urls = set ()
35
- # domain name of the URL without the protocol
36
- domain_name = urlparse (url ).netloc
37
35
soup = BeautifulSoup (requests .get (url ).content , "html.parser" )
38
36
for a_tag in soup .findAll ("a" ):
39
37
href = a_tag .attrs .get ("href" )
@@ -89,16 +87,15 @@ def crawl(url, max_urls=30):
89
87
args = parser .parse_args ()
90
88
url = args .url
91
89
max_urls = args .max_urls
92
-
90
+ # domain name of the URL without the protocol
91
+ domain_name = urlparse (url ).netloc
93
92
crawl (url , max_urls = max_urls )
94
93
95
94
print ("[+] Total Internal links:" , len (internal_urls ))
96
95
print ("[+] Total External links:" , len (external_urls ))
97
96
print ("[+] Total URLs:" , len (external_urls ) + len (internal_urls ))
98
97
print ("[+] Total crawled URLs:" , max_urls )
99
98
100
- domain_name = urlparse (url ).netloc
101
-
102
99
# save the internal links to a file
103
100
with open (f"{ domain_name } _internal_links.txt" , "w" ) as f :
104
101
for internal_link in internal_urls :
Original file line number Diff line number Diff line change @@ -32,8 +32,6 @@ def get_all_website_links(url):
32
32
"""
33
33
# all URLs of `url`
34
34
urls = set ()
35
- # domain name of the URL without the protocol
36
- domain_name = urlparse (url ).netloc
37
35
# initialize an HTTP session
38
36
session = HTMLSession ()
39
37
# make HTTP request & retrieve response
@@ -98,15 +96,14 @@ def crawl(url, max_urls=30):
98
96
args = parser .parse_args ()
99
97
url = args .url
100
98
max_urls = args .max_urls
101
-
99
+ domain_name = urlparse ( url ). netloc
102
100
crawl (url , max_urls = max_urls )
103
101
104
102
print ("[+] Total Internal links:" , len (internal_urls ))
105
103
print ("[+] Total External links:" , len (external_urls ))
106
104
print ("[+] Total URLs:" , len (external_urls ) + len (internal_urls ))
107
105
print ("[+] Total crawled URLs:" , max_urls )
108
106
109
- domain_name = urlparse (url ).netloc
110
107
111
108
# save the internal links to a file
112
109
with open (f"{ domain_name } _internal_links.txt" , "w" ) as f :
You can’t perform that action at this time.
0 commit comments