Skip to content

Commit 7683043

Browse files
committed
fix issue wrongly identifying internal urls as external urls in the link extractor tutorial
1 parent 06c7cef commit 7683043

File tree

2 files changed

+3
-9
lines changed

2 files changed

+3
-9
lines changed

web-scraping/link-extractor/link_extractor.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ def get_all_website_links(url):
3232
"""
3333
# all URLs of `url`
3434
urls = set()
35-
# domain name of the URL without the protocol
36-
domain_name = urlparse(url).netloc
3735
soup = BeautifulSoup(requests.get(url).content, "html.parser")
3836
for a_tag in soup.findAll("a"):
3937
href = a_tag.attrs.get("href")
@@ -89,16 +87,15 @@ def crawl(url, max_urls=30):
8987
args = parser.parse_args()
9088
url = args.url
9189
max_urls = args.max_urls
92-
90+
# domain name of the URL without the protocol
91+
domain_name = urlparse(url).netloc
9392
crawl(url, max_urls=max_urls)
9493

9594
print("[+] Total Internal links:", len(internal_urls))
9695
print("[+] Total External links:", len(external_urls))
9796
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
9897
print("[+] Total crawled URLs:", max_urls)
9998

100-
domain_name = urlparse(url).netloc
101-
10299
# save the internal links to a file
103100
with open(f"{domain_name}_internal_links.txt", "w") as f:
104101
for internal_link in internal_urls:

web-scraping/link-extractor/link_extractor_js.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ def get_all_website_links(url):
3232
"""
3333
# all URLs of `url`
3434
urls = set()
35-
# domain name of the URL without the protocol
36-
domain_name = urlparse(url).netloc
3735
# initialize an HTTP session
3836
session = HTMLSession()
3937
# make HTTP request & retrieve response
@@ -98,15 +96,14 @@ def crawl(url, max_urls=30):
9896
args = parser.parse_args()
9997
url = args.url
10098
max_urls = args.max_urls
101-
99+
domain_name = urlparse(url).netloc
102100
crawl(url, max_urls=max_urls)
103101

104102
print("[+] Total Internal links:", len(internal_urls))
105103
print("[+] Total External links:", len(external_urls))
106104
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
107105
print("[+] Total crawled URLs:", max_urls)
108106

109-
domain_name = urlparse(url).netloc
110107

111108
# save the internal links to a file
112109
with open(f"{domain_name}_internal_links.txt", "w") as f:

0 commit comments

Comments
 (0)