fix issue wrongly identifying internal urls as external urls in the link extractor tutorial

x4nth055 · x4nth055 · commit 76830433c2b7 · 2022-07-21T14:59:02.000+01:00
diff --git a/web-scraping/link-extractor/link_extractor.py b/web-scraping/link-extractor/link_extractor.py
@@ -32,8 +32,6 @@ def get_all_website_links(url):
     """
     # all URLs of `url`
     urls = set()
-    # domain name of the URL without the protocol
-    domain_name = urlparse(url).netloc
     soup = BeautifulSoup(requests.get(url).content, "html.parser")
     for a_tag in soup.findAll("a"):
         href = a_tag.attrs.get("href")
@@ -89,16 +87,15 @@ def crawl(url, max_urls=30):
     args = parser.parse_args()
     url = args.url
     max_urls = args.max_urls
-
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
     crawl(url, max_urls=max_urls)
 
     print("[+] Total Internal links:", len(internal_urls))
     print("[+] Total External links:", len(external_urls))
     print("[+] Total URLs:", len(external_urls) + len(internal_urls))
     print("[+] Total crawled URLs:", max_urls)
 
-    domain_name = urlparse(url).netloc
-
     # save the internal links to a file
     with open(f"{domain_name}_internal_links.txt", "w") as f:
         for internal_link in internal_urls:
diff --git a/web-scraping/link-extractor/link_extractor_js.py b/web-scraping/link-extractor/link_extractor_js.py
@@ -32,8 +32,6 @@ def get_all_website_links(url):
     """
     # all URLs of `url`
     urls = set()
-    # domain name of the URL without the protocol
-    domain_name = urlparse(url).netloc
     # initialize an HTTP session
     session = HTMLSession()
     # make HTTP request & retrieve response
@@ -98,15 +96,14 @@ def crawl(url, max_urls=30):
     args = parser.parse_args()
     url = args.url
     max_urls = args.max_urls
-
+    domain_name = urlparse(url).netloc
     crawl(url, max_urls=max_urls)
 
     print("[+] Total Internal links:", len(internal_urls))
     print("[+] Total External links:", len(external_urls))
     print("[+] Total URLs:", len(external_urls) + len(internal_urls))
     print("[+] Total crawled URLs:", max_urls)
 
-    domain_name = urlparse(url).netloc
 
     # save the internal links to a file
     with open(f"{domain_name}_internal_links.txt", "w") as f: