forked from avinashkranjan/Amazing-Python-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeeksforgeeks_article_downloader.py
56 lines (47 loc) · 2 KB
/
geeksforgeeks_article_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import sys
import os
from selenium import webdriver # Automated webdriver
from PIL import Image # For manipulating images
from fpdf import FPDF # For converting images to pdf
# This function takes in the url from which the article has to be downloaded.
# With selenium chrome driver it gets the screenshot of the article tag and
# saves it as a png image. The path of the image is sent to conver_image_to_pdf()
def get_html(url):
path = "image.png"
options = webdriver.ChromeOptions()
# True is required for taking the screenshot with scroll.
options.headless = True
driver = webdriver.Chrome(
r"chromedriver_win32\chromedriver.exe", options=options)
driver.get(url) # url is passes
required_height = driver.execute_script(
"return document.body.parentNode.scrollHeight"
) # gets the scroll height
# sets the window height and width
driver.set_window_size(1366, required_height)
driver.find_element_by_tag_name("article").screenshot(
path
) # Every article in GeeksForGeeks has article tag
convert_image_to_pdf("image.png")
# This function uses fpdf library to convert the image passed from the last
# function to a pdf. For image manipulation it uses pillow.
def convert_image_to_pdf(path):
cover = Image.open(path)
width, height = cover.size
margin = 20
# Setting up the dimensions
pdf = FPDF(unit="pt", format=[width + 2 * margin, height + 2 * margin])
pdf.add_page() # Adding new page to the pdf
pdf.image(path, margin, margin)
pdf_filename = input("Enter the file name: ") + ".pdf"
pdf.output(pdf_filename, "F")
print("Success!!")
if __name__ == "__main__":
if len(sys.argv) > 1: # Get the url of the site from where you want to download
url = " ".join(sys.argv[1:])
else:
url = input("Enter the URL: ")
if not url.startswith(("http://", "https://")):
url = "https://" + url
get_html(url)
os.remove("image.png")