Skip to content

Commit f251143

Browse files
committed
added extracting & submitting web forms tutorial
1 parent 01965af commit f251143

File tree

5 files changed

+134
-0
lines changed

5 files changed

+134
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
7676
- [How to Convert HTML Tables into CSV Files in Python](https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python). ([code](web-scraping/html-table-extractor))
7777
- [How to Use Proxies to Anonymize your Browsing and Scraping using Python](https://www.thepythoncode.com/article/using-proxies-using-requests-in-python). ([code](web-scraping/using-proxies))
7878
- [How to Extract Script and CSS Files from Web Pages in Python](https://www.thepythoncode.com/article/extract-web-page-script-and-css-files-in-python). ([code](web-scraping/webpage-js-css-extractor))
79+
- [How to Extract and Submit Web Forms from a URL using Python](https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python). ([code](web-scraping/extract-and-fill-forms))
7980

8081
- ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
8182
- [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# [How to Extract and Submit Web Forms from a URL using Python](https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- To extract forms, use `form_extractor.py`:
5+
```
6+
python form_extractor.py https://wikipedia.org
7+
```
8+
- To extract and submit forms, use `form_submitter.py`:
9+
```
10+
python form_submitter.py https://wikipedia.org
11+
```
12+
This will extract the first form (you can change that in the code) and prompt the user for each non-hidden input field, and then submits the form and loads the respond HTML in your default web browser, try it out!
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from bs4 import BeautifulSoup
2+
from requests_html import HTMLSession
3+
from pprint import pprint
4+
5+
# initialize an HTTP session
6+
session = HTMLSession()
7+
8+
9+
def get_all_forms(url):
10+
"""Returns all form tags found on a web page's `url` """
11+
# GET request
12+
res = session.get(url)
13+
# for javascript driven website
14+
# res.html.render()
15+
soup = BeautifulSoup(res.html.html, "html.parser")
16+
return soup.find_all("form")
17+
18+
19+
def get_form_details(form):
20+
"""Returns the HTML details of a form,
21+
including action, method and list of form controls (inputs, etc)"""
22+
details = {}
23+
# get the form action (requested URL)
24+
action = form.attrs.get("action").lower()
25+
# get the form method (POST, GET, DELETE, etc)
26+
# if not specified, GET is the default in HTML
27+
method = form.attrs.get("method", "get").lower()
28+
# get all form inputs
29+
inputs = []
30+
for input_tag in form.find_all("input"):
31+
# get type of input form control
32+
input_type = input_tag.attrs.get("type", "text")
33+
# get name attribute
34+
input_name = input_tag.attrs.get("name")
35+
# get the default value of that input tag
36+
input_value =input_tag.attrs.get("value", "")
37+
# add everything to that list
38+
inputs.append({"type": input_type, "name": input_name, "value": input_value})
39+
# put everything to the resulting dictionary
40+
details["action"] = action
41+
details["method"] = method
42+
details["inputs"] = inputs
43+
return details
44+
45+
46+
if __name__ == "__main__":
47+
import sys
48+
# get URL from the command line
49+
url = sys.argv[1]
50+
# get all form tags
51+
forms = get_all_forms(url)
52+
# iteratte over forms
53+
for i, form in enumerate(forms, start=1):
54+
form_details = get_form_details(form)
55+
print("="*50, f"form #{i}", "="*50)
56+
pprint(form_details)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from bs4 import BeautifulSoup
2+
from requests_html import HTMLSession
3+
4+
from pprint import pprint
5+
from urllib.parse import urljoin
6+
import webbrowser
7+
import sys
8+
9+
from form_extractor import get_all_forms, get_form_details, session
10+
11+
# get the URL from the command line
12+
url = sys.argv[1]
13+
# get the first form (edit this as you wish)
14+
first_form = get_all_forms(url)[0]
15+
# extract all form details
16+
form_details = get_form_details(first_form)
17+
pprint(form_details)
18+
# the data body we want to submit
19+
data = {}
20+
for input_tag in form_details["inputs"]:
21+
if input_tag["type"] == "hidden":
22+
# if it's hidden, use the default value
23+
data[input_tag["name"]] = input_tag["value"]
24+
elif input_tag["type"] != "submit":
25+
# all others except submit, prompt the user to set it
26+
value = input(f"Enter the value of the field '{input_tag['name']}' (type: {input_tag['type']}): ")
27+
data[input_tag["name"]] = value
28+
29+
# join the url with the action (form request URL)
30+
url = urljoin(url, form_details["action"])
31+
32+
if form_details["method"] == "post":
33+
res = session.post(url, data=data)
34+
elif form_details["method"] == "get":
35+
res = session.get(url, params=data)
36+
37+
# the below code is only for replacing relative URLs to absolute ones
38+
soup = BeautifulSoup(res.content, "html.parser")
39+
for link in soup.find_all("link"):
40+
try:
41+
link.attrs["href"] = urljoin(url, link.attrs["href"])
42+
except:
43+
pass
44+
for script in soup.find_all("script"):
45+
try:
46+
script.attrs["src"] = urljoin(url, script.attrs["src"])
47+
except:
48+
pass
49+
for img in soup.find_all("img"):
50+
try:
51+
img.attrs["src"] = urljoin(url, img.attrs["src"])
52+
except:
53+
pass
54+
for a in soup.find_all("a"):
55+
try:
56+
a.attrs["href"] = urljoin(url, a.attrs["href"])
57+
except:
58+
pass
59+
60+
# write the page content to a file
61+
open("page.html", "w").write(str(soup))
62+
# open the page on the default browser
63+
webbrowser.open("page.html")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
requests_html
2+
bs4

0 commit comments

Comments
 (0)