Skip to content

Commit 5807f5b

Browse files
committed
Initial commit: first examlpe
0 parents  commit 5807f5b

File tree

4 files changed

+413
-0
lines changed

4 files changed

+413
-0
lines changed

Diff for: .gitignore

+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,venv
2+
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,python,venv
3+
4+
### Python ###
5+
# Byte-compiled / optimized / DLL files
6+
__pycache__/
7+
*.py[cod]
8+
*$py.class
9+
10+
# C extensions
11+
*.so
12+
13+
# Distribution / packaging
14+
.Python
15+
build/
16+
develop-eggs/
17+
dist/
18+
downloads/
19+
eggs/
20+
.eggs/
21+
lib/
22+
lib64/
23+
parts/
24+
sdist/
25+
var/
26+
wheels/
27+
share/python-wheels/
28+
*.egg-info/
29+
.installed.cfg
30+
*.egg
31+
MANIFEST
32+
33+
# PyInstaller
34+
# Usually these files are written by a python script from a template
35+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
36+
*.manifest
37+
*.spec
38+
39+
# Installer logs
40+
pip-log.txt
41+
pip-delete-this-directory.txt
42+
43+
# Unit test / coverage reports
44+
htmlcov/
45+
.tox/
46+
.nox/
47+
.coverage
48+
.coverage.*
49+
.cache
50+
nosetests.xml
51+
coverage.xml
52+
*.cover
53+
*.py,cover
54+
.hypothesis/
55+
.pytest_cache/
56+
cover/
57+
58+
# Translations
59+
*.mo
60+
*.pot
61+
62+
# Django stuff:
63+
*.log
64+
local_settings.py
65+
db.sqlite3
66+
db.sqlite3-journal
67+
68+
# Flask stuff:
69+
instance/
70+
.webassets-cache
71+
72+
# Scrapy stuff:
73+
.scrapy
74+
75+
# Sphinx documentation
76+
docs/_build/
77+
78+
# PyBuilder
79+
.pybuilder/
80+
target/
81+
82+
# Jupyter Notebook
83+
.ipynb_checkpoints
84+
85+
# IPython
86+
profile_default/
87+
ipython_config.py
88+
89+
# pyenv
90+
# For a library or package, you might want to ignore these files since the code is
91+
# intended to run in multiple environments; otherwise, check them in:
92+
# .python-version
93+
94+
# pipenv
95+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
97+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
98+
# install all needed dependencies.
99+
#Pipfile.lock
100+
101+
# poetry
102+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103+
# This is especially recommended for binary packages to ensure reproducibility, and is more
104+
# commonly ignored for libraries.
105+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106+
#poetry.lock
107+
108+
# pdm
109+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110+
#pdm.lock
111+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112+
# in version control.
113+
# https://pdm.fming.dev/#use-with-ide
114+
.pdm.toml
115+
116+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117+
__pypackages__/
118+
119+
# Celery stuff
120+
celerybeat-schedule
121+
celerybeat.pid
122+
123+
# SageMath parsed files
124+
*.sage.py
125+
126+
# Environments
127+
.env
128+
.venv
129+
env/
130+
venv/
131+
ENV/
132+
env.bak/
133+
venv.bak/
134+
135+
# Spyder project settings
136+
.spyderproject
137+
.spyproject
138+
139+
# Rope project settings
140+
.ropeproject
141+
142+
# mkdocs documentation
143+
/site
144+
145+
# mypy
146+
.mypy_cache/
147+
.dmypy.json
148+
dmypy.json
149+
150+
# Pyre type checker
151+
.pyre/
152+
153+
# pytype static type analyzer
154+
.pytype/
155+
156+
# Cython debug symbols
157+
cython_debug/
158+
159+
# PyCharm
160+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162+
# and can be added to the global gitignore or merged into this file. For a more nuclear
163+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
164+
#.idea/
165+
166+
### Python Patch ###
167+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168+
poetry.toml
169+
170+
# ruff
171+
.ruff_cache/
172+
173+
# LSP config files
174+
pyrightconfig.json
175+
176+
### venv ###
177+
# Virtualenv
178+
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
179+
[Bb]in
180+
[Ii]nclude
181+
[Ll]ib
182+
[Ll]ib64
183+
[Ll]ocal
184+
[Ss]cripts
185+
pyvenv.cfg
186+
pip-selfcheck.json
187+
188+
### VisualStudioCode ###
189+
.vscode/*
190+
!.vscode/settings.json
191+
!.vscode/tasks.json
192+
!.vscode/launch.json
193+
!.vscode/extensions.json
194+
!.vscode/*.code-snippets
195+
196+
# Local History for Visual Studio Code
197+
.history/
198+
199+
# Built Visual Studio Code Extensions
200+
*.vsix
201+
202+
### VisualStudioCode Patch ###
203+
# Ignore all local history of files
204+
.history
205+
.ionide
206+
207+
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,venv

Diff for: README.md

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# WEB SCRAPING TRAINING
2+
3+
## Description
4+
Web scraping is a method helps to extract data and information from any web site. This project was created to learn web scraping using python.
5+
It was based on the flowing tutorial [[1](https://www.youtube.com/watch?v=HCV6nEACQo4)]
6+
7+
It will cover:
8+
- First steps of web scraping using [requests](https://docs.python-requests.org/en/latest/index.html) and [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/en/latest/)
9+
- How to get around web sites protections against web scraping
10+
- What is legal and not on web scraping
11+
- How to create an advanced project for web scraping
12+
- How to use the AI to generate codes for scraping
13+
14+
## Requirements
15+
```bash
16+
beautifulsoup4==4.12.3
17+
requests==2.31.0
18+
html5lib==1.1
19+
```
20+
21+
## Examples
22+
### Scraping Cooking Recipe
23+
This is the very first example, where it is demonstrated how to use `requests` and `bs4` libraries to perform web scraping
24+
25+
The script is in the `scraping_recipe.py` file
26+
27+
#### STEP 1 - IMPORT `Requests` & `BeautifulSoup` LIBRARIES
28+
```python
29+
import requests
30+
from bs4 import BeautifulSoup
31+
```
32+
33+
#### STEP 2 - GET THE HTML SOURCE FROM THE URL
34+
```python
35+
url = "..."
36+
response = requests.get(url)
37+
```
38+
39+
The encoding is important for special characters, therefore, the following code can be used to use the exact encoding of the `url`
40+
41+
```python
42+
response.encoding = response.apparent_encoding
43+
```
44+
45+
The status of the `response` can be displayed using `response.status_code`
46+
47+
#### STEP 3 - PARSING & EXTRACTION OF DATA
48+
```python
49+
soup = BeautifulSoup(html, "html5lib")
50+
```
51+
52+
The `find` command can be used to extract data by searching specific `html` tag, the following example shows how to get the first `<h1>`. We can also obtain the content only by adding `.text` to the `find` command
53+
54+
```python
55+
soup.find("h1")
56+
soup.find("h1").text
57+
```
58+
59+
If we would like to extract content from a specific tag (*i.e.* using a special `class`), we can add an argument to the `find` command
60+
61+
```python
62+
soup.find("p", class_ = "...").text
63+
```
64+
65+
**Note:** We have to add the underscore to the word `class_` here to differentiate it with the python key word `class`
66+
67+
Now, if we would like to obtain all elements, we have to use the `find_all` command instead of `find`
68+
69+
```python
70+
soup.find_all("...")
71+
```
72+
73+
We can also search inside a specific part of the page by creating new `soup` element, and use either `find` or `find_all` on this new element
74+
75+
```python
76+
soup_element = soup.find("...", class_="...")
77+
soup_element.find_all("...")
78+
```
79+
80+
**Note:** On this example, we used only `find` and `find_all` commands. However, it is also possible to use other commands to extract same data such `select` ...etc. For more details, read the documentation of `BeautifulSoup` library
81+
82+
## Links
83+
- [Cooking recipe web site](https://codeavecjonathan.com/scraping/recette/)
84+
- [ScrapeThisSite.com](https://www.scrapethissite.com/pages/)
85+
86+
## Authors
87+
- [Malek B.](https://www.github.com/malek2610)

Diff for: scraping_recipe.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
url = "https://codeavecjonathan.com/scraping/recette/"
5+
6+
def get_text_if_not_none(e):
7+
# Return the text of html element if available or None if not
8+
if e:
9+
return e.text.strip()
10+
11+
return None
12+
13+
response = requests.get(url)
14+
response.encoding = response.apparent_encoding
15+
16+
if response.status_code == 200:
17+
print(f"OK - RESPONSE CODE = {response.status_code}")
18+
19+
html = response.text
20+
# print(html)
21+
22+
# Save the source into html file
23+
src_file = open("source.html", "w")
24+
src_file.write(html)
25+
src_file.close()
26+
27+
# Parsing and extracting the data
28+
soup = BeautifulSoup(html, "html5lib")
29+
30+
title = soup.find("h1").text
31+
print(f"Recipe: {title}")
32+
33+
description = get_text_if_not_none(soup.find("p", class_="description"))
34+
print(f"Description: \n{description}")
35+
36+
div_ingredients = soup.find("div", class_="ingredients")
37+
ingredients = div_ingredients.find_all("p")
38+
39+
print("Ingredients:")
40+
41+
for ingredient in ingredients:
42+
print(f"- {get_text_if_not_none(ingredient)}")
43+
44+
preparation_table = soup.find("table", class_="preparation")
45+
steps = preparation_table.find_all("td", class_="preparation_etape")
46+
47+
print("Preparation:")
48+
49+
for step in steps:
50+
print(f"- {get_text_if_not_none(step)}")
51+
52+
else:
53+
print(f"ERROR - RESPONSE CODE = {response.status_code}")

0 commit comments

Comments
 (0)