|
1 | | -import sys |
2 | | -import os |
3 | | -from pathlib import Path |
4 | | - |
5 | | -p = Path(__file__).resolve().parents[5] |
6 | | -sys.path.insert(1, str(p)) |
7 | | -from common import list_pdf_v3 |
8 | | - |
9 | | -""" |
10 | | -SETUP HOW-TO: |
11 | | - Step 1: Set webpage to the page you want to scrape. |
12 | | - Step 2: Click the links that lead to the files, and copy their paths. |
13 | | - For example, http://www.beverlyhills.org/cbhfiles/storage/files/long_num/file.pdf would become /cbhfiles/storage/files/long_num/ |
14 | | - **NOTE:** Ensure that files all match paths, otherwise remove a level until they match. |
15 | | - Also ensure that domain stays the same (I've seen some sites use AWS buckets for one file and an on-site storage method for another) |
16 | | - Verify* on page that the href to the file contains the domain, if it doesn't, add the domain to domain. |
17 | | - Step 3: If the domain is not in the href, set domain_included to False, otherwise set it to True |
18 | | - Step 4: If you set domain_included to False, you need to add the domain (from the http(s) to the top level domain (TLD) (.com, .edu, etc), |
19 | | - otherwise, you can leave it blank. |
20 | | - Step 5: Set sleep_time to the desired integer. Best practice is to set it to the crawl-delay in a website's `robots.txt`. |
21 | | - Most departments do not seem to have a crawl-delay specified, so leave it at 5 (If it's not there). |
22 | | - Step 6: (Only applies to list_pdf_v3) If there are any documents that you *don't* want to scrape from the page, |
23 | | - put the words that are **unique** to them. |
24 | | - Step 7: "debug" is will make the scraper more verbose, but will generally be unhelpful to the average user. Leave False unless you're having issues. |
25 | | - "csv_dir" is better explained in the readme. |
26 | | - Step 8: If you (for whatever reason) don't like where the scraper is saving the data, your can change this path (by either completely changing it or adding subfolders, both are supported.) |
27 | | -
|
28 | | -EXAMPLE CONFIG: |
29 | | - configs = { |
30 | | - "webpage": "http://www.beverlyhills.org/departments/policedepartment/crimeinformation/crimestatistics/web.jsp", |
31 | | - "web_path": "/cbhfiles/storage/files/", |
32 | | - "domain_included": False, |
33 | | - "domain": "http://www.beverlyhills.org", |
34 | | - "sleep_time": 5, |
35 | | - "non_important": ["emergency", "training", "guidelines"], |
36 | | - "debug": False, |
37 | | - "csv_dir": "/csv/", |
38 | | - } |
39 | | -""" |
40 | | - |
41 | | -configs = { |
42 | | - "webpage": "", |
43 | | - "web_path": "", |
44 | | - "domain_included": False, |
45 | | - "domain": "", |
46 | | - "sleep_time": 5, |
47 | | - "non_important": [], |
48 | | - "debug": False, |
49 | | - "csv_dir": "/csv/", |
50 | | -} |
51 | | - |
52 | | -save_dir = "./data/" |
53 | | - |
54 | | -list_pdf_v3(configs, save_dir) |
| 1 | +import sys |
| 2 | +import os |
| 3 | +from pathlib import Path |
| 4 | + |
| 5 | +p = Path(__file__).resolve().parents[5] |
| 6 | +sys.path.insert(1, str(p)) |
| 7 | +from common import list_pdf_v3 |
| 8 | + |
| 9 | +""" |
| 10 | +SETUP HOW-TO: |
| 11 | + Step 1: Set webpage to the page you want to scrape. |
| 12 | + Step 2: Click the links that lead to the files, and copy their paths. |
| 13 | + For example, http://www.beverlyhills.org/cbhfiles/storage/files/long_num/file.pdf would become /cbhfiles/storage/files/long_num/ |
| 14 | + **NOTE:** Ensure that files all match paths, otherwise remove a level until they match. |
| 15 | + Also ensure that domain stays the same (I've seen some sites use AWS buckets for one file and an on-site storage method for another) |
| 16 | + Verify* on page that the href to the file contains the domain, if it doesn't, add the domain to domain. |
| 17 | + Step 3: If the domain is not in the href, set domain_included to False, otherwise set it to True |
| 18 | + Step 4: If you set domain_included to False, you need to add the domain (from the http(s) to the top level domain (TLD) (.com, .edu, etc), |
| 19 | + otherwise, you can leave it blank. |
| 20 | + Step 5: Set sleep_time to the desired integer. Best practice is to set it to the crawl-delay in a website's `robots.txt`. |
| 21 | + Most departments do not seem to have a crawl-delay specified, so leave it at 5 (If it's not there). |
| 22 | + Step 6: (Only applies to list_pdf_v3) If there are any documents that you *don't* want to scrape from the page, |
| 23 | + put the words that are **unique** to them. |
| 24 | + Step 7: "debug" is will make the scraper more verbose, but will generally be unhelpful to the average user. Leave False unless you're having issues. |
| 25 | + "csv_dir" is better explained in the readme. |
| 26 | + Step 8: If you (for whatever reason) don't like where the scraper is saving the data, your can change this path (by either completely changing it or adding subfolders, both are supported.) |
| 27 | +
|
| 28 | +EXAMPLE CONFIG: |
| 29 | + configs = { |
| 30 | + "webpage": "http://www.beverlyhills.org/departments/policedepartment/crimeinformation/crimestatistics/web.jsp", |
| 31 | + "web_path": "/cbhfiles/storage/files/", |
| 32 | + "domain_included": False, |
| 33 | + "domain": "http://www.beverlyhills.org", |
| 34 | + "sleep_time": 5, |
| 35 | + "non_important": ["emergency", "training", "guidelines"], |
| 36 | + "debug": False, |
| 37 | + "csv_dir": "/csv/", |
| 38 | + } |
| 39 | +""" |
| 40 | + |
| 41 | +configs = { |
| 42 | + "webpage": "", |
| 43 | + "web_path": "", |
| 44 | + "domain_included": False, |
| 45 | + "domain": "", |
| 46 | + "sleep_time": 5, |
| 47 | + "non_important": [], |
| 48 | + "debug": False, |
| 49 | + "csv_dir": "/csv/", |
| 50 | +} |
| 51 | + |
| 52 | +save_dir = "./data/" |
| 53 | + |
| 54 | +list_pdf_v3(configs, save_dir) |
0 commit comments