-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_sign_templates.py
141 lines (118 loc) · 6.1 KB
/
wikipedia_sign_templates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""A module that uses web-scraping to download sign templates from Wikipedia. By default, it downloads all
sign_templates. Since there are often hundreds, there is an argument (--categories_file) that uses a line separated list
of sign categories to specify the signs to download (by considering the max similarity of the description of any
Wikipedia sign to any category in the categories_file).
"""
# wget for windows needs to be downloaded and added to the PATH variable
# https://eternallybored.org/misc/wget/
import os
import sys
import argparse
import requests
import shutil
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import time
import re
current_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(current_dir)
parser = argparse.ArgumentParser(description='Downloads sign templates from any Wikipedia or Wikimedia url')
# parser.add_argument('--country', help='The country to download the templates for', default='Germany')
parser.add_argument('--categories_file', help='a text file containing a line seperated list of sign categories to download', default=None)
parser.add_argument('--out_dir', help='the directory to save the templates to', default='./Wiki_Templates')
parser.add_argument('--resolution', help='The resolution for the signs to be downloaded at in px', default='240px')
parser.add_argument('--url', help='Specifies a wiki link', required=True)
class SignTemplate:
def __init__(self, desc, url, path):
self.url = url
self.description = desc
self.path = path
def initialise_sign_templates(html_text, resolution, dir):
soup = BeautifulSoup(html_text, 'html.parser')
sign_templates = []
top_level = soup.find("div", class_="mw-parser-output") # The top level of the html file that contains only the images and text we need
dir_1 = ""
dir_2 = ""
dir_3 = ""
sign_path = ""
for elem in top_level.contents:
if(elem.name == 'h2'):
# Finds the main section text in the wiki
dir_1 = "/" + str(elem.find("span", class_="mw-headline".split()).get_text()).replace("/","_").replace(" ","_")
dir_2 = ""
elif(elem.name == 'h3'):
# Finds the subsection text in the wiki
dir_2 = "/" + str(elem.find("span", class_="mw-headline".split()).get_text()).replace("/","_").replace(" ","_")
dir_3 = ""
elif(elem.name == 'h4'):
# Finds the sub-subsection text
dir_3 = "/" + str(elem.find("span", class_="mw-headline".split()).get_text()).replace("/","_").replace(" ","_")
elif(elem.name == 'ul'):
sign_path = dir + dir_1 + dir_2 + dir_3
get_templates(sign_templates, elem, resolution, sign_path)
return sign_templates
def get_templates(sign_templates, soup, resolution, sign_path):
gallery_boxes = soup.find_all('li', attrs={'class': 'gallerybox'})
for gbox in gallery_boxes:
gallerytext = gbox.find('div', attrs={'class': 'gallerytext'}).p
thumb = gbox.find('div', attrs={'class': 'thumb'}).img
srcsets = gbox.find('img', attrs = {'srcset' : True})
if gallerytext is None or thumb is None or srcsets is None:
continue
# remove alternative language description of sign
if gallerytext.b:
gallerytext.b.decompose()
desc = gallerytext.get_text().strip()
# url = thumb['srcset'].split(' ')[-2] # Get the largest image
url = srcsets['srcset'].split(' ')[-2]
if not url.startswith(('https:','http:')): # fixes start of url
url = 'https:' + url
url = re.sub("\d+px", resolution, url) # downloads at specified resolution
sign_templates.append(SignTemplate(desc, url, sign_path))
return sign_templates
def filter_templates(categories, sign_templates):
vectorizer = TfidfVectorizer()
final_templates = []
for cat in categories:
document_list = [s.description for s in sign_templates]
document_list.insert(0, cat)
embeddings = vectorizer.fit_transform(document_list)
scores = cosine_similarity(embeddings[0], embeddings[1:]).flatten()
final_templates.append(sign_templates[np.argmax(scores)])
return final_templates
if __name__ == "__main__":
args = parser.parse_args()
if os.path.exists(args.out_dir):
shutil.rmtree(args.out_dir)
os.makedirs(args.out_dir)
categories = []
cats_file = None
# Get the list of sign categories to download
if args.categories_file is not None:
with open(args.categories_file, 'r') as f:
categories = f.read().splitlines()
cats_file = open(os.path.join(args.out_dir, 'categories.txt'), 'w')
# Submit a GET request to the Wikipedia page
headers = {'User-Agent': 'SignScraper/0.0 ([email protected])'} # Prevents being denied access
r = requests.get(args.url, headers=headers)
# r = requests.get(f'https://en.wikipedia.org/wiki/Road_signs_in_{args.country}', headers=headers)
if r.status_code != 200:
print('Error: {}'.format(r.status_code))
sys.exit(1)
# Create a list of sign templates by extracting descriptions and urls of sign images from the HTML
sign_templates = initialise_sign_templates(r.text, args.resolution, args.out_dir)
# Filter the list of sign templates by the supplied categories using the TF-IDF algorithm
if categories != []:
sign_templates = filter_templates(categories, sign_templates)
# Download the sign templates
for i, sign_template in enumerate(sign_templates[:]):
if categories != []:
cats_file.write(f'{i + 1}:{categories[i]}\n')
extension = '.' + sign_template.url.split('.')[-1]
file_name = str(i + 1) + extension
if not os.path.exists(sign_template.path):
os.makedirs(sign_template.path)
os.system(f'wget -O {os.path.join(sign_template.path, file_name)} {sign_template.url}')
time.sleep(0.45) # Prevents cascading errors which corrupt the images