Skip to content

Commit 0dac313

Browse files
committed
first commit
0 parents  commit 0dac313

5 files changed

+82404
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# ImageNet-datasets-downloader

downloader.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import os
2+
import csv
3+
import numpy as np
4+
import requests
5+
import argparse
6+
import json
7+
8+
from requests.exceptions import ConnectionError, ReadTimeout, TooManyRedirects
9+
10+
parser = argparse.ArgumentParser(description='ImageNet image scraper')
11+
parser.add_argument('-scrape_only_flickr', default=True, type=lambda x: (str(x).lower() == 'true'))
12+
parser.add_argument('-number_of_classes', default = 10, type=int)
13+
parser.add_argument('-images_per_class', default = 10, type=int)
14+
parser.add_argument('-data_root', default='' , type=str)
15+
parser.add_argument('-use_class_list', default=False,type=lambda x: (str(x).lower() == 'true'))
16+
parser.add_argument('-class_list', default=[], nargs='*')
17+
18+
args, args_other = parser.parse_known_args()
19+
20+
if len(args.data_root) == 0:
21+
print("-data_root is required to run downloader!")
22+
exit()
23+
24+
if not os.path.isdir(args.data_root):
25+
print(f'folder {args.data_root} does not exist! please provide existing folder in -data_root arg!')
26+
exit()
27+
28+
IMAGENET_API_WNID_TO_URLS = lambda wnid: f'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={wnid}'
29+
30+
class_info_json_filename = 'imagenet_class_info.json'
31+
class_info_json_filepath = os.path.join(args.data_root, class_info_json_filename)
32+
33+
class_info_dict = dict()
34+
35+
with open(class_info_json_filepath) as class_info_json_f:
36+
class_info_dict = json.load(class_info_json_f)
37+
38+
classes_to_scrape = []
39+
40+
if args.use_class_list == True:
41+
42+
for item in args.class_list:
43+
classes_to_scrape.append(item)
44+
if item not in class_info_dict:
45+
print(f'Class {item} not found in ImageNete')
46+
47+
48+
elif args.use_class_list == False:
49+
50+
potential_class_pool = []
51+
52+
for key, val in class_info_dict.items():
53+
54+
if args.scrape_only_flickr:
55+
if int(val['flickr_img_url_count'])*0.8 > args.images_per_class:
56+
potential_class_pool.append(key)
57+
else:
58+
if int(val['img_url_count'])*0.8 > args.images_per_class:
59+
potential_class_pool.append(key)
60+
61+
picked_classes_idxes = np.random.choice(len(potential_class_pool), args.number_of_classes)
62+
63+
for idx in picked_classes_idxes:
64+
classes_to_scrape.append(potential_class_pool[idx])
65+
66+
print("Picked following clases")
67+
for class_wnid in classes_to_scrape:
68+
print(class_wnid)
69+
print(class_info_dict[class_wnid])
70+
print(class_info_dict[class_wnid]['class_name'])
71+
72+
73+
imagenet_images_folder = os.path.join(args.data_root, 'imagenet_images')
74+
if not os.path.isdir(imagenet_images_folder):
75+
os.mkdir(imagenet_images_folder)
76+
77+
78+
img_url_counts = dict(
79+
all=dict(
80+
tried=0,
81+
success=0,
82+
),
83+
is_flickr=dict(
84+
tried=0,
85+
success=0,
86+
),
87+
not_flickr=dict(
88+
tried=0,
89+
success=0
90+
)
91+
)
92+
93+
for class_wnid in classes_to_scrape:
94+
95+
class_images = 0
96+
97+
class_name = class_info_dict[class_wnid]["class_name"]
98+
print(f'Scraping images for class \"{class_name}\"')
99+
url_urls = IMAGENET_API_WNID_TO_URLS(class_wnid)
100+
101+
resp = requests.get(url_urls)
102+
103+
class_folder = os.path.join(imagenet_images_folder, class_name)
104+
if not os.path.exists(class_folder):
105+
os.mkdir(class_folder)
106+
107+
for img_url in resp.content.splitlines():
108+
109+
cls=''
110+
if 'flickr' in img_url.decode('utf-8'):
111+
cls = 'is_flickr'
112+
else:
113+
cls = 'not_flickr'
114+
if args.scrape_only_flickr:
115+
continue
116+
117+
print(img_url)
118+
img_url_counts[cls]['tried'] += 1
119+
img_url_counts['all']['tried'] += 1
120+
121+
122+
try:
123+
img_resp = requests.get(img_url.decode('utf-8'), timeout = 1)
124+
except ConnectionError:
125+
print("Connection Error")
126+
continue
127+
except ReadTimeout:
128+
print("Read Timeout")
129+
continue
130+
except TooManyRedirects:
131+
print("Too many redirects")
132+
continue
133+
134+
if not 'content-type' in img_resp.headers:
135+
continue
136+
137+
if not 'image' in img_resp.headers['content-type']:
138+
print("Not an image:")
139+
continue
140+
141+
if (len(img_resp.content) < 1000):
142+
print("Img too small")
143+
continue
144+
145+
146+
print(img_resp.headers['content-type'])
147+
print(len(img_resp.content))
148+
149+
img_name = img_url.decode('utf-8').split('/')[-1]
150+
img_file_path = os.path.join(class_folder, img_name)
151+
152+
print(f'Saving image in {img_file_path}')
153+
154+
with open(img_file_path, 'wb') as img_f:
155+
img_f.write(img_resp.content)
156+
157+
class_images += 1
158+
img_url_counts[cls]['success'] += 1
159+
img_url_counts['all']['success'] += 1
160+
161+
162+
print(f'Tried counts {img_url_counts}')
163+
if img_url_counts["is_flickr"]["tried"] > 0:
164+
print(f'{100.0 * img_url_counts["is_flickr"]["success"]/img_url_counts["is_flickr"]["tried"]}% of success rate for flickr urls ')
165+
if img_url_counts["not_flickr"]["tried"] > 0:
166+
print(f'{100.0 * img_url_counts["not_flickr"]["success"]/img_url_counts["not_flickr"]["tried"]}% of success rate for other urls ')
167+
if img_url_counts["all"]["tried"] > 0:
168+
print(f'{100.0 * img_url_counts["all"]["success"]/img_url_counts["all"]["tried"]}% of success rate for all urls ')
169+
170+
if class_images == args.images_per_class:
171+
break

imagenet_class_info.json

+1
Large diffs are not rendered by default.

prepare_stats.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import os
2+
import requests
3+
import csv
4+
import codecs
5+
import matplotlib.pyplot as plt
6+
import json
7+
8+
DATA_ROOT = '/Users/martinsf/ai/deep_learning_projects/data'
9+
URL_WORDNET = 'http://image-net.org/archive/words.txt'
10+
IMAGENET_API_WNID_TO_URLS = lambda wnid: f'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={wnid}'
11+
12+
wordnet_filename = URL_WORDNET.split('/')[-1]
13+
wordnet_file_path = os.path.join(DATA_ROOT, wordnet_filename)
14+
print(wordnet_file_path)
15+
if not os.path.exists(wordnet_file_path):
16+
17+
print(f'Downloading {URL_WORDNET}')
18+
resp = requests.get(URL_WORDNET)
19+
20+
with open(wordnet_file_path, "wb") as file:
21+
file.write(resp.content)
22+
file.close()
23+
24+
# Downloaded from http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz
25+
url_list_filepath = '/Users/martinsf/ai/datasets/imagenet/fall11_urls.txt'
26+
img_url_dict = dict()
27+
28+
total_urls = 0
29+
flickr_urls = 0
30+
31+
#Go trough the urls list and count urls per class and flickr_urls per class, store the info in csv
32+
with codecs.open(url_list_filepath, 'r', encoding='utf-8', errors='ignore') as f:
33+
it = 0
34+
for line in f:
35+
it += 1
36+
if it % 10000 == 0:
37+
print(it)
38+
row = line.split('\t')
39+
40+
if (len(row) != 2):
41+
continue
42+
id = row[0].split('_')[0]
43+
url = row[1]
44+
45+
if not id in img_url_dict:
46+
img_url_dict[id] = dict(urls = 0, flickr_urls = 0)
47+
48+
img_url_dict[id]['urls'] += 1
49+
total_urls += 1
50+
if 'flickr' in url:
51+
flickr_urls += 1
52+
img_url_dict[id]['flickr_urls'] += 1
53+
54+
55+
wnid_to_class_dict = dict()
56+
with open(wordnet_file_path, "r") as word_list_file:
57+
58+
csv_reader_word_list = csv.reader(word_list_file, delimiter='\t')
59+
60+
for row in csv_reader_word_list:
61+
62+
wnid = row[0]
63+
keywords = row[1]
64+
65+
wnid_to_class_dict[row[0]] = row[1]
66+
67+
class_info_json_filename = 'imagenet_class_info.json'
68+
class_info_json_filepath = os.path.join(DATA_ROOT, class_info_json_filename)
69+
70+
img_counts = []
71+
total_url_counts = []
72+
flickr_url_counts = []
73+
74+
class_info_dict = dict()
75+
76+
for key, val in img_url_dict.items():
77+
class_info_dict[key] = dict(
78+
img_url_count = val['urls'],
79+
flickr_img_url_count = val['flickr_urls'],
80+
class_name = wnid_to_class_dict[key].split(',')[0]
81+
)
82+
print(f'{wnid_to_class_dict[key]} {len(val)}')
83+
total_url_counts.append(val['urls'])
84+
flickr_url_counts.append(val['flickr_urls'])
85+
86+
87+
with open(class_info_json_filepath,"w") as class_info_json_f:
88+
json.dump(class_info_dict, class_info_json_f)
89+
csv_writer = csv.writer(class_info_json_f, delimiter=';')
90+
91+
print(f'In total there are {total_urls} img urls and {flickr_urls} flickr urls')
92+
93+
fig, axs = plt.subplots(3,1)
94+
plt.style.use('seaborn')
95+
96+
plt.subplots_adjust(hspace = 0.5)
97+
98+
axs[0].hist(total_url_counts, range=(500,2000), bins=50, rwidth=0.8)
99+
axs[0].set_title('All ImageNet urls')
100+
axs[0].set_xticks([x for x in range(500,2000,150)])
101+
axs[0].set_xlabel("Images per class")
102+
axs[0].set_ylabel("Number of classes")
103+
104+
axs[1].set_title('Flickr ImageNet urls')
105+
axs[1].hist(flickr_url_counts, range=(500,2000), bins=50, rwidth=0.8)
106+
axs[1].set_xticks([x for x in range(500,2000,150)])
107+
axs[1].set_xlabel("Images per class")
108+
axs[1].set_ylabel("Number of classes")
109+
110+
axs[2].set_title('Flickr ImageNet urls')
111+
axs[2].hist(flickr_url_counts, range=(500,2000), bins=50, rwidth=0.8, cumulative=-1)
112+
axs[2].set_xticks([x for x in range(500,2000,150)])
113+
axs[2].set_xlabel("Images per class")
114+
axs[2].set_ylabel("Number of classes")
115+
116+
plt.show()

0 commit comments

Comments
 (0)