Skip to content

Commit b526dc2

Browse files
committed
Init commit
0 parents  commit b526dc2

5 files changed

+178
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
__pycache__/
2+
apks/
3+
tmp/

androguard_test.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from androguard.misc import AnalyzeAPK
2+

apkcrawler.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
##
2+
## Crawls apks from https://www.androiddrawer.com/
3+
##
4+
5+
import scrapy
6+
import os
7+
import logging
8+
from datetime import datetime
9+
10+
11+
class ApkSpider(scrapy.Spider):
12+
name = 'apk_spider'
13+
start_urls = ['https://www.androiddrawer.com/']
14+
15+
# Setup logger
16+
logger = logging.getLogger('crawl-logger')
17+
formatter = logging.Formatter('%(levelname)-5s [%(asctime)s] %(message)s')
18+
fileHandler = logging.FileHandler('apkcrawler_' + datetime.now().strftime("%Y_%m_%d__%H_%M_%S") + '.log', mode='w')
19+
fileHandler.setFormatter(formatter)
20+
streamHandler = logging.StreamHandler()
21+
streamHandler.setFormatter(formatter)
22+
23+
logger.setLevel(logging.DEBUG)
24+
logger.addHandler(fileHandler)
25+
logger.addHandler(streamHandler)
26+
27+
logger.info('Starting to crawl...')
28+
29+
if os.path.exists('apks') is False:
30+
os.mkdir('apks')
31+
os.chdir('apks')
32+
33+
# Parses the whole category list
34+
def parse(self, response):
35+
for category in response.css('#categoriesContainer li'):
36+
category_href = category.css('a ::attr(href)').extract_first()
37+
if category_href:
38+
request = scrapy.Request(
39+
url=response.urljoin(category_href),
40+
callback=self.parse_category
41+
)
42+
foldername = category.css('a ::text').extract_first()
43+
request.meta['foldername'] = foldername
44+
yield request
45+
46+
# Parses one category
47+
def parse_category(self, response):
48+
for app in response.css('a.box-click-target.animate'):
49+
app_href = app.css('a ::attr(href)').extract_first()
50+
if app_href:
51+
yield scrapy.Request(
52+
url=response.urljoin(app_href),
53+
callback=self.parse_app,
54+
meta=response.meta
55+
)
56+
57+
# Parses app page
58+
def parse_app(self, response):
59+
download_btn = response.css('a.download-btn.animate')
60+
size = download_btn.css('.download-size ::text').extract_first()
61+
size = int(float(size[:-3]))
62+
if size <= 50:
63+
href = download_btn.css('a ::attr(href)').extract_first()
64+
request = scrapy.Request(
65+
url=response.urljoin(href),
66+
callback=self.save_apk,
67+
meta=response.meta
68+
)
69+
filename = response.css('h1.entry-title.single-title ::text').extract_first()
70+
request.meta['filename'] = filename
71+
yield request
72+
73+
# Saves .apk to the category folder
74+
def save_apk(self, response):
75+
logger = logging.getLogger('crawl-logger')
76+
77+
foldername = response.meta['foldername']
78+
filename = response.meta['filename'].replace(':', ' ') + '.apk'
79+
path = os.path.join(foldername, filename)
80+
81+
if len(response.body) // (1 << 20) == 0 or len(response.body) == 0:
82+
logger.debug("Couldn't download {} correctly :( Length: {}".format(path, len(response.body)))
83+
return
84+
85+
if os.path.exists(foldername) is False:
86+
os.mkdir(foldername)
87+
88+
with open(path, 'wb') as f:
89+
f.write(response.body)
90+
91+
logger.info(path + ' Length: ' + str(len(response.body)))

common.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# from __future__ import division
2+
3+
import sys
4+
import os
5+
import argparse
6+
7+
8+
# Prints text progress bar
9+
def update_progress(current, total):
10+
amtDone = (current + 1) / total
11+
sys.stdout.write("\rProgress: [{0:50s}] {1:.1f}%".format('#' * int(amtDone * 50), amtDone * 100))
12+
13+
14+
# Gets all the files from a given path
15+
def get_files_paths(path):
16+
files_paths = []
17+
for dirname, dirnames, filenames in os.walk(path):
18+
for filename in filenames:
19+
files_paths.append(os.path.join(os.path.abspath(dirname), filename))
20+
return files_paths
21+
22+
23+
# Checks if a path is an actual file
24+
def is_file(filename):
25+
filename = os.path.abspath(filename)
26+
if not os.path.isfile(filename):
27+
msg = "{0} is not a file".format(filename)
28+
raise argparse.ArgumentTypeError(msg)
29+
else:
30+
return filename
31+
32+
33+
# Checks if a path is an actual directory
34+
def is_dir(dirname):
35+
dirname = os.path.abspath(dirname)
36+
if not os.path.isdir(dirname):
37+
msg = "{0} is not a directory".format(dirname)
38+
raise argparse.ArgumentTypeError(msg)
39+
else:
40+
return dirname

samples_picker.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import os
2+
import random
3+
import argparse
4+
from shutil import copyfile
5+
from common import *
6+
7+
8+
def main():
9+
parser = argparse.ArgumentParser(description='Let\'s pick some samples for you.')
10+
parser.add_argument('<src path>', type=is_dir, help='Directory to pick samples from')
11+
parser.add_argument('<dst path>', type=is_dir, help='Directory to put samples to')
12+
parser.add_argument('<nsamples>', type=int, help='Number of samples to pick')
13+
args = vars(parser.parse_args())
14+
15+
src_path = args['<src path>']
16+
dst_path = args['<dst path>']
17+
samples_num = args['<nsamples>']
18+
19+
print(src_path, dst_path)
20+
21+
files = get_files_paths(src_path)
22+
if len(files) < samples_num:
23+
print("Too many samples you want to pick! In total there are {} samples.".format(len(files)))
24+
return
25+
elif len(files) == samples_num:
26+
samples = files
27+
else:
28+
samples = random.sample(files, samples_num)
29+
30+
print('Picking samples...')
31+
32+
for sample in samples:
33+
src_sample_path = sample
34+
dst_sample_path = os.path.join(dst_path, sample.split('\\')[-1])
35+
copyfile(src_sample_path, dst_sample_path)
36+
update_progress(samples.index(sample), len(samples))
37+
38+
print('\nDone!')
39+
40+
41+
if __name__ == '__main__':
42+
main()

0 commit comments

Comments
 (0)