|
| 1 | +## |
| 2 | +## Crawls apks from https://www.androiddrawer.com/ |
| 3 | +## |
| 4 | + |
| 5 | +import scrapy |
| 6 | +import os |
| 7 | +import logging |
| 8 | +from datetime import datetime |
| 9 | + |
| 10 | + |
| 11 | +class ApkSpider(scrapy.Spider): |
| 12 | + name = 'apk_spider' |
| 13 | + start_urls = ['https://www.androiddrawer.com/'] |
| 14 | + |
| 15 | + # Setup logger |
| 16 | + logger = logging.getLogger('crawl-logger') |
| 17 | + formatter = logging.Formatter('%(levelname)-5s [%(asctime)s] %(message)s') |
| 18 | + fileHandler = logging.FileHandler('apkcrawler_' + datetime.now().strftime("%Y_%m_%d__%H_%M_%S") + '.log', mode='w') |
| 19 | + fileHandler.setFormatter(formatter) |
| 20 | + streamHandler = logging.StreamHandler() |
| 21 | + streamHandler.setFormatter(formatter) |
| 22 | + |
| 23 | + logger.setLevel(logging.DEBUG) |
| 24 | + logger.addHandler(fileHandler) |
| 25 | + logger.addHandler(streamHandler) |
| 26 | + |
| 27 | + logger.info('Starting to crawl...') |
| 28 | + |
| 29 | + if os.path.exists('apks') is False: |
| 30 | + os.mkdir('apks') |
| 31 | + os.chdir('apks') |
| 32 | + |
| 33 | + # Parses the whole category list |
| 34 | + def parse(self, response): |
| 35 | + for category in response.css('#categoriesContainer li'): |
| 36 | + category_href = category.css('a ::attr(href)').extract_first() |
| 37 | + if category_href: |
| 38 | + request = scrapy.Request( |
| 39 | + url=response.urljoin(category_href), |
| 40 | + callback=self.parse_category |
| 41 | + ) |
| 42 | + foldername = category.css('a ::text').extract_first() |
| 43 | + request.meta['foldername'] = foldername |
| 44 | + yield request |
| 45 | + |
| 46 | + # Parses one category |
| 47 | + def parse_category(self, response): |
| 48 | + for app in response.css('a.box-click-target.animate'): |
| 49 | + app_href = app.css('a ::attr(href)').extract_first() |
| 50 | + if app_href: |
| 51 | + yield scrapy.Request( |
| 52 | + url=response.urljoin(app_href), |
| 53 | + callback=self.parse_app, |
| 54 | + meta=response.meta |
| 55 | + ) |
| 56 | + |
| 57 | + # Parses app page |
| 58 | + def parse_app(self, response): |
| 59 | + download_btn = response.css('a.download-btn.animate') |
| 60 | + size = download_btn.css('.download-size ::text').extract_first() |
| 61 | + size = int(float(size[:-3])) |
| 62 | + if size <= 50: |
| 63 | + href = download_btn.css('a ::attr(href)').extract_first() |
| 64 | + request = scrapy.Request( |
| 65 | + url=response.urljoin(href), |
| 66 | + callback=self.save_apk, |
| 67 | + meta=response.meta |
| 68 | + ) |
| 69 | + filename = response.css('h1.entry-title.single-title ::text').extract_first() |
| 70 | + request.meta['filename'] = filename |
| 71 | + yield request |
| 72 | + |
| 73 | + # Saves .apk to the category folder |
| 74 | + def save_apk(self, response): |
| 75 | + logger = logging.getLogger('crawl-logger') |
| 76 | + |
| 77 | + foldername = response.meta['foldername'] |
| 78 | + filename = response.meta['filename'].replace(':', ' ') + '.apk' |
| 79 | + path = os.path.join(foldername, filename) |
| 80 | + |
| 81 | + if len(response.body) // (1 << 20) == 0 or len(response.body) == 0: |
| 82 | + logger.debug("Couldn't download {} correctly :( Length: {}".format(path, len(response.body))) |
| 83 | + return |
| 84 | + |
| 85 | + if os.path.exists(foldername) is False: |
| 86 | + os.mkdir(foldername) |
| 87 | + |
| 88 | + with open(path, 'wb') as f: |
| 89 | + f.write(response.body) |
| 90 | + |
| 91 | + logger.info(path + ' Length: ' + str(len(response.body))) |
0 commit comments