|
4 | 4 | from bs4 import BeautifulSoup |
5 | 5 |
|
6 | 6 | from http_request_randomizer.requests.parsers.UrlParser import UrlParser |
| 7 | +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel |
7 | 8 |
|
8 | 9 | logger = logging.getLogger(__name__) |
9 | 10 | __author__ = 'pgaref' |
10 | 11 |
|
11 | 12 |
|
12 | 13 | class RebroWeeblyParser(UrlParser): |
13 | | - def __init__(self, web_url, timeout=None): |
| 14 | + def __init__(self, id, web_url, timeout=None): |
14 | 15 | self.top_proxy_path = "proxy-list.html" |
15 | 16 | self.txt_proxy_path = "txt-lists.html" |
16 | | - UrlParser.__init__(self, web_url, timeout) |
| 17 | + UrlParser.__init__(self, id, web_url, timeout) |
17 | 18 |
|
18 | 19 | def parse_proxyList(self, use_top15k=False): |
19 | 20 | curr_proxy_list = [] |
20 | | - response = requests.get(self.get_URl()+"/"+self.top_proxy_path, timeout=self.timeout) |
| 21 | + response = requests.get(self.get_url() + "/" + self.top_proxy_path, timeout=self.timeout) |
21 | 22 |
|
22 | 23 | if not response.ok: |
23 | | - logger.warn("Proxy Provider url failed: {}".format(self.get_URl())) |
| 24 | + logger.warn("Proxy Provider url failed: {}".format(self.get_url())) |
24 | 25 | return [] |
25 | 26 |
|
26 | 27 | content = response.content |
27 | 28 | soup = BeautifulSoup(content, "html.parser") |
28 | | - table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}).find('font', attrs={ |
29 | | - 'color': '#33a27f'}) |
| 29 | + all_divs = soup.findAll("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) |
| 30 | + # address_table = soup.find("div", attrs={"class": "paragraph", 'style': "text-align:left;"}) |
| 31 | + # .find('font', attrs={'color': '#33a27f'}) |
30 | 32 | # Parse Top Proxy List page |
31 | | - for row in [x for x in table.contents if getattr(x, 'name', None) != 'br']: |
| 33 | + address_list = [] |
| 34 | + country_list = [] |
| 35 | + anonymity_list = [] |
| 36 | + for div in all_divs: |
| 37 | + address_div = div.find('font', attrs={'color': '#33a27f'}) |
| 38 | + if address_div is not None: |
| 39 | + for row in [x for x in address_div.contents if getattr(x, 'name', None) != 'br']: |
| 40 | + address_list.append(str(row)) |
| 41 | + curr_div = div.findAll('font', attrs={'size': '2'}) |
| 42 | + if curr_div[0] is not None: |
| 43 | + row_data = [] |
| 44 | + # font -> strong -> font |
| 45 | + title = curr_div[0].contents[0].contents[0].contents[0] |
| 46 | + for row in [x for x in curr_div[-1].contents if getattr(x, 'name', None) != 'br']: |
| 47 | + row_data.append(str(row)) |
| 48 | + if 'Country' in str(title): |
| 49 | + country_list.extend(row_data) |
| 50 | + if 'Status' in str(title): |
| 51 | + anonymity_list.extend(row_data) |
| 52 | + for address, country, anonymity in zip(address_list, country_list, anonymity_list): |
32 | 53 | # Make sure it is a Valid Proxy Address |
33 | | - if UrlParser.valid_ip_port(row): |
34 | | - proxy = "http://" + row |
35 | | - curr_proxy_list.append(proxy.__str__()) |
| 54 | + proxy_obj = self.create_proxy_object(address, country, anonymity) |
| 55 | + if proxy_obj is not None and UrlParser.valid_ip_port(proxy_obj.get_address()): |
| 56 | + curr_proxy_list.append(proxy_obj) |
36 | 57 | else: |
37 | | - logger.debug("Address with Invalid format: {}".format(row)) |
| 58 | + logger.debug("Proxy Invalid: {}".format(row)) |
38 | 59 | # Usually these proxies are stale |
39 | 60 | if use_top15k: |
40 | 61 | # Parse 15k Nodes Text file (named *-all-*.txt) |
41 | | - content = requests.get(self.get_URl() + "/" + self.txt_proxy_path).content |
| 62 | + content = requests.get(self.get_url() + "/" + self.txt_proxy_path).content |
42 | 63 | soup = BeautifulSoup(content, "html.parser") |
43 | 64 | table = soup.find("div", attrs={"class": "wsite-multicol-table-wrap"}) |
44 | 65 | for link in table.findAll('a'): |
45 | 66 | current_link = link.get('href') |
46 | 67 | if current_link is not None and "all" in current_link: |
47 | 68 | self.txt_proxy_path = current_link |
48 | | - more_content = requests.get(self.get_URl()+self.txt_proxy_path).text |
| 69 | + more_content = requests.get(self.get_url() + self.txt_proxy_path).text |
49 | 70 | for proxy_address in more_content.split(): |
50 | 71 | if UrlParser.valid_ip_port(proxy_address): |
51 | | - curr_proxy_list.append(proxy_address) |
52 | | - |
| 72 | + proxy_obj = self.create_proxy_object(row) |
| 73 | + curr_proxy_list.append(proxy_obj) |
53 | 74 | return curr_proxy_list |
54 | 75 |
|
| 76 | + def create_proxy_object(self, address, country, anonymity): |
| 77 | + # Make sure it is a Valid IP |
| 78 | + ip = address.strip().split(":")[0] |
| 79 | + if not UrlParser.valid_ip(ip): |
| 80 | + logger.debug("IP with Invalid format: {}".format(ip)) |
| 81 | + return None |
| 82 | + port = address.strip().split(":")[1] |
| 83 | + country = country.strip() |
| 84 | + anonymity = AnonymityLevel.get(anonymity.strip()) |
| 85 | + |
| 86 | + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country) |
| 87 | + |
55 | 88 | def __str__(self): |
56 | 89 | return "RebroWeebly Parser of '{0}' with required bandwidth: '{1}' KBs" \ |
57 | 90 | .format(self.url, self.minimum_bandwidth_in_KBs) |
0 commit comments