|
| 1 | +""" |
| 2 | +SPDX-License-Identifier: MIT |
| 3 | +derived from favicon.py by Scott Werner |
| 4 | +https://github.com/scottwernervt/favicon/tree/123e431f53b2c4903b540246a85db0b1633d4786 |
| 5 | +""" |
| 6 | + |
| 7 | +import re |
| 8 | +from collections import defaultdict, namedtuple |
| 9 | +from html.parser import HTMLParser |
| 10 | +from pathlib import Path |
| 11 | +from typing import Any |
| 12 | + |
| 13 | +import httpx |
| 14 | + |
| 15 | +LINK_RELS = [ |
| 16 | + "icon", |
| 17 | + "shortcut icon", |
| 18 | +] |
| 19 | + |
| 20 | +SIZE_RE = re.compile(r"(?P<width>\d{2,4})x(?P<height>\d{2,4})", flags=re.IGNORECASE) |
| 21 | + |
| 22 | +Icon = namedtuple("Icon", ["url", "width", "height", "format", "src"]) |
| 23 | + |
| 24 | + |
| 25 | +def get(client: httpx.Client) -> list[Icon]: |
| 26 | + response = client.get("") |
| 27 | + response.raise_for_status() |
| 28 | + client.base_url = response.url |
| 29 | + |
| 30 | + icons = {icon.url: icon for icon in tags(response.text)} |
| 31 | + |
| 32 | + fallback_icon = fallback(client) |
| 33 | + if fallback_icon and fallback_icon.src not in icons: |
| 34 | + icons[fallback_icon.url] = fallback_icon |
| 35 | + |
| 36 | + # print(f"{icons=}") |
| 37 | + return list(icons.values()) |
| 38 | + # return sorted(icons, key=lambda i: i.width + i.height, reverse=True) |
| 39 | + |
| 40 | + |
| 41 | +def fallback(client: httpx.Client) -> Icon | None: |
| 42 | + response = client.head("favicon.ico") |
| 43 | + if response.status_code == 200 and response.headers["Content-Type"].startswith( |
| 44 | + "image" |
| 45 | + ): |
| 46 | + return Icon(response.url, 0, 0, ".ico", "default") |
| 47 | + return None |
| 48 | + |
| 49 | + |
| 50 | +class LinkRelParser(HTMLParser): |
| 51 | + def __init__(self) -> None: |
| 52 | + super().__init__() |
| 53 | + self.icons: dict[str, set[str]] = defaultdict(set) |
| 54 | + |
| 55 | + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: |
| 56 | + if tag == "link": |
| 57 | + data = dict(attrs) |
| 58 | + rel = data.get("rel") |
| 59 | + if rel in LINK_RELS and (href := data.get("href") or data.get("content")): |
| 60 | + # TODO replace with data |
| 61 | + self.icons[rel].add(href) |
| 62 | + |
| 63 | + |
| 64 | +def tags(html: str) -> set[Icon]: |
| 65 | + parser = LinkRelParser() |
| 66 | + parser.feed(html[0 : html.find("</head>")]) |
| 67 | + hrefs = {link.strip() for links in parser.icons.values() for link in links} |
| 68 | + |
| 69 | + icons = set() |
| 70 | + for href in hrefs: |
| 71 | + if not href or href.startswith("data:image/"): |
| 72 | + continue |
| 73 | + |
| 74 | + # url_parsed = urlparse(url) |
| 75 | + # repair '//cdn.network.com/favicon.png' or `icon.png?v2` |
| 76 | + href_parsed = httpx.URL(href) |
| 77 | + |
| 78 | + width, height = (0, 0) # dimensions(tag) |
| 79 | + ext = Path(href_parsed.path).suffix |
| 80 | + |
| 81 | + icon = Icon( |
| 82 | + href_parsed, |
| 83 | + width, |
| 84 | + height, |
| 85 | + ext.lower(), |
| 86 | + "TODO", |
| 87 | + ) |
| 88 | + icons.add(icon) |
| 89 | + |
| 90 | + return icons |
| 91 | + |
| 92 | + |
| 93 | +def dimensions(tag: Any) -> tuple[int, int]: |
| 94 | + """Get icon dimensions from size attribute or icon filename. |
| 95 | +
|
| 96 | + :param tag: Link or meta tag. |
| 97 | + :type tag: :class:`bs4.element.Tag` |
| 98 | +
|
| 99 | + :return: If found, width and height, else (0,0). |
| 100 | + :rtype: tuple(int, int) |
| 101 | + """ |
| 102 | + sizes = tag.get("sizes", "") |
| 103 | + if sizes and sizes != "any": |
| 104 | + size = sizes.split(" ") # '16x16 32x32 64x64' |
| 105 | + size.sort(reverse=True) |
| 106 | + width, height = re.split(r"[x\xd7]", size[0]) |
| 107 | + else: |
| 108 | + filename = tag.get("href") or tag.get("content") |
| 109 | + size = SIZE_RE.search(filename) |
| 110 | + if size: |
| 111 | + width, height = size.group("width"), size.group("height") |
| 112 | + else: |
| 113 | + width, height = "0", "0" |
| 114 | + |
| 115 | + # repair bad html attribute values: sizes='192x192+' |
| 116 | + width = "".join(c for c in width if c.isdigit()) |
| 117 | + height = "".join(c for c in height if c.isdigit()) |
| 118 | + return int(width), int(height) |
0 commit comments