-
Notifications
You must be signed in to change notification settings - Fork 90
Port SitemapExtractor from CC-MRJob to CC-PySpark
#54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
70bdd93
92f30c6
fed0b2c
9bb3f41
458cb2e
ada1471
1f82d33
35de5ca
0bd0b4a
d17683f
41a3d0b
d50a39a
c163959
1ba160f
e2e9646
94a98d7
6a23d23
8339e6c
be9a046
0210cd8
cb9c4ca
2dd22dc
126b31b
e18e82f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| name: Python Unit Tests | ||
|
|
||
| on: | ||
| push: | ||
| branches: [ main ] | ||
| pull_request: | ||
| branches: [ main ] | ||
| workflow_dispatch: | ||
|
|
||
| jobs: | ||
| test: | ||
| runs-on: ubuntu-latest | ||
| strategy: | ||
| matrix: | ||
| python-version: ['3.10', '3.11', '3.12', '3.13'] | ||
| fail-fast: false | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Python ${{ matrix.python-version }} | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ matrix.python-version }} | ||
|
|
||
| - name: Install dependencies | ||
| run: | | ||
| pip install -r requirements.txt | ||
| pip install -r requirements-pyspark.txt | ||
|
|
||
| - name: Run tests | ||
| run: | | ||
| python -m pytest . -v | ||
| env: | ||
| # pyspark needs this to find the test files | ||
| PYTHONPATH: test |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| pyspark==3.5.7 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| import json | ||
| import re | ||
| from typing import Optional | ||
| from urllib.parse import urlparse, urljoin | ||
|
|
||
| import validators | ||
|
||
| from py4j.protocol import Py4JError | ||
| from pyspark.sql.types import StructType, StructField, StringType, ArrayType | ||
| from warcio.recordloader import ArcWarcRecord | ||
|
|
||
| from sparkcc import CCSparkJob | ||
|
|
||
| class SitemapExtractorJob(CCSparkJob): | ||
| """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files.""" | ||
|
|
||
| name = "SitemapExtractor" | ||
|
|
||
| output_schema = StructType([ | ||
| StructField('sitemap_url', StringType(), True), | ||
| StructField('hosts', ArrayType(elementType=StringType()), True) | ||
| ]) | ||
|
|
||
| # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line | ||
| sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) | ||
|
|
||
| robots_txt_processed = None | ||
| sitemap_urls_found = None | ||
| sitemap_url_invalid_encoding = None | ||
| robots_txt_announcing_sitemap = None | ||
| robots_txt_with_more_than_50_sitemaps = None | ||
|
|
||
|
|
||
| def init_accumulators(self, session): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also log_accumulators needs to be overridden, otherwise the class-specific accumulators are never shown resp. not preserved once the job has finished. See cc_index_word_count.py or wat_extract_links.py.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| super(SitemapExtractorJob, self).init_accumulators(session) | ||
|
|
||
| sc = session.sparkContext | ||
| self.robots_txt_processed = sc.accumulator(0) | ||
| self.sitemap_urls_found = sc.accumulator(0) | ||
| self.sitemap_url_invalid_encoding = sc.accumulator(0) | ||
| self.robots_txt_announcing_sitemap = sc.accumulator(0) | ||
| self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) | ||
|
|
||
|
|
||
| def process_record(self, record: ArcWarcRecord): | ||
| """ emit: sitemap_url => [host] """ | ||
| if not self.is_response_record(record): | ||
| # we're only interested in the HTTP responses | ||
| return | ||
|
|
||
| self.robots_txt_processed.add(1) | ||
| # robots_txt url/host are lazily computed when we encounter the first valid sitemap URL | ||
| robots_txt_url = None | ||
| robots_txt_host = None | ||
| n_sitemaps = 0 | ||
|
|
||
| data = self.get_payload_stream(record).read() | ||
| for raw_line in data.splitlines(): | ||
| raw_line = raw_line.strip() | ||
|
|
||
| match = SitemapExtractorJob.sitemap_pattern.match(raw_line) | ||
| if match: | ||
| sitemap_url = match.group(1).strip() | ||
| self.sitemap_urls_found.add(1) | ||
| try: | ||
| sitemap_url = sitemap_url.decode("utf-8", "strict") | ||
| except UnicodeDecodeError as e: | ||
| self.get_logger().warn(f'Invalid encoding of sitemap URL {sitemap_url}: {repr(e)}') | ||
| self.sitemap_url_invalid_encoding.add(1) | ||
| continue | ||
|
|
||
| if robots_txt_url is None: | ||
| # first sitemap found: set base URL and get host from URL | ||
| robots_txt_url = record.rec_headers['WARC-Target-URI'] | ||
|
||
| try: | ||
| robots_txt_host = urlparse(robots_txt_url).netloc.lower().lstrip('.') | ||
| except Exception as e1: | ||
| try: | ||
damian0815 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| self.get_logger().warn(f'Invalid robots.txt URL: {robots_txt_url} - {repr(e1)}') | ||
| except Exception as e2: | ||
| self.get_logger().warn(f'Invalid robots.txt URL - {repr(e1)} (cannot display: {repr(e2)})') | ||
| # skip this entire robots.txt record | ||
| return | ||
|
|
||
| if not (sitemap_url.startswith('http:') or sitemap_url.startswith('https:')): | ||
| # sitemap_url is relative; pass straight to urljoin which knows how to handle it correctly | ||
| try: | ||
| sitemap_url = urljoin(robots_txt_url, sitemap_url) | ||
| except Exception as e: | ||
| try: | ||
damian0815 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| self.get_logger().warn(f'Error joining sitemap URL {sitemap_url} with base {robots_txt_url}: {repr(e)}') | ||
| except Exception as log_e: | ||
| self.get_logger().warn(f'Error joining sitemap URL with base - {repr(e)} (cannot display: {repr(log_e)})') | ||
| continue | ||
|
|
||
| yield sitemap_url, [robots_txt_host] | ||
| n_sitemaps += 1 | ||
|
|
||
| if n_sitemaps > 0: | ||
| self.robots_txt_announcing_sitemap.add(1) | ||
| if n_sitemaps > 50: | ||
| self.robots_txt_with_more_than_50_sitemaps.add(1) | ||
|
|
||
|
|
||
| def _try_parse_host(self, url: str, label_for_log: str) -> str|None: | ||
| try: | ||
| return urlparse(url).netloc.lower().lstrip('.') | ||
| except Exception as e: | ||
| try: | ||
damian0815 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| self.get_logger().warn(f'Invalid {label_for_log} URL: {url} - {repr(e)}') | ||
| except Exception as log_e: | ||
| self.get_logger().warn(f'Invalid {label_for_log} URL - {repr(e)} (cannot display: {repr(log_e)})') | ||
| return None | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| job = SitemapExtractorJob() | ||
| job.run() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| from fastwarc.warc import WarcRecordType | ||
|
|
||
| from sparkcc_fastwarc import CCFastWarcSparkJob | ||
| from sitemaps_from_robotstxt import SitemapExtractorJob | ||
|
|
||
|
|
||
| class SitemapExtractorFastWarcJob(SitemapExtractorJob, CCFastWarcSparkJob): | ||
| """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files | ||
| using FastWARC to parse WARC files.""" | ||
|
|
||
| name = "SitemapExtractorFastWarc" | ||
|
|
||
| # process only WARC response and metadata (including WAT) records | ||
| fastwarc_record_filter = WarcRecordType.response | ||
|
|
||
| # process_record is implemented by SitemapExtractorJob | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The "main" block is required in order to run the job.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please, also run the job to verify that there are no errors.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not required anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed