-
Notifications
You must be signed in to change notification settings - Fork 90
Port SitemapExtractor from CC-MRJob to CC-PySpark
#54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
70bdd93
92f30c6
fed0b2c
9bb3f41
458cb2e
ada1471
1f82d33
35de5ca
0bd0b4a
d17683f
41a3d0b
d50a39a
c163959
1ba160f
e2e9646
94a98d7
6a23d23
8339e6c
be9a046
0210cd8
cb9c4ca
2dd22dc
126b31b
e18e82f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| name: Python Unit Tests | ||
|
|
||
| on: | ||
| push: | ||
| branches: [ main ] | ||
| pull_request: | ||
| branches: [ main ] | ||
| workflow_dispatch: | ||
|
|
||
| jobs: | ||
| test: | ||
| runs-on: ubuntu-latest | ||
| strategy: | ||
| matrix: | ||
| python-version: ['3.10', '3.11', '3.12', '3.13'] | ||
| fail-fast: false | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Python ${{ matrix.python-version }} | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ matrix.python-version }} | ||
|
|
||
| - name: Install dependencies | ||
| run: | | ||
| pip install -r requirements.txt | ||
| pip install pyspark==3.5.7 | ||
|
|
||
| - name: Run tests | ||
| run: | | ||
| python -m pytest . -v | ||
| env: | ||
| # PySpark needs this to find the test files | ||
| PYTHONPATH: test |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,3 +27,7 @@ lxml | |
| #Resiliparse | ||
| # (tested with) | ||
| #Resiliparse==0.15.2 | ||
|
|
||
| # testing | ||
| pytest | ||
| pytest-mock | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| import re | ||
| from urllib.parse import urlparse, urljoin | ||
|
|
||
| from pyspark.sql.types import StructType, StructField, StringType, ArrayType | ||
| from warcio.recordloader import ArcWarcRecord | ||
|
|
||
| from sparkcc import CCSparkJob | ||
|
|
||
| class SitemapExtractorJob(CCSparkJob): | ||
| """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files.""" | ||
|
|
||
| name = "SitemapExtractor" | ||
|
|
||
| output_schema = StructType([ | ||
| StructField('sitemap_url', StringType(), True), | ||
| StructField('hosts', ArrayType(elementType=StringType()), True) | ||
| ]) | ||
|
|
||
| # rb: match on raw bytes so we can defer utf-8 decoding to the `sitemap:` line | ||
| sitemap_pattern = re.compile(rb'^sitemap:\s*(\S+)', re.I) | ||
|
|
||
| robots_txt_processed = None | ||
| sitemap_urls_found = None | ||
| sitemap_url_invalid_encoding = None | ||
| robots_txt_announcing_sitemap = None | ||
| robots_txt_with_more_than_50_sitemaps = None | ||
|
|
||
|
|
||
| def log_accumulators(self, session): | ||
| super(SitemapExtractorJob, self).log_accumulators(session) | ||
|
|
||
| self.log_accumulator(session, self.robots_txt_processed, | ||
| 'robots.txt successfully parsed = {}') | ||
| self.log_accumulator(session, self.sitemap_urls_found, | ||
| 'sitemap urls found = {}') | ||
| self.log_accumulator(session, self.sitemap_url_invalid_encoding, | ||
| 'sitemap urls with invalid utf-8 encoding = {}') | ||
| self.log_accumulator(session, self.robots_txt_announcing_sitemap, | ||
| 'robots.txt announcing at least 1 sitemap = {}') | ||
| self.log_accumulator(session, self.robots_txt_with_more_than_50_sitemaps, | ||
| 'robots.txt with more than 50 sitemaps = {}') | ||
|
|
||
|
|
||
| def init_accumulators(self, session): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also log_accumulators needs to be overridden, otherwise the class-specific accumulators are never shown resp. not preserved once the job has finished. See cc_index_word_count.py or wat_extract_links.py. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| super(SitemapExtractorJob, self).init_accumulators(session) | ||
|
|
||
| sc = session.sparkContext | ||
| self.robots_txt_processed = sc.accumulator(0) | ||
| self.sitemap_urls_found = sc.accumulator(0) | ||
| self.sitemap_url_invalid_encoding = sc.accumulator(0) | ||
| self.robots_txt_announcing_sitemap = sc.accumulator(0) | ||
| self.robots_txt_with_more_than_50_sitemaps = sc.accumulator(0) | ||
|
|
||
|
|
||
| def process_record(self, record: ArcWarcRecord): | ||
| """ emit: sitemap_url => [host] """ | ||
| if not self.is_response_record(record): | ||
| # we're only interested in the HTTP responses | ||
| return | ||
|
|
||
| self.robots_txt_processed.add(1) | ||
| # robots_txt url/host are lazily computed when we encounter the first valid sitemap URL | ||
| robots_txt_url = None | ||
| robots_txt_host = None | ||
| n_sitemaps = 0 | ||
|
|
||
| data = self.get_payload_stream(record).read() | ||
| for raw_line in data.splitlines(): | ||
| raw_line = raw_line.strip() | ||
|
|
||
| match = SitemapExtractorJob.sitemap_pattern.match(raw_line) | ||
| if match: | ||
| sitemap_url = match.group(1).strip() | ||
| self.sitemap_urls_found.add(1) | ||
| try: | ||
| sitemap_url = sitemap_url.decode("utf-8", "strict") | ||
| except UnicodeDecodeError as e: | ||
| self.get_logger().warn(f'Invalid encoding of sitemap URL {sitemap_url}: {repr(e)}') | ||
| self.sitemap_url_invalid_encoding.add(1) | ||
| continue | ||
|
|
||
| if robots_txt_url is None: | ||
| # first sitemap found: set base URL and get host from URL | ||
| robots_txt_url = self.get_warc_header(record, 'WARC-Target-URI') | ||
| try: | ||
| robots_txt_host = urlparse(robots_txt_url).netloc.lower().lstrip('.') | ||
| except Exception as e: | ||
| self.get_logger().warn(f'Invalid robots.txt URL: {robots_txt_url}: {repr(e)}') | ||
| # skip this entire robots.txt record | ||
| return | ||
|
|
||
| if not (sitemap_url.startswith('http:') or sitemap_url.startswith('https:')): | ||
| # sitemap_url is relative; pass straight to urljoin which knows how to handle it correctly | ||
| try: | ||
| sitemap_url = urljoin(robots_txt_url, sitemap_url) | ||
| except Exception as e: | ||
| self.get_logger().warn(f'Error joining sitemap URL {sitemap_url} with base {robots_txt_url}: {repr(e)}') | ||
| continue | ||
|
|
||
| yield sitemap_url, [robots_txt_host] | ||
| n_sitemaps += 1 | ||
|
|
||
| if n_sitemaps > 0: | ||
| self.robots_txt_announcing_sitemap.add(1) | ||
| if n_sitemaps > 50: | ||
| self.robots_txt_with_more_than_50_sitemaps.add(1) | ||
|
|
||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| job = SitemapExtractorJob() | ||
| job.run() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| from fastwarc.warc import WarcRecordType | ||
|
|
||
| from sparkcc_fastwarc import CCFastWarcSparkJob | ||
| from sitemaps_from_robotstxt import SitemapExtractorJob | ||
|
|
||
|
|
||
| class SitemapExtractorFastWarcJob(SitemapExtractorJob, CCFastWarcSparkJob): | ||
| """Extract sitemap URLs (http://www.sitemaps.org/) from robots.txt WARC files | ||
| using FastWARC to parse WARC files.""" | ||
|
|
||
| name = "SitemapExtractorFastWarc" | ||
|
|
||
| # process only WARC response and metadata (including WAT) records | ||
| fastwarc_record_filter = WarcRecordType.response | ||
|
|
||
| # process_record is implemented by SitemapExtractorJob | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The "main" block is required in order to run the job. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please, also run the job to verify that there are no errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
|
|
||
| if __name__ == '__main__': | ||
| job = SitemapExtractorFastWarcJob() | ||
| job.run() | ||
Uh oh!
There was an error while loading. Please reload this page.