diff --git a/.htmltest.yml b/.htmltest.yml index 01a04eea12b9..0ac6671b9e47 100644 --- a/.htmltest.yml +++ b/.htmltest.yml @@ -55,11 +55,14 @@ IgnoreURLs: # list of regexs of paths or URLs to be ignored # Ignore Docsy-generated GitHub links for now, until # https://github.com/google/docsy/issues/1432 is fixed - ^https?://github\.com/.*?/.*?/(new|edit|issues/new\?title)/ # view-page, edit-source etc - # Here's an approximate regex to avoid the "View page source" links. TODO: fix this in Docsy - - ^https?://github\.com/open-telemetry/opentelemetry.io/tree/ - # FIXME: A patch until we can get Docsy to mark "View page source" links as excluded from link checking, - # Actually, it would be better to pin the version of the OTel spec. - - ^https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/logs/event-(api|sdk)\.md + # Ignore "View page source" links, except for spec pages, i.e., links starting with + # https://github.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/specs + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/[^e] + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/es + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/.*?/_index.md$ + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/[^d] + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/[^s] + - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/security # FIXME: same issue as for the OTel spec mentioned above: - ^https://github.com/open-telemetry/semantic-conventions/tree/main diff --git a/scripts/double-check-refcache-400s.mjs b/scripts/double-check-refcache-400s.mjs index a16a9bae460a..71514c87e6c3 100755 --- a/scripts/double-check-refcache-400s.mjs +++ b/scripts/double-check-refcache-400s.mjs @@ -34,7 +34,7 @@ async function retry400sAndUpdateCache() { } process.stdout.write(`Checking: ${url} (was ${StatusCode})... `); - const status = await getUrlStatus(url); + const status = await getUrlStatus(url, true); console.log(`${status}.`); if (!isHttp2XX(status)) continue; diff --git a/scripts/get-url-status.mjs b/scripts/get-url-status.mjs index ef55b600437d..6981a5177e6c 100755 --- a/scripts/get-url-status.mjs +++ b/scripts/get-url-status.mjs @@ -2,6 +2,7 @@ import puppeteer from 'puppeteer'; +const cratesIoURL = 'https://crates.io/crates/'; let verbose = false; function log(...args) { @@ -24,8 +25,20 @@ async function getUrlHeadless(url) { if (!response) throw new Error('No response from server.'); - const status = response.status(); - log(` Headless fetch returned HTTP status code: ${status}`); + let status = response.status(); + const title = await page.title(); + + // Handles special case of crates.io. For details, see: + // https://github.com/rust-lang/crates.io/issues/788 + if (url.startsWith(cratesIoURL)) { + const crateName = url.split('/').pop(); + // Crate found iff title is `${crateName} - crates.io: Rust Package Registry` + if (!title.startsWith(crateName)) status = 404; + } + + log( + `Headless fetch returned HTTP status code: ${status}; page title: '${title}'`, + ); return status; } catch (error) { @@ -66,9 +79,11 @@ export function isHttp2XX(status) { return status && status >= 200 && status < 300; } -export async function getUrlStatus(url) { +export async function getUrlStatus(url, _verbose = false) { + verbose = _verbose; let status = await getUrlHeadless(url); - if (!isHttp2XX(status)) { + // If headless fetch fails, try in browser for non-404 statuses + if (!isHttp2XX(status) && status !== 404) { status = await getUrlInBrowser(url); } return status; @@ -83,7 +98,7 @@ async function mainCLI() { process.exit(1); } - const status = await getUrlStatus(url); + const status = await getUrlStatus(url, verbose); process.exit(isHttp2XX(status) ? 0 : 1); } diff --git a/static/refcache.json b/static/refcache.json index b979de911cbd..d1b4862e3df0 100644 --- a/static/refcache.json +++ b/static/refcache.json @@ -10007,6 +10007,50 @@ "StatusCode": 206, "LastSeen": "2025-02-01T07:12:04.503997-05:00" }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/archetypes/": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:00.765175-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/archetypes/blog.md": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:10:58.811797-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/content-modules": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:01.039226-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/specs/status.md": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:42.468045-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:10:57.073038-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/adopters.yaml": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:10:58.803368-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/distributions.yaml": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:03.290407-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/vendors.yaml": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:04.640959-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/registry": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:10:58.178169-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/layouts/shortcodes/docs": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:10:59.64214-05:00" + }, + "https://github.com/open-telemetry/opentelemetry.io/tree/main/templates/registry-entry.yml": { + "StatusCode": 206, + "LastSeen": "2025-02-02T12:11:00.926611-05:00" + }, "https://github.com/open-telemetry/otel-arrow": { "StatusCode": 206, "LastSeen": "2025-01-30T17:00:10.089894-05:00"