diff --git a/.htmltest.yml b/.htmltest.yml
index 01a04eea12b9..0ac6671b9e47 100644
--- a/.htmltest.yml
+++ b/.htmltest.yml
@@ -55,11 +55,14 @@ IgnoreURLs: # list of regexs of paths or URLs to be ignored
# Ignore Docsy-generated GitHub links for now, until
# https://github.com/google/docsy/issues/1432 is fixed
- ^https?://github\.com/.*?/.*?/(new|edit|issues/new\?title)/ # view-page, edit-source etc
- # Here's an approximate regex to avoid the "View page source" links. TODO: fix this in Docsy
- - ^https?://github\.com/open-telemetry/opentelemetry.io/tree/
- # FIXME: A patch until we can get Docsy to mark "View page source" links as excluded from link checking,
- # Actually, it would be better to pin the version of the OTel spec.
- - ^https://github.com/open-telemetry/opentelemetry-specification/tree/main/specification/logs/event-(api|sdk)\.md
+ # Ignore "View page source" links, except for spec pages, i.e., links starting with
+ # https://github.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/specs
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/[^e]
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/es
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/.*?/_index.md$
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/[^d]
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/[^s]
+ - ^https://github\.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/security
# FIXME: same issue as for the OTel spec mentioned above:
- ^https://github.com/open-telemetry/semantic-conventions/tree/main
diff --git a/scripts/double-check-refcache-400s.mjs b/scripts/double-check-refcache-400s.mjs
index a16a9bae460a..71514c87e6c3 100755
--- a/scripts/double-check-refcache-400s.mjs
+++ b/scripts/double-check-refcache-400s.mjs
@@ -34,7 +34,7 @@ async function retry400sAndUpdateCache() {
}
process.stdout.write(`Checking: ${url} (was ${StatusCode})... `);
- const status = await getUrlStatus(url);
+ const status = await getUrlStatus(url, true);
console.log(`${status}.`);
if (!isHttp2XX(status)) continue;
diff --git a/scripts/get-url-status.mjs b/scripts/get-url-status.mjs
index ef55b600437d..6981a5177e6c 100755
--- a/scripts/get-url-status.mjs
+++ b/scripts/get-url-status.mjs
@@ -2,6 +2,7 @@
import puppeteer from 'puppeteer';
+const cratesIoURL = 'https://crates.io/crates/';
let verbose = false;
function log(...args) {
@@ -24,8 +25,20 @@ async function getUrlHeadless(url) {
if (!response) throw new Error('No response from server.');
- const status = response.status();
- log(` Headless fetch returned HTTP status code: ${status}`);
+ let status = response.status();
+ const title = await page.title();
+
+ // Handles special case of crates.io. For details, see:
+ // https://github.com/rust-lang/crates.io/issues/788
+ if (url.startsWith(cratesIoURL)) {
+ const crateName = url.split('/').pop();
+ // Crate found iff title is `${crateName} - crates.io: Rust Package Registry`
+ if (!title.startsWith(crateName)) status = 404;
+ }
+
+ log(
+ `Headless fetch returned HTTP status code: ${status}; page title: '${title}'`,
+ );
return status;
} catch (error) {
@@ -66,9 +79,11 @@ export function isHttp2XX(status) {
return status && status >= 200 && status < 300;
}
-export async function getUrlStatus(url) {
+export async function getUrlStatus(url, _verbose = false) {
+ verbose = _verbose;
let status = await getUrlHeadless(url);
- if (!isHttp2XX(status)) {
+ // If headless fetch fails, try in browser for non-404 statuses
+ if (!isHttp2XX(status) && status !== 404) {
status = await getUrlInBrowser(url);
}
return status;
@@ -83,7 +98,7 @@ async function mainCLI() {
process.exit(1);
}
- const status = await getUrlStatus(url);
+ const status = await getUrlStatus(url, verbose);
process.exit(isHttp2XX(status) ? 0 : 1);
}
diff --git a/static/refcache.json b/static/refcache.json
index b979de911cbd..d1b4862e3df0 100644
--- a/static/refcache.json
+++ b/static/refcache.json
@@ -10007,6 +10007,50 @@
"StatusCode": 206,
"LastSeen": "2025-02-01T07:12:04.503997-05:00"
},
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/archetypes/": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:00.765175-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/archetypes/blog.md": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:10:58.811797-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/content-modules": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:01.039226-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/content/en/docs/specs/status.md": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:42.468045-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:10:57.073038-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/adopters.yaml": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:10:58.803368-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/distributions.yaml": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:03.290407-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/ecosystem/vendors.yaml": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:04.640959-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/data/registry": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:10:58.178169-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/layouts/shortcodes/docs": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:10:59.64214-05:00"
+ },
+ "https://github.com/open-telemetry/opentelemetry.io/tree/main/templates/registry-entry.yml": {
+ "StatusCode": 206,
+ "LastSeen": "2025-02-02T12:11:00.926611-05:00"
+ },
"https://github.com/open-telemetry/otel-arrow": {
"StatusCode": 206,
"LastSeen": "2025-01-30T17:00:10.089894-05:00"