1
1
#!/usr/bin/env node
2
2
3
- import puppeteer from 'puppeteer' ;
3
+ import puppeteer from 'puppeteer' ; // Consider using puppeteer-core
4
4
5
5
const cratesIoURL = 'https://crates.io/crates/' ;
6
6
let verbose = false ;
7
7
8
8
function log ( ...args ) {
9
- if ( verbose ) console . log ( ...args ) ;
9
+ if ( ! verbose ) return ;
10
+ const lastArg = args [ args . length - 1 ] ;
11
+ if ( typeof lastArg === 'string' && lastArg . endsWith ( ' ' ) ) {
12
+ process . stdout . write ( args . join ( ' ' ) ) ;
13
+ } else {
14
+ console . log ( ...args ) ;
15
+ }
10
16
}
11
17
12
18
async function getUrlHeadless ( url ) {
13
- let browser ;
19
+ // Get the URL, headless, while trying our best to avoid triggering
20
+ // bot-rejection from some servers. Returns the HTTP status code.
14
21
15
- log ( `Trying headless fetch of ${ url } ` ) ;
22
+ log ( `Headless fetch of ${ url } ... ` ) ;
16
23
24
+ let browser ;
17
25
try {
18
- browser = await puppeteer . launch ( ) ;
26
+ // cSpell:ignore KHTML
27
+ const userAgent =
28
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
29
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ;
30
+
31
+ browser = await puppeteer . launch ( {
32
+ headless : true ,
33
+ args : [
34
+ '--no-sandbox' ,
35
+ '--disable-setuid-sandbox' ,
36
+ `--user-agent=${ userAgent } ` ,
37
+ ] ,
38
+ } ) ;
19
39
const page = await browser . newPage ( ) ;
40
+ await page . setUserAgent ( userAgent ) ;
41
+ await page . setExtraHTTPHeaders ( {
42
+ 'Accept-Language' : 'en-US,en;q=0.9' ,
43
+ } ) ;
20
44
21
45
const response = await page . goto ( url , {
22
46
waitUntil : 'networkidle2' ,
23
- timeout : 9000 ,
47
+ timeout : 10_000 ,
24
48
} ) ;
25
49
26
50
if ( ! response ) throw new Error ( 'No response from server.' ) ;
@@ -32,13 +56,13 @@ async function getUrlHeadless(url) {
32
56
// https://github.com/rust-lang/crates.io/issues/788
33
57
if ( url . startsWith ( cratesIoURL ) ) {
34
58
const crateName = url . split ( '/' ) . pop ( ) ;
35
- // Crate found iff title is `${crateName} - crates.io: Rust Package Registry`
36
- if ( ! title . startsWith ( crateName ) ) status = 404 ;
59
+ // E.g. 'https://crates.io/crates/opentelemetry-sdk' -> 'opentelemetry-sdk'
60
+ const crateNameRegex = new RegExp ( crateName . replace ( / - / g, '[-_]' ) ) ;
61
+ // Crate found if title starts with createName (in kebab or snake case)
62
+ if ( ! crateNameRegex . test ( title ) ) status = 404 ;
37
63
}
38
64
39
- log (
40
- `Headless fetch returned HTTP status code: ${ status } ; page title: '${ title } '` ,
41
- ) ;
65
+ log ( `${ status } ; page title: '${ title } '` ) ;
42
66
43
67
return status ;
44
68
} catch ( error ) {
@@ -91,14 +115,16 @@ export async function getUrlStatus(url, _verbose = false) {
91
115
92
116
async function mainCLI ( ) {
93
117
const url = process . argv [ 2 ] ;
94
- verbose = true ; // process.argv.includes('--verbose ');
118
+ verbose = ! process . argv . includes ( '--quiet ' ) ;
95
119
96
120
if ( ! url ) {
97
121
console . error ( `Usage: ${ process . argv [ 1 ] } URL` ) ;
98
122
process . exit ( 1 ) ;
99
123
}
100
124
101
125
const status = await getUrlStatus ( url , verbose ) ;
126
+ if ( ! verbose ) console . log ( status ) ;
127
+
102
128
process . exit ( isHttp2XX ( status ) ? 0 : 1 ) ;
103
129
}
104
130
0 commit comments