2
2
3
3
import fs from 'fs/promises' ;
4
4
import { getUrlStatus , isHttp2XX } from './get-url-status.mjs' ;
5
+ import { exit } from 'process' ;
5
6
6
7
const CACHE_FILE = 'static/refcache.json' ;
8
+ const GOOGLE_DOCS_URL = 'https://docs.google.com/' ;
9
+ let checkForFragments = false ;
10
+ let maxNumEntriesToUpdate = 3 ;
7
11
const cratesIoURL = 'https://crates.io/crates/' ;
8
12
13
+ // Magic numbers that we use to determine if a URL with a fragment has been
14
+ // checked with this script. Since we can't add new fields to the cache, we
15
+ // encode "magic" values in the LastSeen field.
16
+ const fragSecondsOk = 12 ;
17
+ const fragMillisecondsOk = 345 ;
18
+ const fragSecondsInvalid = 59 ;
19
+ const fragMillisecondsInvalid = 999 ;
20
+
21
+ function isHttp2XXForFragments ( StatusCode , lastSeenDate ) {
22
+ return (
23
+ isHttp2XX ( StatusCode ) &&
24
+ lastSeenDate . getSeconds ( ) === fragSecondsOk &&
25
+ lastSeenDate . getMilliseconds ( ) === fragMillisecondsOk
26
+ ) ;
27
+ }
28
+
29
+ function is4XXForFragments ( StatusCode , lastSeenDate ) {
30
+ return (
31
+ lastSeenDate . getSeconds ( ) === fragSecondsInvalid &&
32
+ lastSeenDate . getMilliseconds ( ) === fragMillisecondsInvalid
33
+ ) ;
34
+ }
35
+
9
36
async function readRefcache ( ) {
10
37
try {
11
38
const data = await fs . readFile ( CACHE_FILE , 'utf8' ) ;
@@ -18,42 +45,154 @@ async function readRefcache() {
18
45
19
46
async function writeRefcache ( cache ) {
20
47
await fs . writeFile ( CACHE_FILE , JSON . stringify ( cache , null , 2 ) + '\n' , 'utf8' ) ;
21
- console . log ( `Updated ${ CACHE_FILE } with fixed links .` ) ;
48
+ console . log ( `Wrote updated ${ CACHE_FILE } .` ) ;
22
49
}
23
50
24
51
// Retry HTTP status check for refcache URLs with non-200s and not 404
25
52
async function retry400sAndUpdateCache ( ) {
53
+ console . log ( `Checking ${ CACHE_FILE } for 4XX status URLs ...` ) ;
26
54
const cache = await readRefcache ( ) ;
27
- let updated = false ;
55
+ let updatedCount = 0 ;
56
+ let entriesCount = 0 ;
57
+ let urlWithFragmentCount = 0 ;
58
+ let urlWithInvalidFragCount = 0 ;
59
+ let statusCounts = { } ;
28
60
29
61
for ( const [ url , details ] of Object . entries ( cache ) ) {
62
+ entriesCount ++ ;
63
+ const parsedUrl = new URL ( url ) ;
64
+ if ( parsedUrl . hash ) urlWithFragmentCount ++ ;
30
65
const { StatusCode, LastSeen } = details ;
31
- if ( isHttp2XX ( StatusCode ) ) continue ;
32
- if ( StatusCode === 404 && ! url . startsWith ( cratesIoURL ) ) {
33
- console . log ( `Skipping 404: ${ url } (last seen ${ LastSeen } ).` ) ;
66
+ const lastSeenDate = new Date ( LastSeen ) ;
67
+
68
+ countStatuses ( StatusCode , parsedUrl , lastSeenDate , statusCounts ) ;
69
+
70
+ if (
71
+ checkForFragments && parsedUrl . hash
72
+ ? isHttp2XXForFragments ( StatusCode , lastSeenDate )
73
+ : isHttp2XX ( StatusCode )
74
+ ) {
75
+ // process.stdout.write('.');
34
76
continue ;
35
77
}
36
78
37
- process . stdout . write ( `Checking: ${ url } (was ${ StatusCode } ) ... ` ) ;
38
- const verbose = false ;
39
- const status = await getUrlStatus ( url , verbose ) ;
79
+ if (
80
+ ( StatusCode === 404 &&
81
+ // Handles special case of crates.io. For details, see:
82
+ // https://github.com/rust-lang/crates.io/issues/788
83
+ ! url . startsWith ( cratesIoURL ) ) ||
84
+ ( parsedUrl . hash && is4XXForFragments ( StatusCode , lastSeenDate ) )
85
+ ) {
86
+ console . log (
87
+ `Skipping ${ StatusCode } : ${ url } (last seen ${ lastSeenDate . toLocaleDateString ( ) } )${
88
+ is4XXForFragments ( StatusCode , lastSeenDate ) ? ' INVALID FRAGMENT' : ''
89
+ } `,
90
+ ) ;
91
+ if ( parsedUrl . hash ) urlWithInvalidFragCount ++ ;
92
+ continue ;
93
+ }
94
+
95
+ if ( url . startsWith ( GOOGLE_DOCS_URL ) ) {
96
+ // console.log(`Skipping Google Docs URL (for now): ${url}.`);
97
+ // process.stdout.write('.');
98
+ continue ;
99
+ /*
100
+ URLs are of the form:
101
+ https://docs.google.com/document/d/15vR7D1x2tKd7u3zaTF0yH1WaHkUr2T4hhr7OyiZgmBg/edit?tab=t.0#heading=h.4xuru5ljcups
102
+ We can simply check for the presence of the heading query parameter value in the page.
103
+ "ps_hdid":"h.4xuru5ljcups" # cSpell:disable-line
104
+ */
105
+ }
106
+
107
+ if ( maxNumEntriesToUpdate && updatedCount >= maxNumEntriesToUpdate ) {
108
+ console . log ( `Updated max of ${ maxNumEntriesToUpdate } entries, exiting.` ) ;
109
+ break ;
110
+ }
111
+
112
+ process . stdout . write (
113
+ `Checking${
114
+ parsedUrl . hash ? ` for fragment in` : `:`
115
+ } ${ url } (was ${ StatusCode } ) ... `,
116
+ ) ;
117
+
118
+ let status = await getUrlStatus ( url ) ;
40
119
console . log ( `${ status } .` ) ;
41
120
42
- if ( ! isHttp2XX ( status ) ) continue ;
121
+ let now = new Date ( ) ;
122
+ if ( parsedUrl . hash ) {
123
+ if ( isHttp2XX ( status ) ) {
124
+ // Encore that the fragment was checked and is valid.
125
+ now . setSeconds ( fragSecondsOk ) ;
126
+ now . setMilliseconds ( fragMillisecondsOk ) ;
127
+ } else {
128
+ status = StatusCode ; // Keep the original status, rather than our custom 4XX status.
129
+ now . setSeconds ( fragSecondsInvalid ) ;
130
+ now . setMilliseconds ( fragMillisecondsInvalid ) ;
131
+ urlWithInvalidFragCount ++ ;
132
+ }
133
+ } else if ( ! isHttp2XX ( status ) ) {
134
+ continue ;
135
+ }
43
136
44
137
cache [ url ] = {
45
138
StatusCode : status ,
46
- LastSeen : new Date ( ) . toISOString ( ) ,
139
+ LastSeen : now . toISOString ( ) ,
47
140
} ;
48
-
49
- updated = true ;
141
+ updatedCount ++ ;
50
142
}
51
143
52
- if ( updated ) {
144
+ if ( updatedCount ) {
53
145
await writeRefcache ( cache ) ;
54
146
} else {
55
147
console . log ( `No updates needed.` ) ;
56
148
}
149
+
150
+ console . log (
151
+ `Processed ${ entriesCount } URLs${
152
+ checkForFragments
153
+ ? ` (${ urlWithFragmentCount } with fragments, ${ urlWithInvalidFragCount } are invalid)`
154
+ : ''
155
+ } `,
156
+ ) ;
157
+ for ( const [ status , count ] of Object . entries ( statusCounts ) ) {
158
+ console . log ( `Status ${ status } : ${ count } ` ) ;
159
+ }
57
160
}
58
161
162
+ function countStatuses ( StatusCode , parsedUrl , lastSeenDate , statusCounts ) {
163
+ let sc = StatusCode ;
164
+ if ( checkForFragments ) {
165
+ sc += parsedUrl . hash
166
+ ? ' frag ' +
167
+ ( isHttp2XXForFragments ( StatusCode , lastSeenDate ) ? 'ok' : 'er' )
168
+ : ' no frag' ;
169
+ }
170
+ statusCounts [ sc ] = ( statusCounts [ sc ] || 0 ) + 1 ;
171
+ }
172
+
173
+ function getNumericFlagValue ( flagName ) {
174
+ const flagArg = process . argv . find ( ( arg ) => arg . startsWith ( flagName ) ) ;
175
+ if ( ! flagArg ) return ;
176
+
177
+ const valueArg = flagArg . includes ( '=' )
178
+ ? flagArg . split ( '=' ) [ 1 ]
179
+ : process . argv [ process . argv . indexOf ( flagName ) + 1 ] ;
180
+ let value = parseInt ( valueArg ) ;
181
+
182
+ if ( value < 0 ) {
183
+ console . error (
184
+ `ERROR: invalid value for ${ flagName } : ${ valueArg } . ` +
185
+ `Must be a number > 0. Using default ${ maxNumEntriesToUpdate } .` ,
186
+ ) ;
187
+ exit ( 1 ) ;
188
+ }
189
+ return value ;
190
+ }
191
+
192
+ const _maxNumEntriesToUpdateFlag = getNumericFlagValue ( '--max-num-to-update' ) ;
193
+ if ( _maxNumEntriesToUpdateFlag >= 0 )
194
+ maxNumEntriesToUpdate = _maxNumEntriesToUpdateFlag ;
195
+ checkForFragments =
196
+ process . argv . includes ( '--check-for-fragments' ) || process . argv . includes ( '-f' ) ;
197
+
59
198
await retry400sAndUpdateCache ( ) ;
0 commit comments