@@ -14,6 +14,54 @@ import { getGitHubEnterpriseApiUrl } from './scale-up';
1414
1515const logger = createChildLogger ( 'scale-down' ) ;
1616
17+ const RETRY_CONFIG = {
18+ maxRetries : 3 ,
19+ initialDelayMs : 1000 ,
20+ maxDelayMs : 10000 ,
21+ } ;
22+
23+ async function sleep ( ms : number ) : Promise < void > {
24+ return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
25+ }
26+
27+ function isRetryableError ( error : unknown ) : boolean {
28+ if ( error instanceof RequestError ) {
29+ const status = ( error as RequestError ) . status ;
30+ // Retry on server errors (5xx) and rate limiting (429)
31+ return status >= 500 || status === 429 ;
32+ }
33+ return false ;
34+ }
35+
36+ async function withRetry < T > (
37+ operation : ( ) => Promise < T > ,
38+ operationName : string ,
39+ context : string ,
40+ ) : Promise < T > {
41+ let lastError : unknown ;
42+ for ( let attempt = 1 ; attempt <= RETRY_CONFIG . maxRetries ; attempt ++ ) {
43+ try {
44+ return await operation ( ) ;
45+ } catch ( error ) {
46+ lastError = error ;
47+ if ( isRetryableError ( error ) && attempt < RETRY_CONFIG . maxRetries ) {
48+ const delay = Math . min (
49+ RETRY_CONFIG . initialDelayMs * Math . pow ( 2 , attempt - 1 ) ,
50+ RETRY_CONFIG . maxDelayMs ,
51+ ) ;
52+ logger . warn (
53+ `${ operationName } failed for ${ context } (attempt ${ attempt } /${ RETRY_CONFIG . maxRetries } ), ` +
54+ `retrying in ${ delay } ms. Error: ${ error } ` ,
55+ ) ;
56+ await sleep ( delay ) ;
57+ } else {
58+ throw error ;
59+ }
60+ }
61+ }
62+ throw lastError ;
63+ }
64+
1765type OrgRunnerList = Endpoints [ 'GET /orgs/{org}/actions/runners' ] [ 'response' ] [ 'data' ] [ 'runners' ] ;
1866type RepoRunnerList = Endpoints [ 'GET /repos/{owner}/{repo}/actions/runners' ] [ 'response' ] [ 'data' ] [ 'runners' ] ;
1967type RunnerState = OrgRunnerList [ number ] | RepoRunnerList [ number ] ;
@@ -127,6 +175,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127175 return launchTimePlusMinimum < now ;
128176}
129177
178+ async function deleteGitHubRunner (
179+ githubAppClient : Octokit ,
180+ ec2runner : RunnerInfo ,
181+ ghRunnerId : number ,
182+ ) : Promise < number > {
183+ const deleteOperation = async ( ) => {
184+ const response =
185+ ec2runner . type === 'Org'
186+ ? await githubAppClient . actions . deleteSelfHostedRunnerFromOrg ( {
187+ runner_id : ghRunnerId ,
188+ org : ec2runner . owner ,
189+ } )
190+ : await githubAppClient . actions . deleteSelfHostedRunnerFromRepo ( {
191+ runner_id : ghRunnerId ,
192+ owner : ec2runner . owner . split ( '/' ) [ 0 ] ,
193+ repo : ec2runner . owner . split ( '/' ) [ 1 ] ,
194+ } ) ;
195+ return response . status ;
196+ } ;
197+
198+ return await withRetry (
199+ deleteOperation ,
200+ 'Delete GitHub runner' ,
201+ `runner ${ ec2runner . instanceId } (GitHub ID: ${ ghRunnerId } )` ,
202+ ) ;
203+ }
204+
130205async function removeRunner ( ec2runner : RunnerInfo , ghRunnerIds : number [ ] ) : Promise < void > {
131206 const githubAppClient = await getOrCreateOctokit ( ec2runner ) ;
132207 try {
@@ -146,28 +221,35 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
146221 ) ;
147222
148223 if ( states . every ( ( busy ) => busy === false ) ) {
149- const statuses = await Promise . all (
224+ const results = await Promise . all (
150225 ghRunnerIds . map ( async ( ghRunnerId ) => {
151- return (
152- ec2runner . type === 'Org'
153- ? await githubAppClient . actions . deleteSelfHostedRunnerFromOrg ( {
154- runner_id : ghRunnerId ,
155- org : ec2runner . owner ,
156- } )
157- : await githubAppClient . actions . deleteSelfHostedRunnerFromRepo ( {
158- runner_id : ghRunnerId ,
159- owner : ec2runner . owner . split ( '/' ) [ 0 ] ,
160- repo : ec2runner . owner . split ( '/' ) [ 1 ] ,
161- } )
162- ) . status ;
226+ try {
227+ const status = await deleteGitHubRunner ( githubAppClient , ec2runner , ghRunnerId ) ;
228+ return { ghRunnerId, status, success : status === 204 } ;
229+ } catch ( error ) {
230+ logger . error (
231+ `Failed to de-register GitHub runner ${ ghRunnerId } for instance '${ ec2runner . instanceId } ' after retries. Error: ${ error } ` ,
232+ { error : error as Error } ,
233+ ) ;
234+ return { ghRunnerId, status : 0 , success : false } ;
235+ }
163236 } ) ,
164237 ) ;
165238
166- if ( statuses . every ( ( status ) => status == 204 ) ) {
239+ const allSucceeded = results . every ( ( r ) => r . success ) ;
240+ const failedRunners = results . filter ( ( r ) => ! r . success ) ;
241+
242+ if ( allSucceeded ) {
167243 await terminateRunner ( ec2runner . instanceId ) ;
168244 logger . info ( `AWS runner instance '${ ec2runner . instanceId } ' is terminated and GitHub runner is de-registered.` ) ;
169245 } else {
170- logger . error ( `Failed to de-register GitHub runner: ${ statuses } ` ) ;
246+ // Only terminate EC2 if we successfully de-registered from GitHub
247+ // Otherwise, leave the instance running so the next scale-down cycle can retry
248+ logger . error (
249+ `Failed to de-register ${ failedRunners . length } GitHub runner(s) for instance '${ ec2runner . instanceId } '. ` +
250+ `Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
251+ `Failed runner IDs: ${ failedRunners . map ( ( r ) => r . ghRunnerId ) . join ( ', ' ) } ` ,
252+ ) ;
171253 }
172254 } else {
173255 logger . info ( `Runner '${ ec2runner . instanceId } ' cannot be de-registered, because it is still busy.` ) ;
0 commit comments