-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinscode.S
736 lines (695 loc) · 21.5 KB
/
inscode.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
//----------------------------------------------------------------------------
// Copyright (c) 2023, Thierry Lelegard
// BSD 2-Clause License, see LICENSE file.
//----------------------------------------------------------------------------
// Declare a function.
.macro func symbol
.text
.align 2
.globl \symbol
\symbol:
.endm
// Declare the start of a structure (typically a stack frame).
.macro struct_begin
.struct_offset = 0
.endm
// Declare a field in the current structure. Size in bytes.
.macro struct_field name, size
\name = .struct_offset
.struct_offset = .struct_offset + \size
.endm
// Realign inside current structure. Alignment in bytes.
.macro struct_align align
.struct_offset = .struct_offset + (\align - (.struct_offset % \align)) % \align
.endm
// End of current structure. Define alignment in bytes and name for structure size.
.macro struct_end size_name, align=16
struct_align \align
\size_name = .struct_offset
.endm
// Available registers in tested instruction loop:
// - x0: iteration count in x0, don't modify
// - x1-x2: generic operands, initial values 0x0123456789ABCDEF and 0xFEDCBA9876543210
// - x3-x15: free to use without breaking the ABI, initially zero
// - x16-x17: temporary registers, initially zero
// - x18: don't use, forbidden by Arm ABI (random crash on Apple M3 if used)
// - x19-x20: saved registers, initially zero
// - x21-x28: saved registers, initially contains same fixed values as x1, x2 (by pairs)
// 16 scratch registers, initially zero:
#define ALL_REGS 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19
// Prolog of all functions in this module.
// Profile: double func(int64_t iterations, int64_t bool_empty_loop);
// bool_empty_loop: if non-zero, substract empty loop time.
// Return: average time of one instruction in iteration body in nanoseconds.
//
.macro xxx_begin symbol
func \symbol
// Stack frame layout.
struct_begin
struct_field xxx.call_frame, 16 // call frame (x29, x30)
struct_field xxx.saved_x19_x20, 16 // saved registers x19, x20
struct_field xxx.saved_x21_x22, 16 // saved registers x21, x22
struct_field xxx.saved_x23_x24, 16 // saved registers x23, x24
struct_field xxx.saved_x25_x26, 16 // saved registers x25, x26
struct_field xxx.saved_x27_x28, 16 // saved registers x27, x28
struct_field xxx.iter, 8 // iteration count
struct_field xxx.bel, 8 // bool_empty_loop (if non-zero, substract empty loop time)
struct_field xxx.time1, 8 // start / duration of main instruction loop
struct_field xxx.time2, 8 // start of empty loop
struct_end xxx.stack_size
// Function prolog.
stp x29, x30, [sp, -xxx.stack_size]!
mov x29, sp
stp x19, x20, [sp, xxx.saved_x19_x20]
stp x21, x22, [sp, xxx.saved_x21_x22]
stp x23, x24, [sp, xxx.saved_x23_x24]
stp x25, x26, [sp, xxx.saved_x25_x26]
stp x27, x28, [sp, xxx.saved_x27_x28]
// Start of function.
str x0, [sp, xxx.iter]
str x1, [sp, xxx.bel]
mov x1, #0xCDEF // x1 = 0x0123456789ABCDEF
movk x1, #0x89AB, lsl 16
movk x1, #0x4567, lsl 32
movk x1, #0x0123, lsl 48
mov x21, x1 // x21 = x23 = x25 = x27 = x1
mov x23, x1
mov x25, x1
mov x27, x1
mov x2, #0x3210 // x2 = 0xFEDCBA9876543210
movk x2, #0x7654, lsl 16
movk x2, #0xBA98, lsl 32
movk x2, #0xFEDC, lsl 48
mov x22, x2 // x22 = x24 = x26 = x28 = x2
mov x24, x2
mov x26, x2
mov x28, x2
.irp reg,ALL_REGS // clear 16 scratch registers
mov x\reg, #0
.endr
isb // instruction barrier before getting timer counter
mrs x20, cntvct_el0 // virtual timer counter
str x20, [sp, xxx.time1]
mov x20, #0 // and set x20 as scratch register
// Start of main instruction loop.
1: cbz x0, 5f
sub x0, x0, #1
3:
.endm
// End of all functions.
.macro xxx_end
4: b 1b // Loop back to instruction sequence.
5: // End of main instruction loop.
isb
mrs x0, cntvct_el0
str x0, [sp, xxx.time2] // time2 = start of second loop
ldr x1, [sp, xxx.time1]
sub x0, x0, x1 // x0 = duration of the instruction loop, in virtual timer units
ldr x1, [sp, xxx.bel] // x1 = bool_empty_loop
cbz x1, 8f // branch if don't substract empty loop time
str x0, [sp, xxx.time1] // time1 = duration of the instruction loop
ldr x0, [sp, xxx.iter]
// Start of empty instruction loop.
6: cbz x0, 7f
sub x0, x0, #1
b 6b
7: // End of empty instruction loop.
isb
mrs x0, cntvct_el0
ldr x1, [sp, xxx.time2]
sub x2, x0, x1 // x2 = duration of empty loop
ldr x1, [sp, xxx.time1] // x1 = duration of the instruction loop
sub x0, x1, x2 // x0 = instructions duration, in virtual timer units
8: ucvtf d0, x0
mov x1, #0xCA00 // x1 = 0x3B9ACA00 = 1 billion = nano-sec / sec
movk x1, #0x3B9A, lsl 16
ucvtf d1, x1
fmul d2, d0, d1 // d2 = instructions duration * 1 billion
mrs x1, cntfrq_el0 // x1 = frequency in Hz of virtual timer
ucvtf d1, x1
fdiv d3, d2, d1 // d3 = instructions duration in nanoseconds
mov x1, #(4b - 3b) / 4 // x1 = number in instructions in main loop
ldr x0, [sp, xxx.iter]
mul x2, x1, x0 // x2 = total number of executed instructions
ucvtf d2, x2
fdiv d0, d3, d2 // d0 (result) = average duration of one instruction in nanoseconds
// Function epilog
ldp x19, x20, [sp, xxx.saved_x19_x20]
ldp x21, x22, [sp, xxx.saved_x21_x22]
ldp x23, x24, [sp, xxx.saved_x23_x24]
ldp x25, x26, [sp, xxx.saved_x25_x26]
ldp x27, x28, [sp, xxx.saved_x27_x28]
ldp x29, x30, [sp], xxx.stack_size
ret
.endm
//----------------------------------------------------------------------------
// To keep a fair comparison between all sequences, try to keep all
// loops to the same number of instructions. Otherwise, distinct usages of
// cache may introduce some bias. Currently, use 32 instructions per loop.
xxx_begin xxx_nop
.rept 32
nop
.endr
xxx_end
xxx_begin xxx_mul
.rept 2
.irp reg,ALL_REGS
mul x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_umulh
.rept 2
.irp reg,ALL_REGS
umulh x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_div
.rept 2
.irp reg,ALL_REGS
udiv x\reg, x2, x1
.endr
.endr
xxx_end
xxx_begin xxx_add
.rept 2
.irp reg,ALL_REGS
add x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_adcs
.rept 2
.irp reg,ALL_REGS
adcs x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_adds
.rept 2
.irp reg,ALL_REGS
adds x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_adc
.rept 2
.irp reg,ALL_REGS
adc x\reg, x1, x2
.endr
.endr
xxx_end
xxx_begin xxx_mul_umulh
.rept 2
mul x3, x1, x2
umulh x4, x1, x2
mul x5, x1, x2
umulh x6, x1, x2
mul x7, x1, x2
umulh x8, x1, x2
mul x9, x1, x2
umulh x10, x1, x2
mul x11, x1, x2
umulh x12, x1, x2
mul x13, x1, x2
umulh x14, x1, x2
mul x15, x1, x2
umulh x16, x1, x2
mul x17, x1, x2
umulh x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_mul_add_umulh_add
.rept 2
mul x3, x1, x2
add x4, x1, x2
umulh x5, x1, x2
add x6, x1, x2
mul x7, x1, x2
add x8, x1, x2
umulh x9, x1, x2
add x10, x1, x2
mul x11, x1, x2
add x12, x1, x2
umulh x13, x1, x2
add x14, x1, x2
mul x15, x1, x2
add x16, x1, x2
umulh x17, x1, x2
add x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_mul_add
.rept 2
mul x3, x1, x2
add x4, x1, x2
mul x5, x1, x2
add x6, x1, x2
mul x7, x1, x2
add x8, x1, x2
mul x9, x1, x2
add x10, x1, x2
mul x11, x1, x2
add x12, x1, x2
mul x13, x1, x2
add x14, x1, x2
mul x15, x1, x2
add x16, x1, x2
mul x17, x1, x2
add x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_mul_adc
.rept 2
mul x3, x1, x2
adc x4, x1, x2
mul x5, x1, x2
adc x6, x1, x2
mul x7, x1, x2
adc x8, x1, x2
mul x9, x1, x2
adc x10, x1, x2
mul x11, x1, x2
adc x12, x1, x2
mul x13, x1, x2
adc x14, x1, x2
mul x15, x1, x2
adc x16, x1, x2
mul x17, x1, x2
adc x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_mul_adds
.rept 2
mul x3, x1, x2
adds x4, x1, x2
mul x5, x1, x2
adds x6, x1, x2
mul x7, x1, x2
adds x8, x1, x2
mul x9, x1, x2
adds x10, x1, x2
mul x11, x1, x2
adds x12, x1, x2
mul x13, x1, x2
adds x14, x1, x2
mul x15, x1, x2
adds x16, x1, x2
mul x17, x1, x2
adds x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_mul_adcs
.rept 2
mul x3, x1, x2
adcs x4, x1, x2
mul x5, x1, x2
adcs x6, x1, x2
mul x7, x1, x2
adcs x8, x1, x2
mul x9, x1, x2
adcs x10, x1, x2
mul x11, x1, x2
adcs x12, x1, x2
mul x13, x1, x2
adcs x14, x1, x2
mul x15, x1, x2
adcs x16, x1, x2
mul x17, x1, x2
adcs x19, x1, x2 // don't use x18
.endr
xxx_end
// same as xxx_mul_adcs but alternate 4 pairs of input registers
xxx_begin xxx_mul_adcs_alt
.rept 2
mul x3, x1, x2
adcs x4, x21, x22
mul x5, x23, x24
adcs x6, x25, x26
mul x7, x1, x2
adcs x8, x21, x22
mul x9, x23, x24
adcs x10, x25, x26
mul x11, x1, x2
adcs x12, x21, x22
mul x13, x23, x24
adcs x14, x25, x26
mul x15, x1, x2
adcs x16, x21, x22
mul x17, x23, x24
adcs x19, x25, x26 // don't use x18
.endr
xxx_end
xxx_begin xxx_umulh_add
.rept 2
umulh x3, x1, x2
add x4, x1, x2
umulh x5, x1, x2
add x6, x1, x2
umulh x7, x1, x2
add x8, x1, x2
umulh x9, x1, x2
add x10, x1, x2
umulh x11, x1, x2
add x12, x1, x2
umulh x13, x1, x2
add x14, x1, x2
umulh x15, x1, x2
add x16, x1, x2
umulh x17, x1, x2
add x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_umulh_adc
.rept 2
umulh x3, x1, x2
adc x4, x1, x2
umulh x5, x1, x2
adc x6, x1, x2
umulh x7, x1, x2
adc x8, x1, x2
umulh x9, x1, x2
adc x10, x1, x2
umulh x11, x1, x2
adc x12, x1, x2
umulh x13, x1, x2
adc x14, x1, x2
umulh x15, x1, x2
adc x16, x1, x2
umulh x17, x1, x2
adc x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_umulh_adds
.rept 2
umulh x3, x1, x2
adds x4, x1, x2
umulh x5, x1, x2
adds x6, x1, x2
umulh x7, x1, x2
adds x8, x1, x2
umulh x9, x1, x2
adds x10, x1, x2
umulh x11, x1, x2
adds x12, x1, x2
umulh x13, x1, x2
adds x14, x1, x2
umulh x15, x1, x2
adds x16, x1, x2
umulh x17, x1, x2
adds x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_umulh_adcs
.rept 2
umulh x3, x1, x2
adcs x4, x1, x2
umulh x5, x1, x2
adcs x6, x1, x2
umulh x7, x1, x2
adcs x8, x1, x2
umulh x9, x1, x2
adcs x10, x1, x2
umulh x11, x1, x2
adcs x12, x1, x2
umulh x13, x1, x2
adcs x14, x1, x2
umulh x15, x1, x2
adcs x16, x1, x2
umulh x17, x1, x2
adcs x19, x1, x2 // don't use x18
.endr
xxx_end
// same as xxx_umulh_adcs but alternate 4 pairs of input registers
xxx_begin xxx_umulh_adcs_alt
.rept 2
umulh x3, x1, x2
adcs x4, x21, x22
umulh x5, x23, x24
adcs x6, x25, x26
umulh x7, x1, x2
adcs x8, x21, x22
umulh x9, x23, x24
adcs x10, x25, x26
umulh x11, x1, x2
adcs x12, x21, x22
umulh x13, x23, x24
adcs x14, x25, x26
umulh x15, x1, x2
adcs x16, x21, x22
umulh x17, x23, x24
adcs x19, x25, x26 // don't use x18
.endr
xxx_end
xxx_begin xxx_umulh_nop_adcs
umulh x3, x1, x2
nop
adcs x5, x1, x2
umulh x6, x1, x2
nop
adcs x8, x1, x2
umulh x9, x1, x2
nop
adcs x11, x1, x2
umulh x12, x1, x2
nop
adcs x14, x1, x2
umulh x15, x1, x2
nop
adcs x17, x1, x2
umulh x19, x1, x2 // don't use x18
nop
adcs x4, x1, x2
umulh x5, x1, x2
nop
adcs x7, x1, x2
umulh x8, x1, x2
nop
adcs x10, x1, x2
umulh x11, x1, x2
nop
adcs x13, x1, x2
umulh x14, x1, x2
nop
adcs x16, x1, x2
umulh x17, x1, x2
nop
adcs x19, x1, x2 // don't use x18
xxx_end
xxx_begin xxx_umulh_adds_adcs
umulh x3, x1, x2
adds x4, x1, x2
adcs x5, x1, x2
umulh x6, x1, x2
adds x7, x1, x2
adcs x8, x1, x2
umulh x9, x1, x2
adds x10, x1, x2
adcs x11, x1, x2
umulh x12, x1, x2
adds x13, x1, x2
adcs x14, x1, x2
umulh x15, x1, x2
adds x16, x1, x2
adcs x17, x1, x2
umulh x19, x1, x2 // don't use x18
adds x3, x1, x2
adcs x4, x1, x2
umulh x5, x1, x2
adds x6, x1, x2
adcs x7, x1, x2
umulh x8, x1, x2
adds x9, x1, x2
adcs x10, x1, x2
umulh x11, x1, x2
adds x12, x1, x2
adcs x13, x1, x2
umulh x14, x1, x2
adds x15, x1, x2
adcs x16, x1, x2
umulh x17, x1, x2
adds x19, x1, x2 // don't use x18
adcs x20, x1, x2
xxx_end
xxx_begin xxx_umulh_add_adcs
umulh x3, x1, x2
add x4, x1, x2
adcs x5, x1, x2
umulh x6, x1, x2
add x7, x1, x2
adcs x8, x1, x2
umulh x9, x1, x2
add x10, x1, x2
adcs x11, x1, x2
umulh x12, x1, x2
add x13, x1, x2
adcs x14, x1, x2
umulh x15, x1, x2
add x16, x1, x2
adcs x17, x1, x2
umulh x19, x1, x2 // don't use x18
add x3, x1, x2
adcs x4, x1, x2
umulh x5, x1, x2
add x6, x1, x2
adcs x7, x1, x2
umulh x8, x1, x2
add x9, x1, x2
adcs x10, x1, x2
umulh x11, x1, x2
add x12, x1, x2
adcs x13, x1, x2
umulh x14, x1, x2
add x15, x1, x2
adcs x16, x1, x2
umulh x17, x1, x2
add x19, x1, x2 // don't use x18
adcs x20, x1, x2
xxx_end
xxx_begin xxx_umulh_adcs_adcs
umulh x3, x1, x2
adcs x4, x1, x2
adcs x5, x1, x2
umulh x6, x1, x2
adcs x7, x1, x2
adcs x8, x1, x2
umulh x9, x1, x2
adcs x10, x1, x2
adcs x11, x1, x2
umulh x12, x1, x2
adcs x13, x1, x2
adcs x14, x1, x2
umulh x15, x1, x2
adcs x16, x1, x2
adcs x17, x1, x2
umulh x19, x1, x2 // don't use x18
adcs x3, x1, x2
adcs x4, x1, x2
umulh x5, x1, x2
adcs x6, x1, x2
adcs x7, x1, x2
umulh x8, x1, x2
adcs x9, x1, x2
adcs x10, x1, x2
umulh x11, x1, x2
adcs x12, x1, x2
adcs x13, x1, x2
umulh x14, x1, x2
adcs x15, x1, x2
adcs x16, x1, x2
umulh x17, x1, x2
adcs x19, x1, x2 // don't use x18
adcs x20, x1, x2
xxx_end
xxx_begin xxx_umulh_adc_adds
umulh x3, x1, x2
adc x4, x1, x2
adds x5, x1, x2
umulh x6, x1, x2
adc x7, x1, x2
adds x8, x1, x2
umulh x9, x1, x2
adc x10, x1, x2
adds x11, x1, x2
umulh x12, x1, x2
adc x13, x1, x2
adds x14, x1, x2
umulh x15, x1, x2
adc x16, x1, x2
adds x17, x1, x2
umulh x19, x1, x2 // don't use x18
adc x3, x1, x2
adds x4, x1, x2
umulh x5, x1, x2
adc x6, x1, x2
adds x7, x1, x2
umulh x8, x1, x2
adc x9, x1, x2
adds x10, x1, x2
umulh x11, x1, x2
adc x12, x1, x2
adds x13, x1, x2
umulh x14, x1, x2
adc x15, x1, x2
adds x16, x1, x2
umulh x17, x1, x2
adc x19, x1, x2 // don't use x18
adds x20, x1, x2
xxx_end
xxx_begin xxx_umulh_adc_adds_depreg
umulh x3, x1, x2
adc x4, x1, x2
adds x5, x4, x2
umulh x6, x1, x2
adc x7, x1, x2
adds x8, x7, x2
umulh x9, x1, x2
adc x10, x1, x2
adds x11, x10, x2
umulh x12, x1, x2
adc x13, x1, x2
adds x14, x13, x2
umulh x15, x1, x2
adc x16, x1, x2
adds x17, x16, x2
umulh x19, x1, x2 // don't use x18
adc x3, x1, x2
adds x4, x3, x2
umulh x5, x1, x2
adc x6, x1, x2
adds x7, x6, x2
umulh x8, x1, x2
adc x9, x1, x2
adds x10, x9, x2
umulh x11, x1, x2
adc x12, x1, x2
adds x13, x12, x2
umulh x14, x1, x2
adc x15, x1, x2
adds x16, x15, x2
umulh x17, x1, x2
adc x19, x1, x2 // don't use x18
adds x20, x19, x2
xxx_end
xxx_begin xxx_mul_adcs_umulh_adcs
.rept 2
mul x3, x1, x2
adcs x4, x1, x2
umulh x5, x1, x2
adcs x6, x1, x2
mul x7, x1, x2
adcs x8, x1, x2
umulh x9, x1, x2
adcs x10, x1, x2
mul x11, x1, x2
adcs x12, x1, x2
umulh x13, x1, x2
adcs x14, x1, x2
mul x15, x1, x2
adcs x16, x1, x2
umulh x17, x1, x2
adcs x19, x1, x2 // don't use x18
.endr
xxx_end
xxx_begin xxx_pacia
.rept 2
.irp reg,ALL_REGS
pacia x\reg, sp
.endr
.endr
xxx_end
xxx_begin xxx_pacia_autia
.irp reg,ALL_REGS
pacia x\reg, sp
autia x\reg, sp
.endr
xxx_end
xxx_begin xxx_pacia_autia_2
.irp reg,ALL_REGS
pacia x\reg, sp
.endr
.irp reg,ALL_REGS
autia x\reg, sp
.endr
xxx_end