@@ -70,61 +70,61 @@ Buffer<uint16_t, 2> blur_fast(Buffer<uint16_t, 2> in) {
70
70
}
71
71
}
72
72
#elif __ARM_NEON
73
- uint16x4_t one_third = vdup_n_u16 (21846 );
73
+ uint16x4_t one_third = vdup_n_u16 (21846 );
74
74
#pragma omp parallel for
75
- for (int yTile = 0 ; yTile < out.height (); yTile += 32 ) {
76
- uint16x8_t tmp[(128 / 8 ) * (32 + 2 )];
77
- for (int xTile = 0 ; xTile < out.width (); xTile += 128 ) {
78
- uint16_t *tmpPtr = (uint16_t *)tmp;
79
- for (int y = 0 ; y < 32 + 2 ; y++) {
80
- const uint16_t *inPtr = &(in (xTile, yTile + y));
81
- for (int x = 0 ; x < 128 ; x += 8 ) {
82
- uint16x8_t a = vld1q_u16 (inPtr);
83
- uint16x8_t b = vld1q_u16 (inPtr + 1 );
84
- uint16x8_t c = vld1q_u16 (inPtr + 2 );
85
- uint16x8_t sum = vaddq_u16 (vaddq_u16 (a, b), c);
86
- uint16x4_t sumlo = vget_low_u16 (sum);
87
- uint16x4_t sumhi = vget_high_u16 (sum);
88
- uint16x4_t avglo = vshrn_n_u32 (vmull_u16 (sumlo, one_third), 16 );
89
- uint16x4_t avghi = vshrn_n_u32 (vmull_u16 (sumhi, one_third), 16 );
90
- uint16x8_t avg = vcombine_u16 (avglo, avghi);
91
- vst1q_u16 (tmpPtr, avg);
92
- tmpPtr += 8 ;
93
- inPtr += 8 ;
94
- }
75
+ for (int yTile = 0 ; yTile < out.height (); yTile += 32 ) {
76
+ uint16x8_t tmp[(128 / 8 ) * (32 + 2 )];
77
+ for (int xTile = 0 ; xTile < out.width (); xTile += 128 ) {
78
+ uint16_t *tmpPtr = (uint16_t *)tmp;
79
+ for (int y = 0 ; y < 32 + 2 ; y++) {
80
+ const uint16_t *inPtr = &(in (xTile, yTile + y));
81
+ for (int x = 0 ; x < 128 ; x += 8 ) {
82
+ uint16x8_t a = vld1q_u16 (inPtr);
83
+ uint16x8_t b = vld1q_u16 (inPtr + 1 );
84
+ uint16x8_t c = vld1q_u16 (inPtr + 2 );
85
+ uint16x8_t sum = vaddq_u16 (vaddq_u16 (a, b), c);
86
+ uint16x4_t sumlo = vget_low_u16 (sum);
87
+ uint16x4_t sumhi = vget_high_u16 (sum);
88
+ uint16x4_t avglo = vshrn_n_u32 (vmull_u16 (sumlo, one_third), 16 );
89
+ uint16x4_t avghi = vshrn_n_u32 (vmull_u16 (sumhi, one_third), 16 );
90
+ uint16x8_t avg = vcombine_u16 (avglo, avghi);
91
+ vst1q_u16 (tmpPtr, avg);
92
+ tmpPtr += 8 ;
93
+ inPtr += 8 ;
95
94
}
96
- tmpPtr = ( uint16_t *)tmp;
97
- for ( int y = 0 ; y < 32 ; y++) {
98
- uint16_t *outPtr = &( out (xTile, yTile + y));
99
- for ( int x = 0 ; x < 128 ; x += 8 ) {
100
- uint16x8_t a = vld1q_u16 (tmpPtr + ( 2 * 128 ));
101
- uint16x8_t b = vld1q_u16 (tmpPtr + 128 );
102
- uint16x8_t c = vld1q_u16 (tmpPtr);
103
- uint16x8_t sum = vaddq_u16 ( vaddq_u16 (a, b), c );
104
- uint16x4_t sumlo = vget_low_u16 (sum );
105
- uint16x4_t sumhi = vget_high_u16 (sum);
106
- uint16x4_t avglo = vshrn_n_u32 ( vmull_u16 (sumlo, one_third), 16 );
107
- uint16x4_t avghi = vshrn_n_u32 (vmull_u16 (sumhi , one_third), 16 );
108
- uint16x8_t avg = vcombine_u16 (avglo, avghi );
109
- vst1q_u16 (outPtr, avg );
110
- tmpPtr += 8 ;
111
- outPtr += 8 ;
112
- }
95
+ }
96
+ tmpPtr = ( uint16_t *)tmp;
97
+ for ( int y = 0 ; y < 32 ; y++) {
98
+ uint16_t *outPtr = &( out (xTile, yTile + y));
99
+ for ( int x = 0 ; x < 128 ; x += 8 ) {
100
+ uint16x8_t a = vld1q_u16 (tmpPtr + ( 2 * 128 ) );
101
+ uint16x8_t b = vld1q_u16 (tmpPtr + 128 );
102
+ uint16x8_t c = vld1q_u16 (tmpPtr );
103
+ uint16x8_t sum = vaddq_u16 ( vaddq_u16 (a, b), c );
104
+ uint16x4_t sumlo = vget_low_u16 (sum);
105
+ uint16x4_t sumhi = vget_high_u16 (sum );
106
+ uint16x4_t avglo = vshrn_n_u32 (vmull_u16 (sumlo , one_third), 16 );
107
+ uint16x4_t avghi = vshrn_n_u32 ( vmull_u16 (sumhi, one_third), 16 );
108
+ uint16x8_t avg = vcombine_u16 (avglo, avghi );
109
+ vst1q_u16 (outPtr, avg) ;
110
+ tmpPtr += 8 ;
111
+ outPtr += 8 ;
113
112
}
114
113
}
115
114
}
115
+ }
116
116
#else
117
- // No intrinsics enabled, do a naive thing.
118
- for (int y = 0 ; y < out.height (); y++) {
119
- for (int x = 0 ; x < out.width (); x++) {
120
- int tmp[3 ] = {
121
- (in (x, y) + in (x + 1 , y) + in (x + 2 , y)) / 3 ,
122
- (in (x, y + 1 ) + in (x + 1 , y + 1 ) + in (x + 2 , y + 1 )) / 3 ,
123
- (in (x, y + 2 ) + in (x + 1 , y + 2 ) + in (x + 2 , y + 2 )) / 3 ,
124
- };
125
- out (x, y) = (tmp[0 ] + tmp[1 ] + tmp[2 ]) / 3 ;
126
- }
117
+ // No intrinsics enabled, do a naive thing.
118
+ for (int y = 0 ; y < out.height (); y++) {
119
+ for (int x = 0 ; x < out.width (); x++) {
120
+ int tmp[3 ] = {
121
+ (in (x, y) + in (x + 1 , y) + in (x + 2 , y)) / 3 ,
122
+ (in (x, y + 1 ) + in (x + 1 , y + 1 ) + in (x + 2 , y + 1 )) / 3 ,
123
+ (in (x, y + 2 ) + in (x + 1 , y + 2 ) + in (x + 2 , y + 2 )) / 3 ,
124
+ };
125
+ out (x, y) = (tmp[0 ] + tmp[1 ] + tmp[2 ]) / 3 ;
127
126
}
127
+ }
128
128
#endif
129
129
});
130
130
0 commit comments