@@ -136,10 +136,10 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
136136 for (int batch_kl = 0; batch_kl < 21; ++batch_kl) {
137137 int task_kl0 = blockIdx.y * 336 + batch_kl * 16;
138138 if (task_kl0 >= npairs_kl) {
139- continue ;
139+ break ;
140140 }
141141 if (pair_ij_mapping == pair_kl_mapping && task_ij0+16 <= task_kl0) {
142- continue ;
142+ break ;
143143 }
144144 int pair_ij0 = pair_ij_mapping[task_ij0];
145145 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -159,7 +159,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
159159 int kl_loc0 = pair_kl_loc[task_kl];
160160 if (pair_ij_mapping == pair_kl_mapping) {
161161 if (task_ij == task_kl) fac *= .5;
162- if (task_ij < task_kl) fac = 0.;
162+ else if (task_ij < task_kl) fac = 0.;
163163 }
164164 __syncthreads();
165165 double xij = Rp_cache[tx+0];
@@ -443,7 +443,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
443443 for (int batch_kl = 0; batch_kl < 21; ++batch_kl) {
444444 int task_kl0 = blockIdx.y * 336 + batch_kl * 16;
445445 if (task_kl0 >= npairs_kl) {
446- continue ;
446+ break ;
447447 }
448448 int pair_ij0 = pair_ij_mapping[task_ij0];
449449 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -821,10 +821,10 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
821821 for (int batch_kl = 0; batch_kl < 6; ++batch_kl) {
822822 int task_kl0 = blockIdx.y * 96 + batch_kl * 16;
823823 if (task_kl0 >= npairs_kl) {
824- continue ;
824+ break ;
825825 }
826826 if (pair_ij_mapping == pair_kl_mapping && task_ij0+16 <= task_kl0) {
827- continue ;
827+ break ;
828828 }
829829 int pair_ij0 = pair_ij_mapping[task_ij0];
830830 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -844,7 +844,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
844844 int kl_loc0 = pair_kl_loc[task_kl];
845845 if (pair_ij_mapping == pair_kl_mapping) {
846846 if (task_ij == task_kl) fac *= .5;
847- if (task_ij < task_kl) fac = 0.;
847+ else if (task_ij < task_kl) fac = 0.;
848848 }
849849 __syncthreads();
850850 double xij = Rp_cache[tx+0];
@@ -1593,7 +1593,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 8) {
15931593 for (int batch_kl = 0; batch_kl < 16; ++batch_kl) {
15941594 int task_kl0 = blockIdx.y * 256 + batch_kl * 16;
15951595 if (task_kl0 >= npairs_kl) {
1596- continue ;
1596+ break ;
15971597 }
15981598 int pair_ij0 = pair_ij_mapping[task_ij0];
15991599 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -2127,7 +2127,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 4) {
21272127 for (int batch_kl = 0; batch_kl < 10; ++batch_kl) {
21282128 int task_kl0 = blockIdx.y * 160 + batch_kl * 16;
21292129 if (task_kl0 >= npairs_kl) {
2130- continue ;
2130+ break ;
21312131 }
21322132 int pair_ij0 = pair_ij_mapping[task_ij0];
21332133 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -3160,10 +3160,10 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 4) {
31603160 for (int batch_kl = 0; batch_kl < 4; ++batch_kl) {
31613161 int task_kl0 = blockIdx.y * 64 + batch_kl * 16;
31623162 if (task_kl0 >= npairs_kl) {
3163- continue ;
3163+ break ;
31643164 }
31653165 if (pair_ij_mapping == pair_kl_mapping && task_ij0+16 <= task_kl0) {
3166- continue ;
3166+ break ;
31673167 }
31683168 int pair_ij0 = pair_ij_mapping[task_ij0];
31693169 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -3183,7 +3183,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 4) {
31833183 int kl_loc0 = pair_kl_loc[task_kl];
31843184 if (pair_ij_mapping == pair_kl_mapping) {
31853185 if (task_ij == task_kl) fac *= .5;
3186- if (task_ij < task_kl) fac = 0.;
3186+ else if (task_ij < task_kl) fac = 0.;
31873187 }
31883188 __syncthreads();
31893189 double xij = Rp_cache[tx+0];
@@ -5337,7 +5337,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 4) {
53375337 for (int batch_kl = 0; batch_kl < 21; ++batch_kl) {
53385338 int task_kl0 = blockIdx.y * 336 + batch_kl * 16;
53395339 if (task_kl0 >= npairs_kl) {
5340- continue ;
5340+ break ;
53415341 }
53425342 int pair_ij0 = pair_ij_mapping[task_ij0];
53435343 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -5988,7 +5988,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 4) {
59885988 for (int batch_kl = 0; batch_kl < 6; ++batch_kl) {
59895989 int task_kl0 = blockIdx.y * 96 + batch_kl * 16;
59905990 if (task_kl0 >= npairs_kl) {
5991- continue ;
5991+ break ;
59925992 }
59935993 int pair_ij0 = pair_ij_mapping[task_ij0];
59945994 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -7884,7 +7884,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 2) {
78847884 for (int batch_kl = 0; batch_kl < 24; ++batch_kl) {
78857885 int task_kl0 = blockIdx.y * 384 + batch_kl * 16;
78867886 if (task_kl0 >= npairs_kl) {
7887- continue ;
7887+ break ;
78887888 }
78897889 int pair_ij0 = pair_ij_mapping[task_ij0];
78907890 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -8470,7 +8470,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 2) {
84708470 for (int batch_kl = 0; batch_kl < 9; ++batch_kl) {
84718471 int task_kl0 = blockIdx.y * 144 + batch_kl * 16;
84728472 if (task_kl0 >= npairs_kl) {
8473- continue ;
8473+ break ;
84748474 }
84758475 int pair_ij0 = pair_ij_mapping[task_ij0];
84768476 int pair_kl0 = pair_kl_mapping[task_kl0];
@@ -10111,7 +10111,7 @@ for (int dm_offset = 0; dm_offset < jk.n_dm; dm_offset += 2) {
1011110111 for (int batch_kl = 0; batch_kl < 12; ++batch_kl) {
1011210112 int task_kl0 = blockIdx.y * 192 + batch_kl * 16;
1011310113 if (task_kl0 >= npairs_kl) {
10114- continue ;
10114+ break ;
1011510115 }
1011610116 int pair_ij0 = pair_ij_mapping[task_ij0];
1011710117 int pair_kl0 = pair_kl_mapping[task_kl0];
0 commit comments