@@ -989,6 +989,162 @@ __kernel void cn1_v2_half(__global uint4 *Scratchpad, __global ulong *states, ui
989
989
# endif
990
990
}
991
991
992
+ __attribute__((reqd_work_group_size (WORKSIZE , 1 , 1 )))
993
+ __kernel void cn1_v2_rwz (__global uint4 * Scratchpad , __global ulong * states , uint variant , __global ulong * input , uint Threads )
994
+ {
995
+ # if (ALGO == CRYPTONIGHT )
996
+ ulong a [2 ], b [4 ];
997
+ __local uint AES0 [256 ], AES1 [256 ], AES2 [256 ], AES3 [256 ];
998
+
999
+ const ulong gIdx = getIdx ();
1000
+
1001
+ for (int i = get_local_id (0 ); i < 256 ; i += WORKSIZE )
1002
+ {
1003
+ const uint tmp = AES0_C [i ];
1004
+ AES0 [i ] = tmp ;
1005
+ AES1 [i ] = rotate (tmp , 8U );
1006
+ AES2 [i ] = rotate (tmp , 16U );
1007
+ AES3 [i ] = rotate (tmp , 24U );
1008
+ }
1009
+
1010
+ barrier (CLK_LOCAL_MEM_FENCE );
1011
+
1012
+ # if (COMP_MODE == 1 )
1013
+ // do not use early return here
1014
+ if (gIdx < Threads )
1015
+ # endif
1016
+ {
1017
+ states += 25 * gIdx ;
1018
+
1019
+ # if defined(__NV_CL_C_VERSION )
1020
+ Scratchpad += gIdx * (0x40000 >> 2 );
1021
+ # else
1022
+ # if (STRIDED_INDEX == 0 )
1023
+ Scratchpad += gIdx * (MEMORY >> 4 );
1024
+ # elif (STRIDED_INDEX == 1 )
1025
+ Scratchpad += gIdx ;
1026
+ # elif (STRIDED_INDEX == 2 )
1027
+ Scratchpad += get_group_id (0 ) * (MEMORY >> 4 ) * WORKSIZE + MEM_CHUNK * get_local_id (0 );
1028
+ # endif
1029
+ # endif
1030
+
1031
+ a [0 ] = states [0 ] ^ states [4 ];
1032
+ a [1 ] = states [1 ] ^ states [5 ];
1033
+
1034
+ b [0 ] = states [2 ] ^ states [6 ];
1035
+ b [1 ] = states [3 ] ^ states [7 ];
1036
+ b [2 ] = states [8 ] ^ states [10 ];
1037
+ b [3 ] = states [9 ] ^ states [11 ];
1038
+ }
1039
+
1040
+ ulong2 bx0 = ((ulong2 * )b )[0 ];
1041
+ ulong2 bx1 = ((ulong2 * )b )[1 ];
1042
+
1043
+ mem_fence (CLK_LOCAL_MEM_FENCE );
1044
+
1045
+ # ifdef __NV_CL_C_VERSION
1046
+ __local uint16 scratchpad_line_buf [WORKSIZE ];
1047
+ __local uint16 * scratchpad_line = scratchpad_line_buf + get_local_id (0 );
1048
+ # define SCRATCHPAD_CHUNK (N ) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
1049
+ # else
1050
+ # if (STRIDED_INDEX == 0 )
1051
+ # define SCRATCHPAD_CHUNK (N ) (*(__global uint4*)((__global uchar*)(Scratchpad) + (idx ^ (N << 4))))
1052
+ # elif (STRIDED_INDEX == 1 )
1053
+ # define SCRATCHPAD_CHUNK (N ) (*(__global uint4*)((__global uchar*)(Scratchpad) + mul24(as_uint(idx ^ (N << 4)), Threads)))
1054
+ # elif (STRIDED_INDEX == 2 )
1055
+ # define SCRATCHPAD_CHUNK (N ) (*(__global uint4*)((__global uchar*)(Scratchpad) + (((idx ^ (N << 4)) % (MEM_CHUNK << 4)) + ((idx ^ (N << 4)) / (MEM_CHUNK << 4)) * WORKSIZE * (MEM_CHUNK << 4))))
1056
+ # endif
1057
+ # endif
1058
+
1059
+ # if (COMP_MODE == 1 )
1060
+ // do not use early return here
1061
+ if (gIdx < Threads )
1062
+ # endif
1063
+ {
1064
+ uint2 division_result = as_uint2 (states [12 ]);
1065
+ uint sqrt_result = as_uint2 (states [13 ]).s0 ;
1066
+
1067
+ #pragma unroll UNROLL_FACTOR
1068
+ for (int i = 0 ; i < 0x60000 ; ++ i )
1069
+ {
1070
+ # ifdef __NV_CL_C_VERSION
1071
+ uint idx = a [0 ] & 0x1FFFC0 ;
1072
+ uint idx1 = a [0 ] & 0x30 ;
1073
+
1074
+ * scratchpad_line = * (__global uint16 * )((__global uchar * )(Scratchpad ) + idx );
1075
+ # else
1076
+ uint idx = a [0 ] & MASK ;
1077
+ # endif
1078
+
1079
+ uint4 c = SCRATCHPAD_CHUNK (0 );
1080
+ c = AES_Round (AES0 , AES1 , AES2 , AES3 , c , ((uint4 * )a )[0 ]);
1081
+
1082
+ {
1083
+ const ulong2 chunk1 = as_ulong2 (SCRATCHPAD_CHUNK (3 ));
1084
+ const ulong2 chunk2 = as_ulong2 (SCRATCHPAD_CHUNK (2 ));
1085
+ const ulong2 chunk3 = as_ulong2 (SCRATCHPAD_CHUNK (1 ));
1086
+
1087
+ SCRATCHPAD_CHUNK (1 ) = as_uint4 (chunk3 + bx1 );
1088
+ SCRATCHPAD_CHUNK (2 ) = as_uint4 (chunk1 + bx0 );
1089
+ SCRATCHPAD_CHUNK (3 ) = as_uint4 (chunk2 + ((ulong2 * )a )[0 ]);
1090
+ }
1091
+
1092
+ SCRATCHPAD_CHUNK (0 ) = as_uint4 (bx0 ) ^ c ;
1093
+
1094
+ # ifdef __NV_CL_C_VERSION
1095
+ * (__global uint16 * )((__global uchar * )(Scratchpad ) + idx ) = * scratchpad_line ;
1096
+
1097
+ idx = as_ulong2 (c ).s0 & 0x1FFFC0 ;
1098
+ idx1 = as_ulong2 (c ).s0 & 0x30 ;
1099
+
1100
+ * scratchpad_line = * (__global uint16 * )((__global uchar * )(Scratchpad ) + idx );
1101
+ # else
1102
+ idx = as_ulong2 (c ).s0 & MASK ;
1103
+ # endif
1104
+
1105
+ uint4 tmp = SCRATCHPAD_CHUNK (0 );
1106
+
1107
+ {
1108
+ tmp .s0 ^= division_result .s0 ;
1109
+ tmp .s1 ^= division_result .s1 ^ sqrt_result ;
1110
+
1111
+ division_result = fast_div_v2 (as_ulong2 (c ).s1 , (c .s0 + (sqrt_result << 1 )) | 0x80000001UL );
1112
+ sqrt_result = fast_sqrt_v2 (as_ulong2 (c ).s0 + as_ulong (division_result ));
1113
+ }
1114
+
1115
+ ulong2 t ;
1116
+ t .s0 = mul_hi (as_ulong2 (c ).s0 , as_ulong2 (tmp ).s0 );
1117
+ t .s1 = as_ulong2 (c ).s0 * as_ulong2 (tmp ).s0 ;
1118
+ {
1119
+ const ulong2 chunk1 = as_ulong2 (SCRATCHPAD_CHUNK (1 )) ^ t ;
1120
+ const ulong2 chunk2 = as_ulong2 (SCRATCHPAD_CHUNK (2 ));
1121
+ t ^= chunk2 ;
1122
+ const ulong2 chunk3 = as_ulong2 (SCRATCHPAD_CHUNK (3 ));
1123
+
1124
+ SCRATCHPAD_CHUNK (1 ) = as_uint4 (chunk1 + bx1 );
1125
+ SCRATCHPAD_CHUNK (2 ) = as_uint4 (chunk3 + bx0 );
1126
+ SCRATCHPAD_CHUNK (3 ) = as_uint4 (chunk2 + ((ulong2 * )a )[0 ]);
1127
+ }
1128
+
1129
+ a [1 ] += t .s1 ;
1130
+ a [0 ] += t .s0 ;
1131
+
1132
+ SCRATCHPAD_CHUNK (0 ) = ((uint4 * )a )[0 ];
1133
+
1134
+ # ifdef __NV_CL_C_VERSION
1135
+ * (__global uint16 * )((__global uchar * )(Scratchpad ) + idx ) = * scratchpad_line ;
1136
+ # endif
1137
+
1138
+ ((uint4 * )a )[0 ] ^= tmp ;
1139
+ bx1 = bx0 ;
1140
+ bx0 = as_ulong2 (c );
1141
+ }
1142
+
1143
+ # undef SCRATCHPAD_CHUNK
1144
+ }
1145
+ mem_fence (CLK_GLOBAL_MEM_FENCE );
1146
+ # endif
1147
+ }
992
1148
993
1149
)== = "
994
1150
R "===(
0 commit comments