Skip to content

Commit 78c37d1

Browse files
Alexander Boykoherbertx
Alexander Boyko
authored andcommitted
crypto: crc32 - add crc32 pclmulqdq implementation and wrappers for table implementation
This patch adds crc32 algorithms to shash crypto api. One is wrapper to gerneric crc32_le function. Second is crc32 pclmulqdq implementation. It use hardware provided PCLMULQDQ instruction to accelerate the CRC32 disposal. This instruction present from Intel Westmere and AMD Bulldozer CPUs. For intel core i5 I got 450MB/s for table implementation and 2100MB/s for pclmulqdq implementation. Signed-off-by: Alexander Boyko <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 5c22ba6 commit 78c37d1

File tree

6 files changed

+630
-0
lines changed

6 files changed

+630
-0
lines changed

arch/x86/crypto/Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
2727

2828
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
2929
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
30+
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
3031

3132
aes-i586-y := aes-i586-asm_32.o aes_glue.o
3233
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -52,3 +53,4 @@ ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
5253
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
5354
crc32c-intel-y := crc32c-intel_glue.o
5455
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
56+
crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o

arch/x86/crypto/crc32-pclmul_asm.S

+247
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/* GPL HEADER START
2+
*
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This program is free software; you can redistribute it and/or modify
6+
* it under the terms of the GNU General Public License version 2 only,
7+
* as published by the Free Software Foundation.
8+
*
9+
* This program is distributed in the hope that it will be useful, but
10+
* WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12+
* General Public License version 2 for more details (a copy is included
13+
* in the LICENSE file that accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License
16+
* version 2 along with this program; If not, see http://www.gnu.org/licenses
17+
*
18+
* Please visit http://www.xyratex.com/contact if you need additional
19+
* information or have any questions.
20+
*
21+
* GPL HEADER END
22+
*/
23+
24+
/*
25+
* Copyright 2012 Xyratex Technology Limited
26+
*
27+
* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
28+
* calculation.
29+
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
30+
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
31+
* at:
32+
* http://www.intel.com/products/processor/manuals/
33+
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
34+
* Volume 2B: Instruction Set Reference, N-Z
35+
*
36+
* Authors: Gregory Prestas <[email protected]>
37+
* Alexander Boyko <[email protected]>
38+
*/
39+
40+
#include <linux/linkage.h>
41+
#include <asm/inst.h>
42+
43+
44+
.align 16
45+
/*
46+
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
47+
* #define CONSTANT_R1 0x154442bd4LL
48+
*
49+
* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
50+
* #define CONSTANT_R2 0x1c6e41596LL
51+
*/
52+
.Lconstant_R2R1:
53+
.octa 0x00000001c6e415960000000154442bd4
54+
/*
55+
* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
56+
* #define CONSTANT_R3 0x1751997d0LL
57+
*
58+
* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
59+
* #define CONSTANT_R4 0x0ccaa009eLL
60+
*/
61+
.Lconstant_R4R3:
62+
.octa 0x00000000ccaa009e00000001751997d0
63+
/*
64+
* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
65+
* #define CONSTANT_R5 0x163cd6124LL
66+
*/
67+
.Lconstant_R5:
68+
.octa 0x00000000000000000000000163cd6124
69+
.Lconstant_mask32:
70+
.octa 0x000000000000000000000000FFFFFFFF
71+
/*
72+
* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
73+
*
74+
* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
75+
* #define CONSTANT_RU 0x1F7011641LL
76+
*/
77+
.Lconstant_RUpoly:
78+
.octa 0x00000001F701164100000001DB710641
79+
80+
#define CONSTANT %xmm0
81+
82+
#ifdef __x86_64__
83+
#define BUF %rdi
84+
#define LEN %rsi
85+
#define CRC %edx
86+
#else
87+
#warning Using 32bit code support
88+
#define BUF %eax
89+
#define LEN %edx
90+
#define CRC %ecx
91+
#endif
92+
93+
94+
95+
.text
96+
/**
97+
* Calculate crc32
98+
* BUF - buffer (16 bytes aligned)
99+
* LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
100+
* CRC - initial crc32
101+
* return %eax crc32
102+
* uint crc32_pclmul_le_16(unsigned char const *buffer,
103+
* size_t len, uint crc32)
104+
*/
105+
.globl crc32_pclmul_le_16
106+
.align 4, 0x90
107+
crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
108+
movdqa (BUF), %xmm1
109+
movdqa 0x10(BUF), %xmm2
110+
movdqa 0x20(BUF), %xmm3
111+
movdqa 0x30(BUF), %xmm4
112+
movd CRC, CONSTANT
113+
pxor CONSTANT, %xmm1
114+
sub $0x40, LEN
115+
add $0x40, BUF
116+
#ifndef __x86_64__
117+
/* This is for position independent code(-fPIC) support for 32bit */
118+
call delta
119+
delta:
120+
pop %ecx
121+
#endif
122+
cmp $0x40, LEN
123+
jb less_64
124+
125+
#ifdef __x86_64__
126+
movdqa .Lconstant_R2R1(%rip), CONSTANT
127+
#else
128+
movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
129+
#endif
130+
131+
loop_64:/* 64 bytes Full cache line folding */
132+
prefetchnta 0x40(BUF)
133+
movdqa %xmm1, %xmm5
134+
movdqa %xmm2, %xmm6
135+
movdqa %xmm3, %xmm7
136+
#ifdef __x86_64__
137+
movdqa %xmm4, %xmm8
138+
#endif
139+
PCLMULQDQ 00, CONSTANT, %xmm1
140+
PCLMULQDQ 00, CONSTANT, %xmm2
141+
PCLMULQDQ 00, CONSTANT, %xmm3
142+
#ifdef __x86_64__
143+
PCLMULQDQ 00, CONSTANT, %xmm4
144+
#endif
145+
PCLMULQDQ 0x11, CONSTANT, %xmm5
146+
PCLMULQDQ 0x11, CONSTANT, %xmm6
147+
PCLMULQDQ 0x11, CONSTANT, %xmm7
148+
#ifdef __x86_64__
149+
PCLMULQDQ 0x11, CONSTANT, %xmm8
150+
#endif
151+
pxor %xmm5, %xmm1
152+
pxor %xmm6, %xmm2
153+
pxor %xmm7, %xmm3
154+
#ifdef __x86_64__
155+
pxor %xmm8, %xmm4
156+
#else
157+
/* xmm8 unsupported for x32 */
158+
movdqa %xmm4, %xmm5
159+
PCLMULQDQ 00, CONSTANT, %xmm4
160+
PCLMULQDQ 0x11, CONSTANT, %xmm5
161+
pxor %xmm5, %xmm4
162+
#endif
163+
164+
pxor (BUF), %xmm1
165+
pxor 0x10(BUF), %xmm2
166+
pxor 0x20(BUF), %xmm3
167+
pxor 0x30(BUF), %xmm4
168+
169+
sub $0x40, LEN
170+
add $0x40, BUF
171+
cmp $0x40, LEN
172+
jge loop_64
173+
less_64:/* Folding cache line into 128bit */
174+
#ifdef __x86_64__
175+
movdqa .Lconstant_R4R3(%rip), CONSTANT
176+
#else
177+
movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
178+
#endif
179+
prefetchnta (BUF)
180+
181+
movdqa %xmm1, %xmm5
182+
PCLMULQDQ 0x00, CONSTANT, %xmm1
183+
PCLMULQDQ 0x11, CONSTANT, %xmm5
184+
pxor %xmm5, %xmm1
185+
pxor %xmm2, %xmm1
186+
187+
movdqa %xmm1, %xmm5
188+
PCLMULQDQ 0x00, CONSTANT, %xmm1
189+
PCLMULQDQ 0x11, CONSTANT, %xmm5
190+
pxor %xmm5, %xmm1
191+
pxor %xmm3, %xmm1
192+
193+
movdqa %xmm1, %xmm5
194+
PCLMULQDQ 0x00, CONSTANT, %xmm1
195+
PCLMULQDQ 0x11, CONSTANT, %xmm5
196+
pxor %xmm5, %xmm1
197+
pxor %xmm4, %xmm1
198+
199+
cmp $0x10, LEN
200+
jb fold_64
201+
loop_16:/* Folding rest buffer into 128bit */
202+
movdqa %xmm1, %xmm5
203+
PCLMULQDQ 0x00, CONSTANT, %xmm1
204+
PCLMULQDQ 0x11, CONSTANT, %xmm5
205+
pxor %xmm5, %xmm1
206+
pxor (BUF), %xmm1
207+
sub $0x10, LEN
208+
add $0x10, BUF
209+
cmp $0x10, LEN
210+
jge loop_16
211+
212+
fold_64:
213+
/* perform the last 64 bit fold, also adds 32 zeroes
214+
* to the input stream */
215+
PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
216+
psrldq $0x08, %xmm1
217+
pxor CONSTANT, %xmm1
218+
219+
/* final 32-bit fold */
220+
movdqa %xmm1, %xmm2
221+
#ifdef __x86_64__
222+
movdqa .Lconstant_R5(%rip), CONSTANT
223+
movdqa .Lconstant_mask32(%rip), %xmm3
224+
#else
225+
movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
226+
movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
227+
#endif
228+
psrldq $0x04, %xmm2
229+
pand %xmm3, %xmm1
230+
PCLMULQDQ 0x00, CONSTANT, %xmm1
231+
pxor %xmm2, %xmm1
232+
233+
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
234+
#ifdef __x86_64__
235+
movdqa .Lconstant_RUpoly(%rip), CONSTANT
236+
#else
237+
movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
238+
#endif
239+
movdqa %xmm1, %xmm2
240+
pand %xmm3, %xmm1
241+
PCLMULQDQ 0x10, CONSTANT, %xmm1
242+
pand %xmm3, %xmm1
243+
PCLMULQDQ 0x00, CONSTANT, %xmm1
244+
pxor %xmm2, %xmm1
245+
pextrd $0x01, %xmm1, %eax
246+
247+
ret

0 commit comments

Comments
 (0)