Commit 30eda671 authored by Andreas Auernhammer's avatar Andreas Auernhammer Committed by Brad Fitzpatrick

crypto/rc4: remove assembler implementations

This CL removes the RC4 assembler implementations.
RC4 is broken and should not be used for encryption
anymore. Therefore it's not worth maintaining
platform-specific assembler implementations.

The native Go implementation may be slower
or faster depending on the CPU:

name       old time/op   new time/op   delta
RC4_128-4    256ns ± 0%    196ns ± 0%  -23.78%  (p=0.029 n=4+4)
RC4_1K-4    2.38µs ± 0%   1.54µs ± 0%  -35.22%  (p=0.029 n=4+4)
RC4_8K-4    19.4µs ± 1%   12.0µs ± 0%  -38.35%  (p=0.029 n=4+4)

name       old speed     new speed     delta
RC4_128-4  498MB/s ± 0%  654MB/s ± 0%  +31.12%  (p=0.029 n=4+4)
RC4_1K-4   431MB/s ± 0%  665MB/s ± 0%  +54.34%  (p=0.029 n=4+4)
RC4_8K-4   418MB/s ± 1%  677MB/s ± 0%  +62.18%  (p=0.029 n=4+4)

vendor_id	: GenuineIntel
cpu family	: 6
model		: 142
model name	: Intel(R) Core(TM) i5-7Y54 CPU @ 1.20GHz
stepping	: 9
microcode	: 0x84
cpu MHz		: 800.036
cache size	: 4096 KB

name       old time/op   new time/op   delta
RC4_128-4    235ns ± 1%    431ns ± 0%  +83.00%  (p=0.000 n=10+10)
RC4_1K-4    1.74µs ± 0%   3.41µs ± 0%  +96.74%  (p=0.000 n=10+10)
RC4_8K-4    13.6µs ± 1%   26.8µs ± 0%  +97.58%   (p=0.000 n=10+9)

name       old speed     new speed     delta
RC4_128-4  543MB/s ± 0%  297MB/s ± 1%  -45.29%  (p=0.000 n=10+10)
RC4_1K-4   590MB/s ± 0%  300MB/s ± 0%  -49.16%  (p=0.000 n=10+10)
RC4_8K-4   596MB/s ± 1%  302MB/s ± 0%  -49.39%   (p=0.000 n=10+9)

vendor_id       : GenuineIntel
cpu family      : 6
model           : 63
model name      : Intel(R) Xeon(R) CPU @ 2.30GHz
stepping        : 0
microcode       : 0x1
cpu MHz         : 2300.000
cache size      : 46080 KB

Fixes #25417

Change-Id: I4124037154aaaa8e48d300c23974f125b6055a1c
Reviewed-on: https://go-review.googlesource.com/130397
Run-TryBot: Filippo Valsorda <filippo@golang.org>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: 's avatarFilippo Valsorda <filippo@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 6d4787af
......@@ -54,12 +54,9 @@ func (c *Cipher) Reset() {
c.i, c.j = 0, 0
}
// xorKeyStreamGeneric sets dst to the result of XORing src with the
// key stream. Dst and src must overlap entirely or not at all.
//
// This is the pure Go version. rc4_{amd64,386,arm}* contain assembly
// implementations. This is here for tests and to prevent bitrot.
func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) {
// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src must overlap entirely or not at all.
func (c *Cipher) XORKeyStream(dst, src []byte) {
if len(src) == 0 {
return
}
......
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVL dst+0(FP), DI
MOVL src+4(FP), SI
MOVL state+12(FP), BP
MOVL i+16(FP), AX
MOVBLZX (AX), AX
MOVL j+20(FP), BX
MOVBLZX (BX), BX
CMPL n+8(FP), $0
JEQ done
loop:
// i += 1
INCB AX
// j += c.s[i]
MOVBLZX (BP)(AX*4), DX
ADDB DX, BX
MOVBLZX BX, BX
// c.s[i], c.s[j] = c.s[j], c.s[i]
MOVBLZX (BP)(BX*4), CX
MOVB CX, (BP)(AX*4)
MOVB DX, (BP)(BX*4)
// *dst = *src ^ c.s[c.s[i]+c.s[j]]
ADDB DX, CX
MOVBLZX CX, CX
MOVB (BP)(CX*4), CX
XORB (SI), CX
MOVBLZX CX, CX
MOVB CX, (DI)
INCL SI
INCL DI
DECL n+8(FP)
JNE loop
done:
MOVL i+16(FP), CX
MOVB AX, (CX)
MOVL j+20(FP), CX
MOVB BX, (CX)
RET
// Original source:
// http://www.zorinaq.com/papers/rc4-amd64.html
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
#include "textflag.h"
// Local modifications:
//
// Transliterated from GNU to 6a assembly syntax by the Go authors.
// The comments and spacing are from the original.
//
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
//
// The original code accumulated 64 bits of key stream in an integer
// register and then XOR'ed the key stream into the data 8 bytes at a time.
// Modified to accumulate 128 bits of key stream into an XMM register
// and then XOR the key stream into the data 16 bytes at a time.
// Approximately doubles throughput.
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
// but makes the code run 2.0x slower on Xeon.
#define EXTEND(r) MOVBLZX r, r
/*
** RC4 implementation optimized for AMD64.
**
** Author: Marc Bevand <bevand_m (at) epita.fr>
** Licence: I hereby disclaim the copyright on this code and place it
** in the public domain.
**
** The code has been designed to be easily integrated into openssl:
** the exported RC4() function can replace the actual implementations
** openssl already contains. Please note that when linking with openssl,
** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
** with -DRC4_INT='unsigned long'.
**
** The throughput achieved by this code is about 320 MBytes/sec, on
** a 1.8 GHz AMD Opteron (rev C0) processor.
*/
TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVQ n+16(FP), BX // rbx = ARG(len)
MOVQ src+8(FP), SI // in = ARG(in)
MOVQ dst+0(FP), DI // out = ARG(out)
MOVQ state+24(FP), BP // d = ARG(data)
MOVQ i+32(FP), AX
MOVBQZX 0(AX), CX // x = *xp
MOVQ j+40(FP), AX
MOVBQZX 0(AX), DX // y = *yp
LEAQ (SI)(BX*1), R9 // limit = in+len
l1: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
TESTL $15, CX
JZ wordloop
MOVBLZX (BP)(CX*4), AX
ADDB AX, DX // y += tx
EXTEND(DX)
MOVBLZX (BP)(DX*4), BX // ty = d[y]
MOVB BX, (BP)(CX*4) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
MOVB AX, (BP)(DX*4) // d[y] = tx
MOVBLZX (BP)(BX*4), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
INCQ DI // out++
JMP l1
wordloop:
SUBQ $16, R9
CMPQ SI, R9
JGT end
start:
ADDQ $16, SI // increment in
ADDQ $16, DI // increment out
// Each KEYROUND generates one byte of key and
// inserts it into an XMM register at the given 16-bit index.
// The key state array is uint32 words only using the bottom
// byte of each word, so the 16-bit OR only copies 8 useful bits.
// We accumulate alternating bytes into X0 and X1, and then at
// the end we OR X1<<8 into X0 to produce the actual key.
//
// At the beginning of the loop, CX%16 == 0, so the 16 loads
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
// without fear of the byte computation CX+15 wrapping around.
//
// The first round needs R12[0], the second needs R12[1], and so on.
// We can avoid memory stalls by starting the load for round n+1
// before the end of round n, using the LOAD macro.
LEAQ (BP)(CX*4), R12
#define KEYROUND(xmm, load, off, r1, r2, index) \
MOVBLZX (BP)(DX*4), R8; \
MOVB r1, (BP)(DX*4); \
load((off+1), r2); \
MOVB R8, (off*4)(R12); \
ADDB r1, R8; \
EXTEND(R8); \
PINSRW $index, (BP)(R8*4), xmm
#define LOAD(off, reg) \
MOVBLZX (off*4)(R12), reg; \
ADDB reg, DX; \
EXTEND(DX)
#define SKIP(off, reg)
LOAD(0, AX)
KEYROUND(X0, LOAD, 0, AX, BX, 0)
KEYROUND(X1, LOAD, 1, BX, AX, 0)
KEYROUND(X0, LOAD, 2, AX, BX, 1)
KEYROUND(X1, LOAD, 3, BX, AX, 1)
KEYROUND(X0, LOAD, 4, AX, BX, 2)
KEYROUND(X1, LOAD, 5, BX, AX, 2)
KEYROUND(X0, LOAD, 6, AX, BX, 3)
KEYROUND(X1, LOAD, 7, BX, AX, 3)
KEYROUND(X0, LOAD, 8, AX, BX, 4)
KEYROUND(X1, LOAD, 9, BX, AX, 4)
KEYROUND(X0, LOAD, 10, AX, BX, 5)
KEYROUND(X1, LOAD, 11, BX, AX, 5)
KEYROUND(X0, LOAD, 12, AX, BX, 6)
KEYROUND(X1, LOAD, 13, BX, AX, 6)
KEYROUND(X0, LOAD, 14, AX, BX, 7)
KEYROUND(X1, SKIP, 15, BX, AX, 7)
ADDB $16, CX
PSLLQ $8, X1
PXOR X1, X0
MOVOU -16(SI), X2
PXOR X0, X2
MOVOU X2, -16(DI)
CMPQ SI, R9 // cmp in with in+len-16
JLE start // jump if (in <= in+len-16)
end:
DECB CX
ADDQ $16, R9 // tmp = in+len
// handle the last bytes, one by one
l2: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
MOVBLZX (BP)(CX*4), AX
ADDB AX, DX // y += tx
EXTEND(DX)
MOVBLZX (BP)(DX*4), BX // ty = d[y]
MOVB BX, (BP)(CX*4) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
MOVB AX, (BP)(DX*4) // d[y] = tx
MOVBLZX (BP)(BX*4), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
INCQ DI // out++
JMP l2
finished:
MOVQ j+40(FP), BX
MOVB DX, 0(BX)
MOVQ i+32(FP), AX
MOVB CX, 0(AX)
RET
// Original source:
// http://www.zorinaq.com/papers/rc4-amd64.html
// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
#include "textflag.h"
// Local modifications:
//
// Transliterated from GNU to 6a assembly syntax by the Go authors.
// The comments and spacing are from the original.
//
// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
//
// The original code accumulated 64 bits of key stream in an integer
// register and then XOR'ed the key stream into the data 8 bytes at a time.
// Modified to accumulate 128 bits of key stream into an XMM register
// and then XOR the key stream into the data 16 bytes at a time.
// Approximately doubles throughput.
//
// Converted to amd64p32.
//
// To make safe for Native Client, avoid use of BP, R15,
// and two-register addressing modes.
// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
// but makes the code run 2.0x slower on Xeon.
#define EXTEND(r) MOVBLZX r, r
/*
** RC4 implementation optimized for AMD64.
**
** Author: Marc Bevand <bevand_m (at) epita.fr>
** Licence: I hereby disclaim the copyright on this code and place it
** in the public domain.
**
** The code has been designed to be easily integrated into openssl:
** the exported RC4() function can replace the actual implementations
** openssl already contains. Please note that when linking with openssl,
** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
** with -DRC4_INT='unsigned long'.
**
** The throughput achieved by this code is about 320 MBytes/sec, on
** a 1.8 GHz AMD Opteron (rev C0) processor.
*/
TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVL n+8(FP), BX // rbx = ARG(len)
MOVL src+4(FP), SI // in = ARG(in)
MOVL dst+0(FP), DI // out = ARG(out)
MOVL state+12(FP), R10 // d = ARG(data)
MOVL i+16(FP), AX
MOVBQZX 0(AX), CX // x = *xp
MOVL j+20(FP), AX
MOVBQZX 0(AX), DX // y = *yp
LEAQ (SI)(BX*1), R9 // limit = in+len
l1: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
TESTL $15, CX
JZ wordloop
LEAL (R10)(CX*4), R12
MOVBLZX (R12), AX
ADDB AX, DX // y += tx
EXTEND(DX)
LEAL (R10)(DX*4), R11
MOVBLZX (R11), BX // ty = d[y]
MOVB BX, (R12) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
LEAL (R10)(BX*4), R13
MOVB AX, (R11) // d[y] = tx
MOVBLZX (R13), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
INCQ DI // out++
JMP l1
wordloop:
SUBQ $16, R9
CMPQ SI, R9
JGT end
start:
ADDQ $16, SI // increment in
ADDQ $16, DI // increment out
// Each KEYROUND generates one byte of key and
// inserts it into an XMM register at the given 16-bit index.
// The key state array is uint32 words only using the bottom
// byte of each word, so the 16-bit OR only copies 8 useful bits.
// We accumulate alternating bytes into X0 and X1, and then at
// the end we OR X1<<8 into X0 to produce the actual key.
//
// At the beginning of the loop, CX%16 == 0, so the 16 loads
// at state[CX], state[CX+1], ..., state[CX+15] can precompute
// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
// without fear of the byte computation CX+15 wrapping around.
//
// The first round needs R12[0], the second needs R12[1], and so on.
// We can avoid memory stalls by starting the load for round n+1
// before the end of round n, using the LOAD macro.
LEAQ (R10)(CX*4), R12
#define KEYROUND(xmm, load, off, r1, r2, index) \
LEAL (R10)(DX*4), R11; \
MOVBLZX (R11), R8; \
MOVB r1, (R11); \
load((off+1), r2); \
MOVB R8, (off*4)(R12); \
ADDB r1, R8; \
EXTEND(R8); \
LEAL (R10)(R8*4), R14; \
PINSRW $index, (R14), xmm
#define LOAD(off, reg) \
MOVBLZX (off*4)(R12), reg; \
ADDB reg, DX; \
EXTEND(DX)
#define SKIP(off, reg)
LOAD(0, AX)
KEYROUND(X0, LOAD, 0, AX, BX, 0)
KEYROUND(X1, LOAD, 1, BX, AX, 0)
KEYROUND(X0, LOAD, 2, AX, BX, 1)
KEYROUND(X1, LOAD, 3, BX, AX, 1)
KEYROUND(X0, LOAD, 4, AX, BX, 2)
KEYROUND(X1, LOAD, 5, BX, AX, 2)
KEYROUND(X0, LOAD, 6, AX, BX, 3)
KEYROUND(X1, LOAD, 7, BX, AX, 3)
KEYROUND(X0, LOAD, 8, AX, BX, 4)
KEYROUND(X1, LOAD, 9, BX, AX, 4)
KEYROUND(X0, LOAD, 10, AX, BX, 5)
KEYROUND(X1, LOAD, 11, BX, AX, 5)
KEYROUND(X0, LOAD, 12, AX, BX, 6)
KEYROUND(X1, LOAD, 13, BX, AX, 6)
KEYROUND(X0, LOAD, 14, AX, BX, 7)
KEYROUND(X1, SKIP, 15, BX, AX, 7)
ADDB $16, CX
PSLLQ $8, X1
PXOR X1, X0
MOVOU -16(SI), X2
PXOR X0, X2
MOVOU X2, -16(DI)
CMPQ SI, R9 // cmp in with in+len-16
JLE start // jump if (in <= in+len-16)
end:
DECB CX
ADDQ $16, R9 // tmp = in+len
// handle the last bytes, one by one
l2: CMPQ SI, R9 // cmp in with in+len
JGE finished // jump if (in >= in+len)
INCB CX
EXTEND(CX)
LEAL (R10)(CX*4), R12
MOVBLZX (R12), AX
ADDB AX, DX // y += tx
EXTEND(DX)
LEAL (R10)(DX*4), R11
MOVBLZX (R11), BX // ty = d[y]
MOVB BX, (R12) // d[x] = ty
ADDB AX, BX // val = ty+tx
EXTEND(BX)
LEAL (R10)(BX*4), R13
MOVB AX, (R11) // d[y] = tx
MOVBLZX (R13), R8 // val = d[val]
XORB (SI), R8 // xor 1 byte
MOVB R8, (DI)
INCQ SI // in++
INCQ DI // out++
JMP l2
finished:
MOVL j+20(FP), BX
MOVB DX, 0(BX)
MOVL i+16(FP), AX
MOVB CX, 0(AX)
RET
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !nacl
#include "textflag.h"
// Registers
#define Rdst R0
#define Rsrc R1
#define Rn R2
#define Rstate R3
#define Rpi R4
#define Rpj R5
#define Ri R6
#define Rj R7
#define Rk R8
#define Rt R11
#define Rt2 R12
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVW dst+0(FP), Rdst
MOVW src+4(FP), Rsrc
MOVW n+8(FP), Rn
MOVW state+12(FP), Rstate
MOVW i+16(FP), Rpi
MOVW j+20(FP), Rpj
MOVBU (Rpi), Ri
MOVBU (Rpj), Rj
MOVW $0, Rk
loop:
// i += 1; j += state[i]
ADD $1, Ri
AND $0xff, Ri
MOVBU Ri<<2(Rstate), Rt
ADD Rt, Rj
AND $0xff, Rj
// swap state[i] <-> state[j]
MOVBU Rj<<2(Rstate), Rt2
MOVB Rt2, Ri<<2(Rstate)
MOVB Rt, Rj<<2(Rstate)
// dst[k] = src[k] ^ state[state[i] + state[j]]
ADD Rt2, Rt
AND $0xff, Rt
MOVBU Rt<<2(Rstate), Rt
MOVBU Rk<<0(Rsrc), Rt2
EOR Rt, Rt2
MOVB Rt2, Rk<<0(Rdst)
ADD $1, Rk
CMP Rk, Rn
BNE loop
done:
MOVB Ri, (Rpi)
MOVB Rj, (Rpj)
RET
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 amd64p32 arm,!nacl 386
package rc4
import "crypto/internal/subtle"
func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src must overlap entirely or not at all.
func (c *Cipher) XORKeyStream(dst, src []byte) {
if len(src) == 0 {
return
}
if len(dst) < len(src) {
panic("crypto/cipher: output smaller than input")
}
if subtle.InexactOverlap(dst[:len(src)], src) {
panic("crypto/cipher: invalid buffer overlap")
}
xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j)
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!amd64p32,!arm,!386 arm,nacl
package rc4
// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src must overlap entirely or not at all.
func (c *Cipher) XORKeyStream(dst, src []byte) {
c.xorKeyStreamGeneric(dst, src)
}
......@@ -117,30 +117,19 @@ func TestGolden(t *testing.T) {
}
func TestBlock(t *testing.T) {
testBlock(t, (*Cipher).XORKeyStream)
}
// Test the pure Go version.
// Because we have assembly for amd64, 386, and arm, this prevents
// bitrot of the reference implementations.
func TestBlockGeneric(t *testing.T) {
testBlock(t, (*Cipher).xorKeyStreamGeneric)
}
func testBlock(t *testing.T, xor func(c *Cipher, dst, src []byte)) {
c1a, _ := NewCipher(golden[0].key)
c1b, _ := NewCipher(golden[1].key)
data1 := make([]byte, 1<<20)
for i := range data1 {
xor(c1a, data1[i:i+1], data1[i:i+1])
xor(c1b, data1[i:i+1], data1[i:i+1])
c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
}
c2a, _ := NewCipher(golden[0].key)
c2b, _ := NewCipher(golden[1].key)
data2 := make([]byte, 1<<20)
xor(c2a, data2, data2)
xor(c2b, data2, data2)
c2a.XORKeyStream(data2, data2)
c2b.XORKeyStream(data2, data2)
if !bytes.Equal(data1, data2) {
t.Fatalf("bad block")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment