crypto/rc4: remove assembler implementations

This CL removes the RC4 assembler implementations. RC4 is broken and should not be used for encryption anymore. Therefore it's not worth maintaining platform-specific assembler implementations. The native Go implementation may be slower or faster depending on the CPU: name old time/op new time/op delta RC4_128-4 256ns ± 0% 196ns ± 0% -23.78% (p=0.029 n=4+4) RC4_1K-4 2.38µs ± 0% 1.54µs ± 0% -35.22% (p=0.029 n=4+4) RC4_8K-4 19.4µs ± 1% 12.0µs ± 0% -38.35% (p=0.029 n=4+4) name old speed new speed delta RC4_128-4 498MB/s ± 0% 654MB/s ± 0% +31.12% (p=0.029 n=4+4) RC4_1K-4 431MB/s ± 0% 665MB/s ± 0% +54.34% (p=0.029 n=4+4) RC4_8K-4 418MB/s ± 1% 677MB/s ± 0% +62.18% (p=0.029 n=4+4) vendor_id : GenuineIntel cpu family : 6 model : 142 model name : Intel(R) Core(TM) i5-7Y54 CPU @ 1.20GHz stepping : 9 microcode : 0x84 cpu MHz : 800.036 cache size : 4096 KB name old time/op new time/op delta RC4_128-4 235ns ± 1% 431ns ± 0% +83.00% (p=0.000 n=10+10) RC4_1K-4 1.74µs ± 0% 3.41µs ± 0% +96.74% (p=0.000 n=10+10) RC4_8K-4 13.6µs ± 1% 26.8µs ± 0% +97.58% (p=0.000 n=10+9) name old speed new speed delta RC4_128-4 543MB/s ± 0% 297MB/s ± 1% -45.29% (p=0.000 n=10+10) RC4_1K-4 590MB/s ± 0% 300MB/s ± 0% -49.16% (p=0.000 n=10+10) RC4_8K-4 596MB/s ± 1% 302MB/s ± 0% -49.39% (p=0.000 n=10+9) vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU @ 2.30GHz stepping : 0 microcode : 0x1 cpu MHz : 2300.000 cache size : 46080 KB Fixes #25417 Change-Id: I4124037154aaaa8e48d300c23974f125b6055a1c Reviewed-on: https://go-review.googlesource.com/130397 Run-TryBot: Filippo Valsorda <filippo@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>

crypto/rc4: remove assembler implementations
This CL removes the RC4 assembler implementations. RC4 is broken and should not be used for encryption anymore. Therefore it's not worth maintaining platform-specific assembler implementations. The native Go implementation may be slower or faster depending on the CPU: name old time/op new time/op delta RC4_128-4 256ns ± 0% 196ns ± 0% -23.78% (p=0.029 n=4+4) RC4_1K-4 2.38µs ± 0% 1.54µs ± 0% -35.22% (p=0.029 n=4+4) RC4_8K-4 19.4µs ± 1% 12.0µs ± 0% -38.35% (p=0.029 n=4+4) name old speed new speed delta RC4_128-4 498MB/s ± 0% 654MB/s ± 0% +31.12% (p=0.029 n=4+4) RC4_1K-4 431MB/s ± 0% 665MB/s ± 0% +54.34% (p=0.029 n=4+4) RC4_8K-4 418MB/s ± 1% 677MB/s ± 0% +62.18% (p=0.029 n=4+4) vendor_id : GenuineIntel cpu family : 6 model : 142 model name : Intel(R) Core(TM) i5-7Y54 CPU @ 1.20GHz stepping : 9 microcode : 0x84 cpu MHz : 800.036 cache size : 4096 KB name old time/op new time/op delta RC4_128-4 235ns ± 1% 431ns ± 0% +83.00% (p=0.000 n=10+10) RC4_1K-4 1.74µs ± 0% 3.41µs ± 0% +96.74% (p=0.000 n=10+10) RC4_8K-4 13.6µs ± 1% 26.8µs ± 0% +97.58% (p=0.000 n=10+9) name old speed new speed delta RC4_128-4 543MB/s ± 0% 297MB/s ± 1% -45.29% (p=0.000 n=10+10) RC4_1K-4 590MB/s ± 0% 300MB/s ± 0% -49.16% (p=0.000 n=10+10) RC4_8K-4 596MB/s ± 1% 302MB/s ± 0% -49.39% (p=0.000 n=10+9) vendor_id : GenuineIntel cpu family : 6 model : 63 model name : Intel(R) Xeon(R) CPU @ 2.30GHz stepping : 0 microcode : 0x1 cpu MHz : 2300.000 cache size : 46080 KB Fixes #25417 Change-Id: I4124037154aaaa8e48d300c23974f125b6055a1c Reviewed-on: https://go-review.googlesource.com/130397 Run-TryBot: Filippo Valsorda <filippo@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Filippo Valsorda <filippo@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
30eda671 · Andreas Auernhammer · Brad Fitzpatrick · 6d4787af · 30eda671 · 6d4787af
Commit 30eda671 authored Aug 21, 2018 by Andreas Auernhammer Committed by Brad Fitzpatrick Aug 21, 2018
8 changed files
--- a/src/crypto/rc4/rc4.go
+++ b/src/crypto/rc4/rc4.go
@@ -54,12 +54,9 @@ func (c *Cipher) Reset() {
 	c.i, c.j = 0, 0
 }

-// xorKeyStreamGeneric sets dst to the result of XORing src with the
-// key stream. Dst and src must overlap entirely or not at all.
-//
-// This is the pure Go version. rc4_{amd64,386,arm}* contain assembly
-// implementations. This is here for tests and to prevent bitrot.
-func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) {
+// XORKeyStream sets dst to the result of XORing src with the key stream.
+// Dst and src must overlap entirely or not at all.
+func (c *Cipher) XORKeyStream(dst, src []byte) {
 	if len(src) == 0 {
 		return
 	}

--- a/src/crypto/rc4/rc4_386.s
+++ b/src/crypto/rc4/rc4_386.s
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
-TEXT ·xorKeyStream(SB),NOSPLIT,$0
-	MOVL dst+0(FP), DI
-	MOVL src+4(FP), SI
-	MOVL state+12(FP), BP
-
-	MOVL i+16(FP), AX
-	MOVBLZX (AX), AX
-	MOVL j+20(FP), BX
-	MOVBLZX (BX), BX
-	CMPL n+8(FP), $0
-	JEQ done
-
-loop:
-	// i += 1
-	INCB AX
-
-	// j += c.s[i]
-	MOVBLZX (BP)(AX*4), DX
-	ADDB DX, BX
-	MOVBLZX BX, BX
-
-	// c.s[i], c.s[j] = c.s[j], c.s[i]
-	MOVBLZX (BP)(BX*4), CX
-	MOVB CX, (BP)(AX*4)
-	MOVB DX, (BP)(BX*4)
-
-	// *dst = *src ^ c.s[c.s[i]+c.s[j]]
-	ADDB DX, CX
-	MOVBLZX CX, CX
-	MOVB (BP)(CX*4), CX
-	XORB (SI), CX
-	MOVBLZX CX, CX
-	MOVB CX, (DI)
-
-	INCL SI
-	INCL DI
-	DECL n+8(FP)
-	JNE loop
-
-done:
-	MOVL i+16(FP), CX
-	MOVB AX, (CX)
-	MOVL j+20(FP), CX
-	MOVB BX, (CX)
-
-	RET
--- a/src/crypto/rc4/rc4_amd64.s
+++ b/src/crypto/rc4/rc4_amd64.s
-// Original source:
-//	http://www.zorinaq.com/papers/rc4-amd64.html
-//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
-
-#include "textflag.h"
-
-// Local modifications:
-//
-// Transliterated from GNU to 6a assembly syntax by the Go authors.
-// The comments and spacing are from the original.
-//
-// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
-//
-// The original code accumulated 64 bits of key stream in an integer
-// register and then XOR'ed the key stream into the data 8 bytes at a time.
-// Modified to accumulate 128 bits of key stream into an XMM register
-// and then XOR the key stream into the data 16 bytes at a time.
-// Approximately doubles throughput.
-
-// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
-// but makes the code run 2.0x slower on Xeon.
-#define EXTEND(r) MOVBLZX r, r
-
-/*
-** RC4 implementation optimized for AMD64.
-**
-** Author: Marc Bevand <bevand_m (at) epita.fr>
-** Licence: I hereby disclaim the copyright on this code and place it
-** in the public domain.
-**
-** The code has been designed to be easily integrated into openssl:
-** the exported RC4() function can replace the actual implementations
-** openssl already contains. Please note that when linking with openssl,
-** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
-** with -DRC4_INT='unsigned long'.
-**
-** The throughput achieved by this code is about 320 MBytes/sec, on
-** a 1.8 GHz AMD Opteron (rev C0) processor.
-*/
-
-TEXT ·xorKeyStream(SB),NOSPLIT,$0
-	MOVQ	n+16(FP),	BX		// rbx = ARG(len)
-	MOVQ	src+8(FP),	SI		// in = ARG(in)
-	MOVQ	dst+0(FP),	DI		// out = ARG(out)
-	MOVQ	state+24(FP),	BP		// d = ARG(data)
-	MOVQ	i+32(FP),	AX
-	MOVBQZX	0(AX),		CX		// x = *xp
-	MOVQ	j+40(FP),	AX
-	MOVBQZX	0(AX),		DX		// y = *yp
-
-	LEAQ	(SI)(BX*1),	R9		// limit = in+len
-
-l1:	CMPQ	SI,		R9		// cmp in with in+len
-	JGE	finished			// jump if (in >= in+len)
-
-	INCB	CX
-	EXTEND(CX)
-	TESTL	$15,		CX
-	JZ	wordloop
-
-	MOVBLZX	(BP)(CX*4),	AX
-
-	ADDB	AX,		DX		// y += tx
-	EXTEND(DX)
-	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
-	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
-	ADDB	AX,		BX		// val = ty+tx
-	EXTEND(BX)
-	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
-	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
-	XORB	(SI),		R8		// xor 1 byte
-	MOVB	R8,		(DI)
-	INCQ	SI				// in++
-	INCQ	DI				// out++
-	JMP l1
-
-wordloop:
-	SUBQ	$16,		R9
-	CMPQ	SI,		R9
-	JGT	end
-
-start:
-	ADDQ	$16,		SI		// increment in
-	ADDQ	$16,		DI		// increment out
-
-	// Each KEYROUND generates one byte of key and
-	// inserts it into an XMM register at the given 16-bit index.
-	// The key state array is uint32 words only using the bottom
-	// byte of each word, so the 16-bit OR only copies 8 useful bits.
-	// We accumulate alternating bytes into X0 and X1, and then at
-	// the end we OR X1<<8 into X0 to produce the actual key.
-	//
-	// At the beginning of the loop, CX%16 == 0, so the 16 loads
-	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
-	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
-	// without fear of the byte computation CX+15 wrapping around.
-	//
-	// The first round needs R12[0], the second needs R12[1], and so on.
-	// We can avoid memory stalls by starting the load for round n+1
-	// before the end of round n, using the LOAD macro.
-	LEAQ	(BP)(CX*4),	R12
-
-#define KEYROUND(xmm, load, off, r1, r2, index) \
-	MOVBLZX	(BP)(DX*4),	R8; \
-	MOVB	r1,		(BP)(DX*4); \
-	load((off+1), r2); \
-	MOVB	R8,		(off*4)(R12); \
-	ADDB	r1,		R8; \
-	EXTEND(R8); \
-	PINSRW	$index, (BP)(R8*4), xmm
-
-#define LOAD(off, reg) \
-	MOVBLZX	(off*4)(R12),	reg; \
-	ADDB	reg,		DX; \
-	EXTEND(DX)
-
-#define SKIP(off, reg)
-
-	LOAD(0, AX)
-	KEYROUND(X0, LOAD, 0, AX, BX, 0)
-	KEYROUND(X1, LOAD, 1, BX, AX, 0)
-	KEYROUND(X0, LOAD, 2, AX, BX, 1)
-	KEYROUND(X1, LOAD, 3, BX, AX, 1)
-	KEYROUND(X0, LOAD, 4, AX, BX, 2)
-	KEYROUND(X1, LOAD, 5, BX, AX, 2)
-	KEYROUND(X0, LOAD, 6, AX, BX, 3)
-	KEYROUND(X1, LOAD, 7, BX, AX, 3)
-	KEYROUND(X0, LOAD, 8, AX, BX, 4)
-	KEYROUND(X1, LOAD, 9, BX, AX, 4)
-	KEYROUND(X0, LOAD, 10, AX, BX, 5)
-	KEYROUND(X1, LOAD, 11, BX, AX, 5)
-	KEYROUND(X0, LOAD, 12, AX, BX, 6)
-	KEYROUND(X1, LOAD, 13, BX, AX, 6)
-	KEYROUND(X0, LOAD, 14, AX, BX, 7)
-	KEYROUND(X1, SKIP, 15, BX, AX, 7)
-	
-	ADDB	$16,		CX
-
-	PSLLQ	$8,		X1
-	PXOR	X1,		X0
-	MOVOU	-16(SI),	X2
-	PXOR	X0,		X2
-	MOVOU	X2,		-16(DI)
-
-	CMPQ	SI,		R9		// cmp in with in+len-16
-	JLE	start				// jump if (in <= in+len-16)
-
-end:
-	DECB	CX
-	ADDQ	$16,		R9		// tmp = in+len
-
-	// handle the last bytes, one by one
-l2:	CMPQ	SI,		R9		// cmp in with in+len
-	JGE	finished			// jump if (in >= in+len)
-
-	INCB	CX
-	EXTEND(CX)
-	MOVBLZX	(BP)(CX*4),	AX
-
-	ADDB	AX,		DX		// y += tx
-	EXTEND(DX)
-	MOVBLZX	(BP)(DX*4),	BX		// ty = d[y]
-	MOVB	BX,		(BP)(CX*4)	// d[x] = ty
-	ADDB	AX,		BX		// val = ty+tx
-	EXTEND(BX)
-	MOVB	AX,		(BP)(DX*4)	// d[y] = tx
-	MOVBLZX	(BP)(BX*4),	R8		// val = d[val]
-	XORB	(SI),		R8		// xor 1 byte
-	MOVB	R8,		(DI)
-	INCQ	SI				// in++
-	INCQ	DI				// out++
-	JMP l2
-
-finished:
-	MOVQ	j+40(FP),	BX
-	MOVB	DX, 0(BX)
-	MOVQ	i+32(FP),	AX
-	MOVB	CX, 0(AX)
-	RET
--- a/src/crypto/rc4/rc4_amd64p32.s
+++ b/src/crypto/rc4/rc4_amd64p32.s
-// Original source:
-//	http://www.zorinaq.com/papers/rc4-amd64.html
-//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
-
-#include "textflag.h"
-
-// Local modifications:
-//
-// Transliterated from GNU to 6a assembly syntax by the Go authors.
-// The comments and spacing are from the original.
-//
-// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
-//
-// The original code accumulated 64 bits of key stream in an integer
-// register and then XOR'ed the key stream into the data 8 bytes at a time.
-// Modified to accumulate 128 bits of key stream into an XMM register
-// and then XOR the key stream into the data 16 bytes at a time.
-// Approximately doubles throughput.
-//
-// Converted to amd64p32.
-//
-// To make safe for Native Client, avoid use of BP, R15,
-// and two-register addressing modes.
-
-// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
-// but makes the code run 2.0x slower on Xeon.
-#define EXTEND(r) MOVBLZX r, r
-
-/*
-** RC4 implementation optimized for AMD64.
-**
-** Author: Marc Bevand <bevand_m (at) epita.fr>
-** Licence: I hereby disclaim the copyright on this code and place it
-** in the public domain.
-**
-** The code has been designed to be easily integrated into openssl:
-** the exported RC4() function can replace the actual implementations
-** openssl already contains. Please note that when linking with openssl,
-** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
-** with -DRC4_INT='unsigned long'.
-**
-** The throughput achieved by this code is about 320 MBytes/sec, on
-** a 1.8 GHz AMD Opteron (rev C0) processor.
-*/
-
-TEXT ·xorKeyStream(SB),NOSPLIT,$0
-	MOVL	n+8(FP),	BX		// rbx = ARG(len)
-	MOVL	src+4(FP),	SI		// in = ARG(in)
-	MOVL	dst+0(FP),	DI		// out = ARG(out)
-	MOVL	state+12(FP),	R10		// d = ARG(data)
-	MOVL	i+16(FP),	AX
-	MOVBQZX	0(AX),		CX		// x = *xp
-	MOVL	j+20(FP),	AX
-	MOVBQZX	0(AX),		DX		// y = *yp
-
-	LEAQ	(SI)(BX*1),	R9		// limit = in+len
-
-l1:	CMPQ	SI,		R9		// cmp in with in+len
-	JGE	finished			// jump if (in >= in+len)
-
-	INCB	CX
-	EXTEND(CX)
-	TESTL	$15,		CX
-	JZ	wordloop
-	LEAL	(R10)(CX*4), R12
-
-	MOVBLZX	(R12),	AX
-
-	ADDB	AX,		DX		// y += tx
-	EXTEND(DX)
-	LEAL (R10)(DX*4), R11
-	MOVBLZX	(R11),	BX		// ty = d[y]
-	MOVB	BX,		(R12)	// d[x] = ty
-	ADDB	AX,		BX		// val = ty+tx
-	EXTEND(BX)
-	LEAL (R10)(BX*4), R13
-	MOVB	AX,		(R11)	// d[y] = tx
-	MOVBLZX	(R13),	R8		// val = d[val]
-	XORB	(SI),		R8		// xor 1 byte
-	MOVB	R8,		(DI)
-	INCQ	SI				// in++
-	INCQ	DI				// out++
-	JMP l1
-
-wordloop:
-	SUBQ	$16,		R9
-	CMPQ	SI,		R9
-	JGT	end
-
-start:
-	ADDQ	$16,		SI		// increment in
-	ADDQ	$16,		DI		// increment out
-
-	// Each KEYROUND generates one byte of key and
-	// inserts it into an XMM register at the given 16-bit index.
-	// The key state array is uint32 words only using the bottom
-	// byte of each word, so the 16-bit OR only copies 8 useful bits.
-	// We accumulate alternating bytes into X0 and X1, and then at
-	// the end we OR X1<<8 into X0 to produce the actual key.
-	//
-	// At the beginning of the loop, CX%16 == 0, so the 16 loads
-	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
-	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
-	// without fear of the byte computation CX+15 wrapping around.
-	//
-	// The first round needs R12[0], the second needs R12[1], and so on.
-	// We can avoid memory stalls by starting the load for round n+1
-	// before the end of round n, using the LOAD macro.
-	LEAQ	(R10)(CX*4),	R12
-
-#define KEYROUND(xmm, load, off, r1, r2, index) \
-	LEAL (R10)(DX*4), R11; \
-	MOVBLZX	(R11),	R8; \
-	MOVB	r1,		(R11); \
-	load((off+1), r2); \
-	MOVB	R8,		(off*4)(R12); \
-	ADDB	r1,		R8; \
-	EXTEND(R8); \
-	LEAL (R10)(R8*4), R14; \
-	PINSRW	$index, (R14), xmm
-
-#define LOAD(off, reg) \
-	MOVBLZX	(off*4)(R12),	reg; \
-	ADDB	reg,		DX; \
-	EXTEND(DX)
-
-#define SKIP(off, reg)
-
-	LOAD(0, AX)
-	KEYROUND(X0, LOAD, 0, AX, BX, 0)
-	KEYROUND(X1, LOAD, 1, BX, AX, 0)
-	KEYROUND(X0, LOAD, 2, AX, BX, 1)
-	KEYROUND(X1, LOAD, 3, BX, AX, 1)
-	KEYROUND(X0, LOAD, 4, AX, BX, 2)
-	KEYROUND(X1, LOAD, 5, BX, AX, 2)
-	KEYROUND(X0, LOAD, 6, AX, BX, 3)
-	KEYROUND(X1, LOAD, 7, BX, AX, 3)
-	KEYROUND(X0, LOAD, 8, AX, BX, 4)
-	KEYROUND(X1, LOAD, 9, BX, AX, 4)
-	KEYROUND(X0, LOAD, 10, AX, BX, 5)
-	KEYROUND(X1, LOAD, 11, BX, AX, 5)
-	KEYROUND(X0, LOAD, 12, AX, BX, 6)
-	KEYROUND(X1, LOAD, 13, BX, AX, 6)
-	KEYROUND(X0, LOAD, 14, AX, BX, 7)
-	KEYROUND(X1, SKIP, 15, BX, AX, 7)
-	
-	ADDB	$16,		CX
-
-	PSLLQ	$8,		X1
-	PXOR	X1,		X0
-	MOVOU	-16(SI),	X2
-	PXOR	X0,		X2
-	MOVOU	X2,		-16(DI)
-
-	CMPQ	SI,		R9		// cmp in with in+len-16
-	JLE	start				// jump if (in <= in+len-16)
-
-end:
-	DECB	CX
-	ADDQ	$16,		R9		// tmp = in+len
-
-	// handle the last bytes, one by one
-l2:	CMPQ	SI,		R9		// cmp in with in+len
-	JGE	finished			// jump if (in >= in+len)
-
-	INCB	CX
-	EXTEND(CX)
-	LEAL (R10)(CX*4), R12
-	MOVBLZX	(R12),	AX
-
-	ADDB	AX,		DX		// y += tx
-	EXTEND(DX)
-	LEAL (R10)(DX*4), R11
-	MOVBLZX	(R11),	BX		// ty = d[y]
-	MOVB	BX,		(R12)	// d[x] = ty
-	ADDB	AX,		BX		// val = ty+tx
-	EXTEND(BX)
-	LEAL (R10)(BX*4), R13
-	MOVB	AX,		(R11)	// d[y] = tx
-	MOVBLZX	(R13),	R8		// val = d[val]
-	XORB	(SI),		R8		// xor 1 byte
-	MOVB	R8,		(DI)
-	INCQ	SI				// in++
-	INCQ	DI				// out++
-	JMP l2
-
-finished:
-	MOVL	j+20(FP),	BX
-	MOVB	DX, 0(BX)
-	MOVL	i+16(FP),	AX
-	MOVB	CX, 0(AX)
-	RET
--- a/src/crypto/rc4/rc4_arm.s
+++ b/src/crypto/rc4/rc4_arm.s
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !nacl
-
-#include "textflag.h"
-
-// Registers
-#define Rdst	R0
-#define Rsrc	R1
-#define Rn	R2
-#define Rstate	R3
-#define Rpi	R4
-#define Rpj	R5
-#define Ri	R6
-#define Rj	R7
-#define Rk	R8
-#define Rt	R11
-#define Rt2	R12
-
-// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
-TEXT ·xorKeyStream(SB),NOSPLIT,$0
-	MOVW dst+0(FP), Rdst
-	MOVW src+4(FP), Rsrc
-	MOVW n+8(FP), Rn
-	MOVW state+12(FP), Rstate
-	MOVW i+16(FP), Rpi
-	MOVW j+20(FP), Rpj
-	MOVBU (Rpi), Ri
-	MOVBU (Rpj), Rj
-	MOVW $0, Rk
-
-loop:
-	// i += 1; j += state[i]
-	ADD $1, Ri
-	AND $0xff, Ri
-	MOVBU Ri<<2(Rstate), Rt
-	ADD Rt, Rj
-	AND $0xff, Rj
-
-	// swap state[i] <-> state[j]
-	MOVBU Rj<<2(Rstate), Rt2
-	MOVB Rt2, Ri<<2(Rstate)
-	MOVB Rt, Rj<<2(Rstate)
-
-	// dst[k] = src[k] ^ state[state[i] + state[j]]
-	ADD Rt2, Rt
-	AND $0xff, Rt
-	MOVBU Rt<<2(Rstate), Rt
-	MOVBU Rk<<0(Rsrc), Rt2
-	EOR Rt, Rt2
-	MOVB Rt2, Rk<<0(Rdst)
-
-	ADD $1, Rk
-	CMP Rk, Rn
-	BNE loop
-
-done:
-	MOVB Ri, (Rpi)
-	MOVB Rj, (Rpj)
-	RET
--- a/src/crypto/rc4/rc4_asm.go
+++ b/src/crypto/rc4/rc4_asm.go
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64 amd64p32 arm,!nacl 386
-
-package rc4
-
-import "crypto/internal/subtle"
-
-func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)
-
-// XORKeyStream sets dst to the result of XORing src with the key stream.
-// Dst and src must overlap entirely or not at all.
-func (c *Cipher) XORKeyStream(dst, src []byte) {
-	if len(src) == 0 {
-		return
-	}
-	if len(dst) < len(src) {
-		panic("crypto/cipher: output smaller than input")
-	}
-	if subtle.InexactOverlap(dst[:len(src)], src) {
-		panic("crypto/cipher: invalid buffer overlap")
-	}
-	xorKeyStream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j)
-}
--- a/src/crypto/rc4/rc4_ref.go
+++ b/src/crypto/rc4/rc4_ref.go
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!amd64p32,!arm,!386 arm,nacl
-
-package rc4
-
-// XORKeyStream sets dst to the result of XORing src with the key stream.
-// Dst and src must overlap entirely or not at all.
-func (c *Cipher) XORKeyStream(dst, src []byte) {
-	c.xorKeyStreamGeneric(dst, src)
-}
--- a/src/crypto/rc4/rc4_test.go
+++ b/src/crypto/rc4/rc4_test.go
@@ -117,30 +117,19 @@ func TestGolden(t *testing.T) {
 }

 func TestBlock(t *testing.T) {
-	testBlock(t, (*Cipher).XORKeyStream)
-}
-
-// Test the pure Go version.
-// Because we have assembly for amd64, 386, and arm, this prevents
-// bitrot of the reference implementations.
-func TestBlockGeneric(t *testing.T) {
-	testBlock(t, (*Cipher).xorKeyStreamGeneric)
-}
-
-func testBlock(t *testing.T, xor func(c *Cipher, dst, src []byte)) {
 	c1a, _ := NewCipher(golden[0].key)
 	c1b, _ := NewCipher(golden[1].key)
 	data1 := make([]byte, 1<<20)
 	for i := range data1 {
-		xor(c1a, data1[i:i+1], data1[i:i+1])
-		xor(c1b, data1[i:i+1], data1[i:i+1])
+		c1a.XORKeyStream(data1[i:i+1], data1[i:i+1])
+		c1b.XORKeyStream(data1[i:i+1], data1[i:i+1])
 	}

 	c2a, _ := NewCipher(golden[0].key)
 	c2b, _ := NewCipher(golden[1].key)
 	data2 := make([]byte, 1<<20)
-	xor(c2a, data2, data2)
-	xor(c2b, data2, data2)
+	c2a.XORKeyStream(data2, data2)
+	c2b.XORKeyStream(data2, data2)

 	if !bytes.Equal(data1, data2) {
 		t.Fatalf("bad block")