Commit 69ddb7a4 authored by Rob Pike's avatar Rob Pike

[dev.cc] all: edit assembly source for ARM to be more regular

Several .s files for ARM had several properties the new assembler will not support.
These include:

- mentioning SP or PC as a hardware register
	These are always pseudo-registers except that in some contexts
	they're not, and it's confusing because the context should not affect
	which register you mean. Change the references to the hardware
	registers to be explicit: R13 for SP, R15 for PC.
- constant creation using assignment
	The files say a=b when they could instead say #define a b.
	There is no reason to have both mechanisms.
- R(0) to refer to R0.
	Some macros use this to a great extent. Again, it's easy just to
	use a #define to rename a register.

Change-Id: I002335ace8e876c5b63c71c2560533eb835346d2
Reviewed-on: https://go-review.googlesource.com/4822Reviewed-by: 's avatarDave Cheney <dave@cheney.net>
parent 2ecefd41
This diff is collapsed.
......@@ -7,56 +7,56 @@
#include "textflag.h"
// Registers
dst = 0
src = 1
n = 2
state = 3
pi = 4
pj = 5
i = 6
j = 7
k = 8
t = 11
t2 = 12
#define Rdst R0
#define Rsrc R1
#define Rn R2
#define Rstate R3
#define Rpi R4
#define Rpj R5
#define Ri R6
#define Rj R7
#define Rk R8
#define Rt R11
#define Rt2 R12
// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
TEXT ·xorKeyStream(SB),NOSPLIT,$0
MOVW 0(FP), R(dst)
MOVW 4(FP), R(src)
MOVW 8(FP), R(n)
MOVW 12(FP), R(state)
MOVW 16(FP), R(pi)
MOVW 20(FP), R(pj)
MOVBU (R(pi)), R(i)
MOVBU (R(pj)), R(j)
MOVW $0, R(k)
MOVW 0(FP), Rdst
MOVW 4(FP), Rsrc
MOVW 8(FP), Rn
MOVW 12(FP), Rstate
MOVW 16(FP), Rpi
MOVW 20(FP), Rpj
MOVBU (Rpi), Ri
MOVBU (Rpj), Rj
MOVW $0, Rk
loop:
// i += 1; j += state[i]
ADD $1, R(i)
AND $0xff, R(i)
MOVBU R(i)<<2(R(state)), R(t)
ADD R(t), R(j)
AND $0xff, R(j)
ADD $1, Ri
AND $0xff, Ri
MOVBU Ri<<2(Rstate), Rt
ADD Rt, Rj
AND $0xff, Rj
// swap state[i] <-> state[j]
MOVBU R(j)<<2(R(state)), R(t2)
MOVB R(t2), R(i)<<2(R(state))
MOVB R(t), R(j)<<2(R(state))
MOVBU Rj<<2(Rstate), Rt2
MOVB Rt2, Ri<<2(Rstate)
MOVB Rt, Rj<<2(Rstate)
// dst[k] = src[k] ^ state[state[i] + state[j]]
ADD R(t2), R(t)
AND $0xff, R(t)
MOVBU R(t)<<2(R(state)), R(t)
MOVBU R(k)<<0(R(src)), R(t2)
EOR R(t), R(t2)
MOVB R(t2), R(k)<<0(R(dst))
ADD $1, R(k)
CMP R(k), R(n)
ADD Rt2, Rt
AND $0xff, Rt
MOVBU Rt<<2(Rstate), Rt
MOVBU Rk<<0(Rsrc), Rt2
EOR Rt, Rt2
MOVB Rt2, Rk<<0(Rdst)
ADD $1, Rk
CMP Rk, Rn
BNE loop
done:
MOVB R(i), (R(pi))
MOVB R(j), (R(pj))
MOVB Ri, (Rpi)
MOVB Rj, (Rpj)
RET
......@@ -23,20 +23,20 @@
// the round macros instead of by explicit move instructions.
// Register definitions
data = 0 // Pointer to incoming data
const = 1 // Current constant for SHA round
a = 2 // SHA1 accumulator
b = 3 // SHA1 accumulator
c = 4 // SHA1 accumulator
d = 5 // SHA1 accumulator
e = 6 // SHA1 accumulator
t0 = 7 // Temporary
t1 = 8 // Temporary
#define Rdata R0 // Pointer to incoming data
#define Rconst R1 // Current constant for SHA round
#define Ra R2 // SHA1 accumulator
#define Rb R3 // SHA1 accumulator
#define Rc R4 // SHA1 accumulator
#define Rd R5 // SHA1 accumulator
#define Re R6 // SHA1 accumulator
#define Rt0 R7 // Temporary
#define Rt1 R8 // Temporary
// r9, r10 are forbidden
// r11 is OK provided you check the assembler that no synthetic instructions use it
t2 = 11 // Temporary
ctr = 12 // loop counter
w = 14 // point to w buffer
#define Rt2 R11 // Temporary
#define Rctr R12 // loop counter
#define Rw R14 // point to w buffer
// func block(dig *digest, p []byte)
// 0(FP) is *digest
......@@ -45,173 +45,173 @@ w = 14 // point to w buffer
//12(FP) is p.cap
//
// Stack frame
p_end = -4 // -4(SP) pointer to the end of data
p_data = p_end - 4 // -8(SP) current data pointer
w_buf = p_data - 4*80 // -328(SP) 80 words temporary buffer w uint32[80]
saved = w_buf - 4*5 // -348(SP) saved sha1 registers a,b,c,d,e - these must be last
#define p_end -4 // -4(SP) pointer to the end of data
#define p_data (p_end - 4) // -8(SP) current data pointer
#define w_buf (p_data - 4*80) // -328(SP) 80 words temporary buffer w uint32[80]
#define saved (w_buf - 4*5) // -348(SP) saved sha1 registers a,b,c,d,e - these must be last
// Total size +4 for saved LR is 352
// w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3]
// e += w[i]
#define LOAD(e) \
MOVBU 2(R(data)), R(t0) ; \
MOVBU 3(R(data)), R(t1) ; \
MOVBU 1(R(data)), R(t2) ; \
ORR R(t0)<<8, R(t1), R(t0) ; \
MOVBU.P 4(R(data)), R(t1) ; \
ORR R(t2)<<16, R(t0), R(t0) ; \
ORR R(t1)<<24, R(t0), R(t0) ; \
MOVW.P R(t0), 4(R(w)) ; \
ADD R(t0), R(e), R(e)
#define LOAD(Re) \
MOVBU 2(Rdata), Rt0 ; \
MOVBU 3(Rdata), Rt1 ; \
MOVBU 1(Rdata), Rt2 ; \
ORR Rt0<<8, Rt1, Rt0 ; \
MOVBU.P 4(Rdata), Rt1 ; \
ORR Rt2<<16, Rt0, Rt0 ; \
ORR Rt1<<24, Rt0, Rt0 ; \
MOVW.P Rt0, 4(Rw) ; \
ADD Rt0, Re, Re
// tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
// w[i&0xf] = tmp<<1 | tmp>>(32-1)
// e += w[i&0xf]
#define SHUFFLE(e) \
MOVW (-16*4)(R(w)), R(t0) ; \
MOVW (-14*4)(R(w)), R(t1) ; \
MOVW (-8*4)(R(w)), R(t2) ; \
EOR R(t0), R(t1), R(t0) ; \
MOVW (-3*4)(R(w)), R(t1) ; \
EOR R(t2), R(t0), R(t0) ; \
EOR R(t0), R(t1), R(t0) ; \
MOVW R(t0)@>(32-1), R(t0) ; \
MOVW.P R(t0), 4(R(w)) ; \
ADD R(t0), R(e), R(e)
#define SHUFFLE(Re) \
MOVW (-16*4)(Rw), Rt0 ; \
MOVW (-14*4)(Rw), Rt1 ; \
MOVW (-8*4)(Rw), Rt2 ; \
EOR Rt0, Rt1, Rt0 ; \
MOVW (-3*4)(Rw), Rt1 ; \
EOR Rt2, Rt0, Rt0 ; \
EOR Rt0, Rt1, Rt0 ; \
MOVW Rt0@>(32-1), Rt0 ; \
MOVW.P Rt0, 4(Rw) ; \
ADD Rt0, Re, Re
// t1 = (b & c) | ((~b) & d)
#define FUNC1(a, b, c, d, e) \
MVN R(b), R(t1) ; \
AND R(b), R(c), R(t0) ; \
AND R(d), R(t1), R(t1) ; \
ORR R(t0), R(t1), R(t1)
#define FUNC1(Ra, Rb, Rc, Rd, Re) \
MVN Rb, Rt1 ; \
AND Rb, Rc, Rt0 ; \
AND Rd, Rt1, Rt1 ; \
ORR Rt0, Rt1, Rt1
// t1 = b ^ c ^ d
#define FUNC2(a, b, c, d, e) \
EOR R(b), R(c), R(t1) ; \
EOR R(d), R(t1), R(t1)
#define FUNC2(Ra, Rb, Rc, Rd, Re) \
EOR Rb, Rc, Rt1 ; \
EOR Rd, Rt1, Rt1
// t1 = (b & c) | (b & d) | (c & d) =
// t1 = (b & c) | ((b | c) & d)
#define FUNC3(a, b, c, d, e) \
ORR R(b), R(c), R(t0) ; \
AND R(b), R(c), R(t1) ; \
AND R(d), R(t0), R(t0) ; \
ORR R(t0), R(t1), R(t1)
#define FUNC3(Ra, Rb, Rc, Rd, Re) \
ORR Rb, Rc, Rt0 ; \
AND Rb, Rc, Rt1 ; \
AND Rd, Rt0, Rt0 ; \
ORR Rt0, Rt1, Rt1
#define FUNC4 FUNC2
// a5 := a<<5 | a>>(32-5)
// b = b<<30 | b>>(32-30)
// e = a5 + t1 + e + const
#define MIX(a, b, c, d, e) \
ADD R(t1), R(e), R(e) ; \
MOVW R(b)@>(32-30), R(b) ; \
ADD R(a)@>(32-5), R(e), R(e) ; \
ADD R(const), R(e), R(e)
#define ROUND1(a, b, c, d, e) \
LOAD(e) ; \
FUNC1(a, b, c, d, e) ; \
MIX(a, b, c, d, e)
#define ROUND1x(a, b, c, d, e) \
SHUFFLE(e) ; \
FUNC1(a, b, c, d, e) ; \
MIX(a, b, c, d, e)
#define ROUND2(a, b, c, d, e) \
SHUFFLE(e) ; \
FUNC2(a, b, c, d, e) ; \
MIX(a, b, c, d, e)
#define ROUND3(a, b, c, d, e) \
SHUFFLE(e) ; \
FUNC3(a, b, c, d, e) ; \
MIX(a, b, c, d, e)
#define ROUND4(a, b, c, d, e) \
SHUFFLE(e) ; \
FUNC4(a, b, c, d, e) ; \
MIX(a, b, c, d, e)
#define MIX(Ra, Rb, Rc, Rd, Re) \
ADD Rt1, Re, Re ; \
MOVW Rb@>(32-30), Rb ; \
ADD Ra@>(32-5), Re, Re ; \
ADD Rconst, Re, Re
#define ROUND1(Ra, Rb, Rc, Rd, Re) \
LOAD(Re) ; \
FUNC1(Ra, Rb, Rc, Rd, Re) ; \
MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND1x(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(Re) ; \
FUNC1(Ra, Rb, Rc, Rd, Re) ; \
MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND2(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(Re) ; \
FUNC2(Ra, Rb, Rc, Rd, Re) ; \
MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND3(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(Re) ; \
FUNC3(Ra, Rb, Rc, Rd, Re) ; \
MIX(Ra, Rb, Rc, Rd, Re)
#define ROUND4(Ra, Rb, Rc, Rd, Re) \
SHUFFLE(Re) ; \
FUNC4(Ra, Rb, Rc, Rd, Re) ; \
MIX(Ra, Rb, Rc, Rd, Re)
// func block(dig *digest, p []byte)
TEXT ·block(SB), 0, $352-16
MOVW p+4(FP), R(data) // pointer to the data
MOVW p_len+8(FP), R(t0) // number of bytes
ADD R(data), R(t0)
MOVW R(t0), p_end(SP) // pointer to end of data
MOVW p+4(FP), Rdata // pointer to the data
MOVW p_len+8(FP), Rt0 // number of bytes
ADD Rdata, Rt0
MOVW Rt0, p_end(SP) // pointer to end of data
// Load up initial SHA1 accumulator
MOVW dig+0(FP), R(t0)
MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)]
MOVW dig+0(FP), Rt0
MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re]
loop:
// Save registers at SP+4 onwards
MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13)
MOVW $w_buf(SP), R(w)
MOVW $0x5A827999, R(const)
MOVW $3, R(ctr)
loop1: ROUND1(a, b, c, d, e)
ROUND1(e, a, b, c, d)
ROUND1(d, e, a, b, c)
ROUND1(c, d, e, a, b)
ROUND1(b, c, d, e, a)
SUB.S $1, R(ctr)
MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13)
MOVW $w_buf(SP), Rw
MOVW $0x5A827999, Rconst
MOVW $3, Rctr
loop1: ROUND1(Ra, Rb, Rc, Rd, Re)
ROUND1(Re, Ra, Rb, Rc, Rd)
ROUND1(Rd, Re, Ra, Rb, Rc)
ROUND1(Rc, Rd, Re, Ra, Rb)
ROUND1(Rb, Rc, Rd, Re, Ra)
SUB.S $1, Rctr
BNE loop1
ROUND1(a, b, c, d, e)
ROUND1x(e, a, b, c, d)
ROUND1x(d, e, a, b, c)
ROUND1x(c, d, e, a, b)
ROUND1x(b, c, d, e, a)
ROUND1(Ra, Rb, Rc, Rd, Re)
ROUND1x(Re, Ra, Rb, Rc, Rd)
ROUND1x(Rd, Re, Ra, Rb, Rc)
ROUND1x(Rc, Rd, Re, Ra, Rb)
ROUND1x(Rb, Rc, Rd, Re, Ra)
MOVW $0x6ED9EBA1, R(const)
MOVW $4, R(ctr)
loop2: ROUND2(a, b, c, d, e)
ROUND2(e, a, b, c, d)
ROUND2(d, e, a, b, c)
ROUND2(c, d, e, a, b)
ROUND2(b, c, d, e, a)
SUB.S $1, R(ctr)
MOVW $0x6ED9EBA1, Rconst
MOVW $4, Rctr
loop2: ROUND2(Ra, Rb, Rc, Rd, Re)
ROUND2(Re, Ra, Rb, Rc, Rd)
ROUND2(Rd, Re, Ra, Rb, Rc)
ROUND2(Rc, Rd, Re, Ra, Rb)
ROUND2(Rb, Rc, Rd, Re, Ra)
SUB.S $1, Rctr
BNE loop2
MOVW $0x8F1BBCDC, R(const)
MOVW $4, R(ctr)
loop3: ROUND3(a, b, c, d, e)
ROUND3(e, a, b, c, d)
ROUND3(d, e, a, b, c)
ROUND3(c, d, e, a, b)
ROUND3(b, c, d, e, a)
SUB.S $1, R(ctr)
MOVW $0x8F1BBCDC, Rconst
MOVW $4, Rctr
loop3: ROUND3(Ra, Rb, Rc, Rd, Re)
ROUND3(Re, Ra, Rb, Rc, Rd)
ROUND3(Rd, Re, Ra, Rb, Rc)
ROUND3(Rc, Rd, Re, Ra, Rb)
ROUND3(Rb, Rc, Rd, Re, Ra)
SUB.S $1, Rctr
BNE loop3
MOVW $0xCA62C1D6, R(const)
MOVW $4, R(ctr)
loop4: ROUND4(a, b, c, d, e)
ROUND4(e, a, b, c, d)
ROUND4(d, e, a, b, c)
ROUND4(c, d, e, a, b)
ROUND4(b, c, d, e, a)
SUB.S $1, R(ctr)
MOVW $0xCA62C1D6, Rconst
MOVW $4, Rctr
loop4: ROUND4(Ra, Rb, Rc, Rd, Re)
ROUND4(Re, Ra, Rb, Rc, Rd)
ROUND4(Rd, Re, Ra, Rb, Rc)
ROUND4(Rc, Rd, Re, Ra, Rb)
ROUND4(Rb, Rc, Rd, Re, Ra)
SUB.S $1, Rctr
BNE loop4
// Accumulate - restoring registers from SP+4
MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)]
ADD R(t0), R(a)
ADD R(t1), R(b)
ADD R(t2), R(c)
ADD R(ctr), R(d)
ADD R(w), R(e)
MOVW p_end(SP), R(t0)
CMP R(t0), R(data)
MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw]
ADD Rt0, Ra
ADD Rt1, Rb
ADD Rt2, Rc
ADD Rctr, Rd
ADD Rw, Re
MOVW p_end(SP), Rt0
CMP Rt0, Rdata
BLO loop
// Save final SHA1 accumulator
MOVW dig+0(FP), R(t0)
MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0))
MOVW dig+0(FP), Rt0
MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0)
RET
......@@ -107,7 +107,7 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0
// save state in Gobuf; setjmp
TEXT runtime·gosave(SB),NOSPLIT,$-4-4
MOVW 0(FP), R0 // gobuf
MOVW SP, gobuf_sp(R0)
MOVW R13, gobuf_sp(R0)
MOVW LR, gobuf_pc(R0)
MOVW g, gobuf_g(R0)
MOVW $0, R11
......@@ -133,7 +133,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4
// after this point: it must be straight-line code until the
// final B instruction.
// See large comment in sigprof for more details.
MOVW gobuf_sp(R1), SP // restore SP
MOVW gobuf_sp(R1), R13 // restore SP==R13
MOVW gobuf_lr(R1), LR
MOVW gobuf_ret(R1), R0
MOVW gobuf_ctxt(R1), R7
......@@ -152,7 +152,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4
// to keep running g.
TEXT runtime·mcall(SB),NOSPLIT,$-4-4
// Save caller state in g->sched.
MOVW SP, (g_sched+gobuf_sp)(g)
MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g)
MOVW $0, R11
MOVW R11, (g_sched+gobuf_lr)(g)
......@@ -170,8 +170,8 @@ TEXT runtime·mcall(SB),NOSPLIT,$-4-4
CMP $0, R11
BL.NE runtime·save_g(SB)
MOVW fn+0(FP), R0
MOVW (g_sched+gobuf_sp)(g), SP
SUB $8, SP
MOVW (g_sched+gobuf_sp)(g), R13
SUB $8, R13
MOVW R1, 4(SP)
MOVW R0, R7
MOVW 0(R0), R0
......@@ -217,7 +217,7 @@ switch:
MOVW $runtime·systemstack_switch(SB), R3
ADD $4, R3, R3 // get past push {lr}
MOVW R3, (g_sched+gobuf_pc)(g)
MOVW SP, (g_sched+gobuf_sp)(g)
MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_lr)(g)
MOVW g, (g_sched+gobuf_g)(g)
......@@ -231,7 +231,7 @@ switch:
SUB $4, R3, R3
MOVW $runtime·mstart(SB), R4
MOVW R4, 0(R3)
MOVW R3, SP
MOVW R3, R13
// call target function
MOVW R0, R7
......@@ -242,7 +242,7 @@ switch:
MOVW g_m(g), R1
MOVW m_curg(R1), R0
BL setg<>(SB)
MOVW (g_sched+gobuf_sp)(g), SP
MOVW (g_sched+gobuf_sp)(g), R13
MOVW $0, R3
MOVW R3, (g_sched+gobuf_sp)(g)
RET
......@@ -284,21 +284,21 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0
// Called from f.
// Set g->sched to context in f.
MOVW R7, (g_sched+gobuf_ctxt)(g)
MOVW SP, (g_sched+gobuf_sp)(g)
MOVW R13, (g_sched+gobuf_sp)(g)
MOVW LR, (g_sched+gobuf_pc)(g)
MOVW R3, (g_sched+gobuf_lr)(g)
// Called from f.
// Set m->morebuf to f's caller.
MOVW R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC
MOVW SP, (m_morebuf+gobuf_sp)(R8) // f's caller's SP
MOVW R13, (m_morebuf+gobuf_sp)(R8) // f's caller's SP
MOVW $4(SP), R3 // f's argument pointer
MOVW g, (m_morebuf+gobuf_g)(R8)
// Call newstack on m->g0's stack.
MOVW m_g0(R8), R0
BL setg<>(SB)
MOVW (g_sched+gobuf_sp)(g), SP
MOVW (g_sched+gobuf_sp)(g), R13
BL runtime·newstack(SB)
// Not reached, but make sure the return PC from the call to newstack
......@@ -362,7 +362,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \
/* copy arguments to stack */ \
MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \
ADD $4, SP, R1; \
ADD $4, R13, R1; \
CMP $0, R2; \
B.EQ 5(PC); \
MOVBU.P 1(R0), R5; \
......@@ -378,7 +378,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \
MOVW argptr+8(FP), R0; \
MOVW argsize+12(FP), R2; \
MOVW retoffset+16(FP), R3; \
ADD $4, SP, R1; \
ADD $4, R13, R1; \
ADD R3, R1; \
ADD R3, R0; \
SUB R3, R2; \
......@@ -443,8 +443,8 @@ TEXT runtime·jmpdefer(SB),NOSPLIT,$0-8
MOVW 0(SP), LR
MOVW $-4(LR), LR // BL deferreturn
MOVW fv+0(FP), R7
MOVW argp+4(FP), SP
MOVW $-4(SP), SP // SP is 4 below argp, due to saved LR
MOVW argp+4(FP), R13
MOVW $-4(SP), R13 // SP is 4 below argp, due to saved LR
MOVW 0(R7), R1
B (R1)
......
......@@ -25,31 +25,31 @@
#include "textflag.h"
TO = 8
TOE = 11
N = 12
TMP = 12 /* N and TMP don't overlap */
#define TO R8
#define TOE R11
#define N R12
#define TMP R12 /* N and TMP don't overlap */
TEXT runtime·memclr(SB),NOSPLIT,$0-8
MOVW ptr+0(FP), R(TO)
MOVW n+4(FP), R(N)
MOVW $0, R(0)
MOVW ptr+0(FP), TO
MOVW n+4(FP), N
MOVW $0, R0
ADD R(N), R(TO), R(TOE) /* to end pointer */
ADD N, TO, TOE /* to end pointer */
CMP $4, R(N) /* need at least 4 bytes to copy */
CMP $4, N /* need at least 4 bytes to copy */
BLT _1tail
_4align: /* align on 4 */
AND.S $3, R(TO), R(TMP)
AND.S $3, TO, TMP
BEQ _4aligned
MOVBU.P R(0), 1(R(TO)) /* implicit write back */
MOVBU.P R0, 1(TO) /* implicit write back */
B _4align
_4aligned:
SUB $31, R(TOE), R(TMP) /* do 32-byte chunks if possible */
CMP R(TMP), R(TO)
SUB $31, TOE, TMP /* do 32-byte chunks if possible */
CMP TMP, TO
BHS _4tail
MOVW R0, R1 /* replicate */
......@@ -61,26 +61,26 @@ _4aligned:
MOVW R0, R7
_f32loop:
CMP R(TMP), R(TO)
CMP TMP, TO
BHS _4tail
MOVM.IA.W [R0-R7], (R(TO))
MOVM.IA.W [R0-R7], (TO)
B _f32loop
_4tail:
SUB $3, R(TOE), R(TMP) /* do remaining words if possible */
SUB $3, TOE, TMP /* do remaining words if possible */
_4loop:
CMP R(TMP), R(TO)
CMP TMP, TO
BHS _1tail
MOVW.P R(0), 4(R(TO)) /* implicit write back */
MOVW.P R0, 4(TO) /* implicit write back */
B _4loop
_1tail:
CMP R(TO), R(TOE)
CMP TO, TOE
BEQ _return
MOVBU.P R(0), 1(R(TO)) /* implicit write back */
MOVBU.P R0, 1(TO) /* implicit write back */
B _1tail
_return:
......
......@@ -26,138 +26,138 @@
#include "textflag.h"
// TE or TS are spilled to the stack during bulk register moves.
TS = 0
TE = 8
#define TS R0
#define TE R8
// Warning: the linker will use R11 to synthesize certain instructions. Please
// take care and double check with objdump.
FROM = 11
N = 12
TMP = 12 /* N and TMP don't overlap */
TMP1 = 5
RSHIFT = 5
LSHIFT = 6
OFFSET = 7
BR0 = 0 /* shared with TS */
BW0 = 1
BR1 = 1
BW1 = 2
BR2 = 2
BW2 = 3
BR3 = 3
BW3 = 4
FW0 = 1
FR0 = 2
FW1 = 2
FR1 = 3
FW2 = 3
FR2 = 4
FW3 = 4
FR3 = 8 /* shared with TE */
#define FROM R11
#define N R12
#define TMP R12 /* N and TMP don't overlap */
#define TMP1 R5
#define RSHIFT R5
#define LSHIFT R6
#define OFFSET R7
#define BR0 R0 /* shared with TS */
#define BW0 R1
#define BR1 R1
#define BW1 R2
#define BR2 R2
#define BW2 R3
#define BR3 R3
#define BW3 R4
#define FW0 R1
#define FR0 R2
#define FW1 R2
#define FR1 R3
#define FW2 R3
#define FR2 R4
#define FW3 R4
#define FR3 R8 /* shared with TE */
TEXT runtime·memmove(SB), NOSPLIT, $4-12
_memmove:
MOVW to+0(FP), R(TS)
MOVW from+4(FP), R(FROM)
MOVW n+8(FP), R(N)
MOVW to+0(FP), TS
MOVW from+4(FP), FROM
MOVW n+8(FP), N
ADD R(N), R(TS), R(TE) /* to end pointer */
ADD N, TS, TE /* to end pointer */
CMP R(FROM), R(TS)
CMP FROM, TS
BLS _forward
_back:
ADD R(N), R(FROM) /* from end pointer */
CMP $4, R(N) /* need at least 4 bytes to copy */
ADD N, FROM /* from end pointer */
CMP $4, N /* need at least 4 bytes to copy */
BLT _b1tail
_b4align: /* align destination on 4 */
AND.S $3, R(TE), R(TMP)
AND.S $3, TE, TMP
BEQ _b4aligned
MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b4align
_b4aligned: /* is source now aligned? */
AND.S $3, R(FROM), R(TMP)
AND.S $3, FROM, TMP
BNE _bunaligned
ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */
MOVW R(TS), savedts-4(SP)
ADD $31, TS, TMP /* do 32-byte chunks if possible */
MOVW TS, savedts-4(SP)
_b32loop:
CMP R(TMP), R(TE)
CMP TMP, TE
BLS _b4tail
MOVM.DB.W (R(FROM)), [R0-R7]
MOVM.DB.W [R0-R7], (R(TE))
MOVM.DB.W (FROM), [R0-R7]
MOVM.DB.W [R0-R7], (TE)
B _b32loop
_b4tail: /* do remaining words if possible */
MOVW savedts-4(SP), R(TS)
ADD $3, R(TS), R(TMP)
MOVW savedts-4(SP), TS
ADD $3, TS, TMP
_b4loop:
CMP R(TMP), R(TE)
CMP TMP, TE
BLS _b1tail
MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */
MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */
MOVW.W -4(FROM), TMP1 /* pre-indexed */
MOVW.W TMP1, -4(TE) /* pre-indexed */
B _b4loop
_b1tail: /* remaining bytes */
CMP R(TE), R(TS)
CMP TE, TS
BEQ _return
MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */
MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */
MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b1tail
_forward:
CMP $4, R(N) /* need at least 4 bytes to copy */
CMP $4, N /* need at least 4 bytes to copy */
BLT _f1tail
_f4align: /* align destination on 4 */
AND.S $3, R(TS), R(TMP)
AND.S $3, TS, TMP
BEQ _f4aligned
MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P TMP, 1(TS) /* implicit write back */
B _f4align
_f4aligned: /* is source now aligned? */
AND.S $3, R(FROM), R(TMP)
AND.S $3, FROM, TMP
BNE _funaligned
SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */
MOVW R(TE), savedte-4(SP)
SUB $31, TE, TMP /* do 32-byte chunks if possible */
MOVW TE, savedte-4(SP)
_f32loop:
CMP R(TMP), R(TS)
CMP TMP, TS
BHS _f4tail
MOVM.IA.W (R(FROM)), [R1-R8]
MOVM.IA.W [R1-R8], (R(TS))
MOVM.IA.W (FROM), [R1-R8]
MOVM.IA.W [R1-R8], (TS)
B _f32loop
_f4tail:
MOVW savedte-4(SP), R(TE)
SUB $3, R(TE), R(TMP) /* do remaining words if possible */
MOVW savedte-4(SP), TE
SUB $3, TE, TMP /* do remaining words if possible */
_f4loop:
CMP R(TMP), R(TS)
CMP TMP, TS
BHS _f1tail
MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */
MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */
MOVW.P 4(FROM), TMP1 /* implicit write back */
MOVW.P TMP1, 4(TS) /* implicit write back */
B _f4loop
_f1tail:
CMP R(TS), R(TE)
CMP TS, TE
BEQ _return
MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */
MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */
MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P TMP, 1(TS) /* implicit write back */
B _f1tail
_return:
......@@ -165,97 +165,97 @@ _return:
RET
_bunaligned:
CMP $2, R(TMP) /* is R(TMP) < 2 ? */
CMP $2, TMP /* is TMP < 2 ? */
MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */
MOVW.LT $24, R(LSHIFT)
MOVW.LT $1, R(OFFSET)
MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */
MOVW.LT $24, LSHIFT
MOVW.LT $1, OFFSET
MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */
MOVW.EQ $16, R(LSHIFT)
MOVW.EQ $2, R(OFFSET)
MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */
MOVW.EQ $16, LSHIFT
MOVW.EQ $2, OFFSET
MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */
MOVW.GT $8, R(LSHIFT)
MOVW.GT $3, R(OFFSET)
MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */
MOVW.GT $8, LSHIFT
MOVW.GT $3, OFFSET
ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */
CMP R(TMP), R(TE)
ADD $16, TS, TMP /* do 16-byte chunks if possible */
CMP TMP, TE
BLS _b1tail
BIC $3, R(FROM) /* align source */
MOVW R(TS), savedts-4(SP)
MOVW (R(FROM)), R(BR0) /* prime first block register */
BIC $3, FROM /* align source */
MOVW TS, savedts-4(SP)
MOVW (FROM), BR0 /* prime first block register */
_bu16loop:
CMP R(TMP), R(TE)
CMP TMP, TE
BLS _bu1tail
MOVW R(BR0)<<R(LSHIFT), R(BW3)
MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
ORR R(BR3)>>R(RSHIFT), R(BW3)
MOVW BR0<<LSHIFT, BW3
MOVM.DB.W (FROM), [BR0-BR3]
ORR BR3>>RSHIFT, BW3
MOVW R(BR3)<<R(LSHIFT), R(BW2)
ORR R(BR2)>>R(RSHIFT), R(BW2)
MOVW BR3<<LSHIFT, BW2
ORR BR2>>RSHIFT, BW2
MOVW R(BR2)<<R(LSHIFT), R(BW1)
ORR R(BR1)>>R(RSHIFT), R(BW1)
MOVW BR2<<LSHIFT, BW1
ORR BR1>>RSHIFT, BW1
MOVW R(BR1)<<R(LSHIFT), R(BW0)
ORR R(BR0)>>R(RSHIFT), R(BW0)
MOVW BR1<<LSHIFT, BW0
ORR BR0>>RSHIFT, BW0
MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
MOVM.DB.W [BW0-BW3], (TE)
B _bu16loop
_bu1tail:
MOVW savedts-4(SP), R(TS)
ADD R(OFFSET), R(FROM)
MOVW savedts-4(SP), TS
ADD OFFSET, FROM
B _b1tail
_funaligned:
CMP $2, R(TMP)
CMP $2, TMP
MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */
MOVW.LT $24, R(LSHIFT)
MOVW.LT $3, R(OFFSET)
MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */
MOVW.LT $24, LSHIFT
MOVW.LT $3, OFFSET
MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */
MOVW.EQ $16, R(LSHIFT)
MOVW.EQ $2, R(OFFSET)
MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */
MOVW.EQ $16, LSHIFT
MOVW.EQ $2, OFFSET
MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */
MOVW.GT $8, R(LSHIFT)
MOVW.GT $1, R(OFFSET)
MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */
MOVW.GT $8, LSHIFT
MOVW.GT $1, OFFSET
SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */
CMP R(TMP), R(TS)
SUB $16, TE, TMP /* do 16-byte chunks if possible */
CMP TMP, TS
BHS _f1tail
BIC $3, R(FROM) /* align source */
MOVW R(TE), savedte-4(SP)
MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */
BIC $3, FROM /* align source */
MOVW TE, savedte-4(SP)
MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */
_fu16loop:
CMP R(TMP), R(TS)
CMP TMP, TS
BHS _fu1tail
MOVW R(FR3)>>R(RSHIFT), R(FW0)
MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
ORR R(FR0)<<R(LSHIFT), R(FW0)
MOVW FR3>>RSHIFT, FW0
MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
ORR FR0<<LSHIFT, FW0
MOVW R(FR0)>>R(RSHIFT), R(FW1)
ORR R(FR1)<<R(LSHIFT), R(FW1)
MOVW FR0>>RSHIFT, FW1
ORR FR1<<LSHIFT, FW1
MOVW R(FR1)>>R(RSHIFT), R(FW2)
ORR R(FR2)<<R(LSHIFT), R(FW2)
MOVW FR1>>RSHIFT, FW2
ORR FR2<<LSHIFT, FW2
MOVW R(FR2)>>R(RSHIFT), R(FW3)
ORR R(FR3)<<R(LSHIFT), R(FW3)
MOVW FR2>>RSHIFT, FW3
ORR FR3<<LSHIFT, FW3
MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
B _fu16loop
_fu1tail:
MOVW savedte-4(SP), R(TE)
SUB R(OFFSET), R(FROM)
MOVW savedte-4(SP), TE
SUB OFFSET, FROM
B _f1tail
......@@ -77,7 +77,7 @@ DATA bad_abi_msg+0x2c(SB)/1, $0xa
GLOBL bad_abi_msg(SB), RODATA, $45
TEXT oabi_syscall<>(SB),NOSPLIT,$-4
ADD $1, PC, R4
ADD $1, R15, R4 // R15 is hardware PC
WORD $0xe12fff14 //BX (R4) // enter thumb mode
// TODO(minux): only supports little-endian CPUs
WORD $0x4770df01 // swi $1; bx lr
......
......@@ -383,7 +383,7 @@ TEXT runtime·usleep(SB),NOSPLIT,$12
// Use kernel version instead of native armcas in asm_arm.s.
// See ../sync/atomic/asm_linux_arm.s for details.
TEXT cas<>(SB),NOSPLIT,$0
MOVW $0xffff0fc0, PC
MOVW $0xffff0fc0, R15 // R15 is hardware PC.
TEXT runtime·cas(SB),NOSPLIT,$0
MOVW ptr+0(FP), R2
......
......@@ -27,8 +27,6 @@
#include "go_tls.h"
#include "textflag.h"
arg=0
/* replaced use of R10 by R11 because the former can be the data segment base register */
TEXT _mulv(SB), NOSPLIT, $0
......@@ -111,70 +109,71 @@ TEXT runtime·_sfloatpanic(SB),NOSPLIT,$-4
// Reference:
// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
q = 0 // input d, output q
r = 1 // input n, output r
s = 2 // three temporary variables
M = 3
a = 11
// Be careful: R(a) == R11 will be used by the linker for synthesized instructions.
#define Rq R0 // input d, output q
#define Rr R1 // input n, output r
#define Rs R2 // three temporary variables
#define RM R3
#define Ra R11
// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
TEXT udiv<>(SB),NOSPLIT,$-4
CLZ R(q), R(s) // find normalizing shift
MOVW.S R(q)<<R(s), R(a)
MOVW $fast_udiv_tab<>-64(SB), R(M)
ADD.NE R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor
MOVBU.NE (R(a)), R(a)
CLZ Rq, Rs // find normalizing shift
MOVW.S Rq<<Rs, Ra
MOVW $fast_udiv_tab<>-64(SB), RM
ADD.NE Ra>>25, RM, Ra // index by most significant 7 bits of divisor
MOVBU.NE (Ra), Ra
SUB.S $7, R(s)
RSB $0, R(q), R(M) // M = -q
MOVW.PL R(a)<<R(s), R(q)
SUB.S $7, Rs
RSB $0, Rq, RM // M = -q
MOVW.PL Ra<<Rs, Rq
// 1st Newton iteration
MUL.PL R(M), R(q), R(a) // a = -q*d
MUL.PL RM, Rq, Ra // a = -q*d
BMI udiv_by_large_d
MULAWT R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32)
TEQ R(M)->1, R(M) // check for d=0 or d=1
MULAWT Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32)
TEQ RM->1, RM // check for d=0 or d=1
// 2nd Newton iteration
MUL.NE R(M), R(q), R(a)
MOVW.NE $0, R(s)
MULAL.NE R(q), R(a), (R(q),R(s))
MUL.NE RM, Rq, Ra
MOVW.NE $0, Rs
MULAL.NE Rq, Ra, (Rq,Rs)
BEQ udiv_by_0_or_1
// q now accurate enough for a remainder r, 0<=r<3*d
MULLU R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32
ADD R(M), R(r), R(r) // r = n - d
MULA R(M), R(q), R(r), R(r) // r = n - (q+1)*d
MULLU Rq, Rr, (Rq,Rs) // q = (r * q) >> 32
ADD RM, Rr, Rr // r = n - d
MULA RM, Rq, Rr, Rr // r = n - (q+1)*d
// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
CMN R(M), R(r) // t = r-d
SUB.CS R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d
ADD.CC $1, R(q)
ADD.PL R(M)<<1, R(r)
ADD.PL $2, R(q)
CMN RM, Rr // t = r-d
SUB.CS RM, Rr, Rr // if (t<-d || t>=0) r=r+d
ADD.CC $1, Rq
ADD.PL RM<<1, Rr
ADD.PL $2, Rq
RET
udiv_by_large_d:
// at this point we know d>=2^(31-6)=2^25
SUB $4, R(a), R(a)
RSB $0, R(s), R(s)
MOVW R(a)>>R(s), R(q)
MULLU R(q), R(r), (R(q),R(s))
MULA R(M), R(q), R(r), R(r)
SUB $4, Ra, Ra
RSB $0, Rs, Rs
MOVW Ra>>Rs, Rq
MULLU Rq, Rr, (Rq,Rs)
MULA RM, Rq, Rr, Rr
// q now accurate enough for a remainder r, 0<=r<4*d
CMN R(r)>>1, R(M) // if(r/2 >= d)
ADD.CS R(M)<<1, R(r)
ADD.CS $2, R(q)
CMN R(r), R(M)
ADD.CS R(M), R(r)
ADD.CS $1, R(q)
CMN Rr>>1, RM // if(r/2 >= d)
ADD.CS RM<<1, Rr
ADD.CS $2, Rq
CMN Rr, RM
ADD.CS RM, Rr
ADD.CS $1, Rq
RET
udiv_by_0_or_1:
// carry set if d==1, carry clear if d==0
BCC udiv_by_0
MOVW R(r), R(q)
MOVW $0, R(r)
MOVW Rr, Rq
MOVW $0, Rr
RET
udiv_by_0:
......@@ -216,96 +215,96 @@ DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788
DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
GLOBL fast_udiv_tab<>(SB), RODATA, $64
// The linker will pass numerator in R(TMP), and it also
// expects the result in R(TMP)
TMP = 11
// The linker will pass numerator in RTMP, and it also
// expects the result in RTMP
#define RTMP R11
TEXT _divu(SB), NOSPLIT, $16
MOVW R(q), 4(R13)
MOVW R(r), 8(R13)
MOVW R(s), 12(R13)
MOVW R(M), 16(R13)
MOVW Rq, 4(R13)
MOVW Rr, 8(R13)
MOVW Rs, 12(R13)
MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */
MOVW 0(FP), R(q) /* denominator */
MOVW RTMP, Rr /* numerator */
MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB)
MOVW R(q), R(TMP)
MOVW 4(R13), R(q)
MOVW 8(R13), R(r)
MOVW 12(R13), R(s)
MOVW 16(R13), R(M)
MOVW Rq, RTMP
MOVW 4(R13), Rq
MOVW 8(R13), Rr
MOVW 12(R13), Rs
MOVW 16(R13), RM
RET
TEXT _modu(SB), NOSPLIT, $16
MOVW R(q), 4(R13)
MOVW R(r), 8(R13)
MOVW R(s), 12(R13)
MOVW R(M), 16(R13)
MOVW Rq, 4(R13)
MOVW Rr, 8(R13)
MOVW Rs, 12(R13)
MOVW RM, 16(R13)
MOVW R(TMP), R(r) /* numerator */
MOVW 0(FP), R(q) /* denominator */
MOVW RTMP, Rr /* numerator */
MOVW 0(FP), Rq /* denominator */
BL udiv<>(SB)
MOVW R(r), R(TMP)
MOVW 4(R13), R(q)
MOVW 8(R13), R(r)
MOVW 12(R13), R(s)
MOVW 16(R13), R(M)
MOVW Rr, RTMP
MOVW 4(R13), Rq
MOVW 8(R13), Rr
MOVW 12(R13), Rs
MOVW 16(R13), RM
RET
TEXT _div(SB),NOSPLIT,$16
MOVW R(q), 4(R13)
MOVW R(r), 8(R13)
MOVW R(s), 12(R13)
MOVW R(M), 16(R13)
MOVW R(TMP), R(r) /* numerator */
MOVW 0(FP), R(q) /* denominator */
CMP $0, R(r)
MOVW Rq, 4(R13)
MOVW Rr, 8(R13)
MOVW Rs, 12(R13)
MOVW RM, 16(R13)
MOVW RTMP, Rr /* numerator */
MOVW 0(FP), Rq /* denominator */
CMP $0, Rr
BGE d1
RSB $0, R(r), R(r)
CMP $0, R(q)
RSB $0, Rr, Rr
CMP $0, Rq
BGE d2
RSB $0, R(q), R(q)
RSB $0, Rq, Rq
d0:
BL udiv<>(SB) /* none/both neg */
MOVW R(q), R(TMP)
MOVW Rq, RTMP
B out1
d1:
CMP $0, R(q)
CMP $0, Rq
BGE d0
RSB $0, R(q), R(q)
RSB $0, Rq, Rq
d2:
BL udiv<>(SB) /* one neg */
RSB $0, R(q), R(TMP)
RSB $0, Rq, RTMP
out1:
MOVW 4(R13), R(q)
MOVW 8(R13), R(r)
MOVW 12(R13), R(s)
MOVW 16(R13), R(M)
MOVW 4(R13), Rq
MOVW 8(R13), Rr
MOVW 12(R13), Rs
MOVW 16(R13), RM
RET
TEXT _mod(SB),NOSPLIT,$16
MOVW R(q), 4(R13)
MOVW R(r), 8(R13)
MOVW R(s), 12(R13)
MOVW R(M), 16(R13)
MOVW R(TMP), R(r) /* numerator */
MOVW 0(FP), R(q) /* denominator */
CMP $0, R(q)
RSB.LT $0, R(q), R(q)
CMP $0, R(r)
MOVW Rq, 4(R13)
MOVW Rr, 8(R13)
MOVW Rs, 12(R13)
MOVW RM, 16(R13)
MOVW RTMP, Rr /* numerator */
MOVW 0(FP), Rq /* denominator */
CMP $0, Rq
RSB.LT $0, Rq, Rq
CMP $0, Rr
BGE m1
RSB $0, R(r), R(r)
RSB $0, Rr, Rr
BL udiv<>(SB) /* neg numerator */
RSB $0, R(r), R(TMP)
RSB $0, Rr, RTMP
B out
m1:
BL udiv<>(SB) /* pos numerator */
MOVW R(r), R(TMP)
MOVW Rr, RTMP
out:
MOVW 4(R13), R(q)
MOVW 8(R13), R(r)
MOVW 12(R13), R(s)
MOVW 16(R13), R(M)
MOVW 4(R13), Rq
MOVW 8(R13), Rr
MOVW 12(R13), Rs
MOVW 16(R13), RM
RET
// _mul64by32 and _div64by32 not implemented on arm
......
......@@ -24,7 +24,7 @@
// http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5
//
TEXT cas<>(SB),NOSPLIT,$0
MOVW $0xffff0fc0, PC
MOVW $0xffff0fc0, R15
TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
B ·CompareAndSwapUint32(SB)
......@@ -95,7 +95,7 @@ TEXT ·SwapUintptr(SB),NOSPLIT,$0
B ·SwapUint32(SB)
TEXT cas64<>(SB),NOSPLIT,$0
MOVW $0xffff0f60, PC // __kuser_cmpxchg64: Linux-3.1 and above
MOVW $0xffff0f60, R15 // R15 = hardware PC. __kuser_cmpxchg64: Linux-3.1 and above
TEXT kernelCAS64<>(SB),NOSPLIT,$0-21
// int (*__kuser_cmpxchg64_t)(const int64_t *oldval, const int64_t *newval, volatile int64_t *ptr);
......@@ -127,17 +127,17 @@ TEXT setupAndCallCAS64<>(SB),NOSPLIT,$-4-21
CMP $5, R0
MOVW.CS $kernelCAS64<>(SB), R1
MOVW.CS R1, armCAS64(SB)
MOVW.CS R1, PC
MOVW.CS R1, R15 // R15 = hardware PC
MOVB runtime·armArch(SB), R0
// LDREXD, STREXD only present on ARMv6K or higher
CMP $6, R0 // TODO(minux): how to differentiate ARMv6 with ARMv6K?
MOVW.CS $·armCompareAndSwapUint64(SB), R1
MOVW.CS R1, armCAS64(SB)
MOVW.CS R1, PC
MOVW.CS R1, R15
// we are out of luck, can only use runtime's emulated 64-bit cas
MOVW $·generalCAS64(SB), R1
MOVW R1, armCAS64(SB)
MOVW R1, PC
MOVW R1, R15
TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
B ·CompareAndSwapUint64(SB)
......@@ -145,7 +145,7 @@ TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4-21
MOVW armCAS64(SB), R0
CMP $0, R0
MOVW.NE R0, PC
MOVW.NE R0, R15 // R15 = hardware PC
B setupAndCallCAS64<>(SB)
TEXT ·AddInt64(SB),NOSPLIT,$0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment