Commit 403ab0f2 authored by Keith Randall's avatar Keith Randall Committed by Keith Randall

internal/bytealg: move IndexByte asssembly to the new bytealg package

Move the IndexByte function from the runtime to a new bytealg package.
The new package will eventually hold all the optimized assembly for
groveling through byte slices and strings. It seems a better home for
this code than randomly keeping it in runtime.

Once this is in, the next step is to move the other functions
(Compare, Equal, ...).

Update #19792

This change seems complicated enough that we might just declare
"not worth it" and abandon.  Opinions welcome.

The core assembly is all unchanged, except minor modifications where
the code reads cpu feature bits.

The wrapper functions have been cleaned up as they are now actually
checked by vet.

Change-Id: I9fa75bee5d85db3a65b3fd3b7997e60367523796
Reviewed-on: https://go-review.googlesource.com/98016
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarBrad Fitzpatrick <bradfitz@golang.org>
parent dcedcaa5
......@@ -6,8 +6,8 @@ package bytes
//go:noescape
// IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s.
func IndexByte(s []byte, c byte) int // ../runtime/asm_$GOARCH.s
// IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
func IndexByte(b []byte, c byte) int // in internal/bytealg
//go:noescape
......
......@@ -791,6 +791,11 @@ func runInstall(dir string, ch chan struct{}) {
if dir == "runtime" {
compile = append(compile, "-+", "-asmhdr", pathf("%s/go_asm.h", workdir))
}
if dir == "internal/bytealg" {
// TODO: why don't we generate go_asm.h for all packages
// that have any assembly?
compile = append(compile, "-asmhdr", pathf("%s/go_asm.h", workdir))
}
compile = append(compile, gofiles...)
run(path, CheckExit|ShowOutput, compile...)
......
......@@ -49,7 +49,9 @@ import (
func isRuntimeDepPkg(pkg string) bool {
switch pkg {
case "runtime",
"sync/atomic": // runtime may call to sync/atomic, due to go:linkname
"sync/atomic", // runtime may call to sync/atomic, due to go:linkname
"internal/bytealg", // for IndexByte
"internal/cpu": // for cpu features
return true
}
return strings.HasPrefix(pkg, "runtime/internal/") && !strings.HasSuffix(pkg, "_test")
......@@ -1874,7 +1876,6 @@ func assignAddress(ctxt *Link, sect *sym.Section, n int, s *sym.Symbol, va uint6
// Only break at outermost syms.
if ctxt.Arch.InFamily(sys.PPC64) && s.Outer == nil && ctxt.IsELF && ctxt.LinkMode == LinkExternal && va-sect.Vaddr+funcsize+maxSizeTrampolinesPPC64(s, isTramp) > 0x1c00000 {
// Set the length for the previous text section
sect.Length = va - sect.Vaddr
......
......@@ -12,8 +12,8 @@ go/types/scope.go: method WriteTo(w io.Writer, n int, recurse bool) should have
// Nothing much to do about cross-package assembly. Unfortunate.
runtime/asm_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: call is in package reflect
runtime/asm_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: Equal is in package bytes
runtime/asm_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: IndexByte is in package bytes
runtime/asm_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: IndexByte is in package strings
internal/bytealg/indexbyte_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: IndexByte is in package bytes
internal/bytealg/indexbyte_ARCHSUFF.s: [GOARCH] cannot check cross-package assembly function: IndexByte is in package strings
// The write barrier is called directly by the compiler, so no Go def
runtime/asm_ARCHSUFF.s: [GOARCH] gcWriteBarrier: function gcWriteBarrier missing Go declaration
......
......@@ -24,7 +24,6 @@ runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: count
runtime/asm_amd64.s: [amd64] aeshashbody: function aeshashbody missing Go declaration
runtime/asm_amd64.s: [amd64] memeqbody: function memeqbody missing Go declaration
runtime/asm_amd64.s: [amd64] cmpbody: function cmpbody missing Go declaration
runtime/asm_amd64.s: [amd64] indexbytebody: function indexbytebody missing Go declaration
runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go declaration
runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
......
......@@ -23,7 +23,6 @@ runtime/asm_amd64p32.s: [amd64p32] rt0_go: unknown variable argv
runtime/asm_amd64p32.s: [amd64p32] memeqbody: function memeqbody missing Go declaration
runtime/asm_amd64p32.s: [amd64p32] cannot check cross-package assembly function: Compare is in package bytes
runtime/asm_amd64p32.s: [amd64p32] cmpbody: function cmpbody missing Go declaration
runtime/asm_amd64p32.s: [amd64p32] indexbytebody: function indexbytebody missing Go declaration
runtime/asm_amd64p32.s: [amd64p32] asmcgocall: RET without writing to 4-byte ret+8(FP)
runtime/asm_amd64p32.s: [amd64p32] stackcheck: function stackcheck missing Go declaration
runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
runtime/asm_s390x.s: [s390x] memeqbody: function memeqbody missing Go declaration
runtime/asm_s390x.s: [s390x] memeqbodyclc: function memeqbodyclc missing Go declaration
runtime/asm_s390x.s: [s390x] indexbytebody: function indexbytebody missing Go declaration
runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
runtime/asm_s390x.s: [s390x] cmpbody: function cmpbody missing Go declaration
runtime/asm_s390x.s: [s390x] cmpbodyclc: function cmpbodyclc missing Go declaration
......
......@@ -36,14 +36,15 @@ var pkgDeps = map[string][]string{
// L0 is the lowest level, core, nearly unavoidable packages.
"errors": {},
"io": {"errors", "sync", "sync/atomic"},
"runtime": {"unsafe", "runtime/internal/atomic", "runtime/internal/sys"},
"runtime": {"unsafe", "runtime/internal/atomic", "runtime/internal/sys", "internal/cpu", "internal/bytealg"},
"runtime/internal/sys": {},
"runtime/internal/atomic": {"unsafe", "runtime/internal/sys"},
"internal/race": {"runtime", "unsafe"},
"sync": {"internal/race", "runtime", "sync/atomic", "unsafe"},
"sync/atomic": {"unsafe"},
"unsafe": {},
"internal/cpu": {"runtime"},
"internal/cpu": {},
"internal/bytealg": {"unsafe", "internal/cpu"},
"L0": {
"errors",
......@@ -54,6 +55,7 @@ var pkgDeps = map[string][]string{
"sync/atomic",
"unsafe",
"internal/cpu",
"internal/bytealg",
},
// L1 adds simple functions and strings processing,
......
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-20
MOVL b_base+0(FP), SI
MOVL b_len+4(FP), CX
MOVB c+12(FP), AL
MOVL SI, DI
CLD; REPN; SCASB
JZ 3(PC)
MOVL $-1, ret+16(FP)
RET
SUBL SI, DI
SUBL $1, DI
MOVL DI, ret+16(FP)
RET
TEXT ·IndexByteString(SB),NOSPLIT,$0-16
MOVL s_base+0(FP), SI
MOVL s_len+4(FP), CX
MOVB c+8(FP), AL
MOVL SI, DI
CLD; REPN; SCASB
JZ 3(PC)
MOVL $-1, ret+12(FP)
RET
SUBL SI, DI
SUBL $1, DI
MOVL DI, ret+12(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
JMP ·IndexByte(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
JMP ·IndexByteString(SB)
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB), NOSPLIT, $0-40
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP indexbytebody<>(SB)
TEXT ·IndexByteString(SB), NOSPLIT, $0-32
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP indexbytebody<>(SB)
// Provide direct access to these functions from other packages.
// This is the equivlant of doing:
// package bytes
// func IndexByte(b []byte, c byte) int {
// return bytealg.IndexByte(s, c)
// }
// but involves no call overhead.
// TODO: remove this hack when midstack inlining is enabled?
TEXT bytes·IndexByte(SB), NOSPLIT, $0-40
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP indexbytebody<>(SB)
TEXT strings·IndexByte(SB), NOSPLIT, $0-32
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP indexbytebody<>(SB)
// input:
// SI: data
// BX: data len
// AL: byte sought
// R8: address to put result
TEXT indexbytebody<>(SB), NOSPLIT, $0
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Find first set bit, if any.
BSFL DX, DX
JNZ ssesuccess
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JB sseloop
// Search the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched, but that's ok.
MOVQ AX, DI
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
BSFL DX, DX
JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
SUBQ SI, DI // Compute offset of chunk within data.
ADDQ DX, DI // Add offset of byte within chunk.
MOVQ DI, (R8)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ failure
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
MOVOU (SI), X1 // Load data
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
CMPL DX, BX
JAE failure // Match is past end of data.
MOVQ DX, (R8)
RET
endofpage:
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
MOVL BX, CX
SHLL CX, DX
SHRL $16, DX // Shift desired bits down to bottom of register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
MOVQ DX, (R8)
RET
avx2:
CMPB internal∕cpu·X86+const_x86_HasAVX2(SB), $1
JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
ADDQ $32, DI
CMPQ DI, R11
JLT avx2_loop
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
VZEROUPPER
MOVQ $-1, (R8)
RET
avx2success:
VPMOVMSKB Y3, DX
BSFL DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVQ DX, (R8)
VZEROUPPER
RET
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-20
MOVL b_base+0(FP), SI
MOVL b_len+4(FP), BX
MOVB c+12(FP), AL
CALL indexbytebody<>(SB)
MOVL AX, ret+16(FP)
RET
TEXT ·IndexByteString(SB),NOSPLIT,$0-20
MOVL s_base+0(FP), SI
MOVL s_len+4(FP), BX
MOVB c+8(FP), AL
CALL indexbytebody<>(SB)
MOVL AX, ret+16(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
MOVL b_base+0(FP), SI
MOVL b_len+4(FP), BX
MOVB c+12(FP), AL
CALL indexbytebody<>(SB)
MOVL AX, ret+16(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-20
MOVL s_base+0(FP), SI
MOVL s_len+4(FP), BX
MOVB c+8(FP), AL
CALL indexbytebody<>(SB)
MOVL AX, ret+16(FP)
RET
// input:
// SI: data
// BX: data len
// AL: byte sought
// output:
// AX
TEXT indexbytebody<>(SB),NOSPLIT,$0
MOVL SI, DI
CMPL BX, $16
JLT small
// round up to first 16-byte boundary
TESTL $15, SI
JZ aligned
MOVL SI, CX
ANDL $~15, CX
ADDL $16, CX
// search the beginning
SUBL SI, CX
REPN; SCASB
JZ success
// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
// round down to last 16-byte boundary
MOVL BX, R11
ADDL SI, R11
ANDL $~15, R11
// shuffle X0 around so that each byte contains c
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
JMP condition
sse:
// move the next 16-byte chunk of the buffer into X1
MOVO (DI), X1
// compare bytes in X0 to X1
PCMPEQB X0, X1
// take the top bit of each byte in X1 and put the result in DX
PMOVMSKB X1, DX
TESTL DX, DX
JNZ ssesuccess
ADDL $16, DI
condition:
CMPL DI, R11
JNE sse
// search the end
MOVL SI, CX
ADDL BX, CX
SUBL R11, CX
// if CX == 0, the zero flag will be set and we'll end up
// returning a false success
JZ failure
REPN; SCASB
JZ success
failure:
MOVL $-1, AX
RET
// handle for lengths < 16
small:
MOVL BX, CX
REPN; SCASB
JZ success
MOVL $-1, AX
RET
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
// get the index of the least significant set bit
BSFW DX, DX
SUBL SI, DI
ADDL DI, DX
MOVL DX, AX
RET
success:
SUBL SI, DI
SUBL $1, DI
MOVL DI, AX
RET
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-20
MOVW b_base+0(FP), R0
MOVW b_len+4(FP), R1
MOVBU c+12(FP), R2 // byte to find
MOVW R0, R4 // store base for later
ADD R0, R1 // end
_loop:
CMP R0, R1
B.EQ _notfound
MOVBU.P 1(R0), R3
CMP R2, R3
B.NE _loop
SUB $1, R0 // R0 will be one beyond the position we want
SUB R4, R0 // remove base
MOVW R0, ret+16(FP)
RET
_notfound:
MOVW $-1, R0
MOVW R0, ret+16(FP)
RET
TEXT ·IndexByteString(SB),NOSPLIT,$0-16
MOVW s_base+0(FP), R0
MOVW s_len+4(FP), R1
MOVBU c+8(FP), R2 // byte to find
MOVW R0, R4 // store base for later
ADD R0, R1 // end
_sib_loop:
CMP R0, R1
B.EQ _sib_notfound
MOVBU.P 1(R0), R3
CMP R2, R3
B.NE _sib_loop
SUB $1, R0 // R0 will be one beyond the position we want
SUB R4, R0 // remove base
MOVW R0, ret+12(FP)
RET
_sib_notfound:
MOVW $-1, R0
MOVW R0, ret+12(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
JMP ·IndexByte(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
JMP ·IndexByteString(SB)
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-40
MOVD b_base+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B indexbytebody<>(SB)
TEXT ·IndexByteString(SB),NOSPLIT,$0-32
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B indexbytebody<>(SB)
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVD b_base+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B indexbytebody<>(SB)
// input:
// R0: data
// R1: byte to search
// R2: data len
// R8: address to put result
TEXT indexbytebody<>(SB),NOSPLIT,$0
// Core algorithm:
// For each 32-byte chunk we calculate a 64-bit syndrome value,
// with two bits per byte. For each tuple, bit 0 is set if the
// relevant byte matched the requested character and bit 1 is
// not used (faster than using a 32bit syndrome). Since the bits
// in the syndrome reflect exactly the order in which things occur
// in the original string, counting trailing zeros allows to
// identify exactly which byte has matched.
CBZ R2, fail
MOVD R0, R11
// Magic constant 0x40100401 allows us to identify
// which lane matches the requested byte.
// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
MOVD $0x40100401, R5
VMOV R1, V0.B16
// Work with aligned 32-byte chunks
BIC $0x1f, R0, R3
VMOV R5, V5.S4
ANDS $0x1f, R0, R9
AND $0x1f, R2, R10
BEQ loop
// Input string is not 32-byte aligned. We calculate the
// syndrome value for the aligned 32 bytes block containing
// the first bytes and mask off the irrelevant part.
VLD1.P (R3), [V1.B16, V2.B16]
SUB $0x20, R9, R4
ADDS R4, R2, R2
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16 // 256->128
VADDP V6.B16, V6.B16, V6.B16 // 128->64
VMOV V6.D[0], R6
// Clear the irrelevant lower bits
LSL $1, R9, R4
LSR R4, R6, R6
LSL R4, R6, R6
// The first block can also be the last
BLS masklast
// Have we found something already?
CBNZ R6, tail
loop:
VLD1.P (R3), [V1.B16, V2.B16]
SUBS $0x20, R2, R2
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// If we're out of data we finish regardless of the result
BLS end
// Use a fast check for the termination condition
VORR V4.B16, V3.B16, V6.B16
VADDP V6.D2, V6.D2, V6.D2
VMOV V6.D[0], R6
// We're not out of data, loop if we haven't found the character
CBZ R6, loop
end:
// Termination condition found, let's calculate the syndrome value
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Only do the clear for the last possible block with less than 32 bytes
// Condition flags come from SUBS in the loop
BHS tail
masklast:
// Clear the irrelevant upper bits
ADD R9, R10, R4
AND $0x1f, R4, R4
SUB $0x20, R4, R4
NEG R4<<1, R4
LSL R4, R6, R6
LSR R4, R6, R6
tail:
// Check that we have found a character
CBZ R6, fail
// Count the trailing zeros using bit reversing
RBIT R6, R6
// Compensate the last post-increment
SUB $0x20, R3, R3
// And count the leading zeros
CLZ R6, R6
// R6 is twice the offset into the fragment
ADD R6>>1, R3, R0
// Compute the offset result
SUB R11, R0, R0
MOVD R0, (R8)
RET
fail:
MOVD $-1, R0
MOVD R0, (R8)
RET
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !386,!amd64,!amd64p32,!s390x,!arm,!arm64,!ppc64,!ppc64le,!mips,!mipsle,!mips64,!mips64le
package bytealg
import _ "unsafe" // for go:linkname
func IndexByte(b []byte, c byte) int {
for i, x := range b {
if x == c {
return i
}
}
return -1
}
func IndexByteString(s string, c byte) int {
for i := 0; i < len(s); i++ {
if s[i] == c {
return i
}
}
return -1
}
//go:linkname bytes_IndexByte bytes.IndexByte
func bytes_IndexByte(b []byte, c byte) int {
for i, x := range b {
if x == c {
return i
}
}
return -1
}
//go:linkname strings_IndexByte strings.IndexByte
func strings_IndexByte(s string, c byte) int {
for i := 0; i < len(s); i++ {
if s[i] == c {
return i
}
}
return -1
}
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build mips64 mips64le
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-40
MOVV b_base+0(FP), R1
MOVV b_len+8(FP), R2
MOVBU c+24(FP), R3 // byte to find
MOVV R1, R4 // store base for later
ADDV R1, R2 // end
ADDV $-1, R1
loop:
ADDV $1, R1
BEQ R1, R2, notfound
MOVBU (R1), R5
BNE R3, R5, loop
SUBV R4, R1 // remove base
MOVV R1, ret+32(FP)
RET
notfound:
MOVV $-1, R1
MOVV R1, ret+32(FP)
RET
TEXT ·IndexByteString(SB),NOSPLIT,$0-32
MOVV s_base+0(FP), R1
MOVV s_len+8(FP), R2
MOVBU c+16(FP), R3 // byte to find
MOVV R1, R4 // store base for later
ADDV R1, R2 // end
ADDV $-1, R1
loop:
ADDV $1, R1
BEQ R1, R2, notfound
MOVBU (R1), R5
BNE R3, R5, loop
SUBV R4, R1 // remove base
MOVV R1, ret+24(FP)
RET
notfound:
MOVV $-1, R1
MOVV R1, ret+24(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
JMP ·IndexByte(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
JMP ·IndexByteString(SB)
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build mips mipsle
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-20
MOVW b_base+0(FP), R1
MOVW b_len+4(FP), R2
MOVBU c+12(FP), R3 // byte to find
ADDU $1, R1, R4 // store base+1 for later
ADDU R1, R2 // end
loop:
BEQ R1, R2, notfound
MOVBU (R1), R5
ADDU $1, R1
BNE R3, R5, loop
SUBU R4, R1 // R1 will be one beyond the position we want so remove (base+1)
MOVW R1, ret+16(FP)
RET
notfound:
MOVW $-1, R1
MOVW R1, ret+16(FP)
RET
TEXT ·IndexByteString(SB),NOSPLIT,$0-16
MOVW s_base+0(FP), R1
MOVW s_len+4(FP), R2
MOVBU c+8(FP), R3 // byte to find
ADDU $1, R1, R4 // store base+1 for later
ADDU R1, R2 // end
loop:
BEQ R1, R2, notfound
MOVBU (R1), R5
ADDU $1, R1
BNE R3, R5, loop
SUBU R4, R1 // remove (base+1)
MOVW R1, ret+12(FP)
RET
notfound:
MOVW $-1, R1
MOVW R1, ret+12(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
JMP ·IndexByte(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
JMP ·IndexByteString(SB)
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32 s390x arm arm64 ppc64 ppc64le mips mipsle mips64 mips64le
package bytealg
import (
"internal/cpu"
"unsafe"
)
// Offsets into internal/cpu records for use in assembly
// TODO: find a better way to do this?
const x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
const s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
//go:noescape
func IndexByte(b []byte, c byte) int
//go:noescape
func IndexByteString(s string, c byte) int
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ppc64 ppc64le
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD b_base+0(FP), R3 // R3 = byte array pointer
MOVD b_len+8(FP), R4 // R4 = length
MOVBZ c+24(FP), R5 // R5 = byte
MOVD $ret+32(FP), R14 // R14 = &ret
BR indexbytebody<>(SB)
TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
MOVD s_base+0(FP), R3 // R3 = string
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+16(FP), R5 // R5 = byte
MOVD $ret+24(FP), R14 // R14 = &ret
BR indexbytebody<>(SB)
TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD b_base+0(FP), R3 // R3 = byte array pointer
MOVD b_len+8(FP), R4 // R4 = length
MOVBZ c+24(FP), R5 // R5 = byte
MOVD $ret+32(FP), R14 // R14 = &ret
BR indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
MOVD s_base+0(FP), R3 // R3 = string
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+16(FP), R5 // R5 = byte
MOVD $ret+24(FP), R14 // R14 = &ret
BR indexbytebody<>(SB)
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
DCBT (R3) // Prepare cache line.
MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
ADD R4,R3,R7 // Last acceptable address in R7.
RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
MOVD $-1,R9
WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
RLDIMI $32,R5,$0,R5
MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7
#ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian
#else
SRD R6,R9,R9 // Same for Big Endian
#endif
BLE small_string // Jump to the small string case if it's <32 bytes.
// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
// in V0, V1 and V10, then branch to the preloop.
ANDCC $63,R3,R11
BEQ CR0,qw_align
RLDICL $0,R3,$61,R11
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
ADD R4,R11,R4
// Check for quadword alignment
ANDCC $15,R8,R11
BEQ CR0,qw_align
// Not aligned, so handle the next doubleword
MOVD 0(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR7
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:
// Set up auxiliary data for the vectorized algorithm.
VSPLTISB $0,V0 // Replicate 0 across V0
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11
VSLB V11,V10,V10
VSPLTB $7,V1,V1 // Replicate byte across V1
CMPU R4, $64 // If len <= 64, don't use the vectorized loop
BLE tail
// We will load 4 quardwords per iteration in the loop, so check for
// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
ANDCC $63,R8,R11
BEQ CR0,preloop
// Not 64-byte aligned. Load one quadword at a time until aligned.
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $-16,R4,R4
ADD $16,R8,R8
// 64-byte aligned. Prepare for the main loop.
preloop:
CMPU R4,$64
BLE tail // If len <= 64, don't use the vectorized loop
// We are now aligned to a 64-byte boundary. We will load 4 quadwords
// per loop iteration. The last doubleword is in R10, so our loop counter
// starts at (R10-R8)/64.
SUB R8,R10,R6
SRD $6,R6,R9 // Loop counter in R9
MOVD R9,CTR
MOVD $16,R11 // Load offsets for the vector loads
MOVD $32,R9
MOVD $48,R7
// Main loop we will load 64 bytes per iteration
loop:
LVX (R8+R0),V2 // Load 4 16-byte vectors
LVX (R11+R8),V3
LVX (R9+R8),V4
LVX (R7+R8),V5
VCMPEQUB V1,V2,V6 // Look for byte in each vector
VCMPEQUB V1,V3,V7
VCMPEQUB V1,V4,V8
VCMPEQUB V1,V5,V9
VOR V6,V7,V11 // Compress the result in a single vector
VOR V8,V9,V12
VOR V11,V12,V11
VCMPEQUBCC V0,V11,V11 // Check for byte
BGE CR6,found
ADD $64,R8,R8
BC 16,0,loop // bdnz loop
// Handle the tailing bytes or R4 <= 64
RLDICL $0,R6,$58,R4
tail:
CMPU R4,$0
BEQ notfound
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
notfound:
MOVD $-1,R3
MOVD R3,(R14)
RET
found:
// We will now compress the results into a single doubleword,
// so it can be moved to a GPR for the final index calculation.
// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
// first bit of each byte into bits 48-63.
VBPERMQ V6,V10,V6
VBPERMQ V7,V10,V7
VBPERMQ V8,V10,V8
VBPERMQ V9,V10,V9
// Shift each 16-bit component into its correct position for
// merging into a single doubleword.
#ifdef GOARCH_ppc64le
VSLDOI $2,V7,V7,V7
VSLDOI $4,V8,V8,V8
VSLDOI $6,V9,V9,V9
#else
VSLDOI $6,V6,V6,V6
VSLDOI $4,V7,V7,V7
VSLDOI $2,V8,V8,V8
#endif
// Merge V6-V9 into a single doubleword and move to a GPR.
VOR V6,V7,V11
VOR V8,V9,V4
VOR V4,V11,V4
MFVRD V4,R3
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
ADD R8,R11,R3 // Calculate byte address
return:
SUB R17,R3
MOVD R3,(R14)
RET
found_qw_align:
// Use the same algorithm as above. Compress the result into
// a single doubleword and move it to a GPR for the final
// calculation.
VBPERMQ V6,V10,V6
#ifdef GOARCH_ppc64le
MFVRD V6,R3
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11
#else
VSLDOI $6,V6,V6,V6
MFVRD V6,R3
CNTLZD R3,R11
#endif
ADD R8,R11,R3
CMPU R11,R4
BLT return
BR notfound
done:
// At this point, R3 has 0xFF in the same position as the byte we are
// looking for in the doubleword. Use that to calculate the exact index
// of the byte.
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3
CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return
BLE CR7,return
BR notfound
small_string:
// We unroll this loop for better performance.
CMPU R4,$0 // Check for length=0
BEQ notfound
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
CMPU R8,R7
BNE CR7,done
BEQ notfound // Hit length.
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
BNE CR6,done
BR notfound
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD b_base+0(FP), R3// b_base => R3
MOVD b_len+8(FP), R4 // b_len => R4
MOVBZ c+24(FP), R5 // c => R5
MOVD $ret+32(FP), R2 // &ret => R9
BR indexbytebody<>(SB)
TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32
MOVD s_base+0(FP), R3// s_base => R3
MOVD s_len+8(FP), R4 // s_len => R4
MOVBZ c+16(FP), R5 // c => R5
MOVD $ret+24(FP), R2 // &ret => R9
BR indexbytebody<>(SB)
TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD b_base+0(FP), R3// b_base => R3
MOVD b_len+8(FP), R4 // b_len => R4
MOVBZ c+24(FP), R5 // c => R5
MOVD $ret+32(FP), R2 // &ret => R9
BR indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
MOVD s_base+0(FP), R3// s_base => R3
MOVD s_len+8(FP), R4 // s_len => R4
MOVBZ c+16(FP), R5 // c => R5
MOVD $ret+24(FP), R2 // &ret => R9
BR indexbytebody<>(SB)
// input:
// R3: s
// R4: s_len
// R5: c -- byte sought
// R2: &ret -- address to put index into
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0
CMPBEQ R4, $0, notfound
MOVD R3, R6 // store base for later
ADD R3, R4, R8 // the address after the end of the string
//if the length is small, use loop; otherwise, use vector or srst search
CMPBGE R4, $16, large
residual:
CMPBEQ R3, R8, notfound
MOVBZ 0(R3), R7
LA 1(R3), R3
CMPBNE R7, R5, residual
found:
SUB R6, R3
SUB $1, R3
MOVD R3, 0(R2)
RET
notfound:
MOVD $-1, 0(R2)
RET
large:
MOVBZ internal∕cpu·S390X+const_s390x_HasVX(SB), R1
CMPBNE R1, $0, vectorimpl
srstimpl: // no vector facility
MOVBZ R5, R0 // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
srstloop:
WORD $0xB25E0083 // srst %r8, %r3 (search the range [R3, R8))
BVS srstloop // interrupted - continue
BGT notfoundr0
foundr0:
XOR R0, R0 // reset R0
SUB R6, R8 // remove base
MOVD R8, 0(R2)
RET
notfoundr0:
XOR R0, R0 // reset R0
MOVD $-1, 0(R2)
RET
vectorimpl:
//if the address is not 16byte aligned, use loop for the header
MOVD R3, R8
AND $15, R8
CMPBGT R8, $0, notaligned
aligned:
ADD R6, R4, R8
MOVD R8, R7
AND $-16, R7
// replicate c across V17
VLVGB $0, R5, V19
VREPB $0, V19, V17
vectorloop:
CMPBGE R3, R7, residual
VL 0(R3), V16 // load string to be searched into V16
ADD $16, R3
VFEEBS V16, V17, V18 // search V17 in V16 and set conditional code accordingly
BVS vectorloop
// when vector search found c in the string
VLGVB $7, V18, R7 // load 7th element of V18 containing index into R7
SUB $16, R3
SUB R6, R3
ADD R3, R7
MOVD R7, 0(R2)
RET
notaligned:
MOVD R3, R8
AND $-16, R8
ADD $16, R8
notalignedloop:
CMPBEQ R3, R8, aligned
MOVBZ 0(R3), R7
LA 1(R3), R3
CMPBNE R7, R5, notalignedloop
BR found
......@@ -75,3 +75,11 @@ type arm64 struct {
HasATOMICS bool
_ [CacheLineSize]byte
}
var S390X s390x
type s390x struct {
_ [CacheLineSize]byte
HasVX bool // vector facility. Note: the runtime sets this when it processes auxv records.
_ [CacheLineSize]byte
}
......@@ -1495,34 +1495,6 @@ TEXT bytes·Compare(SB),NOSPLIT,$0-28
LEAL ret+24(FP), AX
JMP runtime·cmpbody(SB)
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
MOVL s+0(FP), SI
MOVL s_len+4(FP), CX
MOVB c+12(FP), AL
MOVL SI, DI
CLD; REPN; SCASB
JZ 3(PC)
MOVL $-1, ret+16(FP)
RET
SUBL SI, DI
SUBL $1, DI
MOVL DI, ret+16(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
MOVL s+0(FP), SI
MOVL s_len+4(FP), CX
MOVB c+8(FP), AL
MOVL SI, DI
CLD; REPN; SCASB
JZ 3(PC)
MOVL $-1, ret+12(FP)
RET
SUBL SI, DI
SUBL $1, DI
MOVL DI, ret+12(FP)
RET
// input:
// SI = a
// DI = b
......
......@@ -1995,148 +1995,6 @@ success:
MOVQ DI, (R11)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP runtime·indexbytebody(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP runtime·indexbytebody(SB)
// input:
// SI: data
// BX: data len
// AL: byte sought
// R8: address to put result
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Find first set bit, if any.
BSFL DX, DX
JNZ ssesuccess
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JB sseloop
// Search the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched, but that's ok.
MOVQ AX, DI
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
BSFL DX, DX
JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
SUBQ SI, DI // Compute offset of chunk within data.
ADDQ DX, DI // Add offset of byte within chunk.
MOVQ DI, (R8)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ failure
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
MOVOU (SI), X1 // Load data
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
CMPL DX, BX
JAE failure // Match is past end of data.
MOVQ DX, (R8)
RET
endofpage:
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
MOVL BX, CX
SHLL CX, DX
SHRL $16, DX // Shift desired bits down to bottom of register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
MOVQ DX, (R8)
RET
avx2:
CMPB runtime·support_avx2(SB), $1
JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
ADDQ $32, DI
CMPQ DI, R11
JLT avx2_loop
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPTEST Y3, Y3
JNZ avx2success
VZEROUPPER
MOVQ $-1, (R8)
RET
avx2success:
VPMOVMSKB Y3, DX
BSFL DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVQ DX, (R8)
VZEROUPPER
RET
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVQ a_len+8(FP), BX
MOVQ b_len+32(FP), CX
......
......@@ -837,113 +837,6 @@ allsame:
LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
MOVL s+0(FP), SI
MOVL s_len+4(FP), BX
MOVB c+12(FP), AL
CALL runtime·indexbytebody(SB)
MOVL AX, ret+16(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-20
MOVL s+0(FP), SI
MOVL s_len+4(FP), BX
MOVB c+8(FP), AL
CALL runtime·indexbytebody(SB)
MOVL AX, ret+16(FP)
RET
// input:
// SI: data
// BX: data len
// AL: byte sought
// output:
// AX
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
MOVL SI, DI
CMPL BX, $16
JLT small
// round up to first 16-byte boundary
TESTL $15, SI
JZ aligned
MOVL SI, CX
ANDL $~15, CX
ADDL $16, CX
// search the beginning
SUBL SI, CX
REPN; SCASB
JZ success
// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
// round down to last 16-byte boundary
MOVL BX, R11
ADDL SI, R11
ANDL $~15, R11
// shuffle X0 around so that each byte contains c
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
JMP condition
sse:
// move the next 16-byte chunk of the buffer into X1
MOVO (DI), X1
// compare bytes in X0 to X1
PCMPEQB X0, X1
// take the top bit of each byte in X1 and put the result in DX
PMOVMSKB X1, DX
TESTL DX, DX
JNZ ssesuccess
ADDL $16, DI
condition:
CMPL DI, R11
JNE sse
// search the end
MOVL SI, CX
ADDL BX, CX
SUBL R11, CX
// if CX == 0, the zero flag will be set and we'll end up
// returning a false success
JZ failure
REPN; SCASB
JZ success
failure:
MOVL $-1, AX
RET
// handle for lengths < 16
small:
MOVL BX, CX
REPN; SCASB
JZ success
MOVL $-1, AX
RET
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
// get the index of the least significant set bit
BSFW DX, DX
SUBL SI, DI
ADDL DI, DX
MOVL DX, AX
RET
success:
SUBL SI, DI
SUBL $1, DI
MOVL DI, AX
RET
TEXT bytes·Equal(SB),NOSPLIT,$0-25
MOVL a_len+4(FP), BX
MOVL b_len+16(FP), CX
......
......@@ -925,54 +925,6 @@ equal:
MOVBU R0, ret+24(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
MOVW s+0(FP), R0
MOVW s_len+4(FP), R1
MOVBU c+12(FP), R2 // byte to find
MOVW R0, R4 // store base for later
ADD R0, R1 // end
_loop:
CMP R0, R1
B.EQ _notfound
MOVBU.P 1(R0), R3
CMP R2, R3
B.NE _loop
SUB $1, R0 // R0 will be one beyond the position we want
SUB R4, R0 // remove base
MOVW R0, ret+16(FP)
RET
_notfound:
MOVW $-1, R0
MOVW R0, ret+16(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
MOVW s+0(FP), R0
MOVW s_len+4(FP), R1
MOVBU c+8(FP), R2 // byte to find
MOVW R0, R4 // store base for later
ADD R0, R1 // end
_sib_loop:
CMP R0, R1
B.EQ _sib_notfound
MOVBU.P 1(R0), R3
CMP R2, R3
B.NE _sib_loop
SUB $1, R0 // R0 will be one beyond the position we want
SUB R4, R0 // remove base
MOVW R0, ret+12(FP)
RET
_sib_notfound:
MOVW $-1, R0
MOVW R0, ret+12(FP)
RET
TEXT runtime·return0(SB),NOSPLIT,$0
MOVW $0, R0
RET
......
......@@ -800,126 +800,6 @@ samebytes:
//
// functions for other packages
//
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVD b+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B runtime·indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
MOVD s+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B runtime·indexbytebody<>(SB)
// input:
// R0: data
// R1: byte to search
// R2: data len
// R8: address to put result
TEXT runtime·indexbytebody<>(SB),NOSPLIT,$0
// Core algorithm:
// For each 32-byte chunk we calculate a 64-bit syndrome value,
// with two bits per byte. For each tuple, bit 0 is set if the
// relevant byte matched the requested character and bit 1 is
// not used (faster than using a 32bit syndrome). Since the bits
// in the syndrome reflect exactly the order in which things occur
// in the original string, counting trailing zeros allows to
// identify exactly which byte has matched.
CBZ R2, fail
MOVD R0, R11
// Magic constant 0x40100401 allows us to identify
// which lane matches the requested byte.
// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
MOVD $0x40100401, R5
VMOV R1, V0.B16
// Work with aligned 32-byte chunks
BIC $0x1f, R0, R3
VMOV R5, V5.S4
ANDS $0x1f, R0, R9
AND $0x1f, R2, R10
BEQ loop
// Input string is not 32-byte aligned. We calculate the
// syndrome value for the aligned 32 bytes block containing
// the first bytes and mask off the irrelevant part.
VLD1.P (R3), [V1.B16, V2.B16]
SUB $0x20, R9, R4
ADDS R4, R2, R2
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16 // 256->128
VADDP V6.B16, V6.B16, V6.B16 // 128->64
VMOV V6.D[0], R6
// Clear the irrelevant lower bits
LSL $1, R9, R4
LSR R4, R6, R6
LSL R4, R6, R6
// The first block can also be the last
BLS masklast
// Have we found something already?
CBNZ R6, tail
loop:
VLD1.P (R3), [V1.B16, V2.B16]
SUBS $0x20, R2, R2
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// If we're out of data we finish regardless of the result
BLS end
// Use a fast check for the termination condition
VORR V4.B16, V3.B16, V6.B16
VADDP V6.D2, V6.D2, V6.D2
VMOV V6.D[0], R6
// We're not out of data, loop if we haven't found the character
CBZ R6, loop
end:
// Termination condition found, let's calculate the syndrome value
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Only do the clear for the last possible block with less than 32 bytes
// Condition flags come from SUBS in the loop
BHS tail
masklast:
// Clear the irrelevant upper bits
ADD R9, R10, R4
AND $0x1f, R4, R4
SUB $0x20, R4, R4
NEG R4<<1, R4
LSL R4, R6, R6
LSR R4, R6, R6
tail:
// Check that we have found a character
CBZ R6, fail
// Count the trailing zeros using bit reversing
RBIT R6, R6
// Compensate the last post-increment
SUB $0x20, R3, R3
// And count the leading zeros
CLZ R6, R6
// R6 is twice the offset into the fragment
ADD R6>>1, R3, R0
// Compute the offset result
SUB R11, R0, R0
MOVD R0, (R8)
RET
fail:
MOVD $-1, R0
MOVD R0, (R8)
RET
// Equal(a, b []byte) bool
TEXT bytes·Equal(SB),NOSPLIT,$0-49
......
......@@ -697,52 +697,6 @@ equal:
MOVB R1, ret+48(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
MOVV s+0(FP), R1
MOVV s_len+8(FP), R2
MOVBU c+24(FP), R3 // byte to find
MOVV R1, R4 // store base for later
ADDV R1, R2 // end
ADDV $-1, R1
loop:
ADDV $1, R1
BEQ R1, R2, notfound
MOVBU (R1), R5
BNE R3, R5, loop
SUBV R4, R1 // remove base
MOVV R1, ret+32(FP)
RET
notfound:
MOVV $-1, R1
MOVV R1, ret+32(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-32
MOVV p+0(FP), R1
MOVV b_len+8(FP), R2
MOVBU c+16(FP), R3 // byte to find
MOVV R1, R4 // store base for later
ADDV R1, R2 // end
ADDV $-1, R1
loop:
ADDV $1, R1
BEQ R1, R2, notfound
MOVBU (R1), R5
BNE R3, R5, loop
SUBV R4, R1 // remove base
MOVV R1, ret+24(FP)
RET
notfound:
MOVV $-1, R1
MOVV R1, ret+24(FP)
RET
TEXT runtime·return0(SB), NOSPLIT, $0
MOVW $0, R1
RET
......
......@@ -712,50 +712,6 @@ equal:
MOVB R1, ret+24(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
MOVW s+0(FP), R1
MOVW s_len+4(FP), R2
MOVBU c+12(FP), R3 // byte to find
ADDU $1, R1, R4 // store base+1 for later
ADDU R1, R2 // end
loop:
BEQ R1, R2, notfound
MOVBU (R1), R5
ADDU $1, R1
BNE R3, R5, loop
SUBU R4, R1 // R1 will be one beyond the position we want so remove (base+1)
MOVW R1, ret+16(FP)
RET
notfound:
MOVW $-1, R1
MOVW R1, ret+16(FP)
RET
TEXT strings·IndexByte(SB),NOSPLIT,$0-16
MOVW s_base+0(FP), R1
MOVW s_len+4(FP), R2
MOVBU c+8(FP), R3 // byte to find
ADDU $1, R1, R4 // store base+1 for later
ADDU R1, R2 // end
loop:
BEQ R1, R2, notfound
MOVBU (R1), R5
ADDU $1, R1
BNE R3, R5, loop
SUBU R4, R1 // remove (base+1)
MOVW R1, ret+12(FP)
RET
notfound:
MOVW $-1, R1
MOVW R1, ret+12(FP)
RET
TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
MOVW s1_base+0(FP), R3
MOVW s1_len+4(FP), R1
......
......@@ -1068,308 +1068,6 @@ equal:
MOVBZ R3,ret+48(FP)
RET
TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD s+0(FP), R3 // R3 = byte array pointer
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+24(FP), R5 // R5 = byte
MOVD $ret+32(FP), R14 // R14 = &ret
BR runtime·indexbytebody<>(SB)
TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
MOVD s+0(FP), R3 // R3 = string
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+16(FP), R5 // R5 = byte
MOVD $ret+24(FP), R14 // R14 = &ret
BR runtime·indexbytebody<>(SB)
TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
DCBT (R3) // Prepare cache line.
MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
ADD R4,R3,R7 // Last acceptable address in R7.
RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
MOVD $-1,R9
WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
RLDIMI $32,R5,$0,R5
MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7
#ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian
#else
SRD R6,R9,R9 // Same for Big Endian
#endif
BLE small_string // Jump to the small string case if it's <32 bytes.
// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
// in V0, V1 and V10, then branch to the preloop.
ANDCC $63,R3,R11
BEQ CR0,qw_align
RLDICL $0,R3,$61,R11
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
ADD R4,R11,R4
// Check for quadword alignment
ANDCC $15,R8,R11
BEQ CR0,qw_align
// Not aligned, so handle the next doubleword
MOVD 0(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR7
BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:
// Set up auxiliary data for the vectorized algorithm.
VSPLTISB $0,V0 // Replicate 0 across V0
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11
VSLB V11,V10,V10
VSPLTB $7,V1,V1 // Replicate byte across V1
CMPU R4, $64 // If len <= 64, don't use the vectorized loop
BLE tail
// We will load 4 quardwords per iteration in the loop, so check for
// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
ANDCC $63,R8,R11
BEQ CR0,preloop
// Not 64-byte aligned. Load one quadword at a time until aligned.
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $-16,R4,R4
ADD $16,R8,R8
// 64-byte aligned. Prepare for the main loop.
preloop:
CMPU R4,$64
BLE tail // If len <= 64, don't use the vectorized loop
// We are now aligned to a 64-byte boundary. We will load 4 quadwords
// per loop iteration. The last doubleword is in R10, so our loop counter
// starts at (R10-R8)/64.
SUB R8,R10,R6
SRD $6,R6,R9 // Loop counter in R9
MOVD R9,CTR
MOVD $16,R11 // Load offsets for the vector loads
MOVD $32,R9
MOVD $48,R7
// Main loop we will load 64 bytes per iteration
loop:
LVX (R8+R0),V2 // Load 4 16-byte vectors
LVX (R11+R8),V3
LVX (R9+R8),V4
LVX (R7+R8),V5
VCMPEQUB V1,V2,V6 // Look for byte in each vector
VCMPEQUB V1,V3,V7
VCMPEQUB V1,V4,V8
VCMPEQUB V1,V5,V9
VOR V6,V7,V11 // Compress the result in a single vector
VOR V8,V9,V12
VOR V11,V12,V11
VCMPEQUBCC V0,V11,V11 // Check for byte
BGE CR6,found
ADD $64,R8,R8
BC 16,0,loop // bdnz loop
// Handle the tailing bytes or R4 <= 64
RLDICL $0,R6,$58,R4
tail:
CMPU R4,$0
BEQ notfound
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
notfound:
MOVD $-1,R3
MOVD R3,(R14)
RET
found:
// We will now compress the results into a single doubleword,
// so it can be moved to a GPR for the final index calculation.
// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
// first bit of each byte into bits 48-63.
VBPERMQ V6,V10,V6
VBPERMQ V7,V10,V7
VBPERMQ V8,V10,V8
VBPERMQ V9,V10,V9
// Shift each 16-bit component into its correct position for
// merging into a single doubleword.
#ifdef GOARCH_ppc64le
VSLDOI $2,V7,V7,V7
VSLDOI $4,V8,V8,V8
VSLDOI $6,V9,V9,V9
#else
VSLDOI $6,V6,V6,V6
VSLDOI $4,V7,V7,V7
VSLDOI $2,V8,V8,V8
#endif
// Merge V6-V9 into a single doubleword and move to a GPR.
VOR V6,V7,V11
VOR V8,V9,V4
VOR V4,V11,V4
MFVRD V4,R3
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
ADD R8,R11,R3 // Calculate byte address
return:
SUB R17,R3
MOVD R3,(R14)
RET
found_qw_align:
// Use the same algorithm as above. Compress the result into
// a single doubleword and move it to a GPR for the final
// calculation.
VBPERMQ V6,V10,V6
#ifdef GOARCH_ppc64le
MFVRD V6,R3
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11
#else
VSLDOI $6,V6,V6,V6
MFVRD V6,R3
CNTLZD R3,R11
#endif
ADD R8,R11,R3
CMPU R11,R4
BLT return
BR notfound
done:
// At this point, R3 has 0xFF in the same position as the byte we are
// looking for in the doubleword. Use that to calculate the exact index
// of the byte.
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3
CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return
BLE CR7,return
BR notfound
small_string:
// We unroll this loop for better performance.
CMPU R4,$0 // Check for length=0
BEQ notfound
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7.
CMPU R8,R7
BNE CR7,done
BEQ notfound // Hit length.
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done
BEQ notfound
MOVDU 8(R8),R12
CMPB R12,R5,R3
CMPU R3,$0,CR6
BNE CR6,done
BR notfound
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD s1_base+0(FP), R5
MOVD s2_base+16(FP), R6
......
......@@ -854,108 +854,6 @@ TEXT runtime·memeqbodyclc(SB),NOSPLIT|NOFRAME,$0-0
CLC $1, 0(R3), 0(R5)
RET
TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
MOVD s+0(FP), R3 // s => R3
MOVD s_len+8(FP), R4 // s_len => R4
MOVBZ c+24(FP), R5 // c => R5
MOVD $ret+32(FP), R2 // &ret => R9
BR runtime·indexbytebody(SB)
TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
MOVD s+0(FP), R3 // s => R3
MOVD s_len+8(FP), R4 // s_len => R4
MOVBZ c+16(FP), R5 // c => R5
MOVD $ret+24(FP), R2 // &ret => R9
BR runtime·indexbytebody(SB)
// input:
// R3: s
// R4: s_len
// R5: c -- byte sought
// R2: &ret -- address to put index into
TEXT runtime·indexbytebody(SB),NOSPLIT|NOFRAME,$0
CMPBEQ R4, $0, notfound
MOVD R3, R6 // store base for later
ADD R3, R4, R8 // the address after the end of the string
//if the length is small, use loop; otherwise, use vector or srst search
CMPBGE R4, $16, large
residual:
CMPBEQ R3, R8, notfound
MOVBZ 0(R3), R7
LA 1(R3), R3
CMPBNE R7, R5, residual
found:
SUB R6, R3
SUB $1, R3
MOVD R3, 0(R2)
RET
notfound:
MOVD $-1, 0(R2)
RET
large:
MOVBZ ·cpu+facilities_hasVX(SB), R1
CMPBNE R1, $0, vectorimpl
srstimpl: // no vector facility
MOVBZ R5, R0 // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
srstloop:
WORD $0xB25E0083 // srst %r8, %r3 (search the range [R3, R8))
BVS srstloop // interrupted - continue
BGT notfoundr0
foundr0:
XOR R0, R0 // reset R0
SUB R6, R8 // remove base
MOVD R8, 0(R2)
RET
notfoundr0:
XOR R0, R0 // reset R0
MOVD $-1, 0(R2)
RET
vectorimpl:
//if the address is not 16byte aligned, use loop for the header
MOVD R3, R8
AND $15, R8
CMPBGT R8, $0, notaligned
aligned:
ADD R6, R4, R8
MOVD R8, R7
AND $-16, R7
// replicate c across V17
VLVGB $0, R5, V19
VREPB $0, V19, V17
vectorloop:
CMPBGE R3, R7, residual
VL 0(R3), V16 // load string to be searched into V16
ADD $16, R3
VFEEBS V16, V17, V18 // search V17 in V16 and set conditional code accordingly
BVS vectorloop
// when vector search found c in the string
VLGVB $7, V18, R7 // load 7th element of V18 containing index into R7
SUB $16, R3
SUB R6, R3
ADD R3, R7
MOVD R7, 0(R2)
RET
notaligned:
MOVD R3, R8
AND $-16, R8
ADD $16, R8
notalignedloop:
CMPBEQ R3, R8, aligned
MOVBZ 0(R3), R7
LA 1(R3), R3
CMPBNE R7, R5, notalignedloop
BR found
TEXT runtime·return0(SB), NOSPLIT, $0
MOVW $0, R3
RET
......
......@@ -4,7 +4,7 @@
package runtime
import _ "unsafe" // for go:linkname
import "internal/bytealg"
// The Error interface identifies a run time error.
type Error interface {
......@@ -118,11 +118,6 @@ func printany(i interface{}) {
}
}
// strings.IndexByte is implemented in runtime/asm_$goarch.s
// but amusingly we need go:linkname to get access to it here in the runtime.
//go:linkname stringsIndexByte strings.IndexByte
func stringsIndexByte(s string, c byte) int
// panicwrap generates a panic for a call to a wrapped value method
// with a nil pointer receiver.
//
......@@ -133,7 +128,7 @@ func panicwrap() {
// name is something like "main.(*T).F".
// We want to extract pkg ("main"), typ ("T"), and meth ("F").
// Do it by finding the parens.
i := stringsIndexByte(name, '(')
i := bytealg.IndexByteString(name, '(')
if i < 0 {
throw("panicwrap: no ( in " + name)
}
......@@ -142,7 +137,7 @@ func panicwrap() {
throw("panicwrap: unexpected string after package name: " + name)
}
name = name[i+2:]
i = stringsIndexByte(name, ')')
i = bytealg.IndexByteString(name, ')')
if i < 0 {
throw("panicwrap: no ) in " + name)
}
......
......@@ -5,6 +5,7 @@
package runtime
import (
internalcpu "internal/cpu"
"runtime/internal/sys"
)
......@@ -22,11 +23,13 @@ type facilities struct {
// cpu indicates the availability of s390x facilities that can be used in
// Go assembly but are optional on models supported by Go.
// TODO: remove this once we're only using internal/cpu.
var cpu facilities
func archauxv(tag, val uintptr) {
switch tag {
case _AT_HWCAP: // CPU capability bit flags
internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
cpu.hasVX = val&_HWCAP_S390_VX != 0
}
}
......@@ -5,4 +5,4 @@
package strings
// IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s.
func IndexByte(s string, c byte) int // ../runtime/asm_$GOARCH.s
func IndexByte(s string, c byte) int // in internal/bytealg
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment