Commit a0fad982 authored by Lynn Boger's avatar Lynn Boger

internal/bytealg: implement bytes.Count in asm for ppc64x

This adds an asm implementation for the Count function in ppc64x.
The Go code that manipulates a byte at a time is especially
inefficient on ppc64x, so an asm implementation is a significant
improvement.

bytes:
name               old time/op   new time/op    delta
CountSingle/10-8    23.1ns ± 0%    18.6ns ± 0%    -19.48%  (p=1.000 n=1+1)
CountSingle/32-8    60.4ns ± 0%    19.0ns ± 0%    -68.54%  (p=1.000 n=1+1)
CountSingle/4K-8    7.29µs ± 0%    0.45µs ± 0%    -93.80%  (p=1.000 n=1+1)
CountSingle/4M-8    7.49ms ± 0%    0.45ms ± 0%    -93.97%  (p=1.000 n=1+1)
CountSingle/64M-8    127ms ± 0%       9ms ± 0%    -92.53%  (p=1.000 n=1+1)

html:
name              old time/op  new time/op  delta
Escape-8          57.5µs ± 0%  36.1µs ± 0%  -37.13%  (p=1.000 n=1+1)
EscapeNone-8      20.0µs ± 0%   2.0µs ± 0%  -90.14%  (p=1.000 n=1+1)

Change-Id: Iadbf422c0e9a37b47d2d95fb8c778420f3aabb58
Reviewed-on: https://go-review.googlesource.com/131695
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarMichael Munday <mike.munday@ibm.com>
parent e7f5f3ec
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!arm64
// +build !amd64,!arm64,!ppc64le,!ppc64
package bytealg
......
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 arm64
// +build amd64 arm64 ppc64le ppc64
package bytealg
......
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ppc64le ppc64
#include "go_asm.h"
#include "textflag.h"
TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
MOVD b_base+0(FP), R3 // R3 = byte array pointer
MOVD b_len+8(FP), R4 // R4 = length
MOVBZ c+24(FP), R5 // R5 = byte
MOVD $ret+32(FP), R14 // R14 = &ret
BR countbytebody<>(SB)
TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
MOVD s_base+0(FP), R3 // R3 = string
MOVD s_len+8(FP), R4 // R4 = length
MOVBZ c+16(FP), R5 // R5 = byte
MOVD $ret+24(FP), R14 // R14 = &ret
BR countbytebody<>(SB)
// R3: addr of string
// R4: len of string
// R5: byte to count
// R14: addr for return value
// endianness shouldn't matter since we are just counting and order
// is irrelevant
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
DCBT (R3) // Prepare cache line.
MOVD R0, R18 // byte count
MOVD R3, R19 // Save base address for calculating the index later.
MOVD R4, R16
MOVD R5, R6
RLDIMI $8, R6, $48, R6
RLDIMI $16, R6, $32, R6
RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
VSPLTISW $3, V4 // used for shift
MTVRD R6, V1 // move compare byte
VSPLTB $7, V1, V1 // replicate byte across V1
CMPU R4, $32 // Check if it's a small string (<32 bytes)
BLT tail // Jump to the small string case
XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
cmploop:
LXVW4X (R3), VS32 // load bytes from string
// when the bytes match, the corresonding byte contains all 1s
VCMPEQUB V1, V0, V2 // compare bytes
VPOPCNTD V2, V3 // each double word contains its count
VADDUDM V3, V5, V5 // accumulate bit count in each double word
ADD $16, R3, R3 // increment pointer
SUB $16, R16, R16 // remaining bytes
CMP R16, $16 // at least 16 remaining?
BGE cmploop
VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
VSLDOI $8, V5, V5, V6 // get the double word values from vector
MFVSRD V5, R9
MFVSRD V6, R10
ADD R9, R10, R9
ADD R9, R18, R18
tail:
CMP R16, $8 // 8 bytes left?
BLT small
MOVD (R3), R12 // load 8 bytes
CMPB R12, R6, R17 // compare bytes
POPCNTD R17, R15 // bit count
SRD $3, R15, R15 // byte count
ADD R15, R18, R18 // add to byte count
next1:
ADD $8, R3, R3
SUB $8, R16, R16 // remaining bytes
BR tail
small:
CMP $0, R16 // any remaining
BEQ done
MOVBZ (R3), R12 // check each remaining byte
CMP R12, R5
BNE next2
ADD $1, R18
next2:
SUB $1, R16
ADD $1, R3 // inc address
BR small
done:
MOVD R18, (R14) // return count
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment