Commit 37cfb2e0 authored by Ilya Tocar's avatar Ilya Tocar Committed by Brad Fitzpatrick

math: optimize ceil/floor functions on amd64

Use SSE 4.1 rounding instruction to perform rounding
Results (haswell):

name      old time/op  new time/op  delta
Floor-48  2.71ns ± 0%  1.87ns ± 1%  -31.17%  (p=0.000 n=16+19)
Ceil-48   3.09ns ± 3%  2.16ns ± 0%  -30.16%  (p=0.000 n=19+12)

Change-Id: If63715879eed6530b1eb4fc96132d827f8f43909
Reviewed-on: https://go-review.googlesource.com/14561Reviewed-by: 's avatarKlaus Post <klauspost@gmail.com>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: 's avatarKeith Randall <khr@golang.org>
parent acc90c53
......@@ -734,6 +734,11 @@ const (
AAESIMC
AAESKEYGENASSIST
AROUNDPS
AROUNDSS
AROUNDPD
AROUNDSD
APSHUFD
APCLMULQDQ
......
......@@ -677,6 +677,10 @@ var Anames = []string{
"AESDECLAST",
"AESIMC",
"AESKEYGENASSIST",
"ROUNDPS",
"ROUNDSS",
"ROUNDPD",
"ROUNDSD",
"PSHUFD",
"PCLMULQDQ",
"JCXZW",
......
......@@ -1474,6 +1474,10 @@ var optab =
{AAESDECLAST, yaes, Pq, [23]uint8{0x38, 0xdf, 0}},
{AAESIMC, yaes, Pq, [23]uint8{0x38, 0xdb, 0}},
{AAESKEYGENASSIST, yaes2, Pq, [23]uint8{0x3a, 0xdf, 0}},
{AROUNDPD, yaes2, Pq, [23]uint8{0x3a, 0x09, 0}},
{AROUNDPS, yaes2, Pq, [23]uint8{0x3a, 0x08, 0}},
{AROUNDSD, yaes2, Pq, [23]uint8{0x3a, 0x0b, 0}},
{AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}},
{APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}},
{APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}},
{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
......
......@@ -6,8 +6,25 @@
#define Big 0x4330000000000000 // 2**52
// func hasSSE4() bool
// returns whether SSE4.1 is supported
TEXT ·hasSSE4(SB),NOSPLIT,$0
XORQ AX, AX
INCL AX
CPUID
SHRQ $19, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET
// func Floor(x float64) float64
TEXT ·Floor(SB),NOSPLIT,$0
CMPB math·useSSE4(SB), $1
JNE nosse4
ROUNDSD $1, x+0(FP), X0
MOVQ X0, ret+8(FP)
RET
nosse4:
MOVQ x+0(FP), AX
MOVQ $~(1<<63), DX // sign bit mask
ANDQ AX,DX // DX = |x|
......@@ -30,6 +47,12 @@ isBig_floor:
// func Ceil(x float64) float64
TEXT ·Ceil(SB),NOSPLIT,$0
CMPB math·useSSE4(SB), $1
JNE nosse4
ROUNDSD $2, x+0(FP), X0
MOVQ X0, ret+8(FP)
RET
nosse4:
MOVQ x+0(FP), AX
MOVQ $~(1<<63), DX // sign bit mask
MOVQ AX, BX // BX = copy of x
......
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 amd64p32
package math
//defined in floor_amd64.s
func hasSSE4() bool
var useSSE4 = hasSSE4()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment