// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64le || ppc64

#include "go_asm.h"
#include "textflag.h"

TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
	// R3 = byte array pointer
	// R4 = length
	// R6 = byte to count
	MTVRD	R6, V1		// move compare byte
	MOVD	R6, R5
	VSPLTB	$7, V1, V1	// replicate byte across V1
	BR	countbytebody<>(SB)

TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
	// R3 = byte array pointer
	// R4 = length
	// R5 = byte to count
	MTVRD	R5, V1		// move compare byte
	VSPLTB	$7, V1, V1	// replicate byte across V1
	BR	countbytebody<>(SB)

// R3: addr of string
// R4: len of string
// R5: byte to count
// V1: byte to count, splatted.
// On exit:
// R3: return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
	MOVD	$0, R18 // byte count

#ifndef GOPPC64_power10
	RLDIMI	$8, R5, $48, R5
	RLDIMI	$16, R5, $32, R5
	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
#endif

	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
	BLT	tail		// Jump to the small string case
	SRD	$5, R4, R20
	MOVD	R20, CTR
	MOVD	$16, R21
	XXLXOR	V4, V4, V4
	XXLXOR	V5, V5, V5

	PCALIGN	$16
cmploop:
	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
	LXVD2X	(R21)(R3), V2
	VCMPEQUB V2, V1, V2
	VCMPEQUB V0, V1, V0
	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
	VPOPCNTD V0, V0
	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
	ADD	$32, R3
	BDNZ	cmploop

	VADDUDM	V4, V5, V5
	MFVSRD	V5, R18
	VSLDOI	$8, V5, V5, V5
	MFVSRD	V5, R21
	ADD	R21, R18, R18
	ANDCC	$31, R4, R4
	// Skip the tail processing if no bytes remaining.
	BEQ	tail_0

#ifdef GOPPC64_power10
	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.

tail:	// Count the last 0 - 31 bytes.
	CMP	R4, $16
	BLE	small_tail_p10
	LXV	0(R3), V0
	VCMPEQUB V0, V1, V0
	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
	ADD	R14, R18, R18
	ADD	$16, R3, R3
	ANDCC	$15, R4, R4

small_tail_p10:
	SLD	$56, R4, R6
	LXVLL	R3, R6, V0
	VCMPEQUB V0, V1, V0
	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
	ADD	R14, R18, R3
	RET

#else
tail:	// Count the last 0 - 31 bytes.
	CMP	R4, $16
	BLT	tail_8
	MOVD	(R3), R12
	MOVD	8(R3), R14
	CMPB	R12, R5, R12
	CMPB	R14, R5, R14
	POPCNTD	R12, R12
	POPCNTD	R14, R14
	ADD	R12, R18, R18
	ADD	R14, R18, R18
	ADD	$16, R3, R3
	ADD	$-16, R4, R4

tail_8:	// Count the remaining 0 - 15 bytes.
	CMP	R4, $8
	BLT	tail_4
	MOVD	(R3), R12
	CMPB	R12, R5, R12
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$8, R3, R3
	ADD	$-8, R4, R4

tail_4:	// Count the remaining 0 - 7 bytes.
	CMP	R4, $4
	BLT	tail_2
	MOVWZ	(R3), R12
	CMPB	R12, R5, R12
	SLD	$32, R12, R12	// Remove non-participating matches.
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$4, R3, R3
	ADD	$-4, R4, R4

tail_2:	// Count the remaining 0 - 3 bytes.
	CMP	R4, $2
	BLT	tail_1
	MOVHZ	(R3), R12
	CMPB	R12, R5, R12
	SLD	$48, R12, R12	// Remove non-participating matches.
	POPCNTD	R12, R12
	ADD	R12, R18, R18
	ADD	$2, R3, R3
	ADD	$-2, R4, R4

tail_1:	// Count the remaining 0 - 1 bytes.
	CMP	R4, $1
	BLT	tail_0
	MOVBZ	(R3), R12
	CMPB	R12, R5, R12
	ANDCC	$0x8, R12, R12
	ADD	R12, R18, R18
#endif

tail_0:	// No remaining tail to count.
	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
	RET