// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" // func Count(b []byte, c byte) int // input: // R0: b ptr // R1: b len // R2: b cap // R3: c byte to search // return: // R0: result TEXT ·Count(SB),NOSPLIT,$0-40 MOVD R3, R2 B ·CountString(SB) // func CountString(s string, c byte) int // input: // R0: s ptr // R1: s len // R2: c byte to search (due to ABIInternal upper bits can contain junk) // return: // R0: result TEXT ·CountString(SB),NOSPLIT,$0-32 // R11 = count of byte to search MOVD $0, R11 // short path to handle 0-byte case CBZ R1, done CMP $0x20, R1 // jump directly to head if length >= 32 BHS head tail: // Work with tail shorter than 32 bytes MOVBU.P 1(R0), R5 SUB $1, R1, R1 CMP R2.UXTB, R5 CINC EQ, R11, R11 CBNZ R1, tail done: MOVD R11, R0 RET PCALIGN $16 head: ANDS $0x1f, R0, R9 BEQ chunk // Work with not 32-byte aligned head BIC $0x1f, R0, R3 ADD $0x20, R3 PCALIGN $16 head_loop: MOVBU.P 1(R0), R5 CMP R2.UXTB, R5 CINC EQ, R11, R11 SUB $1, R1, R1 CMP R0, R3 BNE head_loop chunk: BIC $0x1f, R1, R9 // The first chunk can also be the last CBZ R9, tail // R3 = end of 32-byte chunks ADD R0, R9, R3 MOVD $1, R5 VMOV R5, V5.B16 // R1 = length of tail SUB R9, R1, R1 // Duplicate R2 (byte to search) to 16 1-byte elements of V0 VMOV R2, V0.B16 // Clear the low 64-bit element of V7 and V8 VEOR V7.B8, V7.B8, V7.B8 VEOR V8.B8, V8.B8, V8.B8 PCALIGN $16 // Count the target byte in 32-byte chunk chunk_loop: VLD1.P (R0), [V1.B16, V2.B16] CMP R0, R3 VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V2.B16, V4.B16 // Clear the higher 7 bits VAND V5.B16, V3.B16, V3.B16 VAND V5.B16, V4.B16, V4.B16 // Count lanes match the requested byte VADDP V4.B16, V3.B16, V6.B16 // 32B->16B VUADDLV V6.B16, V7 // Accumulate the count in low 64-bit element of V8 when inside the loop VADD V7, V8 BNE chunk_loop VMOV V8.D[0], R6 ADD R6, R11, R11 CBZ R1, done B tail