1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // func Count(b []byte, c byte) int
9 // input:
10 // R0: b ptr
11 // R1: b len
12 // R2: b cap
13 // R3: c byte to search
14 // return:
15 // R0: result
16 TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
17 MOVD R3, R2
18 B ·CountString<ABIInternal>(SB)
19
20 // func CountString(s string, c byte) int
21 // input:
22 // R0: s ptr
23 // R1: s len
24 // R2: c byte to search (due to ABIInternal upper bits can contain junk)
25 // return:
26 // R0: result
27 TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
28 // R11 = count of byte to search
29 MOVD $0, R11
30 // short path to handle 0-byte case
31 CBZ R1, done
32 CMP $0x20, R1
33 // jump directly to head if length >= 32
34 BHS head
35 tail:
36 // Work with tail shorter than 32 bytes
37 MOVBU.P 1(R0), R5
38 SUB $1, R1, R1
39 CMP R2.UXTB, R5
40 CINC EQ, R11, R11
41 CBNZ R1, tail
42 done:
43 MOVD R11, R0
44 RET
45 PCALIGN $16
46 head:
47 ANDS $0x1f, R0, R9
48 BEQ chunk
49 // Work with not 32-byte aligned head
50 BIC $0x1f, R0, R3
51 ADD $0x20, R3
52 PCALIGN $16
53 head_loop:
54 MOVBU.P 1(R0), R5
55 CMP R2.UXTB, R5
56 CINC EQ, R11, R11
57 SUB $1, R1, R1
58 CMP R0, R3
59 BNE head_loop
60 chunk:
61 BIC $0x1f, R1, R9
62 // The first chunk can also be the last
63 CBZ R9, tail
64 // R3 = end of 32-byte chunks
65 ADD R0, R9, R3
66 MOVD $1, R5
67 VMOV R5, V5.B16
68 // R1 = length of tail
69 SUB R9, R1, R1
70 // Duplicate R2 (byte to search) to 16 1-byte elements of V0
71 VMOV R2, V0.B16
72 // Clear the low 64-bit element of V7 and V8
73 VEOR V7.B8, V7.B8, V7.B8
74 VEOR V8.B8, V8.B8, V8.B8
75 PCALIGN $16
76 // Count the target byte in 32-byte chunk
77 chunk_loop:
78 VLD1.P (R0), [V1.B16, V2.B16]
79 CMP R0, R3
80 VCMEQ V0.B16, V1.B16, V3.B16
81 VCMEQ V0.B16, V2.B16, V4.B16
82 // Clear the higher 7 bits
83 VAND V5.B16, V3.B16, V3.B16
84 VAND V5.B16, V4.B16, V4.B16
85 // Count lanes match the requested byte
86 VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
87 VUADDLV V6.B16, V7
88 // Accumulate the count in low 64-bit element of V8 when inside the loop
89 VADD V7, V8
90 BNE chunk_loop
91 VMOV V8.D[0], R6
92 ADD R6, R11, R11
93 CBZ R1, done
94 B tail
95
View as plain text