Text file src/internal/bytealg/count_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  // func Count(b []byte, c byte) int
     9  // input:
    10  //   R0: b ptr
    11  //   R1: b len
    12  //   R2: b cap
    13  //   R3: c byte to search
    14  // return:
    15  //   R0: result
    16  TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
    17  	MOVD	R3, R2
    18  	B	·CountString<ABIInternal>(SB)
    19  
    20  // func CountString(s string, c byte) int
    21  // input:
    22  //   R0: s ptr
    23  //   R1: s len
    24  //   R2: c byte to search (due to ABIInternal upper bits can contain junk)
    25  // return:
    26  //   R0: result
    27  TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
    28  	// R11 = count of byte to search
    29  	MOVD	$0, R11
    30  	// short path to handle 0-byte case
    31  	CBZ	R1, done
    32  	CMP	$0x20, R1
    33  	// jump directly to head if length >= 32
    34  	BHS	head
    35  tail:
    36  	// Work with tail shorter than 32 bytes
    37  	MOVBU.P	1(R0), R5
    38  	SUB	$1, R1, R1
    39  	CMP	R2.UXTB, R5
    40  	CINC	EQ, R11, R11
    41  	CBNZ	R1, tail
    42  done:
    43  	MOVD	R11, R0
    44  	RET
    45  	PCALIGN	$16
    46  head:
    47  	ANDS	$0x1f, R0, R9
    48  	BEQ	chunk
    49  	// Work with not 32-byte aligned head
    50  	BIC	$0x1f, R0, R3
    51  	ADD	$0x20, R3
    52  	PCALIGN $16
    53  head_loop:
    54  	MOVBU.P	1(R0), R5
    55  	CMP	R2.UXTB, R5
    56  	CINC	EQ, R11, R11
    57  	SUB	$1, R1, R1
    58  	CMP	R0, R3
    59  	BNE	head_loop
    60  chunk:
    61  	BIC	$0x1f, R1, R9
    62  	// The first chunk can also be the last
    63  	CBZ	R9, tail
    64  	// R3 = end of 32-byte chunks
    65  	ADD	R0, R9, R3
    66  	MOVD	$1, R5
    67  	VMOV	R5, V5.B16
    68  	// R1 = length of tail
    69  	SUB	R9, R1, R1
    70  	// Duplicate R2 (byte to search) to 16 1-byte elements of V0
    71  	VMOV	R2, V0.B16
    72  	// Clear the low 64-bit element of V7 and V8
    73  	VEOR	V7.B8, V7.B8, V7.B8
    74  	VEOR	V8.B8, V8.B8, V8.B8
    75  	PCALIGN $16
    76  	// Count the target byte in 32-byte chunk
    77  chunk_loop:
    78  	VLD1.P	(R0), [V1.B16, V2.B16]
    79  	CMP	R0, R3
    80  	VCMEQ	V0.B16, V1.B16, V3.B16
    81  	VCMEQ	V0.B16, V2.B16, V4.B16
    82  	// Clear the higher 7 bits
    83  	VAND	V5.B16, V3.B16, V3.B16
    84  	VAND	V5.B16, V4.B16, V4.B16
    85  	// Count lanes match the requested byte
    86  	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
    87  	VUADDLV	V6.B16, V7
    88  	// Accumulate the count in low 64-bit element of V8 when inside the loop
    89  	VADD	V7, V8
    90  	BNE	chunk_loop
    91  	VMOV	V8.D[0], R6
    92  	ADD	R6, R11, R11
    93  	CBZ	R1, done
    94  	B	tail
    95  

View as plain text