memclr_loong64.s

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  // Register map
     9  //
    10  // R4: ptr
    11  // R5: n
    12  // R6: ptrend
    13  // R7: tmp
    14  
    15  // Algorithm:
    16  //
    17  // 1. if lasx is enabled:
    18  //        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
    19  //    else if lsx is enabled:
    20  //        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
    21  //    else
    22  //        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
    23  //
    24  // 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
    25  // The handling is divided into distinct cases based on the size of count:
    26  //   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
    27  //      clr_9through16, clr_17through32, clr_33through64,
    28  //   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
    29  //   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
    30  //      lasx_clr_65through128, lasx_clr_129through256
    31  //
    32  // 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
    33  // bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
    34  // a LOOPBLOCKS-byte loop is executed to zero out memory.
    35  // When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
    36  // processing is performed, invoking the corresponding case based on the size of n.
    37  //
    38  // example:
    39  //    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
    40  //
    41  //    ptr           newptr                           ptrend
    42  //     |               |<----count after correction---->|
    43  //     |<-------------count before correction---------->|
    44  //     |<--8-(ptr&7)-->|               |<---64 bytes--->|
    45  //     +------------------------------------------------+
    46  //     |   Head        |      Body     |      Tail      |
    47  //     +---------------+---------------+----------------+
    48  //    newptr = ptr - (ptr & 7) + 8
    49  //    count = count - 8 + (ptr & 7)
    50  
    51  // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    52  TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
    53  	BEQ	R5, clr_0
    54  	ADDV	R4, R5, R6
    55  tail:
    56  	// <=64 bytes, clear directly, not check aligned
    57  	SGTU	$2, R5, R7
    58  	BNE	R7, clr_1
    59  	SGTU	$3, R5, R7
    60  	BNE	R7, clr_2
    61  	SGTU	$4, R5, R7
    62  	BNE	R7, clr_3
    63  	SGTU	$5, R5, R7
    64  	BNE	R7, clr_4
    65  	SGTU	$8, R5, R7
    66  	BNE	R7, clr_5through7
    67  	SGTU	$9, R5, R7
    68  	BNE	R7, clr_8
    69  	SGTU	$17, R5, R7
    70  	BNE	R7, clr_9through16
    71  
    72  	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
    73  	BNE	R7, lasx_tail
    74  	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
    75  	BNE	R7, lsx_tail
    76  
    77  	SGTU	$33, R5, R7
    78  	BNE	R7, clr_17through32
    79  	SGTU	$65, R5, R7
    80  	BNE	R7, clr_33through64
    81  	JMP	clr_large
    82  
    83  lasx_tail:
    84  	// X0 = 0
    85  	XVXORV	X0, X0, X0
    86  
    87  	SGTU	$33, R5, R7
    88  	BNE	R7, lasx_clr_17through32
    89  	SGTU	$65, R5, R7
    90  	BNE	R7, lasx_clr_33through64
    91  	SGTU	$129, R5, R7
    92  	BNE	R7, lasx_clr_65through128
    93  	SGTU	$257, R5, R7
    94  	BNE	R7, lasx_clr_129through256
    95  	JMP	lasx_clr_large
    96  
    97  lsx_tail:
    98  	// V0 = 0
    99  	VXORV	V0, V0, V0
   100  
   101  	SGTU	$33, R5, R7
   102  	BNE	R7, lsx_clr_17through32
   103  	SGTU	$65, R5, R7
   104  	BNE	R7, lsx_clr_33through64
   105  	SGTU	$129, R5, R7
   106  	BNE	R7, lsx_clr_65through128
   107  	JMP	lsx_clr_large
   108  
   109  	// use simd 256 instructions to implement memclr
   110  	// n > 256 bytes, check 32-byte alignment
   111  lasx_clr_large:
   112  	AND	$31, R4, R7
   113  	BEQ	R7, lasx_clr_256loop
   114  	XVMOVQ	X0, (R4)
   115  	SUBV	R7, R4
   116  	ADDV	R7, R5
   117  	SUBV	$32, R5 // newn = n - (32 - (ptr & 31))
   118  	ADDV	$32, R4 // newptr = ptr + (32 - (ptr & 31))
   119  	SGTU	$257, R5, R7
   120  	BNE	R7, lasx_clr_129through256
   121  lasx_clr_256loop:
   122  	SUBV	$256, R5
   123  	SGTU	$256, R5, R7
   124  	XVMOVQ	X0, 0(R4)
   125  	XVMOVQ	X0, 32(R4)
   126  	XVMOVQ	X0, 64(R4)
   127  	XVMOVQ	X0, 96(R4)
   128  	XVMOVQ	X0, 128(R4)
   129  	XVMOVQ	X0, 160(R4)
   130  	XVMOVQ	X0, 192(R4)
   131  	XVMOVQ	X0, 224(R4)
   132  	ADDV	$256, R4
   133  	BEQ	R7, lasx_clr_256loop
   134  
   135  	// remaining_length is 0
   136  	BEQ	R5, clr_0
   137  
   138  	// 128 < remaining_length < 256
   139  	SGTU	$129, R5, R7
   140  	BEQ	R7, lasx_clr_129through256
   141  
   142  	// 64 < remaining_length <= 128
   143  	SGTU	$65, R5, R7
   144  	BEQ	R7, lasx_clr_65through128
   145  
   146  	// 32 < remaining_length <= 64
   147  	SGTU	$33, R5, R7
   148  	BEQ	R7, lasx_clr_33through64
   149  
   150  	// 16 < remaining_length <= 32
   151  	SGTU	$17, R5, R7
   152  	BEQ	R7, lasx_clr_17through32
   153  
   154  	// 0 < remaining_length <= 16
   155  	JMP	tail
   156  
   157  	// use simd 128 instructions to implement memclr
   158  	// n > 128 bytes, check 16-byte alignment
   159  lsx_clr_large:
   160  	// check 16-byte alignment
   161  	AND	$15, R4, R7
   162  	BEQ	R7, lsx_clr_128loop
   163  	VMOVQ	V0, (R4)
   164  	SUBV	R7, R4
   165  	ADDV	R7, R5
   166  	SUBV	$16, R5 // newn = n - (16 - (ptr & 15))
   167  	ADDV	$16, R4 // newptr = ptr + (16 - (ptr & 15))
   168  	SGTU	$129, R5, R7
   169  	BNE	R7, lsx_clr_65through128
   170  lsx_clr_128loop:
   171  	SUBV	$128, R5
   172  	SGTU	$128, R5, R7
   173  	VMOVQ	V0, 0(R4)
   174  	VMOVQ	V0, 16(R4)
   175  	VMOVQ	V0, 32(R4)
   176  	VMOVQ	V0, 48(R4)
   177  	VMOVQ	V0, 64(R4)
   178  	VMOVQ	V0, 80(R4)
   179  	VMOVQ	V0, 96(R4)
   180  	VMOVQ	V0, 112(R4)
   181  	ADDV	$128, R4
   182  	BEQ	R7, lsx_clr_128loop
   183  
   184  	// remaining_length is 0
   185  	BEQ	R5, clr_0
   186  
   187  	// 64 < remaining_length <= 128
   188  	SGTU	$65, R5, R7
   189  	BEQ	R7, lsx_clr_65through128
   190  
   191  	// 32 < remaining_length <= 64
   192  	SGTU	$33, R5, R7
   193  	BEQ	R7, lsx_clr_33through64
   194  
   195  	// 16 < remaining_length <= 32
   196  	SGTU	$17, R5, R7
   197  	BEQ	R7, lsx_clr_17through32
   198  
   199  	// 0 < remaining_length <= 16
   200  	JMP	tail
   201  
   202  	// use general instructions to implement memclr
   203  	// n > 64 bytes, check 16-byte alignment
   204  clr_large:
   205  	AND	$7, R4, R7
   206  	BEQ	R7, clr_64loop
   207  	MOVV	R0, (R4)
   208  	SUBV	R7, R4
   209  	ADDV	R7, R5
   210  	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
   211  	SUBV	$8, R5	// newn = n - (8 - (ptr & 7))
   212  	MOVV	$64, R7
   213  	BLT	R5, R7, clr_33through64
   214  clr_64loop:
   215  	SUBV	$64, R5
   216  	SGTU    $64, R5, R7
   217  	MOVV	R0, (R4)
   218  	MOVV	R0, 8(R4)
   219  	MOVV	R0, 16(R4)
   220  	MOVV	R0, 24(R4)
   221  	MOVV	R0, 32(R4)
   222  	MOVV	R0, 40(R4)
   223  	MOVV	R0, 48(R4)
   224  	MOVV	R0, 56(R4)
   225  	ADDV	$64, R4
   226  	BEQ     R7, clr_64loop
   227  
   228  	// remaining_length is 0
   229  	BEQ	R5, clr_0
   230  
   231  	// 32 < remaining_length < 64
   232  	SGTU	$33, R5, R7
   233  	BEQ	R7, clr_33through64
   234  
   235  	// 16 < remaining_length <= 32
   236  	SGTU	$17, R5, R7
   237  	BEQ	R7, clr_17through32
   238  
   239  	// 0 < remaining_length <= 16
   240  	JMP	tail
   241  
   242  clr_0:
   243  	RET
   244  clr_1:
   245  	MOVB	R0, (R4)
   246  	RET
   247  clr_2:
   248  	MOVH	R0, (R4)
   249  	RET
   250  clr_3:
   251  	MOVH	R0, (R4)
   252  	MOVB	R0, 2(R4)
   253  	RET
   254  clr_4:
   255  	MOVW	R0, (R4)
   256  	RET
   257  clr_5through7:
   258  	MOVW	R0, (R4)
   259  	MOVW	R0, -4(R6)
   260  	RET
   261  clr_8:
   262  	MOVV	R0, (R4)
   263  	RET
   264  clr_9through16:
   265  	MOVV	R0, (R4)
   266  	MOVV	R0, -8(R6)
   267  	RET
   268  clr_17through32:
   269  	MOVV	R0, (R4)
   270  	MOVV	R0, 8(R4)
   271  	MOVV	R0, -16(R6)
   272  	MOVV	R0, -8(R6)
   273  	RET
   274  clr_33through64:
   275  	MOVV	R0, (R4)
   276  	MOVV	R0, 8(R4)
   277  	MOVV	R0, 16(R4)
   278  	MOVV	R0, 24(R4)
   279  	MOVV	R0, -32(R6)
   280  	MOVV	R0, -24(R6)
   281  	MOVV	R0, -16(R6)
   282  	MOVV	R0, -8(R6)
   283  	RET
   284  
   285  lasx_clr_17through32:
   286  	VMOVQ	V0, 0(R4)
   287  	VMOVQ	V0, -16(R6)
   288  	RET
   289  lasx_clr_33through64:
   290  	XVMOVQ	X0, 0(R4)
   291  	XVMOVQ	X0, -32(R6)
   292  	RET
   293  lasx_clr_65through128:
   294  	XVMOVQ	X0, 0(R4)
   295  	XVMOVQ	X0, 32(R4)
   296  	XVMOVQ	X0, -64(R6)
   297  	XVMOVQ	X0, -32(R6)
   298  	RET
   299  lasx_clr_129through256:
   300  	XVMOVQ	X0, 0(R4)
   301  	XVMOVQ	X0, 32(R4)
   302  	XVMOVQ	X0, 64(R4)
   303  	XVMOVQ	X0, 96(R4)
   304  	XVMOVQ	X0, -128(R6)
   305  	XVMOVQ	X0, -96(R6)
   306  	XVMOVQ	X0, -64(R6)
   307  	XVMOVQ	X0, -32(R6)
   308  	RET
   309  
   310  lsx_clr_17through32:
   311  	VMOVQ	V0, 0(R4)
   312  	VMOVQ	V0, -16(R6)
   313  	RET
   314  lsx_clr_33through64:
   315  	VMOVQ	V0, 0(R4)
   316  	VMOVQ	V0, 16(R4)
   317  	VMOVQ	V0, -32(R6)
   318  	VMOVQ	V0, -16(R6)
   319  	RET
   320  lsx_clr_65through128:
   321  	VMOVQ	V0, 0(R4)
   322  	VMOVQ	V0, 16(R4)
   323  	VMOVQ	V0, 32(R4)
   324  	VMOVQ	V0, 48(R4)
   325  	VMOVQ	V0, -64(R6)
   326  	VMOVQ	V0, -48(R6)
   327  	VMOVQ	V0, -32(R6)
   328  	VMOVQ	V0, -16(R6)
   329  	RET
   330
View as plain text