Text file
src/runtime/memclr_loong64.s
1 // Copyright 2022 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // Register map
9 //
10 // R4: ptr
11 // R5: n
12 // R6: ptrend
13 // R7: tmp
14
15 // Algorithm:
16 //
17 // 1. if lasx is enabled:
18 // THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
19 // else if lsx is enabled:
20 // THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
21 // else
22 // THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
23 //
24 // 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
25 // The handling is divided into distinct cases based on the size of count:
26 // a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
27 // clr_9through16, clr_17through32, clr_33through64,
28 // b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
29 // c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
30 // lasx_clr_65through128, lasx_clr_129through256
31 //
32 // 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
33 // bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
34 // a LOOPBLOCKS-byte loop is executed to zero out memory.
35 // When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
36 // processing is performed, invoking the corresponding case based on the size of n.
37 //
38 // example:
39 // THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
40 //
41 // ptr newptr ptrend
42 // | |<----count after correction---->|
43 // |<-------------count before correction---------->|
44 // |<--8-(ptr&7)-->| |<---64 bytes--->|
45 // +------------------------------------------------+
46 // | Head | Body | Tail |
47 // +---------------+---------------+----------------+
48 // newptr = ptr - (ptr & 7) + 8
49 // count = count - 8 + (ptr & 7)
50
51 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
52 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
53 BEQ R5, clr_0
54 ADDV R4, R5, R6
55 tail:
56 // <=64 bytes, clear directly, not check aligned
57 SGTU $2, R5, R7
58 BNE R7, clr_1
59 SGTU $3, R5, R7
60 BNE R7, clr_2
61 SGTU $4, R5, R7
62 BNE R7, clr_3
63 SGTU $5, R5, R7
64 BNE R7, clr_4
65 SGTU $8, R5, R7
66 BNE R7, clr_5through7
67 SGTU $9, R5, R7
68 BNE R7, clr_8
69 SGTU $17, R5, R7
70 BNE R7, clr_9through16
71
72 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
73 BNE R7, lasx_tail
74 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
75 BNE R7, lsx_tail
76
77 SGTU $33, R5, R7
78 BNE R7, clr_17through32
79 SGTU $65, R5, R7
80 BNE R7, clr_33through64
81 JMP clr_large
82
83 lasx_tail:
84 // X0 = 0
85 XVXORV X0, X0, X0
86
87 SGTU $33, R5, R7
88 BNE R7, lasx_clr_17through32
89 SGTU $65, R5, R7
90 BNE R7, lasx_clr_33through64
91 SGTU $129, R5, R7
92 BNE R7, lasx_clr_65through128
93 SGTU $257, R5, R7
94 BNE R7, lasx_clr_129through256
95 JMP lasx_clr_large
96
97 lsx_tail:
98 // V0 = 0
99 VXORV V0, V0, V0
100
101 SGTU $33, R5, R7
102 BNE R7, lsx_clr_17through32
103 SGTU $65, R5, R7
104 BNE R7, lsx_clr_33through64
105 SGTU $129, R5, R7
106 BNE R7, lsx_clr_65through128
107 JMP lsx_clr_large
108
109 // use simd 256 instructions to implement memclr
110 // n > 256 bytes, check 32-byte alignment
111 lasx_clr_large:
112 AND $31, R4, R7
113 BEQ R7, lasx_clr_256loop
114 XVMOVQ X0, (R4)
115 SUBV R7, R4
116 ADDV R7, R5
117 SUBV $32, R5 // newn = n - (32 - (ptr & 31))
118 ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31))
119 SGTU $257, R5, R7
120 BNE R7, lasx_clr_129through256
121 lasx_clr_256loop:
122 SUBV $256, R5
123 SGTU $256, R5, R7
124 XVMOVQ X0, 0(R4)
125 XVMOVQ X0, 32(R4)
126 XVMOVQ X0, 64(R4)
127 XVMOVQ X0, 96(R4)
128 XVMOVQ X0, 128(R4)
129 XVMOVQ X0, 160(R4)
130 XVMOVQ X0, 192(R4)
131 XVMOVQ X0, 224(R4)
132 ADDV $256, R4
133 BEQ R7, lasx_clr_256loop
134
135 // remaining_length is 0
136 BEQ R5, clr_0
137
138 // 128 < remaining_length < 256
139 SGTU $129, R5, R7
140 BEQ R7, lasx_clr_129through256
141
142 // 64 < remaining_length <= 128
143 SGTU $65, R5, R7
144 BEQ R7, lasx_clr_65through128
145
146 // 32 < remaining_length <= 64
147 SGTU $33, R5, R7
148 BEQ R7, lasx_clr_33through64
149
150 // 16 < remaining_length <= 32
151 SGTU $17, R5, R7
152 BEQ R7, lasx_clr_17through32
153
154 // 0 < remaining_length <= 16
155 JMP tail
156
157 // use simd 128 instructions to implement memclr
158 // n > 128 bytes, check 16-byte alignment
159 lsx_clr_large:
160 // check 16-byte alignment
161 AND $15, R4, R7
162 BEQ R7, lsx_clr_128loop
163 VMOVQ V0, (R4)
164 SUBV R7, R4
165 ADDV R7, R5
166 SUBV $16, R5 // newn = n - (16 - (ptr & 15))
167 ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15))
168 SGTU $129, R5, R7
169 BNE R7, lsx_clr_65through128
170 lsx_clr_128loop:
171 SUBV $128, R5
172 SGTU $128, R5, R7
173 VMOVQ V0, 0(R4)
174 VMOVQ V0, 16(R4)
175 VMOVQ V0, 32(R4)
176 VMOVQ V0, 48(R4)
177 VMOVQ V0, 64(R4)
178 VMOVQ V0, 80(R4)
179 VMOVQ V0, 96(R4)
180 VMOVQ V0, 112(R4)
181 ADDV $128, R4
182 BEQ R7, lsx_clr_128loop
183
184 // remaining_length is 0
185 BEQ R5, clr_0
186
187 // 64 < remaining_length <= 128
188 SGTU $65, R5, R7
189 BEQ R7, lsx_clr_65through128
190
191 // 32 < remaining_length <= 64
192 SGTU $33, R5, R7
193 BEQ R7, lsx_clr_33through64
194
195 // 16 < remaining_length <= 32
196 SGTU $17, R5, R7
197 BEQ R7, lsx_clr_17through32
198
199 // 0 < remaining_length <= 16
200 JMP tail
201
202 // use general instructions to implement memclr
203 // n > 64 bytes, check 16-byte alignment
204 clr_large:
205 AND $7, R4, R7
206 BEQ R7, clr_64loop
207 MOVV R0, (R4)
208 SUBV R7, R4
209 ADDV R7, R5
210 ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7))
211 SUBV $8, R5 // newn = n - (8 - (ptr & 7))
212 MOVV $64, R7
213 BLT R5, R7, clr_33through64
214 clr_64loop:
215 SUBV $64, R5
216 SGTU $64, R5, R7
217 MOVV R0, (R4)
218 MOVV R0, 8(R4)
219 MOVV R0, 16(R4)
220 MOVV R0, 24(R4)
221 MOVV R0, 32(R4)
222 MOVV R0, 40(R4)
223 MOVV R0, 48(R4)
224 MOVV R0, 56(R4)
225 ADDV $64, R4
226 BEQ R7, clr_64loop
227
228 // remaining_length is 0
229 BEQ R5, clr_0
230
231 // 32 < remaining_length < 64
232 SGTU $33, R5, R7
233 BEQ R7, clr_33through64
234
235 // 16 < remaining_length <= 32
236 SGTU $17, R5, R7
237 BEQ R7, clr_17through32
238
239 // 0 < remaining_length <= 16
240 JMP tail
241
242 clr_0:
243 RET
244 clr_1:
245 MOVB R0, (R4)
246 RET
247 clr_2:
248 MOVH R0, (R4)
249 RET
250 clr_3:
251 MOVH R0, (R4)
252 MOVB R0, 2(R4)
253 RET
254 clr_4:
255 MOVW R0, (R4)
256 RET
257 clr_5through7:
258 MOVW R0, (R4)
259 MOVW R0, -4(R6)
260 RET
261 clr_8:
262 MOVV R0, (R4)
263 RET
264 clr_9through16:
265 MOVV R0, (R4)
266 MOVV R0, -8(R6)
267 RET
268 clr_17through32:
269 MOVV R0, (R4)
270 MOVV R0, 8(R4)
271 MOVV R0, -16(R6)
272 MOVV R0, -8(R6)
273 RET
274 clr_33through64:
275 MOVV R0, (R4)
276 MOVV R0, 8(R4)
277 MOVV R0, 16(R4)
278 MOVV R0, 24(R4)
279 MOVV R0, -32(R6)
280 MOVV R0, -24(R6)
281 MOVV R0, -16(R6)
282 MOVV R0, -8(R6)
283 RET
284
285 lasx_clr_17through32:
286 VMOVQ V0, 0(R4)
287 VMOVQ V0, -16(R6)
288 RET
289 lasx_clr_33through64:
290 XVMOVQ X0, 0(R4)
291 XVMOVQ X0, -32(R6)
292 RET
293 lasx_clr_65through128:
294 XVMOVQ X0, 0(R4)
295 XVMOVQ X0, 32(R4)
296 XVMOVQ X0, -64(R6)
297 XVMOVQ X0, -32(R6)
298 RET
299 lasx_clr_129through256:
300 XVMOVQ X0, 0(R4)
301 XVMOVQ X0, 32(R4)
302 XVMOVQ X0, 64(R4)
303 XVMOVQ X0, 96(R4)
304 XVMOVQ X0, -128(R6)
305 XVMOVQ X0, -96(R6)
306 XVMOVQ X0, -64(R6)
307 XVMOVQ X0, -32(R6)
308 RET
309
310 lsx_clr_17through32:
311 VMOVQ V0, 0(R4)
312 VMOVQ V0, -16(R6)
313 RET
314 lsx_clr_33through64:
315 VMOVQ V0, 0(R4)
316 VMOVQ V0, 16(R4)
317 VMOVQ V0, -32(R6)
318 VMOVQ V0, -16(R6)
319 RET
320 lsx_clr_65through128:
321 VMOVQ V0, 0(R4)
322 VMOVQ V0, 16(R4)
323 VMOVQ V0, 32(R4)
324 VMOVQ V0, 48(R4)
325 VMOVQ V0, -64(R6)
326 VMOVQ V0, -48(R6)
327 VMOVQ V0, -32(R6)
328 VMOVQ V0, -16(R6)
329 RET
330
View as plain text