// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !purego && (ppc64 || ppc64le)

#include "textflag.h"

// func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32
	MOVD	$4, R6 // R6 = z_len/4
	JMP		addMulVVWx<>(SB)

// func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32
	MOVD	$6, R6 // R6 = z_len/4
	JMP		addMulVVWx<>(SB)

// func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32
	MOVD	$8, R6 // R6 = z_len/4
	JMP		addMulVVWx<>(SB)

// This local function expects to be called only by
// callers above. R6 contains the z length/4
// since 4 values are processed for each
// loop iteration, and is guaranteed to be > 0.
// If other callers are added this function might
// need to change.
TEXT addMulVVWx<>(SB), NOSPLIT, $0
	MOVD	z+0(FP), R3
	MOVD	x+8(FP), R4
	MOVD	y+16(FP), R5

	MOVD	$0, R9		// R9 = c = 0
	MOVD	R6, CTR		// Initialize loop counter
	PCALIGN	$16

loop:
	MOVD	0(R4), R14	// x[i]
	MOVD	8(R4), R16	// x[i+1]
	MOVD	16(R4), R18	// x[i+2]
	MOVD	24(R4), R20	// x[i+3]
	MOVD	0(R3), R15	// z[i]
	MOVD	8(R3), R17	// z[i+1]
	MOVD	16(R3), R19	// z[i+2]
	MOVD	24(R3), R21	// z[i+3]
	MULLD	R5, R14, R10	// low x[i]*y
	MULHDU	R5, R14, R11	// high x[i]*y
	ADDC	R15, R10
	ADDZE	R11
	ADDC	R9, R10
	ADDZE	R11, R9
	MULLD	R5, R16, R14	// low x[i+1]*y
	MULHDU	R5, R16, R15	// high x[i+1]*y
	ADDC	R17, R14
	ADDZE	R15
	ADDC	R9, R14
	ADDZE	R15, R9
	MULLD	R5, R18, R16	// low x[i+2]*y
	MULHDU	R5, R18, R17	// high x[i+2]*y
	ADDC	R19, R16
	ADDZE	R17
	ADDC	R9, R16
	ADDZE	R17, R9
	MULLD	R5, R20, R18	// low x[i+3]*y
	MULHDU	R5, R20, R19	// high x[i+3]*y
	ADDC	R21, R18
	ADDZE	R19
	ADDC	R9, R18
	ADDZE	R19, R9
	MOVD	R10, 0(R3)	// z[i]
	MOVD	R14, 8(R3)	// z[i+1]
	MOVD	R16, 16(R3)	// z[i+2]
	MOVD	R18, 24(R3)	// z[i+3]
	ADD	$32, R3
	ADD	$32, R4
	BDNZ	loop

done:
	MOVD	R9, c+24(FP)
	RET