// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Original source:
//	http://www.zorinaq.com/papers/md5-amd64.html
//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
//
// Translated from Perl generating GNU assembly into
// #defines generating 6a assembly by the Go Authors.

package main

import (
	. "github.com/mmcloughlin/avo/build"
	. "github.com/mmcloughlin/avo/operand"
	. "github.com/mmcloughlin/avo/reg"
)

//go:generate go run . -out ../md5block_amd64.s -pkg md5

func main() {
	Package("crypto/md5")
	ConstraintExpr("!purego")
	block()
	Generate()
}

// MD5 optimized for AMD64.
//
// Author: Marc Bevand <bevand_m (at) epita.fr>
// Licence: I hereby disclaim the copyright on this code and place it
// in the public domain.
func block() {
	Implement("block")
	Attributes(NOSPLIT)
	AllocLocal(8)

	Load(Param("dig"), RBP)
	Load(Param("p").Base(), RSI)
	Load(Param("p").Len(), RDX)
	SHRQ(Imm(6), RDX)
	SHLQ(Imm(6), RDX)

	LEAQ(Mem{Base: SI, Index: DX, Scale: 1}, RDI)
	MOVL(Mem{Base: BP}.Offset(0*4), EAX)
	MOVL(Mem{Base: BP}.Offset(1*4), EBX)
	MOVL(Mem{Base: BP}.Offset(2*4), ECX)
	MOVL(Mem{Base: BP}.Offset(3*4), EDX)
	MOVL(Imm(0xffffffff), R11L)

	CMPQ(RSI, RDI)
	JEQ(LabelRef("end"))

	loop()
	end()
}

func loop() {
	Label("loop")
	MOVL(EAX, R12L)
	MOVL(EBX, R13L)
	MOVL(ECX, R14L)
	MOVL(EDX, R15L)

	MOVL(Mem{Base: SI}.Offset(0*4), R8L)
	MOVL(EDX, R9L)

	ROUND1(EAX, EBX, ECX, EDX, 1, 0xd76aa478, 7)
	ROUND1(EDX, EAX, EBX, ECX, 2, 0xe8c7b756, 12)
	ROUND1(ECX, EDX, EAX, EBX, 3, 0x242070db, 17)
	ROUND1(EBX, ECX, EDX, EAX, 4, 0xc1bdceee, 22)
	ROUND1(EAX, EBX, ECX, EDX, 5, 0xf57c0faf, 7)
	ROUND1(EDX, EAX, EBX, ECX, 6, 0x4787c62a, 12)
	ROUND1(ECX, EDX, EAX, EBX, 7, 0xa8304613, 17)
	ROUND1(EBX, ECX, EDX, EAX, 8, 0xfd469501, 22)
	ROUND1(EAX, EBX, ECX, EDX, 9, 0x698098d8, 7)
	ROUND1(EDX, EAX, EBX, ECX, 10, 0x8b44f7af, 12)
	ROUND1(ECX, EDX, EAX, EBX, 11, 0xffff5bb1, 17)
	ROUND1(EBX, ECX, EDX, EAX, 12, 0x895cd7be, 22)
	ROUND1(EAX, EBX, ECX, EDX, 13, 0x6b901122, 7)
	ROUND1(EDX, EAX, EBX, ECX, 14, 0xfd987193, 12)
	ROUND1(ECX, EDX, EAX, EBX, 15, 0xa679438e, 17)
	ROUND1(EBX, ECX, EDX, EAX, 1, 0x49b40821, 22)

	MOVL(EDX, R9L)
	MOVL(EDX, R10L)

	ROUND2(EAX, EBX, ECX, EDX, 6, 0xf61e2562, 5)
	ROUND2(EDX, EAX, EBX, ECX, 11, 0xc040b340, 9)
	ROUND2(ECX, EDX, EAX, EBX, 0, 0x265e5a51, 14)
	ROUND2(EBX, ECX, EDX, EAX, 5, 0xe9b6c7aa, 20)
	ROUND2(EAX, EBX, ECX, EDX, 10, 0xd62f105d, 5)
	ROUND2(EDX, EAX, EBX, ECX, 15, 0x2441453, 9)
	ROUND2(ECX, EDX, EAX, EBX, 4, 0xd8a1e681, 14)
	ROUND2(EBX, ECX, EDX, EAX, 9, 0xe7d3fbc8, 20)
	ROUND2(EAX, EBX, ECX, EDX, 14, 0x21e1cde6, 5)
	ROUND2(EDX, EAX, EBX, ECX, 3, 0xc33707d6, 9)
	ROUND2(ECX, EDX, EAX, EBX, 8, 0xf4d50d87, 14)
	ROUND2(EBX, ECX, EDX, EAX, 13, 0x455a14ed, 20)
	ROUND2(EAX, EBX, ECX, EDX, 2, 0xa9e3e905, 5)
	ROUND2(EDX, EAX, EBX, ECX, 7, 0xfcefa3f8, 9)
	ROUND2(ECX, EDX, EAX, EBX, 12, 0x676f02d9, 14)
	ROUND2(EBX, ECX, EDX, EAX, 5, 0x8d2a4c8a, 20)

	MOVL(ECX, R9L)

	ROUND3FIRST(EAX, EBX, ECX, EDX, 8, 0xfffa3942, 4)
	ROUND3(EDX, EAX, EBX, ECX, 11, 0x8771f681, 11)
	ROUND3(ECX, EDX, EAX, EBX, 14, 0x6d9d6122, 16)
	ROUND3(EBX, ECX, EDX, EAX, 1, 0xfde5380c, 23)
	ROUND3(EAX, EBX, ECX, EDX, 4, 0xa4beea44, 4)
	ROUND3(EDX, EAX, EBX, ECX, 7, 0x4bdecfa9, 11)
	ROUND3(ECX, EDX, EAX, EBX, 10, 0xf6bb4b60, 16)
	ROUND3(EBX, ECX, EDX, EAX, 13, 0xbebfbc70, 23)
	ROUND3(EAX, EBX, ECX, EDX, 0, 0x289b7ec6, 4)
	ROUND3(EDX, EAX, EBX, ECX, 3, 0xeaa127fa, 11)
	ROUND3(ECX, EDX, EAX, EBX, 6, 0xd4ef3085, 16)
	ROUND3(EBX, ECX, EDX, EAX, 9, 0x4881d05, 23)
	ROUND3(EAX, EBX, ECX, EDX, 12, 0xd9d4d039, 4)
	ROUND3(EDX, EAX, EBX, ECX, 15, 0xe6db99e5, 11)
	ROUND3(ECX, EDX, EAX, EBX, 2, 0x1fa27cf8, 16)
	ROUND3(EBX, ECX, EDX, EAX, 0, 0xc4ac5665, 23)

	MOVL(R11L, R9L)
	XORL(EDX, R9L)

	ROUND4(EAX, EBX, ECX, EDX, 7, 0xf4292244, 6)
	ROUND4(EDX, EAX, EBX, ECX, 14, 0x432aff97, 10)
	ROUND4(ECX, EDX, EAX, EBX, 5, 0xab9423a7, 15)
	ROUND4(EBX, ECX, EDX, EAX, 12, 0xfc93a039, 21)
	ROUND4(EAX, EBX, ECX, EDX, 3, 0x655b59c3, 6)
	ROUND4(EDX, EAX, EBX, ECX, 10, 0x8f0ccc92, 10)
	ROUND4(ECX, EDX, EAX, EBX, 1, 0xffeff47d, 15)
	ROUND4(EBX, ECX, EDX, EAX, 8, 0x85845dd1, 21)
	ROUND4(EAX, EBX, ECX, EDX, 15, 0x6fa87e4f, 6)
	ROUND4(EDX, EAX, EBX, ECX, 6, 0xfe2ce6e0, 10)
	ROUND4(ECX, EDX, EAX, EBX, 13, 0xa3014314, 15)
	ROUND4(EBX, ECX, EDX, EAX, 4, 0x4e0811a1, 21)
	ROUND4(EAX, EBX, ECX, EDX, 11, 0xf7537e82, 6)
	ROUND4(EDX, EAX, EBX, ECX, 2, 0xbd3af235, 10)
	ROUND4(ECX, EDX, EAX, EBX, 9, 0x2ad7d2bb, 15)
	ROUND4(EBX, ECX, EDX, EAX, 0, 0xeb86d391, 21)

	ADDL(R12L, EAX)
	ADDL(R13L, EBX)
	ADDL(R14L, ECX)
	ADDL(R15L, EDX)

	ADDQ(Imm(64), RSI)
	CMPQ(RSI, RDI)
	JB(LabelRef("loop"))
}

func end() {
	Label("end")
	MOVL(EAX, Mem{Base: BP}.Offset(0*4))
	MOVL(EBX, Mem{Base: BP}.Offset(1*4))
	MOVL(ECX, Mem{Base: BP}.Offset(2*4))
	MOVL(EDX, Mem{Base: BP}.Offset(3*4))
	RET()
}

func ROUND1(a, b, c, d GPPhysical, index int, konst, shift uint64) {
	XORL(c, R9L)
	ADDL(Imm(konst), a)
	ADDL(R8L, a)
	ANDL(b, R9L)
	XORL(d, R9L)
	MOVL(Mem{Base: SI}.Offset(index*4), R8L)
	ADDL(R9L, a)
	ROLL(Imm(shift), a)
	MOVL(c, R9L)
	ADDL(b, a)
}

// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
func ROUND2(a, b, c, d GPPhysical, index int, konst, shift uint64) {
	XORL(R11L, R9L)
	ADDL(Imm(konst), a)
	ADDL(R8L, a)
	ANDL(b, R10L)
	ANDL(c, R9L)
	MOVL(Mem{Base: SI}.Offset(index*4), R8L)
	ADDL(R9L, a)
	ADDL(R10L, a)
	MOVL(c, R9L)
	MOVL(c, R10L)
	ROLL(Imm(shift), a)
	ADDL(b, a)
}

// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
func ROUND3FIRST(a, b, c, d GPPhysical, index int, konst, shift uint64) {
	MOVL(d, R9L)
	XORL(c, R9L)
	XORL(b, R9L)
	ADDL(Imm(konst), a)
	ADDL(R8L, a)
	MOVL(Mem{Base: SI}.Offset(index*4), R8L)
	ADDL(R9L, a)
	ROLL(Imm(shift), a)
	ADDL(b, a)
}

func ROUND3(a, b, c, d GPPhysical, index int, konst, shift uint64) {
	XORL(a, R9L)
	XORL(b, R9L)
	ADDL(Imm(konst), a)
	ADDL(R8L, a)
	MOVL(Mem{Base: SI}.Offset(index*4), R8L)
	ADDL(R9L, a)
	ROLL(Imm(shift), a)
	ADDL(b, a)
}

func ROUND4(a, b, c, d GPPhysical, index int, konst, shift uint64) {
	ADDL(Imm(konst), a)
	ADDL(R8L, a)
	ORL(b, R9L)
	XORL(c, R9L)
	ADDL(R9L, a)
	MOVL(Mem{Base: SI}.Offset(index*4), R8L)
	MOVL(Imm(0xffffffff), R9L)
	ROLL(Imm(shift), a)
	XORL(c, R9L)
	ADDL(b, a)
}