// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package ssa import ( "cmd/compile/internal/ir" "cmd/compile/internal/types" "cmd/internal/obj" "slices" ) // The pair pass finds memory operations that can be paired up // into single 2-register memory instructions. func pair(f *Func) { // Only arm64 for now. This pass is fairly arch-specific. switch f.Config.arch { case "arm64": default: return } pairLoads(f) pairStores(f) } type pairableLoadInfo struct { width int64 // width of one element in the pair, in bytes pair Op } // All pairableLoad ops must take 2 arguments, a pointer and a memory. // They must also take an offset in Aux/AuxInt. var pairableLoads = map[Op]pairableLoadInfo{ OpARM64MOVDload: {8, OpARM64LDP}, OpARM64MOVWUload: {4, OpARM64LDPW}, OpARM64MOVWload: {4, OpARM64LDPSW}, // TODO: conceivably we could pair a signed and unsigned load // if we knew the upper bits of one of them weren't being used. OpARM64FMOVDload: {8, OpARM64FLDPD}, OpARM64FMOVSload: {4, OpARM64FLDPS}, } type pairableStoreInfo struct { width int64 // width of one element in the pair, in bytes pair Op } // All pairableStore keys must take 3 arguments, a pointer, a value, and a memory. // All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory. // They must also take an offset in Aux/AuxInt. var pairableStores = map[Op]pairableStoreInfo{ OpARM64MOVDstore: {8, OpARM64STP}, OpARM64MOVWstore: {4, OpARM64STPW}, OpARM64FMOVDstore: {8, OpARM64FSTPD}, OpARM64FMOVSstore: {4, OpARM64FSTPS}, } // offsetOk returns true if a pair instruction should be used // for the offset Aux+off, when the data width (of the // unpaired instructions) is width. // This function is best-effort. The compiled function must // still work if offsetOk always returns true. // TODO: this is currently arm64-specific. func offsetOk(aux Aux, off, width int64) bool { if true { // Seems to generate slightly smaller code if we just // always allow this rewrite. // // Without pairing, we have 2 load instructions, like: // LDR 88(R0), R1 // LDR 96(R0), R2 // with pairing we have, best case: // LDP 88(R0), R1, R2 // but maybe we need an adjuster if out of range or unaligned: // ADD R0, $88, R27 // LDP (R27), R1, R2 // Even with the adjuster, it is at least no worse. // // A similar situation occurs when accessing globals. // Two loads from globals requires 4 instructions, // two ADRP and two LDR. With pairing, we need // ADRP+ADD+LDP, three instructions. // // With pairing, it looks like the critical path might // be a little bit longer. But it should never be more // instructions. // TODO: see if that longer critical path causes any // regressions. return true } if aux != nil { if _, ok := aux.(*ir.Name); !ok { // Offset is probably too big (globals). return false } // We let *ir.Names pass here, as // they are probably small offsets from SP. // There's no guarantee that we're in range // in that case though (we don't know the // stack frame size yet), so the assembler // might need to issue fixup instructions. // Assume some small frame size. if off >= 0 { off += 120 } // TODO: figure out how often this helps vs. hurts. } switch width { case 4: if off >= -256 && off <= 252 && off%4 == 0 { return true } case 8: if off >= -512 && off <= 504 && off%8 == 0 { return true } } return false } func pairLoads(f *Func) { var loads []*Value // Registry of aux values for sorting. auxIDs := map[Aux]int{} auxID := func(aux Aux) int { id, ok := auxIDs[aux] if !ok { id = len(auxIDs) auxIDs[aux] = id } return id } for _, b := range f.Blocks { // Find loads. loads = loads[:0] clear(auxIDs) for _, v := range b.Values { info := pairableLoads[v.Op] if info.width == 0 { continue // not pairable } if !offsetOk(v.Aux, v.AuxInt, info.width) { continue // not advisable } loads = append(loads, v) } if len(loads) < 2 { continue } // Sort to put pairable loads together. slices.SortFunc(loads, func(x, y *Value) int { // First sort by op, ptr, and memory arg. if x.Op != y.Op { return int(x.Op - y.Op) } if x.Args[0].ID != y.Args[0].ID { return int(x.Args[0].ID - y.Args[0].ID) } if x.Args[1].ID != y.Args[1].ID { return int(x.Args[1].ID - y.Args[1].ID) } // Then sort by aux. (nil first, then by aux ID) if x.Aux != nil { if y.Aux == nil { return 1 } a, b := auxID(x.Aux), auxID(y.Aux) if a != b { return a - b } } else if y.Aux != nil { return -1 } // Then sort by offset, low to high. return int(x.AuxInt - y.AuxInt) }) // Look for pairable loads. for i := 0; i < len(loads)-1; i++ { x := loads[i] y := loads[i+1] if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] { continue } if x.Aux != y.Aux { continue } if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt { continue } // Commit point. // Make the 2-register load. load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1]) // Modify x to be (Select0 load). Similar for y. x.reset(OpSelect0) x.SetArgs1(load) y.reset(OpSelect1) y.SetArgs1(load) i++ // Skip y next time around the loop. } } // Try to pair a load with a load from a subsequent block. // Note that this is always safe to do if the memory arguments match. // (But see the memory barrier case below.) type nextBlockKey struct { op Op ptr ID mem ID auxInt int64 aux any } nextBlock := map[nextBlockKey]*Value{} for _, b := range f.Blocks { if memoryBarrierTest(b) { // TODO: Do we really need to skip write barrier test blocks? // type T struct { // a *byte // b int // } // func f(t *T) int { // r := t.b // t.a = nil // return r // } // This would issue a single LDP for both the t.a and t.b fields, // *before* we check the write barrier flag. (We load the t.a field // to put it in the write barrier buffer.) Not sure if that is ok. continue } // Find loads in the next block(s) that we can move to this one. // TODO: could maybe look further than just one successor hop. clear(nextBlock) for _, e := range b.Succs { if len(e.b.Preds) > 1 { continue } for _, v := range e.b.Values { info := pairableLoads[v.Op] if info.width == 0 { continue } if !offsetOk(v.Aux, v.AuxInt, info.width) { continue // not advisable } nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v } } if len(nextBlock) == 0 { continue } // don't move too many loads. Each requires a register across a basic block boundary. const maxMoved = 4 nMoved := 0 for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- { x := b.Values[i] info := pairableLoads[x.Op] if info.width == 0 { continue } if !offsetOk(x.Aux, x.AuxInt, info.width) { continue // not advisable } key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux} if y := nextBlock[key]; y != nil { delete(nextBlock, key) // Make the 2-register load. load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1]) // Modify x to be (Select0 load). x.reset(OpSelect0) x.SetArgs1(load) // Modify y to be (Copy (Select1 load)). // Note: the Select* needs to live in the load's block, not y's block. y.reset(OpCopy) y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load)) nMoved++ continue } key.auxInt = x.AuxInt - info.width if y := nextBlock[key]; y != nil { delete(nextBlock, key) // Make the 2-register load. load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1]) // Modify x to be (Select1 load). x.reset(OpSelect1) x.SetArgs1(load) // Modify y to be (Copy (Select0 load)). y.reset(OpCopy) y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load)) nMoved++ continue } } } } func memoryBarrierTest(b *Block) bool { if b.Kind != BlockARM64NZW { return false } c := b.Controls[0] if c.Op != OpARM64MOVWUload { return false } if globl, ok := c.Aux.(*obj.LSym); ok { return globl.Name == "runtime.writeBarrier" } return false } // pairStores merges store instructions. // It collects stores into a buffer where they can be freely reordered. // When encountering an instruction that cannot be added to the buffer, // it pairs the accumulated stores, flushes the buffer, and continues processing. func pairStores(f *Func) { last := f.Cache.allocBoolSlice(f.NumValues()) defer f.Cache.freeBoolSlice(last) // memChain contains a list of stores with the same ptr/aux pair and // nonoverlapping write ranges [AuxInt:AuxInt+writeSize]. All of the // elements of memChain can be reordered with each other. memChain := []*Value{} // Limit of length of memChain array. // This keeps us in O(n) territory. limit := 100 // flushMemChain sorts the stores in memChain and merges them when possible. // Then it flushes memChain. flushMemChain := func() { if len(memChain) < 2 { memChain = memChain[:0] return } // Sort in increasing AuxInt to put pairable stores together. slices.SortFunc(memChain, func(x, y *Value) int { return int(x.AuxInt - y.AuxInt) }) lastIdx := len(memChain) - 1 for i := 0; i < lastIdx; i++ { v := memChain[i] w := memChain[i+1] info := pairableStores[v.Op] off := v.AuxInt mem := v.MemoryArg() aux := v.Aux pos := v.Pos wmem := w.MemoryArg() if w.Op == v.Op && w.AuxInt == off+info.width { // Arguments for the merged store: ptr, val1, val2, mem. args := []*Value{v.Args[0], v.Args[1], w.Args[1], mem} v.reset(info.pair) v.AddArgs(args...) v.Aux = aux v.AuxInt = off v.Pos = pos // Make w just a memory copy. w.reset(OpCopy) w.SetArgs1(wmem) // Skip merged store (w) i++ } } memChain = memChain[:0] } // prevStore returns the previous store in the // same block, or nil if there are none. prevStore := func(v *Value) *Value { if v.Op == OpInitMem || v.Op == OpPhi { return nil } m := v.MemoryArg() if m.Block != v.Block { return nil } return m } // storeWidth returns the width of store, // or 0 if it is not a store this pass understands. storeWidth := func(op Op) int64 { var width int64 switch op { case OpARM64MOVDstore, OpARM64FMOVDstore: width = 8 case OpARM64MOVWstore, OpARM64FMOVSstore: width = 4 case OpARM64MOVHstore: width = 2 case OpARM64MOVBstore: width = 1 default: width = 0 } return width } for _, b := range f.Blocks { memChain = memChain[:0] // Find last store in block, so we can // walk the stores last to first. // Last to first helps ensure that the rewrites we // perform do not get in the way of subsequent rewrites. for _, v := range b.Values { if v.Type.IsMemory() { last[v.ID] = true } } for _, v := range b.Values { if v.Type.IsMemory() { if m := prevStore(v); m != nil { last[m.ID] = false } } } var lastMem *Value for _, v := range b.Values { if last[v.ID] { lastMem = v break } } // Iterate over memory stores, accumulating them in memChain for potential merging. // Flush the chain when reordering is unsafe or a conflict is detected. for v := lastMem; v != nil; v = prevStore(v) { writeSize := storeWidth(v.Op) if writeSize == 0 { // We can't reorder stores with calls or other instructions // with writeSize == 0. flushMemChain() continue } if v.Uses != 1 && len(memChain) > 0 || len(memChain) > 0 && (v.Args[0] != memChain[0].Args[0] || v.Aux != memChain[0].Aux) || len(memChain) == limit { // If v has multiple uses and it is not the latest store in the chain, // we cannot merge it with other store instructions. // If v has a different base pointer or Aux value from the current chain, // we need to flush memChain and start a new one with v. // If memChain length limit is exceeded, we also need to flush the chain // and start a new one with v. // Only look back so far. // This keeps us in O(n) territory, and it // also prevents us from keeping values // in registers for too long (and thus // needing to spill them). flushMemChain() } for _, w := range memChain { wWriteSize := storeWidth(w.Op) if overlap(w.AuxInt, wWriteSize, v.AuxInt, writeSize) { // Aliases with w's location. // Flush the chain and start a new one with v. flushMemChain() break } } memChain = append(memChain, v) } flushMemChain() } }