1//===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Merge the offset of address calculation into the offset field
10// of instructions in a global address lowering sequence.
11//
12//===----------------------------------------------------------------------===//
13
14#include "LoongArch.h"
15#include "LoongArchTargetMachine.h"
16#include "llvm/CodeGen/MachineFunctionPass.h"
17#include "llvm/CodeGen/Passes.h"
18#include "llvm/MC/TargetRegistry.h"
19#include "llvm/Support/Debug.h"
20#include "llvm/Target/TargetOptions.h"
21#include <optional>
22
23using namespace llvm;
24
25#define DEBUG_TYPE "loongarch-merge-base-offset"
26#define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
27
28namespace {
29
30class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
31 const LoongArchSubtarget *ST = nullptr;
32 MachineRegisterInfo *MRI;
33
34public:
35 static char ID;
36 bool runOnMachineFunction(MachineFunction &Fn) override;
37 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
38 MachineInstr *&Lo20, MachineInstr *&Hi12,
39 MachineInstr *&Last);
40 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
41 MachineInstr *&Lo12);
42
43 bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
44 MachineInstr *&Lo20, MachineInstr *&Hi12,
45 MachineInstr *&Last);
46 void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
47 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
48 int64_t Offset);
49 bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
50 MachineInstr *&Lo20, MachineInstr *&Hi12,
51 MachineInstr *&Last, MachineInstr &TailAdd,
52 Register GAReg);
53
54 bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
55 MachineInstr *&Lo20, MachineInstr *&Hi12,
56 MachineInstr *&Last);
57
58 LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
59
60 MachineFunctionProperties getRequiredProperties() const override {
61 return MachineFunctionProperties().setIsSSA();
62 }
63
64 void getAnalysisUsage(AnalysisUsage &AU) const override {
65 AU.setPreservesCFG();
66 MachineFunctionPass::getAnalysisUsage(AU);
67 }
68
69 StringRef getPassName() const override {
70 return LoongArch_MERGE_BASE_OFFSET_NAME;
71 }
72};
73} // end anonymous namespace
74
75char LoongArchMergeBaseOffsetOpt::ID = 0;
76INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
77 LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
78
79// Detect either of the patterns:
80//
81// 1. (small/medium):
82// pcalau12i vreg1, %pc_hi20(s)
83// addi.d vreg2, vreg1, %pc_lo12(s)
84//
85// 2. (large):
86// pcalau12i vreg1, %pc_hi20(s)
87// addi.d vreg2, $zero, %pc_lo12(s)
88// lu32i.d vreg3, vreg2, %pc64_lo20(s)
89// lu52i.d vreg4, vreg3, %pc64_hi12(s)
90// add.d vreg5, vreg4, vreg1
91
92// The pattern is only accepted if:
93// 1) For small and medium pattern, the first instruction has only one use,
94// which is the ADDI.
95// 2) For large pattern, the first four instructions each have only one use,
96// and the user of the fourth instruction is ADD.
97// 3) The address operands have the appropriate type, reflecting the
98// lowering of a global address or constant pool using the pattern.
99// 4) The offset value in the Global Address or Constant Pool is 0.
100bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
101 MachineInstr *&Lo12,
102 MachineInstr *&Lo20,
103 MachineInstr *&Hi12,
104 MachineInstr *&Last) {
105 if (Hi20.getOpcode() != LoongArch::PCALAU12I)
106 return false;
107
108 const MachineOperand &Hi20Op1 = Hi20.getOperand(i: 1);
109 if (LoongArchII::getDirectFlags(MO: Hi20Op1) != LoongArchII::MO_PCREL_HI)
110 return false;
111
112 auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
113 return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
114 };
115
116 if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
117 return false;
118
119 Register HiDestReg = Hi20.getOperand(i: 0).getReg();
120 if (!MRI->hasOneUse(RegNo: HiDestReg))
121 return false;
122
123 MachineInstr *UseInst = &*MRI->use_instr_begin(RegNo: HiDestReg);
124 if (UseInst->getOpcode() != LoongArch::ADD_D) {
125 Lo12 = UseInst;
126 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
127 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
128 return false;
129 } else {
130 assert(ST->is64Bit());
131 Last = UseInst;
132
133 Register LastOp1Reg = Last->getOperand(i: 1).getReg();
134 if (!LastOp1Reg.isVirtual())
135 return false;
136 Hi12 = MRI->getVRegDef(Reg: LastOp1Reg);
137 const MachineOperand &Hi12Op2 = Hi12->getOperand(i: 2);
138 if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
139 return false;
140 if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
141 return false;
142 if (!MRI->hasOneUse(RegNo: Hi12->getOperand(i: 0).getReg()))
143 return false;
144
145 Lo20 = MRI->getVRegDef(Reg: Hi12->getOperand(i: 1).getReg());
146 const MachineOperand &Lo20Op2 = Lo20->getOperand(i: 2);
147 if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
148 return false;
149 if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
150 return false;
151 if (!MRI->hasOneUse(RegNo: Lo20->getOperand(i: 0).getReg()))
152 return false;
153
154 Lo12 = MRI->getVRegDef(Reg: Lo20->getOperand(i: 1).getReg());
155 if (!MRI->hasOneUse(RegNo: Lo12->getOperand(i: 0).getReg()))
156 return false;
157 }
158
159 const MachineOperand &Lo12Op2 = Lo12->getOperand(i: 2);
160 assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
161 if (LoongArchII::getDirectFlags(MO: Lo12Op2) != LoongArchII::MO_PCREL_LO ||
162 !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
163 Lo12Op2.getOffset() != 0)
164 return false;
165
166 if (Hi20Op1.isGlobal()) {
167 LLVM_DEBUG(dbgs() << " Found lowered global address: "
168 << *Hi20Op1.getGlobal() << "\n");
169 } else if (Hi20Op1.isBlockAddress()) {
170 LLVM_DEBUG(dbgs() << " Found lowered basic address: "
171 << *Hi20Op1.getBlockAddress() << "\n");
172 } else if (Hi20Op1.isCPI()) {
173 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
174 << "\n");
175 }
176
177 return true;
178}
179
180// Detect the pattern:
181//
182// (small/medium):
183// lu12i.w vreg1, %le_hi20_r(s)
184// add.w/d vreg2, vreg1, r2, %le_add_r(s)
185// addi.w/d vreg3, vreg2, %le_lo12_r(s)
186
187// The pattern is only accepted if:
188// 1) The first instruction has only one use, which is the PseudoAddTPRel.
189// The second instruction has only one use, which is the ADDI. The
190// second instruction's last operand is the tp register.
191// 2) The address operands have the appropriate type, reflecting the
192// lowering of a thread_local global address using the pattern.
193// 3) The offset value in the ThreadLocal Global Address is 0.
194bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
195 MachineInstr *&Add,
196 MachineInstr *&Lo12) {
197 if (Hi20.getOpcode() != LoongArch::LU12I_W)
198 return false;
199
200 auto isGlobalOrCPI = [](const MachineOperand &Op) {
201 return Op.isGlobal() || Op.isCPI();
202 };
203
204 const MachineOperand &Hi20Op1 = Hi20.getOperand(i: 1);
205 if (LoongArchII::getDirectFlags(MO: Hi20Op1) != LoongArchII::MO_LE_HI_R ||
206 !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
207 return false;
208
209 Register HiDestReg = Hi20.getOperand(i: 0).getReg();
210 if (!MRI->hasOneUse(RegNo: HiDestReg))
211 return false;
212
213 Add = &*MRI->use_instr_begin(RegNo: HiDestReg);
214 if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
215 (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
216 return false;
217
218 if (Add->getOperand(i: 2).getReg() != LoongArch::R2)
219 return false;
220
221 const MachineOperand &AddOp3 = Add->getOperand(i: 3);
222 if (LoongArchII::getDirectFlags(MO: AddOp3) != LoongArchII::MO_LE_ADD_R ||
223 !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
224 AddOp3.getOffset() != 0)
225 return false;
226
227 Register AddDestReg = Add->getOperand(i: 0).getReg();
228 if (!MRI->hasOneUse(RegNo: AddDestReg))
229 return false;
230
231 Lo12 = &*MRI->use_instr_begin(RegNo: AddDestReg);
232 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
233 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
234 return false;
235
236 const MachineOperand &Lo12Op2 = Lo12->getOperand(i: 2);
237 if (LoongArchII::getDirectFlags(MO: Lo12Op2) != LoongArchII::MO_LE_LO_R ||
238 !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
239 Lo12Op2.getOffset() != 0)
240 return false;
241
242 if (Hi20Op1.isGlobal()) {
243 LLVM_DEBUG(dbgs() << " Found lowered global address: "
244 << *Hi20Op1.getGlobal() << "\n");
245 } else if (Hi20Op1.isCPI()) {
246 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
247 << "\n");
248 }
249
250 return true;
251}
252
253// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
254// Delete the tail instruction and update all the uses to use the
255// output from Last.
256void LoongArchMergeBaseOffsetOpt::foldOffset(
257 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
258 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
259 int64_t Offset) {
260 // Put the offset back in Hi and the Lo
261 Hi20.getOperand(i: 1).setOffset(Offset);
262 Lo12.getOperand(i: 2).setOffset(Offset);
263 if (Lo20 && Hi12) {
264 Lo20->getOperand(i: 2).setOffset(Offset);
265 Hi12->getOperand(i: 2).setOffset(Offset);
266 }
267
268 // For tls-le, offset of the second PseudoAddTPRel instr should also be
269 // updated.
270 MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg());
271 if (Hi20.getOpcode() == LoongArch::LU12I_W)
272 Add->getOperand(i: 3).setOffset(Offset);
273
274 // Delete the tail instruction.
275 MachineInstr *Def = Last ? Last : &Lo12;
276 MRI->constrainRegClass(Reg: Def->getOperand(i: 0).getReg(),
277 RC: MRI->getRegClass(Reg: Tail.getOperand(i: 0).getReg()));
278 MRI->replaceRegWith(FromReg: Tail.getOperand(i: 0).getReg(), ToReg: Def->getOperand(i: 0).getReg());
279 Tail.eraseFromParent();
280
281 LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
282 << " " << Hi20;);
283 if (Hi20.getOpcode() == LoongArch::LU12I_W) {
284 LLVM_DEBUG(dbgs() << " " << *Add;);
285 }
286 LLVM_DEBUG(dbgs() << " " << Lo12;);
287 if (Lo20 && Hi12) {
288 LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
289 }
290}
291
292// Detect patterns for large offsets that are passed into an ADD instruction.
293// If the pattern is found, updates the offset in Hi20, (Add), Lo12,
294// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
295// produced the offset.
296//
297// (The instructions marked with "!" are not necessarily present)
298//
299// Base address lowering is of the form:
300// 1) pcala:
301// Hi20: pcalau12i vreg1, %pc_hi20(s)
302// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
303// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
304// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
305// |
306// | 2) tls-le:
307// | Hi20: lu12i.w vreg1, %le_hi20_r(s)
308// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
309// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
310// |
311// | The large offset can be one of the forms:
312// |
313// +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
314// | OffsetHi20: lu12i.w vreg3, 4
315// | OffsetLo12: ori voff, vreg3, 188 ------------------+
316// | |
317// +-> 2) Offset that has non zero bits in Hi20 bits only: |
318// | OffsetHi20: lu12i.w voff, 128 ------------------+
319// | |
320// +-> 3) Offset that has non zero bits in Lo20 bits: |
321// | OffsetHi20: lu12i.w vreg3, 121 ! |
322// | OffsetLo12: ori voff, vreg3, 122 ! |
323// | OffsetLo20: lu32i.d voff, 123 ------------------+
324// +-> 4) Offset that has non zero bits in Hi12 bits: |
325// OffsetHi20: lu12i.w vreg3, 121 ! |
326// OffsetLo12: ori voff, vreg3, 122 ! |
327// OffsetLo20: lu32i.d vreg3, 123 ! |
328// OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
329// |
330// TailAdd: add.d vreg4, vreg2, voff <------------------+
331//
332bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
333 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
334 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
335 Register GAReg) {
336 assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
337 TailAdd.getOpcode() == LoongArch::ADD_D) &&
338 "Expected ADD instruction!");
339 Register Rs = TailAdd.getOperand(i: 1).getReg();
340 Register Rt = TailAdd.getOperand(i: 2).getReg();
341 Register Reg = Rs == GAReg ? Rt : Rs;
342 SmallVector<MachineInstr *, 4> Instrs;
343 int64_t Offset = 0;
344 int64_t Mask = -1;
345
346 // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
347 for (int i = 0; i < 4; i++) {
348 // Handle Reg is R0.
349 if (Reg == LoongArch::R0)
350 break;
351
352 // Can't fold if the register has more than one use.
353 if (!Reg.isVirtual() || !MRI->hasOneUse(RegNo: Reg))
354 return false;
355
356 MachineInstr *Curr = MRI->getVRegDef(Reg);
357 if (!Curr)
358 break;
359
360 switch (Curr->getOpcode()) {
361 default:
362 // Can't fold if the instruction opcode is unexpected.
363 return false;
364 case LoongArch::ORI: {
365 MachineOperand ImmOp = Curr->getOperand(i: 2);
366 if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
367 return false;
368 Offset += ImmOp.getImm();
369 Reg = Curr->getOperand(i: 1).getReg();
370 Instrs.push_back(Elt: Curr);
371 break;
372 }
373 case LoongArch::LU12I_W: {
374 MachineOperand ImmOp = Curr->getOperand(i: 1);
375 if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
376 return false;
377 Offset += SignExtend64<32>(x: ImmOp.getImm() << 12) & Mask;
378 Reg = LoongArch::R0;
379 Instrs.push_back(Elt: Curr);
380 break;
381 }
382 case LoongArch::LU32I_D: {
383 MachineOperand ImmOp = Curr->getOperand(i: 2);
384 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
385 return false;
386 Offset += SignExtend64<52>(x: ImmOp.getImm() << 32) & Mask;
387 Mask ^= 0x000FFFFF00000000ULL;
388 Reg = Curr->getOperand(i: 1).getReg();
389 Instrs.push_back(Elt: Curr);
390 break;
391 }
392 case LoongArch::LU52I_D: {
393 MachineOperand ImmOp = Curr->getOperand(i: 2);
394 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
395 return false;
396 Offset += ImmOp.getImm() << 52;
397 Mask ^= 0xFFF0000000000000ULL;
398 Reg = Curr->getOperand(i: 1).getReg();
399 Instrs.push_back(Elt: Curr);
400 break;
401 }
402 }
403 }
404
405 // Can't fold if the offset is not extracted.
406 if (!Offset)
407 return false;
408
409 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail&: TailAdd, Offset);
410 LLVM_DEBUG(dbgs() << " Offset Instrs:\n");
411 for (auto I : Instrs) {
412 LLVM_DEBUG(dbgs() << " " << *I);
413 I->eraseFromParent();
414 }
415
416 return true;
417}
418
419bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
420 MachineInstr &Lo12,
421 MachineInstr *&Lo20,
422 MachineInstr *&Hi12,
423 MachineInstr *&Last) {
424 Register DestReg =
425 Last ? Last->getOperand(i: 0).getReg() : Lo12.getOperand(i: 0).getReg();
426
427 // Look for arithmetic instructions we can get an offset from.
428 // We might be able to remove the arithmetic instructions by folding the
429 // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
430 // LU12I_W+PseudoAddTPRel+ADDI.
431 if (!MRI->hasOneUse(RegNo: DestReg))
432 return false;
433
434 // DestReg has only one use.
435 MachineInstr &Tail = *MRI->use_instr_begin(RegNo: DestReg);
436 switch (Tail.getOpcode()) {
437 default:
438 LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
439 << Tail);
440 break;
441 case LoongArch::ADDI_W:
442 if (ST->is64Bit())
443 return false;
444 [[fallthrough]];
445 case LoongArch::ADDI_D:
446 case LoongArch::ADDU16I_D: {
447 // Offset is simply an immediate operand.
448 int64_t Offset = Tail.getOperand(i: 2).getImm();
449 if (Tail.getOpcode() == LoongArch::ADDU16I_D)
450 Offset = SignExtend64<32>(x: Offset << 16);
451
452 // We might have two ADDIs in a row.
453 Register TailDestReg = Tail.getOperand(i: 0).getReg();
454 if (MRI->hasOneUse(RegNo: TailDestReg)) {
455 MachineInstr &TailTail = *MRI->use_instr_begin(RegNo: TailDestReg);
456 if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
457 return false;
458 if (TailTail.getOpcode() == LoongArch::ADDI_W ||
459 TailTail.getOpcode() == LoongArch::ADDI_D) {
460 Offset += TailTail.getOperand(i: 2).getImm();
461 LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail);
462 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail&: TailTail, Offset);
463 Tail.eraseFromParent();
464 return true;
465 }
466 }
467
468 LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail);
469 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
470 return true;
471 }
472 case LoongArch::ADD_W:
473 if (ST->is64Bit())
474 return false;
475 [[fallthrough]];
476 case LoongArch::ADD_D:
477 // The offset is too large to fit in the immediate field of ADDI.
478 return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd&: Tail, GAReg: DestReg);
479 break;
480 }
481
482 return false;
483}
484
485// Memory access opcode mapping for transforms.
486static unsigned getNewOpc(unsigned Op, bool isLarge) {
487 switch (Op) {
488 case LoongArch::LD_B:
489 return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
490 case LoongArch::LD_H:
491 return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
492 case LoongArch::LD_W:
493 case LoongArch::LDPTR_W:
494 return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
495 case LoongArch::LD_D:
496 case LoongArch::LDPTR_D:
497 return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
498 case LoongArch::LD_BU:
499 return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
500 case LoongArch::LD_HU:
501 return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
502 case LoongArch::LD_WU:
503 return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
504 case LoongArch::FLD_S:
505 return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
506 case LoongArch::FLD_D:
507 return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
508 case LoongArch::VLD:
509 return isLarge ? LoongArch::VLDX : LoongArch::VLD;
510 case LoongArch::XVLD:
511 return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;
512 case LoongArch::VLDREPL_B:
513 return LoongArch::VLDREPL_B;
514 case LoongArch::XVLDREPL_B:
515 return LoongArch::XVLDREPL_B;
516 case LoongArch::ST_B:
517 return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
518 case LoongArch::ST_H:
519 return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
520 case LoongArch::ST_W:
521 case LoongArch::STPTR_W:
522 return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
523 case LoongArch::ST_D:
524 case LoongArch::STPTR_D:
525 return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
526 case LoongArch::FST_S:
527 return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
528 case LoongArch::FST_D:
529 return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
530 case LoongArch::VST:
531 return isLarge ? LoongArch::VSTX : LoongArch::VST;
532 case LoongArch::XVST:
533 return isLarge ? LoongArch::XVSTX : LoongArch::XVST;
534 default:
535 llvm_unreachable("Unexpected opcode for replacement");
536 }
537}
538
539bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
540 MachineInstr &Lo12,
541 MachineInstr *&Lo20,
542 MachineInstr *&Hi12,
543 MachineInstr *&Last) {
544 Register DestReg =
545 Last ? Last->getOperand(i: 0).getReg() : Lo12.getOperand(i: 0).getReg();
546
547 // If all the uses are memory ops with the same offset, we can transform:
548 //
549 // 1. (small/medium):
550 // 1.1. pcala
551 // pcalau12i vreg1, %pc_hi20(s)
552 // addi.d vreg2, vreg1, %pc_lo12(s)
553 // ld.w vreg3, 8(vreg2)
554 //
555 // =>
556 //
557 // pcalau12i vreg1, %pc_hi20(s+8)
558 // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
559 //
560 // 1.2. tls-le
561 // lu12i.w vreg1, %le_hi20_r(s)
562 // add.w/d vreg2, vreg1, r2, %le_add_r(s)
563 // addi.w/d vreg3, vreg2, %le_lo12_r(s)
564 // ld.w vreg4, 8(vreg3)
565 //
566 // =>
567 //
568 // lu12i.w vreg1, %le_hi20_r(s+8)
569 // add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
570 // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
571 //
572 // 2. (large):
573 // pcalau12i vreg1, %pc_hi20(s)
574 // addi.d vreg2, $zero, %pc_lo12(s)
575 // lu32i.d vreg3, vreg2, %pc64_lo20(s)
576 // lu52i.d vreg4, vreg3, %pc64_hi12(s)
577 // add.d vreg5, vreg4, vreg1
578 // ld.w vreg6, 8(vreg5)
579 //
580 // =>
581 //
582 // pcalau12i vreg1, %pc_hi20(s+8)
583 // addi.d vreg2, $zero, %pc_lo12(s+8)
584 // lu32i.d vreg3, vreg2, %pc64_lo20(s+8)
585 // lu52i.d vreg4, vreg3, %pc64_hi12(s+8)
586 // ldx.w vreg6, vreg4, vreg1
587
588 std::optional<int64_t> CommonOffset;
589 DenseMap<const MachineInstr *, SmallVector<unsigned>>
590 InlineAsmMemoryOpIndexesMap;
591 for (const MachineInstr &UseMI : MRI->use_instructions(Reg: DestReg)) {
592 switch (UseMI.getOpcode()) {
593 default:
594 LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
595 return false;
596 case LoongArch::VLDREPL_B:
597 case LoongArch::XVLDREPL_B:
598 // We can't do this for large pattern.
599 if (Last)
600 return false;
601 [[fallthrough]];
602 case LoongArch::LD_B:
603 case LoongArch::LD_H:
604 case LoongArch::LD_W:
605 case LoongArch::LD_D:
606 case LoongArch::LD_BU:
607 case LoongArch::LD_HU:
608 case LoongArch::LD_WU:
609 case LoongArch::LDPTR_W:
610 case LoongArch::LDPTR_D:
611 case LoongArch::FLD_S:
612 case LoongArch::FLD_D:
613 case LoongArch::VLD:
614 case LoongArch::XVLD:
615 case LoongArch::ST_B:
616 case LoongArch::ST_H:
617 case LoongArch::ST_W:
618 case LoongArch::ST_D:
619 case LoongArch::STPTR_W:
620 case LoongArch::STPTR_D:
621 case LoongArch::FST_S:
622 case LoongArch::FST_D:
623 case LoongArch::VST:
624 case LoongArch::XVST: {
625 if (UseMI.getOperand(i: 1).isFI())
626 return false;
627 // Register defined by Lo should not be the value register.
628 if (DestReg == UseMI.getOperand(i: 0).getReg())
629 return false;
630 assert(DestReg == UseMI.getOperand(1).getReg() &&
631 "Expected base address use");
632 // All load/store instructions must use the same offset.
633 int64_t Offset = UseMI.getOperand(i: 2).getImm();
634 if (CommonOffset && Offset != CommonOffset)
635 return false;
636 CommonOffset = Offset;
637 break;
638 }
639 case LoongArch::INLINEASM:
640 case LoongArch::INLINEASM_BR: {
641 // We can't do this for large pattern.
642 if (Last)
643 return false;
644 SmallVector<unsigned> InlineAsmMemoryOpIndexes;
645 unsigned NumOps = 0;
646 for (unsigned I = InlineAsm::MIOp_FirstOperand;
647 I < UseMI.getNumOperands(); I += 1 + NumOps) {
648 const MachineOperand &FlagsMO = UseMI.getOperand(i: I);
649 // Should be an imm.
650 if (!FlagsMO.isImm())
651 continue;
652
653 const InlineAsm::Flag Flags(FlagsMO.getImm());
654 NumOps = Flags.getNumOperandRegisters();
655
656 // Memory constraints have two operands.
657 if (NumOps != 2 || !Flags.isMemKind()) {
658 // If the register is used by something other than a memory contraint,
659 // we should not fold.
660 for (unsigned J = 0; J < NumOps; ++J) {
661 const MachineOperand &MO = UseMI.getOperand(i: I + 1 + J);
662 if (MO.isReg() && MO.getReg() == DestReg)
663 return false;
664 }
665 continue;
666 }
667
668 // We can only do this for constraint m.
669 if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
670 return false;
671
672 const MachineOperand &AddrMO = UseMI.getOperand(i: I + 1);
673 if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
674 continue;
675
676 const MachineOperand &OffsetMO = UseMI.getOperand(i: I + 2);
677 if (!OffsetMO.isImm())
678 continue;
679
680 // All inline asm memory operands must use the same offset.
681 int64_t Offset = OffsetMO.getImm();
682 if (CommonOffset && Offset != CommonOffset)
683 return false;
684 CommonOffset = Offset;
685 InlineAsmMemoryOpIndexes.push_back(Elt: I + 1);
686 }
687 InlineAsmMemoryOpIndexesMap.insert(
688 KV: std::make_pair(x: &UseMI, y&: InlineAsmMemoryOpIndexes));
689 break;
690 }
691 }
692 }
693
694 // We found a common offset.
695 // Update the offsets in global address lowering.
696 // We may have already folded some arithmetic so we need to add to any
697 // existing offset.
698 int64_t NewOffset = Hi20.getOperand(i: 1).getOffset() + *CommonOffset;
699 // LA32 ignores the upper 32 bits.
700 if (!ST->is64Bit())
701 NewOffset = SignExtend64<32>(x: NewOffset);
702 // We can only fold simm32 offsets.
703 if (!isInt<32>(x: NewOffset))
704 return false;
705
706 // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
707 // be removed from the pcala code sequence. Code sequence of tls-le can still
708 // be relaxed after being optimized.
709 //
710 // For example:
711 // pcalau12i $a0, %pc_hi20(symbol)
712 // addi.d $a0, $a0, %pc_lo12(symbol)
713 // ld.w $a0, $a0, 0
714 //
715 // =>
716 //
717 // pcalau12i $a0, %pc_hi20(symbol)
718 // ld.w $a0, $a0, %pc_lo12(symbol)
719 //
720 // Code sequence optimized before can be relax by linker. But after being
721 // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
722 // carried by them.
723 Hi20.getOperand(i: 1).setOffset(NewOffset);
724 MachineOperand &ImmOp = Lo12.getOperand(i: 2);
725 ImmOp.setOffset(NewOffset);
726 if (Lo20 && Hi12) {
727 Lo20->getOperand(i: 2).setOffset(NewOffset);
728 Hi12->getOperand(i: 2).setOffset(NewOffset);
729 }
730 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
731 Hi20.getOperand(i: 1).setTargetFlags(
732 LoongArchII::getDirectFlags(MO: Hi20.getOperand(i: 1)));
733 ImmOp.setTargetFlags(LoongArchII::getDirectFlags(MO: ImmOp));
734 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
735 MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg());
736 Add->getOperand(i: 3).setOffset(NewOffset);
737 }
738
739 // Update the immediate in the load/store instructions to add the offset.
740 const LoongArchInstrInfo &TII = *ST->getInstrInfo();
741 for (MachineInstr &UseMI :
742 llvm::make_early_inc_range(Range: MRI->use_instructions(Reg: DestReg))) {
743 if (UseMI.getOpcode() == LoongArch::INLINEASM ||
744 UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
745 auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
746 for (unsigned I : InlineAsmMemoryOpIndexes) {
747 MachineOperand &MO = UseMI.getOperand(i: I + 1);
748 switch (ImmOp.getType()) {
749 case MachineOperand::MO_GlobalAddress:
750 MO.ChangeToGA(GV: ImmOp.getGlobal(), Offset: ImmOp.getOffset(),
751 TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp));
752 break;
753 case MachineOperand::MO_MCSymbol:
754 MO.ChangeToMCSymbol(Sym: ImmOp.getMCSymbol(),
755 TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp));
756 MO.setOffset(ImmOp.getOffset());
757 break;
758 case MachineOperand::MO_BlockAddress:
759 MO.ChangeToBA(BA: ImmOp.getBlockAddress(), Offset: ImmOp.getOffset(),
760 TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp));
761 break;
762 default:
763 report_fatal_error(reason: "unsupported machine operand type");
764 break;
765 }
766 }
767 } else {
768 UseMI.setDesc(TII.get(Opcode: getNewOpc(Op: UseMI.getOpcode(), isLarge: Last)));
769 if (Last) {
770 UseMI.removeOperand(OpNo: 2);
771 UseMI.removeOperand(OpNo: 1);
772 UseMI.addOperand(Op: Last->getOperand(i: 1));
773 UseMI.addOperand(Op: Last->getOperand(i: 2));
774 UseMI.getOperand(i: 1).setIsKill(false);
775 UseMI.getOperand(i: 2).setIsKill(false);
776 } else {
777 UseMI.removeOperand(OpNo: 2);
778 UseMI.addOperand(Op: ImmOp);
779 }
780 }
781 }
782
783 if (Last) {
784 Last->eraseFromParent();
785 return true;
786 }
787
788 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
789 MRI->replaceRegWith(FromReg: Lo12.getOperand(i: 0).getReg(),
790 ToReg: Hi20.getOperand(i: 0).getReg());
791 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
792 MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg());
793 MRI->replaceRegWith(FromReg: Lo12.getOperand(i: 0).getReg(),
794 ToReg: Add->getOperand(i: 0).getReg());
795 }
796 Lo12.eraseFromParent();
797 return true;
798}
799
800bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
801 if (skipFunction(F: Fn.getFunction()))
802 return false;
803
804 ST = &Fn.getSubtarget<LoongArchSubtarget>();
805
806 bool MadeChange = false;
807 MRI = &Fn.getRegInfo();
808 for (MachineBasicBlock &MBB : Fn) {
809 LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
810 for (MachineInstr &Hi20 : MBB) {
811 MachineInstr *Lo12 = nullptr;
812 MachineInstr *Lo20 = nullptr;
813 MachineInstr *Hi12 = nullptr;
814 MachineInstr *Last = nullptr;
815 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
816 // Detect foldable pcala code sequence in small/medium/large code model.
817 if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
818 continue;
819 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
820 MachineInstr *Add = nullptr;
821 // Detect foldable tls-le code sequence in small/medium code model.
822 if (!detectFoldable(Hi20, Add, Lo12))
823 continue;
824 } else {
825 continue;
826 }
827 // For tls-le, we do not pass the second PseudoAddTPRel instr in order to
828 // reuse the existing hooks and the last three paramaters should always be
829 // nullptr.
830 MadeChange |= detectAndFoldOffset(Hi20, Lo12&: *Lo12, Lo20, Hi12, Last);
831 MadeChange |= foldIntoMemoryOps(Hi20, Lo12&: *Lo12, Lo20, Hi12, Last);
832 }
833 }
834
835 return MadeChange;
836}
837
838/// Returns an instance of the Merge Base Offset Optimization pass.
839FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
840 return new LoongArchMergeBaseOffsetOpt();
841}
842