1 | //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Merge the offset of address calculation into the offset field |
10 | // of instructions in a global address lowering sequence. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "LoongArch.h" |
15 | #include "LoongArchTargetMachine.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | #include "llvm/CodeGen/Passes.h" |
18 | #include "llvm/MC/TargetRegistry.h" |
19 | #include "llvm/Support/Debug.h" |
20 | #include "llvm/Target/TargetOptions.h" |
21 | #include <optional> |
22 | |
23 | using namespace llvm; |
24 | |
25 | #define DEBUG_TYPE "loongarch-merge-base-offset" |
26 | #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset" |
27 | |
28 | namespace { |
29 | |
30 | class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass { |
31 | const LoongArchSubtarget *ST = nullptr; |
32 | MachineRegisterInfo *MRI; |
33 | |
34 | public: |
35 | static char ID; |
36 | bool runOnMachineFunction(MachineFunction &Fn) override; |
37 | bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12, |
38 | MachineInstr *&Lo20, MachineInstr *&Hi12, |
39 | MachineInstr *&Last); |
40 | bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add, |
41 | MachineInstr *&Lo12); |
42 | |
43 | bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12, |
44 | MachineInstr *&Lo20, MachineInstr *&Hi12, |
45 | MachineInstr *&Last); |
46 | void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
47 | MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, |
48 | int64_t Offset); |
49 | bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12, |
50 | MachineInstr *&Lo20, MachineInstr *&Hi12, |
51 | MachineInstr *&Last, MachineInstr &TailAdd, |
52 | Register GAReg); |
53 | |
54 | bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12, |
55 | MachineInstr *&Lo20, MachineInstr *&Hi12, |
56 | MachineInstr *&Last); |
57 | |
58 | LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {} |
59 | |
60 | MachineFunctionProperties getRequiredProperties() const override { |
61 | return MachineFunctionProperties().setIsSSA(); |
62 | } |
63 | |
64 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
65 | AU.setPreservesCFG(); |
66 | MachineFunctionPass::getAnalysisUsage(AU); |
67 | } |
68 | |
69 | StringRef getPassName() const override { |
70 | return LoongArch_MERGE_BASE_OFFSET_NAME; |
71 | } |
72 | }; |
73 | } // end anonymous namespace |
74 | |
75 | char LoongArchMergeBaseOffsetOpt::ID = 0; |
76 | INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE, |
77 | LoongArch_MERGE_BASE_OFFSET_NAME, false, false) |
78 | |
79 | // Detect either of the patterns: |
80 | // |
81 | // 1. (small/medium): |
82 | // pcalau12i vreg1, %pc_hi20(s) |
83 | // addi.d vreg2, vreg1, %pc_lo12(s) |
84 | // |
85 | // 2. (large): |
86 | // pcalau12i vreg1, %pc_hi20(s) |
87 | // addi.d vreg2, $zero, %pc_lo12(s) |
88 | // lu32i.d vreg3, vreg2, %pc64_lo20(s) |
89 | // lu52i.d vreg4, vreg3, %pc64_hi12(s) |
90 | // add.d vreg5, vreg4, vreg1 |
91 | |
92 | // The pattern is only accepted if: |
93 | // 1) For small and medium pattern, the first instruction has only one use, |
94 | // which is the ADDI. |
95 | // 2) For large pattern, the first four instructions each have only one use, |
96 | // and the user of the fourth instruction is ADD. |
97 | // 3) The address operands have the appropriate type, reflecting the |
98 | // lowering of a global address or constant pool using the pattern. |
99 | // 4) The offset value in the Global Address or Constant Pool is 0. |
100 | bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, |
101 | MachineInstr *&Lo12, |
102 | MachineInstr *&Lo20, |
103 | MachineInstr *&Hi12, |
104 | MachineInstr *&Last) { |
105 | if (Hi20.getOpcode() != LoongArch::PCALAU12I) |
106 | return false; |
107 | |
108 | const MachineOperand &Hi20Op1 = Hi20.getOperand(i: 1); |
109 | if (LoongArchII::getDirectFlags(MO: Hi20Op1) != LoongArchII::MO_PCREL_HI) |
110 | return false; |
111 | |
112 | auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) { |
113 | return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress(); |
114 | }; |
115 | |
116 | if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0) |
117 | return false; |
118 | |
119 | Register HiDestReg = Hi20.getOperand(i: 0).getReg(); |
120 | if (!MRI->hasOneUse(RegNo: HiDestReg)) |
121 | return false; |
122 | |
123 | MachineInstr *UseInst = &*MRI->use_instr_begin(RegNo: HiDestReg); |
124 | if (UseInst->getOpcode() != LoongArch::ADD_D) { |
125 | Lo12 = UseInst; |
126 | if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || |
127 | (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) |
128 | return false; |
129 | } else { |
130 | assert(ST->is64Bit()); |
131 | Last = UseInst; |
132 | |
133 | Register LastOp1Reg = Last->getOperand(i: 1).getReg(); |
134 | if (!LastOp1Reg.isVirtual()) |
135 | return false; |
136 | Hi12 = MRI->getVRegDef(Reg: LastOp1Reg); |
137 | const MachineOperand &Hi12Op2 = Hi12->getOperand(i: 2); |
138 | if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI) |
139 | return false; |
140 | if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0) |
141 | return false; |
142 | if (!MRI->hasOneUse(RegNo: Hi12->getOperand(i: 0).getReg())) |
143 | return false; |
144 | |
145 | Lo20 = MRI->getVRegDef(Reg: Hi12->getOperand(i: 1).getReg()); |
146 | const MachineOperand &Lo20Op2 = Lo20->getOperand(i: 2); |
147 | if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO) |
148 | return false; |
149 | if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0) |
150 | return false; |
151 | if (!MRI->hasOneUse(RegNo: Lo20->getOperand(i: 0).getReg())) |
152 | return false; |
153 | |
154 | Lo12 = MRI->getVRegDef(Reg: Lo20->getOperand(i: 1).getReg()); |
155 | if (!MRI->hasOneUse(RegNo: Lo12->getOperand(i: 0).getReg())) |
156 | return false; |
157 | } |
158 | |
159 | const MachineOperand &Lo12Op2 = Lo12->getOperand(i: 2); |
160 | assert(Hi20.getOpcode() == LoongArch::PCALAU12I); |
161 | if (LoongArchII::getDirectFlags(MO: Lo12Op2) != LoongArchII::MO_PCREL_LO || |
162 | !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) || |
163 | Lo12Op2.getOffset() != 0) |
164 | return false; |
165 | |
166 | if (Hi20Op1.isGlobal()) { |
167 | LLVM_DEBUG(dbgs() << " Found lowered global address: " |
168 | << *Hi20Op1.getGlobal() << "\n" ); |
169 | } else if (Hi20Op1.isBlockAddress()) { |
170 | LLVM_DEBUG(dbgs() << " Found lowered basic address: " |
171 | << *Hi20Op1.getBlockAddress() << "\n" ); |
172 | } else if (Hi20Op1.isCPI()) { |
173 | LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() |
174 | << "\n" ); |
175 | } |
176 | |
177 | return true; |
178 | } |
179 | |
180 | // Detect the pattern: |
181 | // |
182 | // (small/medium): |
183 | // lu12i.w vreg1, %le_hi20_r(s) |
184 | // add.w/d vreg2, vreg1, r2, %le_add_r(s) |
185 | // addi.w/d vreg3, vreg2, %le_lo12_r(s) |
186 | |
187 | // The pattern is only accepted if: |
188 | // 1) The first instruction has only one use, which is the PseudoAddTPRel. |
189 | // The second instruction has only one use, which is the ADDI. The |
190 | // second instruction's last operand is the tp register. |
191 | // 2) The address operands have the appropriate type, reflecting the |
192 | // lowering of a thread_local global address using the pattern. |
193 | // 3) The offset value in the ThreadLocal Global Address is 0. |
194 | bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, |
195 | MachineInstr *&Add, |
196 | MachineInstr *&Lo12) { |
197 | if (Hi20.getOpcode() != LoongArch::LU12I_W) |
198 | return false; |
199 | |
200 | auto isGlobalOrCPI = [](const MachineOperand &Op) { |
201 | return Op.isGlobal() || Op.isCPI(); |
202 | }; |
203 | |
204 | const MachineOperand &Hi20Op1 = Hi20.getOperand(i: 1); |
205 | if (LoongArchII::getDirectFlags(MO: Hi20Op1) != LoongArchII::MO_LE_HI_R || |
206 | !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0) |
207 | return false; |
208 | |
209 | Register HiDestReg = Hi20.getOperand(i: 0).getReg(); |
210 | if (!MRI->hasOneUse(RegNo: HiDestReg)) |
211 | return false; |
212 | |
213 | Add = &*MRI->use_instr_begin(RegNo: HiDestReg); |
214 | if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) || |
215 | (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W)) |
216 | return false; |
217 | |
218 | if (Add->getOperand(i: 2).getReg() != LoongArch::R2) |
219 | return false; |
220 | |
221 | const MachineOperand &AddOp3 = Add->getOperand(i: 3); |
222 | if (LoongArchII::getDirectFlags(MO: AddOp3) != LoongArchII::MO_LE_ADD_R || |
223 | !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) || |
224 | AddOp3.getOffset() != 0) |
225 | return false; |
226 | |
227 | Register AddDestReg = Add->getOperand(i: 0).getReg(); |
228 | if (!MRI->hasOneUse(RegNo: AddDestReg)) |
229 | return false; |
230 | |
231 | Lo12 = &*MRI->use_instr_begin(RegNo: AddDestReg); |
232 | if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || |
233 | (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) |
234 | return false; |
235 | |
236 | const MachineOperand &Lo12Op2 = Lo12->getOperand(i: 2); |
237 | if (LoongArchII::getDirectFlags(MO: Lo12Op2) != LoongArchII::MO_LE_LO_R || |
238 | !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) || |
239 | Lo12Op2.getOffset() != 0) |
240 | return false; |
241 | |
242 | if (Hi20Op1.isGlobal()) { |
243 | LLVM_DEBUG(dbgs() << " Found lowered global address: " |
244 | << *Hi20Op1.getGlobal() << "\n" ); |
245 | } else if (Hi20Op1.isCPI()) { |
246 | LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() |
247 | << "\n" ); |
248 | } |
249 | |
250 | return true; |
251 | } |
252 | |
253 | // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions. |
254 | // Delete the tail instruction and update all the uses to use the |
255 | // output from Last. |
256 | void LoongArchMergeBaseOffsetOpt::foldOffset( |
257 | MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
258 | MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, |
259 | int64_t Offset) { |
260 | // Put the offset back in Hi and the Lo |
261 | Hi20.getOperand(i: 1).setOffset(Offset); |
262 | Lo12.getOperand(i: 2).setOffset(Offset); |
263 | if (Lo20 && Hi12) { |
264 | Lo20->getOperand(i: 2).setOffset(Offset); |
265 | Hi12->getOperand(i: 2).setOffset(Offset); |
266 | } |
267 | |
268 | // For tls-le, offset of the second PseudoAddTPRel instr should also be |
269 | // updated. |
270 | MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg()); |
271 | if (Hi20.getOpcode() == LoongArch::LU12I_W) |
272 | Add->getOperand(i: 3).setOffset(Offset); |
273 | |
274 | // Delete the tail instruction. |
275 | MachineInstr *Def = Last ? Last : &Lo12; |
276 | MRI->constrainRegClass(Reg: Def->getOperand(i: 0).getReg(), |
277 | RC: MRI->getRegClass(Reg: Tail.getOperand(i: 0).getReg())); |
278 | MRI->replaceRegWith(FromReg: Tail.getOperand(i: 0).getReg(), ToReg: Def->getOperand(i: 0).getReg()); |
279 | Tail.eraseFromParent(); |
280 | |
281 | LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n" |
282 | << " " << Hi20;); |
283 | if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
284 | LLVM_DEBUG(dbgs() << " " << *Add;); |
285 | } |
286 | LLVM_DEBUG(dbgs() << " " << Lo12;); |
287 | if (Lo20 && Hi12) { |
288 | LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;); |
289 | } |
290 | } |
291 | |
292 | // Detect patterns for large offsets that are passed into an ADD instruction. |
293 | // If the pattern is found, updates the offset in Hi20, (Add), Lo12, |
294 | // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that |
295 | // produced the offset. |
296 | // |
297 | // (The instructions marked with "!" are not necessarily present) |
298 | // |
299 | // Base address lowering is of the form: |
300 | // 1) pcala: |
301 | // Hi20: pcalau12i vreg1, %pc_hi20(s) |
302 | // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s) |
303 | // | Lo20: lu32i.d vreg2, %pc64_lo20(s) ! |
304 | // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) ! |
305 | // | |
306 | // | 2) tls-le: |
307 | // | Hi20: lu12i.w vreg1, %le_hi20_r(s) |
308 | // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s) |
309 | // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s) |
310 | // | |
311 | // | The large offset can be one of the forms: |
312 | // | |
313 | // +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits: |
314 | // | OffsetHi20: lu12i.w vreg3, 4 |
315 | // | OffsetLo12: ori voff, vreg3, 188 ------------------+ |
316 | // | | |
317 | // +-> 2) Offset that has non zero bits in Hi20 bits only: | |
318 | // | OffsetHi20: lu12i.w voff, 128 ------------------+ |
319 | // | | |
320 | // +-> 3) Offset that has non zero bits in Lo20 bits: | |
321 | // | OffsetHi20: lu12i.w vreg3, 121 ! | |
322 | // | OffsetLo12: ori voff, vreg3, 122 ! | |
323 | // | OffsetLo20: lu32i.d voff, 123 ------------------+ |
324 | // +-> 4) Offset that has non zero bits in Hi12 bits: | |
325 | // OffsetHi20: lu12i.w vreg3, 121 ! | |
326 | // OffsetLo12: ori voff, vreg3, 122 ! | |
327 | // OffsetLo20: lu32i.d vreg3, 123 ! | |
328 | // OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+ |
329 | // | |
330 | // TailAdd: add.d vreg4, vreg2, voff <------------------+ |
331 | // |
332 | bool LoongArchMergeBaseOffsetOpt::foldLargeOffset( |
333 | MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, |
334 | MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd, |
335 | Register GAReg) { |
336 | assert((TailAdd.getOpcode() == LoongArch::ADD_W || |
337 | TailAdd.getOpcode() == LoongArch::ADD_D) && |
338 | "Expected ADD instruction!" ); |
339 | Register Rs = TailAdd.getOperand(i: 1).getReg(); |
340 | Register Rt = TailAdd.getOperand(i: 2).getReg(); |
341 | Register Reg = Rs == GAReg ? Rt : Rs; |
342 | SmallVector<MachineInstr *, 4> Instrs; |
343 | int64_t Offset = 0; |
344 | int64_t Mask = -1; |
345 | |
346 | // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]: |
347 | for (int i = 0; i < 4; i++) { |
348 | // Handle Reg is R0. |
349 | if (Reg == LoongArch::R0) |
350 | break; |
351 | |
352 | // Can't fold if the register has more than one use. |
353 | if (!Reg.isVirtual() || !MRI->hasOneUse(RegNo: Reg)) |
354 | return false; |
355 | |
356 | MachineInstr *Curr = MRI->getVRegDef(Reg); |
357 | if (!Curr) |
358 | break; |
359 | |
360 | switch (Curr->getOpcode()) { |
361 | default: |
362 | // Can't fold if the instruction opcode is unexpected. |
363 | return false; |
364 | case LoongArch::ORI: { |
365 | MachineOperand ImmOp = Curr->getOperand(i: 2); |
366 | if (ImmOp.getTargetFlags() != LoongArchII::MO_None) |
367 | return false; |
368 | Offset += ImmOp.getImm(); |
369 | Reg = Curr->getOperand(i: 1).getReg(); |
370 | Instrs.push_back(Elt: Curr); |
371 | break; |
372 | } |
373 | case LoongArch::LU12I_W: { |
374 | MachineOperand ImmOp = Curr->getOperand(i: 1); |
375 | if (ImmOp.getTargetFlags() != LoongArchII::MO_None) |
376 | return false; |
377 | Offset += SignExtend64<32>(x: ImmOp.getImm() << 12) & Mask; |
378 | Reg = LoongArch::R0; |
379 | Instrs.push_back(Elt: Curr); |
380 | break; |
381 | } |
382 | case LoongArch::LU32I_D: { |
383 | MachineOperand ImmOp = Curr->getOperand(i: 2); |
384 | if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20) |
385 | return false; |
386 | Offset += SignExtend64<52>(x: ImmOp.getImm() << 32) & Mask; |
387 | Mask ^= 0x000FFFFF00000000ULL; |
388 | Reg = Curr->getOperand(i: 1).getReg(); |
389 | Instrs.push_back(Elt: Curr); |
390 | break; |
391 | } |
392 | case LoongArch::LU52I_D: { |
393 | MachineOperand ImmOp = Curr->getOperand(i: 2); |
394 | if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12) |
395 | return false; |
396 | Offset += ImmOp.getImm() << 52; |
397 | Mask ^= 0xFFF0000000000000ULL; |
398 | Reg = Curr->getOperand(i: 1).getReg(); |
399 | Instrs.push_back(Elt: Curr); |
400 | break; |
401 | } |
402 | } |
403 | } |
404 | |
405 | // Can't fold if the offset is not extracted. |
406 | if (!Offset) |
407 | return false; |
408 | |
409 | foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail&: TailAdd, Offset); |
410 | LLVM_DEBUG(dbgs() << " Offset Instrs:\n" ); |
411 | for (auto I : Instrs) { |
412 | LLVM_DEBUG(dbgs() << " " << *I); |
413 | I->eraseFromParent(); |
414 | } |
415 | |
416 | return true; |
417 | } |
418 | |
419 | bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20, |
420 | MachineInstr &Lo12, |
421 | MachineInstr *&Lo20, |
422 | MachineInstr *&Hi12, |
423 | MachineInstr *&Last) { |
424 | Register DestReg = |
425 | Last ? Last->getOperand(i: 0).getReg() : Lo12.getOperand(i: 0).getReg(); |
426 | |
427 | // Look for arithmetic instructions we can get an offset from. |
428 | // We might be able to remove the arithmetic instructions by folding the |
429 | // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or |
430 | // LU12I_W+PseudoAddTPRel+ADDI. |
431 | if (!MRI->hasOneUse(RegNo: DestReg)) |
432 | return false; |
433 | |
434 | // DestReg has only one use. |
435 | MachineInstr &Tail = *MRI->use_instr_begin(RegNo: DestReg); |
436 | switch (Tail.getOpcode()) { |
437 | default: |
438 | LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:" |
439 | << Tail); |
440 | break; |
441 | case LoongArch::ADDI_W: |
442 | if (ST->is64Bit()) |
443 | return false; |
444 | [[fallthrough]]; |
445 | case LoongArch::ADDI_D: |
446 | case LoongArch::ADDU16I_D: { |
447 | // Offset is simply an immediate operand. |
448 | int64_t Offset = Tail.getOperand(i: 2).getImm(); |
449 | if (Tail.getOpcode() == LoongArch::ADDU16I_D) |
450 | Offset = SignExtend64<32>(x: Offset << 16); |
451 | |
452 | // We might have two ADDIs in a row. |
453 | Register TailDestReg = Tail.getOperand(i: 0).getReg(); |
454 | if (MRI->hasOneUse(RegNo: TailDestReg)) { |
455 | MachineInstr &TailTail = *MRI->use_instr_begin(RegNo: TailDestReg); |
456 | if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W) |
457 | return false; |
458 | if (TailTail.getOpcode() == LoongArch::ADDI_W || |
459 | TailTail.getOpcode() == LoongArch::ADDI_D) { |
460 | Offset += TailTail.getOperand(i: 2).getImm(); |
461 | LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail); |
462 | foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail&: TailTail, Offset); |
463 | Tail.eraseFromParent(); |
464 | return true; |
465 | } |
466 | } |
467 | |
468 | LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail); |
469 | foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset); |
470 | return true; |
471 | } |
472 | case LoongArch::ADD_W: |
473 | if (ST->is64Bit()) |
474 | return false; |
475 | [[fallthrough]]; |
476 | case LoongArch::ADD_D: |
477 | // The offset is too large to fit in the immediate field of ADDI. |
478 | return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd&: Tail, GAReg: DestReg); |
479 | break; |
480 | } |
481 | |
482 | return false; |
483 | } |
484 | |
485 | // Memory access opcode mapping for transforms. |
486 | static unsigned getNewOpc(unsigned Op, bool isLarge) { |
487 | switch (Op) { |
488 | case LoongArch::LD_B: |
489 | return isLarge ? LoongArch::LDX_B : LoongArch::LD_B; |
490 | case LoongArch::LD_H: |
491 | return isLarge ? LoongArch::LDX_H : LoongArch::LD_H; |
492 | case LoongArch::LD_W: |
493 | case LoongArch::LDPTR_W: |
494 | return isLarge ? LoongArch::LDX_W : LoongArch::LD_W; |
495 | case LoongArch::LD_D: |
496 | case LoongArch::LDPTR_D: |
497 | return isLarge ? LoongArch::LDX_D : LoongArch::LD_D; |
498 | case LoongArch::LD_BU: |
499 | return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU; |
500 | case LoongArch::LD_HU: |
501 | return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU; |
502 | case LoongArch::LD_WU: |
503 | return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU; |
504 | case LoongArch::FLD_S: |
505 | return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S; |
506 | case LoongArch::FLD_D: |
507 | return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D; |
508 | case LoongArch::VLD: |
509 | return isLarge ? LoongArch::VLDX : LoongArch::VLD; |
510 | case LoongArch::XVLD: |
511 | return isLarge ? LoongArch::XVLDX : LoongArch::XVLD; |
512 | case LoongArch::VLDREPL_B: |
513 | return LoongArch::VLDREPL_B; |
514 | case LoongArch::XVLDREPL_B: |
515 | return LoongArch::XVLDREPL_B; |
516 | case LoongArch::ST_B: |
517 | return isLarge ? LoongArch::STX_B : LoongArch::ST_B; |
518 | case LoongArch::ST_H: |
519 | return isLarge ? LoongArch::STX_H : LoongArch::ST_H; |
520 | case LoongArch::ST_W: |
521 | case LoongArch::STPTR_W: |
522 | return isLarge ? LoongArch::STX_W : LoongArch::ST_W; |
523 | case LoongArch::ST_D: |
524 | case LoongArch::STPTR_D: |
525 | return isLarge ? LoongArch::STX_D : LoongArch::ST_D; |
526 | case LoongArch::FST_S: |
527 | return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S; |
528 | case LoongArch::FST_D: |
529 | return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D; |
530 | case LoongArch::VST: |
531 | return isLarge ? LoongArch::VSTX : LoongArch::VST; |
532 | case LoongArch::XVST: |
533 | return isLarge ? LoongArch::XVSTX : LoongArch::XVST; |
534 | default: |
535 | llvm_unreachable("Unexpected opcode for replacement" ); |
536 | } |
537 | } |
538 | |
539 | bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, |
540 | MachineInstr &Lo12, |
541 | MachineInstr *&Lo20, |
542 | MachineInstr *&Hi12, |
543 | MachineInstr *&Last) { |
544 | Register DestReg = |
545 | Last ? Last->getOperand(i: 0).getReg() : Lo12.getOperand(i: 0).getReg(); |
546 | |
547 | // If all the uses are memory ops with the same offset, we can transform: |
548 | // |
549 | // 1. (small/medium): |
550 | // 1.1. pcala |
551 | // pcalau12i vreg1, %pc_hi20(s) |
552 | // addi.d vreg2, vreg1, %pc_lo12(s) |
553 | // ld.w vreg3, 8(vreg2) |
554 | // |
555 | // => |
556 | // |
557 | // pcalau12i vreg1, %pc_hi20(s+8) |
558 | // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1) |
559 | // |
560 | // 1.2. tls-le |
561 | // lu12i.w vreg1, %le_hi20_r(s) |
562 | // add.w/d vreg2, vreg1, r2, %le_add_r(s) |
563 | // addi.w/d vreg3, vreg2, %le_lo12_r(s) |
564 | // ld.w vreg4, 8(vreg3) |
565 | // |
566 | // => |
567 | // |
568 | // lu12i.w vreg1, %le_hi20_r(s+8) |
569 | // add.w/d vreg2, vreg1, r2, %le_add_r(s+8) |
570 | // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2) |
571 | // |
572 | // 2. (large): |
573 | // pcalau12i vreg1, %pc_hi20(s) |
574 | // addi.d vreg2, $zero, %pc_lo12(s) |
575 | // lu32i.d vreg3, vreg2, %pc64_lo20(s) |
576 | // lu52i.d vreg4, vreg3, %pc64_hi12(s) |
577 | // add.d vreg5, vreg4, vreg1 |
578 | // ld.w vreg6, 8(vreg5) |
579 | // |
580 | // => |
581 | // |
582 | // pcalau12i vreg1, %pc_hi20(s+8) |
583 | // addi.d vreg2, $zero, %pc_lo12(s+8) |
584 | // lu32i.d vreg3, vreg2, %pc64_lo20(s+8) |
585 | // lu52i.d vreg4, vreg3, %pc64_hi12(s+8) |
586 | // ldx.w vreg6, vreg4, vreg1 |
587 | |
588 | std::optional<int64_t> CommonOffset; |
589 | DenseMap<const MachineInstr *, SmallVector<unsigned>> |
590 | InlineAsmMemoryOpIndexesMap; |
591 | for (const MachineInstr &UseMI : MRI->use_instructions(Reg: DestReg)) { |
592 | switch (UseMI.getOpcode()) { |
593 | default: |
594 | LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI); |
595 | return false; |
596 | case LoongArch::VLDREPL_B: |
597 | case LoongArch::XVLDREPL_B: |
598 | // We can't do this for large pattern. |
599 | if (Last) |
600 | return false; |
601 | [[fallthrough]]; |
602 | case LoongArch::LD_B: |
603 | case LoongArch::LD_H: |
604 | case LoongArch::LD_W: |
605 | case LoongArch::LD_D: |
606 | case LoongArch::LD_BU: |
607 | case LoongArch::LD_HU: |
608 | case LoongArch::LD_WU: |
609 | case LoongArch::LDPTR_W: |
610 | case LoongArch::LDPTR_D: |
611 | case LoongArch::FLD_S: |
612 | case LoongArch::FLD_D: |
613 | case LoongArch::VLD: |
614 | case LoongArch::XVLD: |
615 | case LoongArch::ST_B: |
616 | case LoongArch::ST_H: |
617 | case LoongArch::ST_W: |
618 | case LoongArch::ST_D: |
619 | case LoongArch::STPTR_W: |
620 | case LoongArch::STPTR_D: |
621 | case LoongArch::FST_S: |
622 | case LoongArch::FST_D: |
623 | case LoongArch::VST: |
624 | case LoongArch::XVST: { |
625 | if (UseMI.getOperand(i: 1).isFI()) |
626 | return false; |
627 | // Register defined by Lo should not be the value register. |
628 | if (DestReg == UseMI.getOperand(i: 0).getReg()) |
629 | return false; |
630 | assert(DestReg == UseMI.getOperand(1).getReg() && |
631 | "Expected base address use" ); |
632 | // All load/store instructions must use the same offset. |
633 | int64_t Offset = UseMI.getOperand(i: 2).getImm(); |
634 | if (CommonOffset && Offset != CommonOffset) |
635 | return false; |
636 | CommonOffset = Offset; |
637 | break; |
638 | } |
639 | case LoongArch::INLINEASM: |
640 | case LoongArch::INLINEASM_BR: { |
641 | // We can't do this for large pattern. |
642 | if (Last) |
643 | return false; |
644 | SmallVector<unsigned> InlineAsmMemoryOpIndexes; |
645 | unsigned NumOps = 0; |
646 | for (unsigned I = InlineAsm::MIOp_FirstOperand; |
647 | I < UseMI.getNumOperands(); I += 1 + NumOps) { |
648 | const MachineOperand &FlagsMO = UseMI.getOperand(i: I); |
649 | // Should be an imm. |
650 | if (!FlagsMO.isImm()) |
651 | continue; |
652 | |
653 | const InlineAsm::Flag Flags(FlagsMO.getImm()); |
654 | NumOps = Flags.getNumOperandRegisters(); |
655 | |
656 | // Memory constraints have two operands. |
657 | if (NumOps != 2 || !Flags.isMemKind()) { |
658 | // If the register is used by something other than a memory contraint, |
659 | // we should not fold. |
660 | for (unsigned J = 0; J < NumOps; ++J) { |
661 | const MachineOperand &MO = UseMI.getOperand(i: I + 1 + J); |
662 | if (MO.isReg() && MO.getReg() == DestReg) |
663 | return false; |
664 | } |
665 | continue; |
666 | } |
667 | |
668 | // We can only do this for constraint m. |
669 | if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m) |
670 | return false; |
671 | |
672 | const MachineOperand &AddrMO = UseMI.getOperand(i: I + 1); |
673 | if (!AddrMO.isReg() || AddrMO.getReg() != DestReg) |
674 | continue; |
675 | |
676 | const MachineOperand &OffsetMO = UseMI.getOperand(i: I + 2); |
677 | if (!OffsetMO.isImm()) |
678 | continue; |
679 | |
680 | // All inline asm memory operands must use the same offset. |
681 | int64_t Offset = OffsetMO.getImm(); |
682 | if (CommonOffset && Offset != CommonOffset) |
683 | return false; |
684 | CommonOffset = Offset; |
685 | InlineAsmMemoryOpIndexes.push_back(Elt: I + 1); |
686 | } |
687 | InlineAsmMemoryOpIndexesMap.insert( |
688 | KV: std::make_pair(x: &UseMI, y&: InlineAsmMemoryOpIndexes)); |
689 | break; |
690 | } |
691 | } |
692 | } |
693 | |
694 | // We found a common offset. |
695 | // Update the offsets in global address lowering. |
696 | // We may have already folded some arithmetic so we need to add to any |
697 | // existing offset. |
698 | int64_t NewOffset = Hi20.getOperand(i: 1).getOffset() + *CommonOffset; |
699 | // LA32 ignores the upper 32 bits. |
700 | if (!ST->is64Bit()) |
701 | NewOffset = SignExtend64<32>(x: NewOffset); |
702 | // We can only fold simm32 offsets. |
703 | if (!isInt<32>(x: NewOffset)) |
704 | return false; |
705 | |
706 | // If optimized by this pass successfully, MO_RELAX bitmask target-flag should |
707 | // be removed from the pcala code sequence. Code sequence of tls-le can still |
708 | // be relaxed after being optimized. |
709 | // |
710 | // For example: |
711 | // pcalau12i $a0, %pc_hi20(symbol) |
712 | // addi.d $a0, $a0, %pc_lo12(symbol) |
713 | // ld.w $a0, $a0, 0 |
714 | // |
715 | // => |
716 | // |
717 | // pcalau12i $a0, %pc_hi20(symbol) |
718 | // ld.w $a0, $a0, %pc_lo12(symbol) |
719 | // |
720 | // Code sequence optimized before can be relax by linker. But after being |
721 | // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be |
722 | // carried by them. |
723 | Hi20.getOperand(i: 1).setOffset(NewOffset); |
724 | MachineOperand &ImmOp = Lo12.getOperand(i: 2); |
725 | ImmOp.setOffset(NewOffset); |
726 | if (Lo20 && Hi12) { |
727 | Lo20->getOperand(i: 2).setOffset(NewOffset); |
728 | Hi12->getOperand(i: 2).setOffset(NewOffset); |
729 | } |
730 | if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
731 | Hi20.getOperand(i: 1).setTargetFlags( |
732 | LoongArchII::getDirectFlags(MO: Hi20.getOperand(i: 1))); |
733 | ImmOp.setTargetFlags(LoongArchII::getDirectFlags(MO: ImmOp)); |
734 | } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
735 | MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg()); |
736 | Add->getOperand(i: 3).setOffset(NewOffset); |
737 | } |
738 | |
739 | // Update the immediate in the load/store instructions to add the offset. |
740 | const LoongArchInstrInfo &TII = *ST->getInstrInfo(); |
741 | for (MachineInstr &UseMI : |
742 | llvm::make_early_inc_range(Range: MRI->use_instructions(Reg: DestReg))) { |
743 | if (UseMI.getOpcode() == LoongArch::INLINEASM || |
744 | UseMI.getOpcode() == LoongArch::INLINEASM_BR) { |
745 | auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI]; |
746 | for (unsigned I : InlineAsmMemoryOpIndexes) { |
747 | MachineOperand &MO = UseMI.getOperand(i: I + 1); |
748 | switch (ImmOp.getType()) { |
749 | case MachineOperand::MO_GlobalAddress: |
750 | MO.ChangeToGA(GV: ImmOp.getGlobal(), Offset: ImmOp.getOffset(), |
751 | TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp)); |
752 | break; |
753 | case MachineOperand::MO_MCSymbol: |
754 | MO.ChangeToMCSymbol(Sym: ImmOp.getMCSymbol(), |
755 | TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp)); |
756 | MO.setOffset(ImmOp.getOffset()); |
757 | break; |
758 | case MachineOperand::MO_BlockAddress: |
759 | MO.ChangeToBA(BA: ImmOp.getBlockAddress(), Offset: ImmOp.getOffset(), |
760 | TargetFlags: LoongArchII::getDirectFlags(MO: ImmOp)); |
761 | break; |
762 | default: |
763 | report_fatal_error(reason: "unsupported machine operand type" ); |
764 | break; |
765 | } |
766 | } |
767 | } else { |
768 | UseMI.setDesc(TII.get(Opcode: getNewOpc(Op: UseMI.getOpcode(), isLarge: Last))); |
769 | if (Last) { |
770 | UseMI.removeOperand(OpNo: 2); |
771 | UseMI.removeOperand(OpNo: 1); |
772 | UseMI.addOperand(Op: Last->getOperand(i: 1)); |
773 | UseMI.addOperand(Op: Last->getOperand(i: 2)); |
774 | UseMI.getOperand(i: 1).setIsKill(false); |
775 | UseMI.getOperand(i: 2).setIsKill(false); |
776 | } else { |
777 | UseMI.removeOperand(OpNo: 2); |
778 | UseMI.addOperand(Op: ImmOp); |
779 | } |
780 | } |
781 | } |
782 | |
783 | if (Last) { |
784 | Last->eraseFromParent(); |
785 | return true; |
786 | } |
787 | |
788 | if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
789 | MRI->replaceRegWith(FromReg: Lo12.getOperand(i: 0).getReg(), |
790 | ToReg: Hi20.getOperand(i: 0).getReg()); |
791 | } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
792 | MachineInstr *Add = &*MRI->use_instr_begin(RegNo: Hi20.getOperand(i: 0).getReg()); |
793 | MRI->replaceRegWith(FromReg: Lo12.getOperand(i: 0).getReg(), |
794 | ToReg: Add->getOperand(i: 0).getReg()); |
795 | } |
796 | Lo12.eraseFromParent(); |
797 | return true; |
798 | } |
799 | |
800 | bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) { |
801 | if (skipFunction(F: Fn.getFunction())) |
802 | return false; |
803 | |
804 | ST = &Fn.getSubtarget<LoongArchSubtarget>(); |
805 | |
806 | bool MadeChange = false; |
807 | MRI = &Fn.getRegInfo(); |
808 | for (MachineBasicBlock &MBB : Fn) { |
809 | LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n" ); |
810 | for (MachineInstr &Hi20 : MBB) { |
811 | MachineInstr *Lo12 = nullptr; |
812 | MachineInstr *Lo20 = nullptr; |
813 | MachineInstr *Hi12 = nullptr; |
814 | MachineInstr *Last = nullptr; |
815 | if (Hi20.getOpcode() == LoongArch::PCALAU12I) { |
816 | // Detect foldable pcala code sequence in small/medium/large code model. |
817 | if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last)) |
818 | continue; |
819 | } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { |
820 | MachineInstr *Add = nullptr; |
821 | // Detect foldable tls-le code sequence in small/medium code model. |
822 | if (!detectFoldable(Hi20, Add, Lo12)) |
823 | continue; |
824 | } else { |
825 | continue; |
826 | } |
827 | // For tls-le, we do not pass the second PseudoAddTPRel instr in order to |
828 | // reuse the existing hooks and the last three paramaters should always be |
829 | // nullptr. |
830 | MadeChange |= detectAndFoldOffset(Hi20, Lo12&: *Lo12, Lo20, Hi12, Last); |
831 | MadeChange |= foldIntoMemoryOps(Hi20, Lo12&: *Lo12, Lo20, Hi12, Last); |
832 | } |
833 | } |
834 | |
835 | return MadeChange; |
836 | } |
837 | |
838 | /// Returns an instance of the Merge Base Offset Optimization pass. |
839 | FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() { |
840 | return new LoongArchMergeBaseOffsetOpt(); |
841 | } |
842 | |