1 | //===- LoongArchOptWInstrs.cpp - MI W instruction optimizations ----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===---------------------------------------------------------------------===// |
8 | // |
9 | // This pass does some optimizations for *W instructions at the MI level. |
10 | // |
11 | // First it removes unneeded sext(addi.w rd, rs, 0) instructions. Either |
12 | // because the sign extended bits aren't consumed or because the input was |
13 | // already sign extended by an earlier instruction. |
14 | // |
15 | // Then: |
16 | // 1. Unless explicit disabled or the target prefers instructions with W suffix, |
17 | // it removes the -w suffix from opw instructions whenever all users are |
18 | // dependent only on the lower word of the result of the instruction. |
19 | // The cases handled are: |
20 | // * addi.w because it helps reduce test differences between LA32 and LA64 |
21 | // w/o being a pessimization. |
22 | // |
23 | // 2. Or if explicit enabled or the target prefers instructions with W suffix, |
24 | // it adds the W suffix to the instruction whenever all users are dependent |
25 | // only on the lower word of the result of the instruction. |
26 | // The cases handled are: |
27 | // * add.d/addi.d/sub.d/mul.d. |
28 | // * slli.d with imm < 32. |
29 | // * ld.d/ld.wu. |
30 | //===---------------------------------------------------------------------===// |
31 | |
32 | #include "LoongArch.h" |
33 | #include "LoongArchMachineFunctionInfo.h" |
34 | #include "LoongArchSubtarget.h" |
35 | #include "llvm/ADT/SmallSet.h" |
36 | #include "llvm/ADT/Statistic.h" |
37 | #include "llvm/CodeGen/MachineFunctionPass.h" |
38 | #include "llvm/CodeGen/TargetInstrInfo.h" |
39 | |
40 | using namespace llvm; |
41 | |
42 | #define DEBUG_TYPE "loongarch-opt-w-instrs" |
43 | #define LOONGARCH_OPT_W_INSTRS_NAME "LoongArch Optimize W Instructions" |
44 | |
45 | STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions" ); |
46 | STATISTIC(NumTransformedToWInstrs, |
47 | "Number of instructions transformed to W-ops" ); |
48 | |
49 | static cl::opt<bool> |
50 | DisableSExtWRemoval("loongarch-disable-sextw-removal" , |
51 | cl::desc("Disable removal of sign-extend insn" ), |
52 | cl::init(Val: false), cl::Hidden); |
53 | static cl::opt<bool> |
54 | DisableCvtToDSuffix("loongarch-disable-cvt-to-d-suffix" , |
55 | cl::desc("Disable convert to D suffix" ), |
56 | cl::init(Val: false), cl::Hidden); |
57 | |
58 | namespace { |
59 | |
60 | class LoongArchOptWInstrs : public MachineFunctionPass { |
61 | public: |
62 | static char ID; |
63 | |
64 | LoongArchOptWInstrs() : MachineFunctionPass(ID) {} |
65 | |
66 | bool runOnMachineFunction(MachineFunction &MF) override; |
67 | bool removeSExtWInstrs(MachineFunction &MF, const LoongArchInstrInfo &TII, |
68 | const LoongArchSubtarget &ST, |
69 | MachineRegisterInfo &MRI); |
70 | bool convertToDSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII, |
71 | const LoongArchSubtarget &ST, |
72 | MachineRegisterInfo &MRI); |
73 | bool convertToWSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII, |
74 | const LoongArchSubtarget &ST, |
75 | MachineRegisterInfo &MRI); |
76 | |
77 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
78 | AU.setPreservesCFG(); |
79 | MachineFunctionPass::getAnalysisUsage(AU); |
80 | } |
81 | |
82 | StringRef getPassName() const override { return LOONGARCH_OPT_W_INSTRS_NAME; } |
83 | }; |
84 | |
85 | } // end anonymous namespace |
86 | |
87 | char LoongArchOptWInstrs::ID = 0; |
88 | INITIALIZE_PASS(LoongArchOptWInstrs, DEBUG_TYPE, LOONGARCH_OPT_W_INSTRS_NAME, |
89 | false, false) |
90 | |
91 | FunctionPass *llvm::createLoongArchOptWInstrsPass() { |
92 | return new LoongArchOptWInstrs(); |
93 | } |
94 | |
95 | // Checks if all users only demand the lower \p OrigBits of the original |
96 | // instruction's result. |
97 | // TODO: handle multiple interdependent transformations |
98 | static bool hasAllNBitUsers(const MachineInstr &OrigMI, |
99 | const LoongArchSubtarget &ST, |
100 | const MachineRegisterInfo &MRI, unsigned OrigBits) { |
101 | |
102 | SmallSet<std::pair<const MachineInstr *, unsigned>, 4> Visited; |
103 | SmallVector<std::pair<const MachineInstr *, unsigned>, 4> Worklist; |
104 | |
105 | Worklist.push_back(Elt: std::make_pair(x: &OrigMI, y&: OrigBits)); |
106 | |
107 | while (!Worklist.empty()) { |
108 | auto P = Worklist.pop_back_val(); |
109 | const MachineInstr *MI = P.first; |
110 | unsigned Bits = P.second; |
111 | |
112 | if (!Visited.insert(V: P).second) |
113 | continue; |
114 | |
115 | // Only handle instructions with one def. |
116 | if (MI->getNumExplicitDefs() != 1) |
117 | return false; |
118 | |
119 | Register DestReg = MI->getOperand(i: 0).getReg(); |
120 | if (!DestReg.isVirtual()) |
121 | return false; |
122 | |
123 | for (auto &UserOp : MRI.use_nodbg_operands(Reg: DestReg)) { |
124 | const MachineInstr *UserMI = UserOp.getParent(); |
125 | unsigned OpIdx = UserOp.getOperandNo(); |
126 | |
127 | switch (UserMI->getOpcode()) { |
128 | default: |
129 | // TODO: Add vector |
130 | return false; |
131 | |
132 | case LoongArch::ADD_W: |
133 | case LoongArch::ADDI_W: |
134 | case LoongArch::SUB_W: |
135 | case LoongArch::ALSL_W: |
136 | case LoongArch::ALSL_WU: |
137 | case LoongArch::MUL_W: |
138 | case LoongArch::MULH_W: |
139 | case LoongArch::MULH_WU: |
140 | case LoongArch::MULW_D_W: |
141 | case LoongArch::MULW_D_WU: |
142 | // TODO: {DIV,MOD}.{W,WU} consumes the upper 32 bits before LA664+. |
143 | // case LoongArch::DIV_W: |
144 | // case LoongArch::DIV_WU: |
145 | // case LoongArch::MOD_W: |
146 | // case LoongArch::MOD_WU: |
147 | case LoongArch::SLL_W: |
148 | case LoongArch::SLLI_W: |
149 | case LoongArch::SRL_W: |
150 | case LoongArch::SRLI_W: |
151 | case LoongArch::SRA_W: |
152 | case LoongArch::SRAI_W: |
153 | case LoongArch::ROTR_W: |
154 | case LoongArch::ROTRI_W: |
155 | case LoongArch::CLO_W: |
156 | case LoongArch::CLZ_W: |
157 | case LoongArch::CTO_W: |
158 | case LoongArch::CTZ_W: |
159 | case LoongArch::BYTEPICK_W: |
160 | case LoongArch::REVB_2H: |
161 | case LoongArch::BITREV_4B: |
162 | case LoongArch::BITREV_W: |
163 | case LoongArch::BSTRINS_W: |
164 | case LoongArch::BSTRPICK_W: |
165 | case LoongArch::CRC_W_W_W: |
166 | case LoongArch::CRCC_W_W_W: |
167 | case LoongArch::MOVGR2FCSR: |
168 | case LoongArch::MOVGR2FRH_W: |
169 | case LoongArch::MOVGR2FR_W_64: |
170 | if (Bits >= 32) |
171 | break; |
172 | return false; |
173 | case LoongArch::MOVGR2CF: |
174 | if (Bits >= 1) |
175 | break; |
176 | return false; |
177 | case LoongArch::EXT_W_B: |
178 | if (Bits >= 8) |
179 | break; |
180 | return false; |
181 | case LoongArch::EXT_W_H: |
182 | if (Bits >= 16) |
183 | break; |
184 | return false; |
185 | |
186 | case LoongArch::SRLI_D: { |
187 | // If we are shifting right by less than Bits, and users don't demand |
188 | // any bits that were shifted into [Bits-1:0], then we can consider this |
189 | // as an N-Bit user. |
190 | unsigned ShAmt = UserMI->getOperand(i: 2).getImm(); |
191 | if (Bits > ShAmt) { |
192 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y: Bits - ShAmt)); |
193 | break; |
194 | } |
195 | return false; |
196 | } |
197 | |
198 | // these overwrite higher input bits, otherwise the lower word of output |
199 | // depends only on the lower word of input. So check their uses read W. |
200 | case LoongArch::SLLI_D: |
201 | if (Bits >= (ST.getGRLen() - UserMI->getOperand(i: 2).getImm())) |
202 | break; |
203 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
204 | break; |
205 | case LoongArch::ANDI: { |
206 | uint64_t Imm = UserMI->getOperand(i: 2).getImm(); |
207 | if (Bits >= (unsigned)llvm::bit_width(Value: Imm)) |
208 | break; |
209 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
210 | break; |
211 | } |
212 | case LoongArch::ORI: { |
213 | uint64_t Imm = UserMI->getOperand(i: 2).getImm(); |
214 | if (Bits >= (unsigned)llvm::bit_width<uint64_t>(Value: ~Imm)) |
215 | break; |
216 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
217 | break; |
218 | } |
219 | |
220 | case LoongArch::SLL_D: |
221 | // Operand 2 is the shift amount which uses log2(grlen) bits. |
222 | if (OpIdx == 2) { |
223 | if (Bits >= Log2_32(Value: ST.getGRLen())) |
224 | break; |
225 | return false; |
226 | } |
227 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
228 | break; |
229 | |
230 | case LoongArch::SRA_D: |
231 | case LoongArch::SRL_D: |
232 | case LoongArch::ROTR_D: |
233 | // Operand 2 is the shift amount which uses 6 bits. |
234 | if (OpIdx == 2 && Bits >= Log2_32(Value: ST.getGRLen())) |
235 | break; |
236 | return false; |
237 | |
238 | case LoongArch::ST_B: |
239 | case LoongArch::STX_B: |
240 | case LoongArch::STGT_B: |
241 | case LoongArch::STLE_B: |
242 | case LoongArch::IOCSRWR_B: |
243 | // The first argument is the value to store. |
244 | if (OpIdx == 0 && Bits >= 8) |
245 | break; |
246 | return false; |
247 | case LoongArch::ST_H: |
248 | case LoongArch::STX_H: |
249 | case LoongArch::STGT_H: |
250 | case LoongArch::STLE_H: |
251 | case LoongArch::IOCSRWR_H: |
252 | // The first argument is the value to store. |
253 | if (OpIdx == 0 && Bits >= 16) |
254 | break; |
255 | return false; |
256 | case LoongArch::ST_W: |
257 | case LoongArch::STX_W: |
258 | case LoongArch::SCREL_W: |
259 | case LoongArch::STPTR_W: |
260 | case LoongArch::STGT_W: |
261 | case LoongArch::STLE_W: |
262 | case LoongArch::IOCSRWR_W: |
263 | // The first argument is the value to store. |
264 | if (OpIdx == 0 && Bits >= 32) |
265 | break; |
266 | return false; |
267 | |
268 | case LoongArch::CRC_W_B_W: |
269 | case LoongArch::CRCC_W_B_W: |
270 | if ((OpIdx == 1 && Bits >= 8) || (OpIdx == 2 && Bits >= 32)) |
271 | break; |
272 | return false; |
273 | case LoongArch::CRC_W_H_W: |
274 | case LoongArch::CRCC_W_H_W: |
275 | if ((OpIdx == 1 && Bits >= 16) || (OpIdx == 2 && Bits >= 32)) |
276 | break; |
277 | return false; |
278 | case LoongArch::CRC_W_D_W: |
279 | case LoongArch::CRCC_W_D_W: |
280 | if (OpIdx == 2 && Bits >= 32) |
281 | break; |
282 | return false; |
283 | |
284 | // For these, lower word of output in these operations, depends only on |
285 | // the lower word of input. So, we check all uses only read lower word. |
286 | case LoongArch::COPY: |
287 | case LoongArch::PHI: |
288 | case LoongArch::ADD_D: |
289 | case LoongArch::ADDI_D: |
290 | case LoongArch::SUB_D: |
291 | case LoongArch::MUL_D: |
292 | case LoongArch::AND: |
293 | case LoongArch::OR: |
294 | case LoongArch::NOR: |
295 | case LoongArch::XOR: |
296 | case LoongArch::XORI: |
297 | case LoongArch::ANDN: |
298 | case LoongArch::ORN: |
299 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
300 | break; |
301 | |
302 | case LoongArch::MASKNEZ: |
303 | case LoongArch::MASKEQZ: |
304 | if (OpIdx != 1) |
305 | return false; |
306 | Worklist.push_back(Elt: std::make_pair(x&: UserMI, y&: Bits)); |
307 | break; |
308 | } |
309 | } |
310 | } |
311 | |
312 | return true; |
313 | } |
314 | |
315 | static bool hasAllWUsers(const MachineInstr &OrigMI, |
316 | const LoongArchSubtarget &ST, |
317 | const MachineRegisterInfo &MRI) { |
318 | return hasAllNBitUsers(OrigMI, ST, MRI, OrigBits: 32); |
319 | } |
320 | |
321 | // This function returns true if the machine instruction always outputs a value |
322 | // where bits 63:32 match bit 31. |
323 | static bool isSignExtendingOpW(const MachineInstr &MI, |
324 | const MachineRegisterInfo &MRI, unsigned OpNo) { |
325 | switch (MI.getOpcode()) { |
326 | // Normal cases |
327 | case LoongArch::ADD_W: |
328 | case LoongArch::SUB_W: |
329 | case LoongArch::ADDI_W: |
330 | case LoongArch::ALSL_W: |
331 | case LoongArch::LU12I_W: |
332 | case LoongArch::SLT: |
333 | case LoongArch::SLTU: |
334 | case LoongArch::SLTI: |
335 | case LoongArch::SLTUI: |
336 | case LoongArch::ANDI: |
337 | case LoongArch::MUL_W: |
338 | case LoongArch::MULH_W: |
339 | case LoongArch::MULH_WU: |
340 | case LoongArch::DIV_W: |
341 | case LoongArch::MOD_W: |
342 | case LoongArch::DIV_WU: |
343 | case LoongArch::MOD_WU: |
344 | case LoongArch::SLL_W: |
345 | case LoongArch::SRL_W: |
346 | case LoongArch::SRA_W: |
347 | case LoongArch::ROTR_W: |
348 | case LoongArch::SLLI_W: |
349 | case LoongArch::SRLI_W: |
350 | case LoongArch::SRAI_W: |
351 | case LoongArch::ROTRI_W: |
352 | case LoongArch::EXT_W_B: |
353 | case LoongArch::EXT_W_H: |
354 | case LoongArch::CLO_W: |
355 | case LoongArch::CLZ_W: |
356 | case LoongArch::CTO_W: |
357 | case LoongArch::CTZ_W: |
358 | case LoongArch::BYTEPICK_W: |
359 | case LoongArch::REVB_2H: |
360 | case LoongArch::BITREV_4B: |
361 | case LoongArch::BITREV_W: |
362 | case LoongArch::BSTRINS_W: |
363 | case LoongArch::BSTRPICK_W: |
364 | case LoongArch::LD_B: |
365 | case LoongArch::LD_H: |
366 | case LoongArch::LD_W: |
367 | case LoongArch::LD_BU: |
368 | case LoongArch::LD_HU: |
369 | case LoongArch::LL_W: |
370 | case LoongArch::LLACQ_W: |
371 | case LoongArch::RDTIMEL_W: |
372 | case LoongArch::RDTIMEH_W: |
373 | case LoongArch::CPUCFG: |
374 | case LoongArch::LDX_B: |
375 | case LoongArch::LDX_H: |
376 | case LoongArch::LDX_W: |
377 | case LoongArch::LDX_BU: |
378 | case LoongArch::LDX_HU: |
379 | case LoongArch::LDPTR_W: |
380 | case LoongArch::LDGT_B: |
381 | case LoongArch::LDGT_H: |
382 | case LoongArch::LDGT_W: |
383 | case LoongArch::LDLE_B: |
384 | case LoongArch::LDLE_H: |
385 | case LoongArch::LDLE_W: |
386 | case LoongArch::AMSWAP_B: |
387 | case LoongArch::AMSWAP_H: |
388 | case LoongArch::AMSWAP_W: |
389 | case LoongArch::AMADD_B: |
390 | case LoongArch::AMADD_H: |
391 | case LoongArch::AMADD_W: |
392 | case LoongArch::AMAND_W: |
393 | case LoongArch::AMOR_W: |
394 | case LoongArch::AMXOR_W: |
395 | case LoongArch::AMMAX_W: |
396 | case LoongArch::AMMIN_W: |
397 | case LoongArch::AMMAX_WU: |
398 | case LoongArch::AMMIN_WU: |
399 | case LoongArch::AMSWAP__DB_B: |
400 | case LoongArch::AMSWAP__DB_H: |
401 | case LoongArch::AMSWAP__DB_W: |
402 | case LoongArch::AMADD__DB_B: |
403 | case LoongArch::AMADD__DB_H: |
404 | case LoongArch::AMADD__DB_W: |
405 | case LoongArch::AMAND__DB_W: |
406 | case LoongArch::AMOR__DB_W: |
407 | case LoongArch::AMXOR__DB_W: |
408 | case LoongArch::AMMAX__DB_W: |
409 | case LoongArch::AMMIN__DB_W: |
410 | case LoongArch::AMMAX__DB_WU: |
411 | case LoongArch::AMMIN__DB_WU: |
412 | case LoongArch::AMCAS_B: |
413 | case LoongArch::AMCAS_H: |
414 | case LoongArch::AMCAS_W: |
415 | case LoongArch::AMCAS__DB_B: |
416 | case LoongArch::AMCAS__DB_H: |
417 | case LoongArch::AMCAS__DB_W: |
418 | case LoongArch::CRC_W_B_W: |
419 | case LoongArch::CRC_W_H_W: |
420 | case LoongArch::CRC_W_W_W: |
421 | case LoongArch::CRC_W_D_W: |
422 | case LoongArch::CRCC_W_B_W: |
423 | case LoongArch::CRCC_W_H_W: |
424 | case LoongArch::CRCC_W_W_W: |
425 | case LoongArch::CRCC_W_D_W: |
426 | case LoongArch::IOCSRRD_B: |
427 | case LoongArch::IOCSRRD_H: |
428 | case LoongArch::IOCSRRD_W: |
429 | case LoongArch::MOVFR2GR_S: |
430 | case LoongArch::MOVFCSR2GR: |
431 | case LoongArch::MOVCF2GR: |
432 | case LoongArch::MOVFRH2GR_S: |
433 | case LoongArch::MOVFR2GR_S_64: |
434 | // TODO: Add vector |
435 | return true; |
436 | // Special cases that require checking operands. |
437 | // shifting right sufficiently makes the value 32-bit sign-extended |
438 | case LoongArch::SRAI_D: |
439 | return MI.getOperand(i: 2).getImm() >= 32; |
440 | case LoongArch::SRLI_D: |
441 | return MI.getOperand(i: 2).getImm() > 32; |
442 | // The LI pattern ADDI rd, R0, imm and ORI rd, R0, imm are sign extended. |
443 | case LoongArch::ADDI_D: |
444 | case LoongArch::ORI: |
445 | return MI.getOperand(i: 1).isReg() && |
446 | MI.getOperand(i: 1).getReg() == LoongArch::R0; |
447 | // A bits extract is sign extended if the msb is less than 31. |
448 | case LoongArch::BSTRPICK_D: |
449 | return MI.getOperand(i: 2).getImm() < 31; |
450 | // Copying from R0 produces zero. |
451 | case LoongArch::COPY: |
452 | return MI.getOperand(i: 1).getReg() == LoongArch::R0; |
453 | // Ignore the scratch register destination. |
454 | case LoongArch::PseudoMaskedAtomicSwap32: |
455 | case LoongArch::PseudoAtomicSwap32: |
456 | case LoongArch::PseudoMaskedAtomicLoadAdd32: |
457 | case LoongArch::PseudoMaskedAtomicLoadSub32: |
458 | case LoongArch::PseudoAtomicLoadNand32: |
459 | case LoongArch::PseudoMaskedAtomicLoadNand32: |
460 | case LoongArch::PseudoAtomicLoadAdd32: |
461 | case LoongArch::PseudoAtomicLoadSub32: |
462 | case LoongArch::PseudoAtomicLoadAnd32: |
463 | case LoongArch::PseudoAtomicLoadOr32: |
464 | case LoongArch::PseudoAtomicLoadXor32: |
465 | case LoongArch::PseudoMaskedAtomicLoadUMax32: |
466 | case LoongArch::PseudoMaskedAtomicLoadUMin32: |
467 | case LoongArch::PseudoCmpXchg32: |
468 | case LoongArch::PseudoMaskedCmpXchg32: |
469 | case LoongArch::PseudoMaskedAtomicLoadMax32: |
470 | case LoongArch::PseudoMaskedAtomicLoadMin32: |
471 | return OpNo == 0; |
472 | } |
473 | |
474 | return false; |
475 | } |
476 | |
477 | static bool isSignExtendedW(Register SrcReg, const LoongArchSubtarget &ST, |
478 | const MachineRegisterInfo &MRI, |
479 | SmallPtrSetImpl<MachineInstr *> &FixableDef) { |
480 | SmallSet<Register, 4> Visited; |
481 | SmallVector<Register, 4> Worklist; |
482 | |
483 | auto AddRegToWorkList = [&](Register SrcReg) { |
484 | if (!SrcReg.isVirtual()) |
485 | return false; |
486 | Worklist.push_back(Elt: SrcReg); |
487 | return true; |
488 | }; |
489 | |
490 | if (!AddRegToWorkList(SrcReg)) |
491 | return false; |
492 | |
493 | while (!Worklist.empty()) { |
494 | Register Reg = Worklist.pop_back_val(); |
495 | |
496 | // If we already visited this register, we don't need to check it again. |
497 | if (!Visited.insert(V: Reg).second) |
498 | continue; |
499 | |
500 | MachineInstr *MI = MRI.getVRegDef(Reg); |
501 | if (!MI) |
502 | continue; |
503 | |
504 | int OpNo = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr); |
505 | assert(OpNo != -1 && "Couldn't find register" ); |
506 | |
507 | // If this is a sign extending operation we don't need to look any further. |
508 | if (isSignExtendingOpW(MI: *MI, MRI, OpNo)) |
509 | continue; |
510 | |
511 | // Is this an instruction that propagates sign extend? |
512 | switch (MI->getOpcode()) { |
513 | default: |
514 | // Unknown opcode, give up. |
515 | return false; |
516 | case LoongArch::COPY: { |
517 | const MachineFunction *MF = MI->getMF(); |
518 | const LoongArchMachineFunctionInfo *LAFI = |
519 | MF->getInfo<LoongArchMachineFunctionInfo>(); |
520 | |
521 | // If this is the entry block and the register is livein, see if we know |
522 | // it is sign extended. |
523 | if (MI->getParent() == &MF->front()) { |
524 | Register VReg = MI->getOperand(i: 0).getReg(); |
525 | if (MF->getRegInfo().isLiveIn(Reg: VReg) && LAFI->isSExt32Register(Reg: VReg)) |
526 | continue; |
527 | } |
528 | |
529 | Register CopySrcReg = MI->getOperand(i: 1).getReg(); |
530 | if (CopySrcReg == LoongArch::R4) { |
531 | // For a method return value, we check the ZExt/SExt flags in attribute. |
532 | // We assume the following code sequence for method call. |
533 | // PseudoCALL @bar, ... |
534 | // ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3 |
535 | // %0:gpr = COPY $r4 |
536 | // |
537 | // We use the PseudoCall to look up the IR function being called to find |
538 | // its return attributes. |
539 | const MachineBasicBlock *MBB = MI->getParent(); |
540 | auto II = MI->getIterator(); |
541 | if (II == MBB->instr_begin() || |
542 | (--II)->getOpcode() != LoongArch::ADJCALLSTACKUP) |
543 | return false; |
544 | |
545 | const MachineInstr &CallMI = *(--II); |
546 | if (!CallMI.isCall() || !CallMI.getOperand(i: 0).isGlobal()) |
547 | return false; |
548 | |
549 | auto *CalleeFn = |
550 | dyn_cast_if_present<Function>(Val: CallMI.getOperand(i: 0).getGlobal()); |
551 | if (!CalleeFn) |
552 | return false; |
553 | |
554 | auto *IntTy = dyn_cast<IntegerType>(Val: CalleeFn->getReturnType()); |
555 | if (!IntTy) |
556 | return false; |
557 | |
558 | const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs(); |
559 | unsigned BitWidth = IntTy->getBitWidth(); |
560 | if ((BitWidth <= 32 && Attrs.hasAttribute(Kind: Attribute::SExt)) || |
561 | (BitWidth < 32 && Attrs.hasAttribute(Kind: Attribute::ZExt))) |
562 | continue; |
563 | } |
564 | |
565 | if (!AddRegToWorkList(CopySrcReg)) |
566 | return false; |
567 | |
568 | break; |
569 | } |
570 | |
571 | // For these, we just need to check if the 1st operand is sign extended. |
572 | case LoongArch::MOD_D: |
573 | case LoongArch::ANDI: |
574 | case LoongArch::ORI: |
575 | case LoongArch::XORI: |
576 | // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R. |
577 | // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1 |
578 | // Logical operations use a sign extended 12-bit immediate. |
579 | if (!AddRegToWorkList(MI->getOperand(i: 1).getReg())) |
580 | return false; |
581 | |
582 | break; |
583 | case LoongArch::MOD_DU: |
584 | case LoongArch::AND: |
585 | case LoongArch::OR: |
586 | case LoongArch::XOR: |
587 | case LoongArch::ANDN: |
588 | case LoongArch::ORN: |
589 | case LoongArch::PHI: { |
590 | // If all incoming values are sign-extended, the output of AND, OR, XOR, |
591 | // or PHI is also sign-extended. |
592 | |
593 | // The input registers for PHI are operand 1, 3, ... |
594 | // The input registers for others are operand 1 and 2. |
595 | unsigned B = 1, E = 3, D = 1; |
596 | switch (MI->getOpcode()) { |
597 | case LoongArch::PHI: |
598 | E = MI->getNumOperands(); |
599 | D = 2; |
600 | break; |
601 | } |
602 | |
603 | for (unsigned I = B; I != E; I += D) { |
604 | if (!MI->getOperand(i: I).isReg()) |
605 | return false; |
606 | |
607 | if (!AddRegToWorkList(MI->getOperand(i: I).getReg())) |
608 | return false; |
609 | } |
610 | |
611 | break; |
612 | } |
613 | |
614 | case LoongArch::MASKEQZ: |
615 | case LoongArch::MASKNEZ: |
616 | // Instructions return zero or operand 1. Result is sign extended if |
617 | // operand 1 is sign extended. |
618 | if (!AddRegToWorkList(MI->getOperand(i: 1).getReg())) |
619 | return false; |
620 | break; |
621 | |
622 | // With these opcode, we can "fix" them with the W-version |
623 | // if we know all users of the result only rely on bits 31:0 |
624 | case LoongArch::SLLI_D: |
625 | // SLLI_W reads the lowest 5 bits, while SLLI_D reads lowest 6 bits |
626 | if (MI->getOperand(i: 2).getImm() >= 32) |
627 | return false; |
628 | [[fallthrough]]; |
629 | case LoongArch::ADDI_D: |
630 | case LoongArch::ADD_D: |
631 | case LoongArch::LD_D: |
632 | case LoongArch::LD_WU: |
633 | case LoongArch::MUL_D: |
634 | case LoongArch::SUB_D: |
635 | if (hasAllWUsers(OrigMI: *MI, ST, MRI)) { |
636 | FixableDef.insert(Ptr: MI); |
637 | break; |
638 | } |
639 | return false; |
640 | // If all incoming values are sign-extended and all users only use |
641 | // the lower 32 bits, then convert them to W versions. |
642 | case LoongArch::DIV_D: { |
643 | if (!AddRegToWorkList(MI->getOperand(i: 1).getReg())) |
644 | return false; |
645 | if (!AddRegToWorkList(MI->getOperand(i: 2).getReg())) |
646 | return false; |
647 | if (hasAllWUsers(OrigMI: *MI, ST, MRI)) { |
648 | FixableDef.insert(Ptr: MI); |
649 | break; |
650 | } |
651 | return false; |
652 | } |
653 | } |
654 | } |
655 | |
656 | // If we get here, then every node we visited produces a sign extended value |
657 | // or propagated sign extended values. So the result must be sign extended. |
658 | return true; |
659 | } |
660 | |
661 | static unsigned getWOp(unsigned Opcode) { |
662 | switch (Opcode) { |
663 | case LoongArch::ADDI_D: |
664 | return LoongArch::ADDI_W; |
665 | case LoongArch::ADD_D: |
666 | return LoongArch::ADD_W; |
667 | case LoongArch::DIV_D: |
668 | return LoongArch::DIV_W; |
669 | case LoongArch::LD_D: |
670 | case LoongArch::LD_WU: |
671 | return LoongArch::LD_W; |
672 | case LoongArch::MUL_D: |
673 | return LoongArch::MUL_W; |
674 | case LoongArch::SLLI_D: |
675 | return LoongArch::SLLI_W; |
676 | case LoongArch::SUB_D: |
677 | return LoongArch::SUB_W; |
678 | default: |
679 | llvm_unreachable("Unexpected opcode for replacement with W variant" ); |
680 | } |
681 | } |
682 | |
683 | bool LoongArchOptWInstrs::removeSExtWInstrs(MachineFunction &MF, |
684 | const LoongArchInstrInfo &TII, |
685 | const LoongArchSubtarget &ST, |
686 | MachineRegisterInfo &MRI) { |
687 | if (DisableSExtWRemoval) |
688 | return false; |
689 | |
690 | bool MadeChange = false; |
691 | for (MachineBasicBlock &MBB : MF) { |
692 | for (MachineInstr &MI : llvm::make_early_inc_range(Range&: MBB)) { |
693 | // We're looking for the sext.w pattern ADDI.W rd, rs, 0. |
694 | if (!LoongArch::isSEXT_W(MI)) |
695 | continue; |
696 | |
697 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
698 | |
699 | SmallPtrSet<MachineInstr *, 4> FixableDefs; |
700 | |
701 | // If all users only use the lower bits, this sext.w is redundant. |
702 | // Or if all definitions reaching MI sign-extend their output, |
703 | // then sext.w is redundant. |
704 | if (!hasAllWUsers(OrigMI: MI, ST, MRI) && |
705 | !isSignExtendedW(SrcReg, ST, MRI, FixableDef&: FixableDefs)) |
706 | continue; |
707 | |
708 | Register DstReg = MI.getOperand(i: 0).getReg(); |
709 | if (!MRI.constrainRegClass(Reg: SrcReg, RC: MRI.getRegClass(Reg: DstReg))) |
710 | continue; |
711 | |
712 | // Convert Fixable instructions to their W versions. |
713 | for (MachineInstr *Fixable : FixableDefs) { |
714 | LLVM_DEBUG(dbgs() << "Replacing " << *Fixable); |
715 | Fixable->setDesc(TII.get(Opcode: getWOp(Opcode: Fixable->getOpcode()))); |
716 | Fixable->clearFlag(Flag: MachineInstr::MIFlag::NoSWrap); |
717 | Fixable->clearFlag(Flag: MachineInstr::MIFlag::NoUWrap); |
718 | Fixable->clearFlag(Flag: MachineInstr::MIFlag::IsExact); |
719 | LLVM_DEBUG(dbgs() << " with " << *Fixable); |
720 | ++NumTransformedToWInstrs; |
721 | } |
722 | |
723 | LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n" ); |
724 | MRI.replaceRegWith(FromReg: DstReg, ToReg: SrcReg); |
725 | MRI.clearKillFlags(Reg: SrcReg); |
726 | MI.eraseFromParent(); |
727 | ++NumRemovedSExtW; |
728 | MadeChange = true; |
729 | } |
730 | } |
731 | |
732 | return MadeChange; |
733 | } |
734 | |
735 | bool LoongArchOptWInstrs::convertToDSuffixes(MachineFunction &MF, |
736 | const LoongArchInstrInfo &TII, |
737 | const LoongArchSubtarget &ST, |
738 | MachineRegisterInfo &MRI) { |
739 | bool MadeChange = false; |
740 | for (MachineBasicBlock &MBB : MF) { |
741 | for (MachineInstr &MI : MBB) { |
742 | unsigned Opc; |
743 | switch (MI.getOpcode()) { |
744 | default: |
745 | continue; |
746 | case LoongArch::ADDI_W: |
747 | Opc = LoongArch::ADDI_D; |
748 | break; |
749 | } |
750 | |
751 | if (hasAllWUsers(OrigMI: MI, ST, MRI)) { |
752 | MI.setDesc(TII.get(Opcode: Opc)); |
753 | MadeChange = true; |
754 | } |
755 | } |
756 | } |
757 | |
758 | return MadeChange; |
759 | } |
760 | |
761 | bool LoongArchOptWInstrs::convertToWSuffixes(MachineFunction &MF, |
762 | const LoongArchInstrInfo &TII, |
763 | const LoongArchSubtarget &ST, |
764 | MachineRegisterInfo &MRI) { |
765 | bool MadeChange = false; |
766 | for (MachineBasicBlock &MBB : MF) { |
767 | for (MachineInstr &MI : MBB) { |
768 | unsigned WOpc; |
769 | // TODO: Add more? |
770 | switch (MI.getOpcode()) { |
771 | default: |
772 | continue; |
773 | case LoongArch::ADD_D: |
774 | WOpc = LoongArch::ADD_W; |
775 | break; |
776 | case LoongArch::ADDI_D: |
777 | WOpc = LoongArch::ADDI_W; |
778 | break; |
779 | case LoongArch::SUB_D: |
780 | WOpc = LoongArch::SUB_W; |
781 | break; |
782 | case LoongArch::MUL_D: |
783 | WOpc = LoongArch::MUL_W; |
784 | break; |
785 | case LoongArch::SLLI_D: |
786 | // SLLI.W reads the lowest 5 bits, while SLLI.D reads lowest 6 bits |
787 | if (MI.getOperand(i: 2).getImm() >= 32) |
788 | continue; |
789 | WOpc = LoongArch::SLLI_W; |
790 | break; |
791 | case LoongArch::LD_D: |
792 | case LoongArch::LD_WU: |
793 | WOpc = LoongArch::LD_W; |
794 | break; |
795 | } |
796 | |
797 | if (hasAllWUsers(OrigMI: MI, ST, MRI)) { |
798 | LLVM_DEBUG(dbgs() << "Replacing " << MI); |
799 | MI.setDesc(TII.get(Opcode: WOpc)); |
800 | MI.clearFlag(Flag: MachineInstr::MIFlag::NoSWrap); |
801 | MI.clearFlag(Flag: MachineInstr::MIFlag::NoUWrap); |
802 | MI.clearFlag(Flag: MachineInstr::MIFlag::IsExact); |
803 | LLVM_DEBUG(dbgs() << " with " << MI); |
804 | ++NumTransformedToWInstrs; |
805 | MadeChange = true; |
806 | } |
807 | } |
808 | } |
809 | |
810 | return MadeChange; |
811 | } |
812 | |
813 | bool LoongArchOptWInstrs::runOnMachineFunction(MachineFunction &MF) { |
814 | if (skipFunction(F: MF.getFunction())) |
815 | return false; |
816 | |
817 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
818 | const LoongArchSubtarget &ST = MF.getSubtarget<LoongArchSubtarget>(); |
819 | const LoongArchInstrInfo &TII = *ST.getInstrInfo(); |
820 | |
821 | if (!ST.is64Bit()) |
822 | return false; |
823 | |
824 | bool MadeChange = false; |
825 | MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI); |
826 | |
827 | if (!(DisableCvtToDSuffix || ST.preferWInst())) |
828 | MadeChange |= convertToDSuffixes(MF, TII, ST, MRI); |
829 | |
830 | if (ST.preferWInst()) |
831 | MadeChange |= convertToWSuffixes(MF, TII, ST, MRI); |
832 | |
833 | return MadeChange; |
834 | } |
835 | |