| 1 | //===- AArch64ExpandImm.h - AArch64 Immediate Expansion -------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the AArch64ExpandImm stuff. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "AArch64.h" |
| 14 | #include "AArch64ExpandImm.h" |
| 15 | #include "MCTargetDesc/AArch64AddressingModes.h" |
| 16 | |
| 17 | using namespace llvm; |
| 18 | using namespace llvm::AArch64_IMM; |
| 19 | |
| 20 | /// Helper function which extracts the specified 16-bit chunk from a |
| 21 | /// 64-bit value. |
| 22 | static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { |
| 23 | assert(ChunkIdx < 4 && "Out of range chunk index specified!" ); |
| 24 | |
| 25 | return (Imm >> (ChunkIdx * 16)) & 0xFFFF; |
| 26 | } |
| 27 | |
| 28 | /// Check whether the given 16-bit chunk replicated to full 64-bit width |
| 29 | /// can be materialized with an ORR instruction. |
| 30 | static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { |
| 31 | Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; |
| 32 | |
| 33 | return AArch64_AM::processLogicalImmediate(Imm: Chunk, RegSize: 64, Encoding); |
| 34 | } |
| 35 | |
| 36 | /// Check for identical 16-bit chunks within the constant and if so |
| 37 | /// materialize them with a single ORR instruction. The remaining one or two |
| 38 | /// 16-bit chunks will be materialized with MOVK instructions. |
| 39 | /// |
| 40 | /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order |
| 41 | /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with |
| 42 | /// an ORR instruction. |
| 43 | static bool tryToreplicateChunks(uint64_t UImm, |
| 44 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 45 | using CountMap = DenseMap<uint64_t, unsigned>; |
| 46 | |
| 47 | CountMap Counts; |
| 48 | |
| 49 | // Scan the constant and count how often every chunk occurs. |
| 50 | for (unsigned Idx = 0; Idx < 4; ++Idx) |
| 51 | ++Counts[getChunk(Imm: UImm, ChunkIdx: Idx)]; |
| 52 | |
| 53 | // Traverse the chunks to find one which occurs more than once. |
| 54 | for (const auto &Chunk : Counts) { |
| 55 | const uint64_t ChunkVal = Chunk.first; |
| 56 | const unsigned Count = Chunk.second; |
| 57 | |
| 58 | uint64_t Encoding = 0; |
| 59 | |
| 60 | // We are looking for chunks which have two or three instances and can be |
| 61 | // materialized with an ORR instruction. |
| 62 | if ((Count != 2 && Count != 3) || !canUseOrr(Chunk: ChunkVal, Encoding)) |
| 63 | continue; |
| 64 | |
| 65 | const bool CountThree = Count == 3; |
| 66 | |
| 67 | Insn.push_back(Elt: { .Opcode: AArch64::ORRXri, .Op1: 0, .Op2: Encoding }); |
| 68 | |
| 69 | unsigned ShiftAmt = 0; |
| 70 | uint64_t Imm16 = 0; |
| 71 | // Find the first chunk not materialized with the ORR instruction. |
| 72 | for (; ShiftAmt < 64; ShiftAmt += 16) { |
| 73 | Imm16 = (UImm >> ShiftAmt) & 0xFFFF; |
| 74 | |
| 75 | if (Imm16 != ChunkVal) |
| 76 | break; |
| 77 | } |
| 78 | |
| 79 | // Create the first MOVK instruction. |
| 80 | Insn.push_back(Elt: { .Opcode: AArch64::MOVKXi, .Op1: Imm16, |
| 81 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt) }); |
| 82 | |
| 83 | // In case we have three instances the whole constant is now materialized |
| 84 | // and we can exit. |
| 85 | if (CountThree) |
| 86 | return true; |
| 87 | |
| 88 | // Find the remaining chunk which needs to be materialized. |
| 89 | for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { |
| 90 | Imm16 = (UImm >> ShiftAmt) & 0xFFFF; |
| 91 | |
| 92 | if (Imm16 != ChunkVal) |
| 93 | break; |
| 94 | } |
| 95 | Insn.push_back(Elt: { .Opcode: AArch64::MOVKXi, .Op1: Imm16, |
| 96 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt) }); |
| 97 | return true; |
| 98 | } |
| 99 | |
| 100 | return false; |
| 101 | } |
| 102 | |
| 103 | /// Check whether this chunk matches the pattern '1...0...'. This pattern |
| 104 | /// starts a contiguous sequence of ones if we look at the bits from the LSB |
| 105 | /// towards the MSB. |
| 106 | static bool isStartChunk(uint64_t Chunk) { |
| 107 | if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) |
| 108 | return false; |
| 109 | |
| 110 | return isMask_64(Value: ~Chunk); |
| 111 | } |
| 112 | |
| 113 | /// Check whether this chunk matches the pattern '0...1...' This pattern |
| 114 | /// ends a contiguous sequence of ones if we look at the bits from the LSB |
| 115 | /// towards the MSB. |
| 116 | static bool isEndChunk(uint64_t Chunk) { |
| 117 | if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max()) |
| 118 | return false; |
| 119 | |
| 120 | return isMask_64(Value: Chunk); |
| 121 | } |
| 122 | |
| 123 | /// Clear or set all bits in the chunk at the given index. |
| 124 | static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { |
| 125 | const uint64_t Mask = 0xFFFF; |
| 126 | |
| 127 | if (Clear) |
| 128 | // Clear chunk in the immediate. |
| 129 | Imm &= ~(Mask << (Idx * 16)); |
| 130 | else |
| 131 | // Set all bits in the immediate for the particular chunk. |
| 132 | Imm |= Mask << (Idx * 16); |
| 133 | |
| 134 | return Imm; |
| 135 | } |
| 136 | |
| 137 | /// Check whether the constant contains a sequence of contiguous ones, |
| 138 | /// which might be interrupted by one or two chunks. If so, materialize the |
| 139 | /// sequence of contiguous ones with an ORR instruction. |
| 140 | /// Materialize the chunks which are either interrupting the sequence or outside |
| 141 | /// of the sequence with a MOVK instruction. |
| 142 | /// |
| 143 | /// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk |
| 144 | /// which ends the sequence (0...1...). Then we are looking for constants which |
| 145 | /// contain at least one S and E chunk. |
| 146 | /// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. |
| 147 | /// |
| 148 | /// We are also looking for constants like |S|A|B|E| where the contiguous |
| 149 | /// sequence of ones wraps around the MSB into the LSB. |
| 150 | static bool trySequenceOfOnes(uint64_t UImm, |
| 151 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 152 | const int NotSet = -1; |
| 153 | const uint64_t Mask = 0xFFFF; |
| 154 | |
| 155 | int StartIdx = NotSet; |
| 156 | int EndIdx = NotSet; |
| 157 | // Try to find the chunks which start/end a contiguous sequence of ones. |
| 158 | for (int Idx = 0; Idx < 4; ++Idx) { |
| 159 | int64_t Chunk = getChunk(Imm: UImm, ChunkIdx: Idx); |
| 160 | // Sign extend the 16-bit chunk to 64-bit. |
| 161 | Chunk = (Chunk << 48) >> 48; |
| 162 | |
| 163 | if (isStartChunk(Chunk)) |
| 164 | StartIdx = Idx; |
| 165 | else if (isEndChunk(Chunk)) |
| 166 | EndIdx = Idx; |
| 167 | } |
| 168 | |
| 169 | // Early exit in case we can't find a start/end chunk. |
| 170 | if (StartIdx == NotSet || EndIdx == NotSet) |
| 171 | return false; |
| 172 | |
| 173 | // Outside of the contiguous sequence of ones everything needs to be zero. |
| 174 | uint64_t Outside = 0; |
| 175 | // Chunks between the start and end chunk need to have all their bits set. |
| 176 | uint64_t Inside = Mask; |
| 177 | |
| 178 | // If our contiguous sequence of ones wraps around from the MSB into the LSB, |
| 179 | // just swap indices and pretend we are materializing a contiguous sequence |
| 180 | // of zeros surrounded by a contiguous sequence of ones. |
| 181 | if (StartIdx > EndIdx) { |
| 182 | std::swap(a&: StartIdx, b&: EndIdx); |
| 183 | std::swap(a&: Outside, b&: Inside); |
| 184 | } |
| 185 | |
| 186 | uint64_t OrrImm = UImm; |
| 187 | int FirstMovkIdx = NotSet; |
| 188 | int SecondMovkIdx = NotSet; |
| 189 | |
| 190 | // Find out which chunks we need to patch up to obtain a contiguous sequence |
| 191 | // of ones. |
| 192 | for (int Idx = 0; Idx < 4; ++Idx) { |
| 193 | const uint64_t Chunk = getChunk(Imm: UImm, ChunkIdx: Idx); |
| 194 | |
| 195 | // Check whether we are looking at a chunk which is not part of the |
| 196 | // contiguous sequence of ones. |
| 197 | if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { |
| 198 | OrrImm = updateImm(Imm: OrrImm, Idx, Clear: Outside == 0); |
| 199 | |
| 200 | // Remember the index we need to patch. |
| 201 | if (FirstMovkIdx == NotSet) |
| 202 | FirstMovkIdx = Idx; |
| 203 | else |
| 204 | SecondMovkIdx = Idx; |
| 205 | |
| 206 | // Check whether we are looking a chunk which is part of the contiguous |
| 207 | // sequence of ones. |
| 208 | } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { |
| 209 | OrrImm = updateImm(Imm: OrrImm, Idx, Clear: Inside != Mask); |
| 210 | |
| 211 | // Remember the index we need to patch. |
| 212 | if (FirstMovkIdx == NotSet) |
| 213 | FirstMovkIdx = Idx; |
| 214 | else |
| 215 | SecondMovkIdx = Idx; |
| 216 | } |
| 217 | } |
| 218 | assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!" ); |
| 219 | |
| 220 | // Create the ORR-immediate instruction. |
| 221 | uint64_t Encoding = 0; |
| 222 | AArch64_AM::processLogicalImmediate(Imm: OrrImm, RegSize: 64, Encoding); |
| 223 | Insn.push_back(Elt: { .Opcode: AArch64::ORRXri, .Op1: 0, .Op2: Encoding }); |
| 224 | |
| 225 | const bool SingleMovk = SecondMovkIdx == NotSet; |
| 226 | Insn.push_back(Elt: { .Opcode: AArch64::MOVKXi, .Op1: getChunk(Imm: UImm, ChunkIdx: FirstMovkIdx), |
| 227 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, |
| 228 | Imm: FirstMovkIdx * 16) }); |
| 229 | |
| 230 | // Early exit in case we only need to emit a single MOVK instruction. |
| 231 | if (SingleMovk) |
| 232 | return true; |
| 233 | |
| 234 | // Create the second MOVK instruction. |
| 235 | Insn.push_back(Elt: { .Opcode: AArch64::MOVKXi, .Op1: getChunk(Imm: UImm, ChunkIdx: SecondMovkIdx), |
| 236 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, |
| 237 | Imm: SecondMovkIdx * 16) }); |
| 238 | |
| 239 | return true; |
| 240 | } |
| 241 | |
| 242 | static uint64_t GetRunOfOnesStartingAt(uint64_t V, uint64_t StartPosition) { |
| 243 | uint64_t NumOnes = llvm::countr_one(Value: V >> StartPosition); |
| 244 | |
| 245 | uint64_t UnshiftedOnes; |
| 246 | if (NumOnes == 64) { |
| 247 | UnshiftedOnes = ~0ULL; |
| 248 | } else { |
| 249 | UnshiftedOnes = (1ULL << NumOnes) - 1; |
| 250 | } |
| 251 | return UnshiftedOnes << StartPosition; |
| 252 | } |
| 253 | |
| 254 | static uint64_t MaximallyReplicateSubImmediate(uint64_t V, uint64_t Subset) { |
| 255 | uint64_t Result = Subset; |
| 256 | |
| 257 | // 64, 32, 16, 8, 4, 2 |
| 258 | for (uint64_t i = 0; i < 6; ++i) { |
| 259 | uint64_t Rotation = 1ULL << (6 - i); |
| 260 | uint64_t Closure = Result | llvm::rotl<uint64_t>(V: Result, R: Rotation); |
| 261 | if (Closure != (Closure & V)) { |
| 262 | break; |
| 263 | } |
| 264 | Result = Closure; |
| 265 | } |
| 266 | |
| 267 | return Result; |
| 268 | } |
| 269 | |
| 270 | // Find the logical immediate that covers the most bits in RemainingBits, |
| 271 | // allowing for additional bits to be set that were set in OriginalBits. |
| 272 | static uint64_t maximalLogicalImmWithin(uint64_t RemainingBits, |
| 273 | uint64_t OriginalBits) { |
| 274 | // Find the first set bit. |
| 275 | uint32_t Position = llvm::countr_zero(Val: RemainingBits); |
| 276 | |
| 277 | // Get the first run of set bits. |
| 278 | uint64_t FirstRun = GetRunOfOnesStartingAt(V: OriginalBits, StartPosition: Position); |
| 279 | |
| 280 | // Replicate the run as many times as possible, as long as the bits are set in |
| 281 | // RemainingBits. |
| 282 | uint64_t MaximalImm = MaximallyReplicateSubImmediate(V: OriginalBits, Subset: FirstRun); |
| 283 | |
| 284 | return MaximalImm; |
| 285 | } |
| 286 | |
| 287 | static std::optional<std::pair<uint64_t, uint64_t>> |
| 288 | decomposeIntoOrrOfLogicalImmediates(uint64_t UImm) { |
| 289 | if (UImm == 0 || ~UImm == 0) |
| 290 | return std::nullopt; |
| 291 | |
| 292 | // Make sure we don't have a run of ones split around the rotation boundary. |
| 293 | uint32_t InitialTrailingOnes = llvm::countr_one(Value: UImm); |
| 294 | uint64_t RotatedBits = llvm::rotr<uint64_t>(V: UImm, R: InitialTrailingOnes); |
| 295 | |
| 296 | // Find the largest logical immediate that fits within the full immediate. |
| 297 | uint64_t MaximalImm1 = maximalLogicalImmWithin(RemainingBits: RotatedBits, OriginalBits: RotatedBits); |
| 298 | |
| 299 | // Remove all bits that are set by this mask. |
| 300 | uint64_t RemainingBits = RotatedBits & ~MaximalImm1; |
| 301 | |
| 302 | // Find the largest logical immediate covering the remaining bits, allowing |
| 303 | // for additional bits to be set that were also set in the original immediate. |
| 304 | uint64_t MaximalImm2 = maximalLogicalImmWithin(RemainingBits, OriginalBits: RotatedBits); |
| 305 | |
| 306 | // If any bits still haven't been covered, then give up. |
| 307 | if (RemainingBits & ~MaximalImm2) |
| 308 | return std::nullopt; |
| 309 | |
| 310 | // Make sure to un-rotate the immediates. |
| 311 | return std::make_pair(x: rotl(V: MaximalImm1, R: InitialTrailingOnes), |
| 312 | y: rotl(V: MaximalImm2, R: InitialTrailingOnes)); |
| 313 | } |
| 314 | |
| 315 | // Attempt to expand an immediate as the ORR of a pair of logical immediates. |
| 316 | static bool tryOrrOfLogicalImmediates(uint64_t UImm, |
| 317 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 318 | auto MaybeDecomposition = decomposeIntoOrrOfLogicalImmediates(UImm); |
| 319 | if (MaybeDecomposition == std::nullopt) |
| 320 | return false; |
| 321 | uint64_t Imm1 = MaybeDecomposition->first; |
| 322 | uint64_t Imm2 = MaybeDecomposition->second; |
| 323 | |
| 324 | uint64_t Encoding1, Encoding2; |
| 325 | bool Imm1Success = AArch64_AM::processLogicalImmediate(Imm: Imm1, RegSize: 64, Encoding&: Encoding1); |
| 326 | bool Imm2Success = AArch64_AM::processLogicalImmediate(Imm: Imm2, RegSize: 64, Encoding&: Encoding2); |
| 327 | |
| 328 | if (Imm1Success && Imm2Success) { |
| 329 | // Create the ORR-immediate instructions. |
| 330 | Insn.push_back(Elt: {.Opcode: AArch64::ORRXri, .Op1: 0, .Op2: Encoding1}); |
| 331 | Insn.push_back(Elt: {.Opcode: AArch64::ORRXri, .Op1: 1, .Op2: Encoding2}); |
| 332 | return true; |
| 333 | } |
| 334 | |
| 335 | return false; |
| 336 | } |
| 337 | |
| 338 | // Attempt to expand an immediate as the AND of a pair of logical immediates. |
| 339 | // This is done by applying DeMorgan's law, under which logical immediates |
| 340 | // are closed. |
| 341 | static bool tryAndOfLogicalImmediates(uint64_t UImm, |
| 342 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 343 | // Apply DeMorgan's law to turn this into an ORR problem. |
| 344 | auto MaybeDecomposition = decomposeIntoOrrOfLogicalImmediates(UImm: ~UImm); |
| 345 | if (MaybeDecomposition == std::nullopt) |
| 346 | return false; |
| 347 | uint64_t Imm1 = MaybeDecomposition->first; |
| 348 | uint64_t Imm2 = MaybeDecomposition->second; |
| 349 | |
| 350 | uint64_t Encoding1, Encoding2; |
| 351 | bool Imm1Success = AArch64_AM::processLogicalImmediate(Imm: ~Imm1, RegSize: 64, Encoding&: Encoding1); |
| 352 | bool Imm2Success = AArch64_AM::processLogicalImmediate(Imm: ~Imm2, RegSize: 64, Encoding&: Encoding2); |
| 353 | |
| 354 | if (Imm1Success && Imm2Success) { |
| 355 | // Materialize Imm1, the LHS of the AND |
| 356 | Insn.push_back(Elt: {.Opcode: AArch64::ORRXri, .Op1: 0, .Op2: Encoding1}); |
| 357 | // AND Imm1 with Imm2 |
| 358 | Insn.push_back(Elt: {.Opcode: AArch64::ANDXri, .Op1: 1, .Op2: Encoding2}); |
| 359 | return true; |
| 360 | } |
| 361 | |
| 362 | return false; |
| 363 | } |
| 364 | |
| 365 | // Check whether the constant can be represented by exclusive-or of two 64-bit |
| 366 | // logical immediates. If so, materialize it with an ORR instruction followed |
| 367 | // by an EOR instruction. |
| 368 | // |
| 369 | // This encoding allows all remaining repeated byte patterns, and many repeated |
| 370 | // 16-bit values, to be encoded without needing four instructions. It can also |
| 371 | // represent some irregular bitmasks (although those would mostly only need |
| 372 | // three instructions otherwise). |
| 373 | static bool tryEorOfLogicalImmediates(uint64_t Imm, |
| 374 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 375 | // Determine the larger repetition size of the two possible logical |
| 376 | // immediates, by finding the repetition size of Imm. |
| 377 | unsigned BigSize = 64; |
| 378 | |
| 379 | do { |
| 380 | BigSize /= 2; |
| 381 | uint64_t Mask = (1ULL << BigSize) - 1; |
| 382 | |
| 383 | if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) { |
| 384 | BigSize *= 2; |
| 385 | break; |
| 386 | } |
| 387 | } while (BigSize > 2); |
| 388 | |
| 389 | uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize); |
| 390 | |
| 391 | // Find the last bit of each run of ones, circularly. For runs which wrap |
| 392 | // around from bit 0 to bit 63, this is the bit before the most-significant |
| 393 | // zero, otherwise it is the least-significant bit in the run of ones. |
| 394 | uint64_t RunStarts = Imm & ~rotl<uint64_t>(V: Imm, R: 1); |
| 395 | |
| 396 | // Find the smaller repetition size of the two possible logical immediates by |
| 397 | // counting the number of runs of one-bits within the BigSize-bit value. Both |
| 398 | // sizes may be the same. The EOR may add one or subtract one from the |
| 399 | // power-of-two count that can be represented by a logical immediate, or it |
| 400 | // may be left unchanged. |
| 401 | int RunsPerBigChunk = popcount(Value: RunStarts & BigMask); |
| 402 | |
| 403 | static const int8_t BigToSmallSizeTable[32] = { |
| 404 | -1, -1, 0, 1, 2, 2, -1, 3, 3, 3, -1, -1, -1, -1, -1, 4, |
| 405 | 4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, |
| 406 | }; |
| 407 | |
| 408 | int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk]; |
| 409 | |
| 410 | // Early-exit if the big chunk couldn't be a power-of-two number of runs |
| 411 | // EORed with another single run. |
| 412 | if (BigToSmallShift == -1) |
| 413 | return false; |
| 414 | |
| 415 | unsigned SmallSize = BigSize >> BigToSmallShift; |
| 416 | |
| 417 | // 64-bit values with a bit set every (1 << index) bits. |
| 418 | static const uint64_t RepeatedOnesTable[] = { |
| 419 | 0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111, |
| 420 | 0x0101010101010101, 0x0001000100010001, 0x0000000100000001, |
| 421 | 0x0000000000000001, |
| 422 | }; |
| 423 | |
| 424 | // This RepeatedOnesTable lookup is a faster implementation of the division |
| 425 | // 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as |
| 426 | // dividing the 64-bit value into fields of width SmallSize, and placing a |
| 427 | // one in the least significant bit of each field. |
| 428 | uint64_t SmallOnes = RepeatedOnesTable[countr_zero(Val: SmallSize)]; |
| 429 | |
| 430 | // Now we try to find the number of ones in each of the smaller repetitions, |
| 431 | // by looking at runs of ones in Imm. This can take three attempts, as the |
| 432 | // EOR may have changed the length of the first two runs we find. |
| 433 | |
| 434 | // Rotate a run of ones so we can count the number of trailing set bits. |
| 435 | int Rotation = countr_zero(Val: RunStarts); |
| 436 | uint64_t RotatedImm = rotr<uint64_t>(V: Imm, R: Rotation); |
| 437 | for (int Attempt = 0; Attempt < 3; ++Attempt) { |
| 438 | unsigned RunLength = countr_one(Value: RotatedImm); |
| 439 | |
| 440 | // Construct candidate values BigImm and SmallImm, such that if these two |
| 441 | // values are encodable, we have a solution. (SmallImm is constructed to be |
| 442 | // encodable, but this isn't guaranteed when RunLength >= SmallSize) |
| 443 | uint64_t SmallImm = |
| 444 | rotl<uint64_t>(V: (SmallOnes << RunLength) - SmallOnes, R: Rotation); |
| 445 | uint64_t BigImm = Imm ^ SmallImm; |
| 446 | |
| 447 | uint64_t BigEncoding = 0; |
| 448 | uint64_t SmallEncoding = 0; |
| 449 | if (AArch64_AM::processLogicalImmediate(Imm: BigImm, RegSize: 64, Encoding&: BigEncoding) && |
| 450 | AArch64_AM::processLogicalImmediate(Imm: SmallImm, RegSize: 64, Encoding&: SmallEncoding)) { |
| 451 | Insn.push_back(Elt: {.Opcode: AArch64::ORRXri, .Op1: 0, .Op2: SmallEncoding}); |
| 452 | Insn.push_back(Elt: {.Opcode: AArch64::EORXri, .Op1: 1, .Op2: BigEncoding}); |
| 453 | return true; |
| 454 | } |
| 455 | |
| 456 | // Rotate to the next run of ones |
| 457 | Rotation += countr_zero(Val: rotr<uint64_t>(V: RunStarts, R: Rotation) & ~1); |
| 458 | RotatedImm = rotr<uint64_t>(V: Imm, R: Rotation); |
| 459 | } |
| 460 | |
| 461 | return false; |
| 462 | } |
| 463 | |
| 464 | /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a |
| 465 | /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. |
| 466 | static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize, |
| 467 | unsigned OneChunks, unsigned ZeroChunks, |
| 468 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 469 | const unsigned Mask = 0xFFFF; |
| 470 | |
| 471 | // Use a MOVZ or MOVN instruction to set the high bits, followed by one or |
| 472 | // more MOVK instructions to insert additional 16-bit portions into the |
| 473 | // lower bits. |
| 474 | bool isNeg = false; |
| 475 | |
| 476 | // Use MOVN to materialize the high bits if we have more all one chunks |
| 477 | // than all zero chunks. |
| 478 | if (OneChunks > ZeroChunks) { |
| 479 | isNeg = true; |
| 480 | Imm = ~Imm; |
| 481 | } |
| 482 | |
| 483 | unsigned FirstOpc; |
| 484 | if (BitSize == 32) { |
| 485 | Imm &= (1LL << 32) - 1; |
| 486 | FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); |
| 487 | } else { |
| 488 | FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); |
| 489 | } |
| 490 | unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN |
| 491 | unsigned LastShift = 0; // LSL amount for last MOVK |
| 492 | if (Imm != 0) { |
| 493 | unsigned LZ = llvm::countl_zero(Val: Imm); |
| 494 | unsigned TZ = llvm::countr_zero(Val: Imm); |
| 495 | Shift = (TZ / 16) * 16; |
| 496 | LastShift = ((63 - LZ) / 16) * 16; |
| 497 | } |
| 498 | unsigned Imm16 = (Imm >> Shift) & Mask; |
| 499 | |
| 500 | Insn.push_back(Elt: { .Opcode: FirstOpc, .Op1: Imm16, |
| 501 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: Shift) }); |
| 502 | |
| 503 | if (Shift == LastShift) |
| 504 | return; |
| 505 | |
| 506 | // If a MOVN was used for the high bits of a negative value, flip the rest |
| 507 | // of the bits back for use with MOVK. |
| 508 | if (isNeg) |
| 509 | Imm = ~Imm; |
| 510 | |
| 511 | unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); |
| 512 | while (Shift < LastShift) { |
| 513 | Shift += 16; |
| 514 | Imm16 = (Imm >> Shift) & Mask; |
| 515 | if (Imm16 == (isNeg ? Mask : 0)) |
| 516 | continue; // This 16-bit portion is already set correctly. |
| 517 | |
| 518 | Insn.push_back(Elt: { .Opcode: Opc, .Op1: Imm16, |
| 519 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: Shift) }); |
| 520 | } |
| 521 | |
| 522 | // Now, we get 16-bit divided Imm. If high and low bits are same in |
| 523 | // 32-bit, there is an opportunity to reduce instruction. |
| 524 | if (Insn.size() > 2 && (Imm >> 32) == (Imm & 0xffffffffULL)) { |
| 525 | for (int Size = Insn.size(); Size > 2; Size--) |
| 526 | Insn.pop_back(); |
| 527 | Insn.push_back(Elt: {.Opcode: AArch64::ORRXrs, .Op1: 0, .Op2: 32}); |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | /// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more |
| 532 | /// real move-immediate instructions to synthesize the immediate. |
| 533 | void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize, |
| 534 | SmallVectorImpl<ImmInsnModel> &Insn) { |
| 535 | const unsigned Mask = 0xFFFF; |
| 536 | |
| 537 | // Scan the immediate and count the number of 16-bit chunks which are either |
| 538 | // all ones or all zeros. |
| 539 | unsigned OneChunks = 0; |
| 540 | unsigned ZeroChunks = 0; |
| 541 | for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { |
| 542 | const unsigned Chunk = (Imm >> Shift) & Mask; |
| 543 | if (Chunk == Mask) |
| 544 | OneChunks++; |
| 545 | else if (Chunk == 0) |
| 546 | ZeroChunks++; |
| 547 | } |
| 548 | |
| 549 | // Prefer MOVZ/MOVN over ORR because of the rules for the "mov" alias. |
| 550 | if ((BitSize / 16) - OneChunks <= 1 || (BitSize / 16) - ZeroChunks <= 1) { |
| 551 | expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); |
| 552 | return; |
| 553 | } |
| 554 | |
| 555 | // Try a single ORR. |
| 556 | uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); |
| 557 | uint64_t Encoding; |
| 558 | if (AArch64_AM::processLogicalImmediate(Imm: UImm, RegSize: BitSize, Encoding)) { |
| 559 | unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); |
| 560 | Insn.push_back(Elt: { .Opcode: Opc, .Op1: 0, .Op2: Encoding }); |
| 561 | return; |
| 562 | } |
| 563 | |
| 564 | // One to up three instruction sequences. |
| 565 | // |
| 566 | // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the |
| 567 | // fastest sequence with fast literal generation. |
| 568 | if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) { |
| 569 | expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); |
| 570 | return; |
| 571 | } |
| 572 | |
| 573 | assert(BitSize == 64 && "All 32-bit immediates can be expanded with a" |
| 574 | "MOVZ/MOVK pair" ); |
| 575 | |
| 576 | // Try other two-instruction sequences. |
| 577 | |
| 578 | // 64-bit ORR followed by MOVK. |
| 579 | // We try to construct the ORR immediate in three different ways: either we |
| 580 | // zero out the chunk which will be replaced, we fill the chunk which will |
| 581 | // be replaced with ones, or we take the bit pattern from the other half of |
| 582 | // the 64-bit immediate. This is comprehensive because of the way ORR |
| 583 | // immediates are constructed. |
| 584 | for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { |
| 585 | uint64_t ShiftedMask = (0xFFFFULL << Shift); |
| 586 | uint64_t ZeroChunk = UImm & ~ShiftedMask; |
| 587 | uint64_t OneChunk = UImm | ShiftedMask; |
| 588 | uint64_t RotatedImm = (UImm << 32) | (UImm >> 32); |
| 589 | uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask); |
| 590 | if (AArch64_AM::processLogicalImmediate(Imm: ZeroChunk, RegSize: BitSize, Encoding) || |
| 591 | AArch64_AM::processLogicalImmediate(Imm: OneChunk, RegSize: BitSize, Encoding) || |
| 592 | AArch64_AM::processLogicalImmediate(Imm: ReplicateChunk, RegSize: BitSize, |
| 593 | Encoding)) { |
| 594 | // Create the ORR-immediate instruction. |
| 595 | Insn.push_back(Elt: { .Opcode: AArch64::ORRXri, .Op1: 0, .Op2: Encoding }); |
| 596 | |
| 597 | // Create the MOVK instruction. |
| 598 | const unsigned Imm16 = getChunk(Imm: UImm, ChunkIdx: Shift / 16); |
| 599 | Insn.push_back(Elt: { .Opcode: AArch64::MOVKXi, .Op1: Imm16, |
| 600 | .Op2: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: Shift) }); |
| 601 | return; |
| 602 | } |
| 603 | } |
| 604 | |
| 605 | // Attempt to use a sequence of two ORR-immediate instructions. |
| 606 | if (tryOrrOfLogicalImmediates(UImm: Imm, Insn)) |
| 607 | return; |
| 608 | |
| 609 | // Attempt to use a sequence of ORR-immediate followed by AND-immediate. |
| 610 | if (tryAndOfLogicalImmediates(UImm: Imm, Insn)) |
| 611 | return; |
| 612 | |
| 613 | // Attempt to use a sequence of ORR-immediate followed by EOR-immediate. |
| 614 | if (tryEorOfLogicalImmediates(Imm: UImm, Insn)) |
| 615 | return; |
| 616 | |
| 617 | // FIXME: Add more two-instruction sequences. |
| 618 | |
| 619 | // Three instruction sequences. |
| 620 | // |
| 621 | // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly |
| 622 | // the fastest sequence with fast literal generation. (If neither MOVK is |
| 623 | // part of a fast literal generation pair, it could be slower than the |
| 624 | // four-instruction sequence, but we won't worry about that for now.) |
| 625 | if (OneChunks || ZeroChunks) { |
| 626 | expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); |
| 627 | return; |
| 628 | } |
| 629 | |
| 630 | // Check for identical 16-bit chunks within the constant and if so materialize |
| 631 | // them with a single ORR instruction. The remaining one or two 16-bit chunks |
| 632 | // will be materialized with MOVK instructions. |
| 633 | if (BitSize == 64 && tryToreplicateChunks(UImm, Insn)) |
| 634 | return; |
| 635 | |
| 636 | // Check whether the constant contains a sequence of contiguous ones, which |
| 637 | // might be interrupted by one or two chunks. If so, materialize the sequence |
| 638 | // of contiguous ones with an ORR instruction. Materialize the chunks which |
| 639 | // are either interrupting the sequence or outside of the sequence with a |
| 640 | // MOVK instruction. |
| 641 | if (BitSize == 64 && trySequenceOfOnes(UImm, Insn)) |
| 642 | return; |
| 643 | |
| 644 | // We found no possible two or three instruction sequence; use the general |
| 645 | // four-instruction sequence. |
| 646 | expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn); |
| 647 | } |
| 648 | |