| 1 | //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | // | 
|---|
| 9 | // This file implements the ARMSelectionDAGInfo class. | 
|---|
| 10 | // | 
|---|
| 11 | //===----------------------------------------------------------------------===// | 
|---|
| 12 |  | 
|---|
| 13 | #include "ARMTargetTransformInfo.h" | 
|---|
| 14 | #include "llvm/CodeGen/SelectionDAG.h" | 
|---|
| 15 | #include "llvm/Support/CommandLine.h" | 
|---|
| 16 | using namespace llvm; | 
|---|
| 17 |  | 
|---|
| 18 | #define DEBUG_TYPE "arm-selectiondag-info" | 
|---|
| 19 |  | 
|---|
| 20 | static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop( | 
|---|
| 21 | "arm-memtransfer-tploop", cl::Hidden, | 
|---|
| 22 | cl::desc( "Control conversion of memcpy to " | 
|---|
| 23 | "Tail predicated loops (WLSTP)"), | 
|---|
| 24 | cl::init(Val: TPLoop::ForceDisabled), | 
|---|
| 25 | cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled", | 
|---|
| 26 | "Don't convert memcpy to TP loop."), | 
|---|
| 27 | clEnumValN(TPLoop::ForceEnabled, "force-enabled", | 
|---|
| 28 | "Always convert memcpy to TP loop."), | 
|---|
| 29 | clEnumValN(TPLoop::Allow, "allow", | 
|---|
| 30 | "Allow (may be subject to certain conditions) " | 
|---|
| 31 | "conversion of memcpy to TP loop."))); | 
|---|
| 32 |  | 
|---|
| 33 | bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { | 
|---|
| 34 | return Opcode >= ARMISD::FIRST_MEMORY_OPCODE && | 
|---|
| 35 | Opcode <= ARMISD::LAST_MEMORY_OPCODE; | 
|---|
| 36 | } | 
|---|
| 37 |  | 
|---|
| 38 | // Emit, if possible, a specialized version of the given Libcall. Typically this | 
|---|
| 39 | // means selecting the appropriately aligned version, but we also convert memset | 
|---|
| 40 | // of 0 into memclr. | 
|---|
| 41 | SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( | 
|---|
| 42 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, | 
|---|
| 43 | SDValue Size, unsigned Align, RTLIB::Libcall LC) const { | 
|---|
| 44 | const ARMSubtarget &Subtarget = | 
|---|
| 45 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); | 
|---|
| 46 | const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); | 
|---|
| 47 |  | 
|---|
| 48 | // Only use a specialized AEABI function if the default version of this | 
|---|
| 49 | // Libcall is an AEABI function. | 
|---|
| 50 | if (std::strncmp(s1: TLI->getLibcallName(Call: LC), s2: "__aeabi", n: 7) != 0) | 
|---|
| 51 | return SDValue(); | 
|---|
| 52 |  | 
|---|
| 53 | // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be | 
|---|
| 54 | // able to translate memset to memclr and use the value to index the function | 
|---|
| 55 | // name array. | 
|---|
| 56 | enum { | 
|---|
| 57 | AEABI_MEMCPY = 0, | 
|---|
| 58 | AEABI_MEMMOVE, | 
|---|
| 59 | AEABI_MEMSET, | 
|---|
| 60 | AEABI_MEMCLR | 
|---|
| 61 | } AEABILibcall; | 
|---|
| 62 | switch (LC) { | 
|---|
| 63 | case RTLIB::MEMCPY: | 
|---|
| 64 | AEABILibcall = AEABI_MEMCPY; | 
|---|
| 65 | break; | 
|---|
| 66 | case RTLIB::MEMMOVE: | 
|---|
| 67 | AEABILibcall = AEABI_MEMMOVE; | 
|---|
| 68 | break; | 
|---|
| 69 | case RTLIB::MEMSET: | 
|---|
| 70 | AEABILibcall = AEABI_MEMSET; | 
|---|
| 71 | if (isNullConstant(V: Src)) | 
|---|
| 72 | AEABILibcall = AEABI_MEMCLR; | 
|---|
| 73 | break; | 
|---|
| 74 | default: | 
|---|
| 75 | return SDValue(); | 
|---|
| 76 | } | 
|---|
| 77 |  | 
|---|
| 78 | // Choose the most-aligned libcall variant that we can | 
|---|
| 79 | enum { | 
|---|
| 80 | ALIGN1 = 0, | 
|---|
| 81 | ALIGN4, | 
|---|
| 82 | ALIGN8 | 
|---|
| 83 | } AlignVariant; | 
|---|
| 84 | if ((Align & 7) == 0) | 
|---|
| 85 | AlignVariant = ALIGN8; | 
|---|
| 86 | else if ((Align & 3) == 0) | 
|---|
| 87 | AlignVariant = ALIGN4; | 
|---|
| 88 | else | 
|---|
| 89 | AlignVariant = ALIGN1; | 
|---|
| 90 |  | 
|---|
| 91 | TargetLowering::ArgListTy Args; | 
|---|
| 92 | TargetLowering::ArgListEntry Entry; | 
|---|
| 93 | Entry.Ty = DAG.getDataLayout().getIntPtrType(C&: *DAG.getContext()); | 
|---|
| 94 | Entry.Node = Dst; | 
|---|
| 95 | Args.push_back(x: Entry); | 
|---|
| 96 | if (AEABILibcall == AEABI_MEMCLR) { | 
|---|
| 97 | Entry.Node = Size; | 
|---|
| 98 | Args.push_back(x: Entry); | 
|---|
| 99 | } else if (AEABILibcall == AEABI_MEMSET) { | 
|---|
| 100 | // Adjust parameters for memset, EABI uses format (ptr, size, value), | 
|---|
| 101 | // GNU library uses (ptr, value, size) | 
|---|
| 102 | // See RTABI section 4.3.4 | 
|---|
| 103 | Entry.Node = Size; | 
|---|
| 104 | Args.push_back(x: Entry); | 
|---|
| 105 |  | 
|---|
| 106 | // Extend or truncate the argument to be an i32 value for the call. | 
|---|
| 107 | if (Src.getValueType().bitsGT(VT: MVT::i32)) | 
|---|
| 108 | Src = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: Src); | 
|---|
| 109 | else if (Src.getValueType().bitsLT(VT: MVT::i32)) | 
|---|
| 110 | Src = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i32, Operand: Src); | 
|---|
| 111 |  | 
|---|
| 112 | Entry.Node = Src; | 
|---|
| 113 | Entry.Ty = Type::getInt32Ty(C&: *DAG.getContext()); | 
|---|
| 114 | Entry.IsSExt = false; | 
|---|
| 115 | Args.push_back(x: Entry); | 
|---|
| 116 | } else { | 
|---|
| 117 | Entry.Node = Src; | 
|---|
| 118 | Args.push_back(x: Entry); | 
|---|
| 119 |  | 
|---|
| 120 | Entry.Node = Size; | 
|---|
| 121 | Args.push_back(x: Entry); | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | static const RTLIB::Libcall FunctionImpls[4][3] = { | 
|---|
| 125 | {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8}, | 
|---|
| 126 | {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8}, | 
|---|
| 127 | {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8}, | 
|---|
| 128 | {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}}; | 
|---|
| 129 |  | 
|---|
| 130 | RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant]; | 
|---|
| 131 |  | 
|---|
| 132 | TargetLowering::CallLoweringInfo CLI(DAG); | 
|---|
| 133 | CLI.setDebugLoc(dl) | 
|---|
| 134 | .setChain(Chain) | 
|---|
| 135 | .setLibCallee( | 
|---|
| 136 | CC: TLI->getLibcallCallingConv(Call: NewLC), ResultType: Type::getVoidTy(C&: *DAG.getContext()), | 
|---|
| 137 | Target: DAG.getExternalSymbol(Sym: TLI->getLibcallName(Call: NewLC), | 
|---|
| 138 | VT: TLI->getPointerTy(DL: DAG.getDataLayout())), | 
|---|
| 139 | ArgsList: std::move(Args)) | 
|---|
| 140 | .setDiscardResult(); | 
|---|
| 141 | std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); | 
|---|
| 142 |  | 
|---|
| 143 | return CallResult.second; | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, | 
|---|
| 147 | const SelectionDAG &DAG, | 
|---|
| 148 | ConstantSDNode *ConstantSize, | 
|---|
| 149 | Align Alignment, bool IsMemcpy) { | 
|---|
| 150 | auto &F = DAG.getMachineFunction().getFunction(); | 
|---|
| 151 | if (!EnableMemtransferTPLoop) | 
|---|
| 152 | return false; | 
|---|
| 153 | if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) | 
|---|
| 154 | return true; | 
|---|
| 155 | // Do not generate inline TP loop if optimizations is disabled, | 
|---|
| 156 | // or if optimization for size (-Os or -Oz) is on. | 
|---|
| 157 | if (F.hasOptNone() || F.hasOptSize()) | 
|---|
| 158 | return false; | 
|---|
| 159 | // If cli option is unset, for memset always generate inline TP. | 
|---|
| 160 | // For memcpy, check some conditions | 
|---|
| 161 | if (!IsMemcpy) | 
|---|
| 162 | return true; | 
|---|
| 163 | if (!ConstantSize && Alignment >= Align(4)) | 
|---|
| 164 | return true; | 
|---|
| 165 | if (ConstantSize && | 
|---|
| 166 | ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && | 
|---|
| 167 | ConstantSize->getZExtValue() < | 
|---|
| 168 | Subtarget.getMaxMemcpyTPInlineSizeThreshold()) | 
|---|
| 169 | return true; | 
|---|
| 170 | return false; | 
|---|
| 171 | } | 
|---|
| 172 |  | 
|---|
| 173 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( | 
|---|
| 174 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, | 
|---|
| 175 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, | 
|---|
| 176 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { | 
|---|
| 177 | const ARMSubtarget &Subtarget = | 
|---|
| 178 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); | 
|---|
| 179 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); | 
|---|
| 180 |  | 
|---|
| 181 | if (Subtarget.hasMVEIntegerOps() && | 
|---|
| 182 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, IsMemcpy: true)) | 
|---|
| 183 | return DAG.getNode(Opcode: ARMISD::MEMCPYLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, | 
|---|
| 184 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); | 
|---|
| 185 |  | 
|---|
| 186 | // Do repeated 4-byte loads and stores. To be improved. | 
|---|
| 187 | // This requires 4-byte alignment. | 
|---|
| 188 | if (Alignment < Align(4)) | 
|---|
| 189 | return SDValue(); | 
|---|
| 190 | // This requires the copy size to be a constant, preferably | 
|---|
| 191 | // within a subtarget-specific limit. | 
|---|
| 192 | if (!ConstantSize) | 
|---|
| 193 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, | 
|---|
| 194 | Align: Alignment.value(), LC: RTLIB::MEMCPY); | 
|---|
| 195 | uint64_t SizeVal = ConstantSize->getZExtValue(); | 
|---|
| 196 | if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) | 
|---|
| 197 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, | 
|---|
| 198 | Align: Alignment.value(), LC: RTLIB::MEMCPY); | 
|---|
| 199 |  | 
|---|
| 200 | unsigned BytesLeft = SizeVal & 3; | 
|---|
| 201 | unsigned NumMemOps = SizeVal >> 2; | 
|---|
| 202 | unsigned EmittedNumMemOps = 0; | 
|---|
| 203 | EVT VT = MVT::i32; | 
|---|
| 204 | unsigned VTSize = 4; | 
|---|
| 205 | unsigned i = 0; | 
|---|
| 206 | // Emit a maximum of 4 loads in Thumb1 since we have fewer registers | 
|---|
| 207 | const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; | 
|---|
| 208 | SDValue TFOps[6]; | 
|---|
| 209 | SDValue Loads[6]; | 
|---|
| 210 | uint64_t SrcOff = 0, DstOff = 0; | 
|---|
| 211 |  | 
|---|
| 212 | // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to | 
|---|
| 213 | // VLDM/VSTM and make this code emit it when appropriate. This would reduce | 
|---|
| 214 | // pressure on the general purpose registers. However this seems harder to map | 
|---|
| 215 | // onto the register allocator's view of the world. | 
|---|
| 216 |  | 
|---|
| 217 | // The number of MEMCPY pseudo-instructions to emit. We use up to | 
|---|
| 218 | // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm | 
|---|
| 219 | // later on. This is a lower bound on the number of MEMCPY operations we must | 
|---|
| 220 | // emit. | 
|---|
| 221 | unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; | 
|---|
| 222 |  | 
|---|
| 223 | // Code size optimisation: do not inline memcpy if expansion results in | 
|---|
| 224 | // more instructions than the libary call. | 
|---|
| 225 | if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { | 
|---|
| 226 | return SDValue(); | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|
| 229 | SDVTList VTs = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i32, VT3: MVT::Other, VT4: MVT::Glue); | 
|---|
| 230 |  | 
|---|
| 231 | for (unsigned I = 0; I != NumMEMCPYs; ++I) { | 
|---|
| 232 | // Evenly distribute registers among MEMCPY operations to reduce register | 
|---|
| 233 | // pressure. | 
|---|
| 234 | unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; | 
|---|
| 235 | unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; | 
|---|
| 236 |  | 
|---|
| 237 | Dst = DAG.getNode(Opcode: ARMISD::MEMCPY, DL: dl, VTList: VTs, N1: Chain, N2: Dst, N3: Src, | 
|---|
| 238 | N4: DAG.getConstant(Val: NumRegs, DL: dl, VT: MVT::i32)); | 
|---|
| 239 | Src = Dst.getValue(R: 1); | 
|---|
| 240 | Chain = Dst.getValue(R: 2); | 
|---|
| 241 |  | 
|---|
| 242 | DstPtrInfo = DstPtrInfo.getWithOffset(O: NumRegs * VTSize); | 
|---|
| 243 | SrcPtrInfo = SrcPtrInfo.getWithOffset(O: NumRegs * VTSize); | 
|---|
| 244 |  | 
|---|
| 245 | EmittedNumMemOps = NextEmittedNumMemOps; | 
|---|
| 246 | } | 
|---|
| 247 |  | 
|---|
| 248 | if (BytesLeft == 0) | 
|---|
| 249 | return Chain; | 
|---|
| 250 |  | 
|---|
| 251 | // Issue loads / stores for the trailing (1 - 3) bytes. | 
|---|
| 252 | auto getRemainingValueType = [](unsigned BytesLeft) { | 
|---|
| 253 | return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; | 
|---|
| 254 | }; | 
|---|
| 255 | auto getRemainingSize = [](unsigned BytesLeft) { | 
|---|
| 256 | return (BytesLeft >= 2) ? 2 : 1; | 
|---|
| 257 | }; | 
|---|
| 258 |  | 
|---|
| 259 | unsigned BytesLeftSave = BytesLeft; | 
|---|
| 260 | i = 0; | 
|---|
| 261 | while (BytesLeft) { | 
|---|
| 262 | VT = getRemainingValueType(BytesLeft); | 
|---|
| 263 | VTSize = getRemainingSize(BytesLeft); | 
|---|
| 264 | Loads[i] = DAG.getLoad(VT, dl, Chain, | 
|---|
| 265 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Src, | 
|---|
| 266 | N2: DAG.getConstant(Val: SrcOff, DL: dl, VT: MVT::i32)), | 
|---|
| 267 | PtrInfo: SrcPtrInfo.getWithOffset(O: SrcOff)); | 
|---|
| 268 | TFOps[i] = Loads[i].getValue(R: 1); | 
|---|
| 269 | ++i; | 
|---|
| 270 | SrcOff += VTSize; | 
|---|
| 271 | BytesLeft -= VTSize; | 
|---|
| 272 | } | 
|---|
| 273 | Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); | 
|---|
| 274 |  | 
|---|
| 275 | i = 0; | 
|---|
| 276 | BytesLeft = BytesLeftSave; | 
|---|
| 277 | while (BytesLeft) { | 
|---|
| 278 | VT = getRemainingValueType(BytesLeft); | 
|---|
| 279 | VTSize = getRemainingSize(BytesLeft); | 
|---|
| 280 | TFOps[i] = DAG.getStore(Chain, dl, Val: Loads[i], | 
|---|
| 281 | Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: Dst, | 
|---|
| 282 | N2: DAG.getConstant(Val: DstOff, DL: dl, VT: MVT::i32)), | 
|---|
| 283 | PtrInfo: DstPtrInfo.getWithOffset(O: DstOff)); | 
|---|
| 284 | ++i; | 
|---|
| 285 | DstOff += VTSize; | 
|---|
| 286 | BytesLeft -= VTSize; | 
|---|
| 287 | } | 
|---|
| 288 | return DAG.getNode(Opcode: ISD::TokenFactor, DL: dl, VT: MVT::Other, Ops: ArrayRef(TFOps, i)); | 
|---|
| 289 | } | 
|---|
| 290 |  | 
|---|
| 291 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( | 
|---|
| 292 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, | 
|---|
| 293 | SDValue Size, Align Alignment, bool isVolatile, | 
|---|
| 294 | MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { | 
|---|
| 295 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, | 
|---|
| 296 | Align: Alignment.value(), LC: RTLIB::MEMMOVE); | 
|---|
| 297 | } | 
|---|
| 298 |  | 
|---|
| 299 | SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( | 
|---|
| 300 | SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, | 
|---|
| 301 | SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, | 
|---|
| 302 | MachinePointerInfo DstPtrInfo) const { | 
|---|
| 303 |  | 
|---|
| 304 | const ARMSubtarget &Subtarget = | 
|---|
| 305 | DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); | 
|---|
| 306 |  | 
|---|
| 307 | ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Val&: Size); | 
|---|
| 308 |  | 
|---|
| 309 | // Generate TP loop for llvm.memset | 
|---|
| 310 | if (Subtarget.hasMVEIntegerOps() && | 
|---|
| 311 | shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, | 
|---|
| 312 | IsMemcpy: false)) { | 
|---|
| 313 | Src = DAG.getSplatBuildVector(VT: MVT::v16i8, DL: dl, | 
|---|
| 314 | Op: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i8, Operand: Src)); | 
|---|
| 315 | return DAG.getNode(Opcode: ARMISD::MEMSETLOOP, DL: dl, VT: MVT::Other, N1: Chain, N2: Dst, N3: Src, | 
|---|
| 316 | N4: DAG.getZExtOrTrunc(Op: Size, DL: dl, VT: MVT::i32)); | 
|---|
| 317 | } | 
|---|
| 318 |  | 
|---|
| 319 | if (!AlwaysInline) | 
|---|
| 320 | return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, | 
|---|
| 321 | Align: Alignment.value(), LC: RTLIB::MEMSET); | 
|---|
| 322 |  | 
|---|
| 323 | return SDValue(); | 
|---|
| 324 | } | 
|---|
| 325 |  | 
|---|