| 1 | //===------ CGGPUBuiltin.cpp - Codegen for GPU builtins -------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Generates code for built-in GPU calls which are not runtime-specific. |
| 10 | // (Runtime-specific codegen lives in programming model specific files.) |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "CodeGenFunction.h" |
| 15 | #include "clang/Basic/Builtins.h" |
| 16 | #include "llvm/IR/DataLayout.h" |
| 17 | #include "llvm/IR/Instruction.h" |
| 18 | #include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h" |
| 19 | |
| 20 | using namespace clang; |
| 21 | using namespace CodeGen; |
| 22 | |
| 23 | namespace { |
| 24 | llvm::Function *GetVprintfDeclaration(llvm::Module &M) { |
| 25 | llvm::Type *ArgTypes[] = {llvm::PointerType::getUnqual(C&: M.getContext()), |
| 26 | llvm::PointerType::getUnqual(C&: M.getContext())}; |
| 27 | llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get( |
| 28 | Result: llvm::Type::getInt32Ty(C&: M.getContext()), Params: ArgTypes, isVarArg: false); |
| 29 | |
| 30 | if (auto *F = M.getFunction(Name: "vprintf" )) { |
| 31 | // Our CUDA system header declares vprintf with the right signature, so |
| 32 | // nobody else should have been able to declare vprintf with a bogus |
| 33 | // signature. |
| 34 | assert(F->getFunctionType() == VprintfFuncType); |
| 35 | return F; |
| 36 | } |
| 37 | |
| 38 | // vprintf doesn't already exist; create a declaration and insert it into the |
| 39 | // module. |
| 40 | return llvm::Function::Create( |
| 41 | Ty: VprintfFuncType, Linkage: llvm::GlobalVariable::ExternalLinkage, N: "vprintf" , M: &M); |
| 42 | } |
| 43 | |
| 44 | // Transforms a call to printf into a call to the NVPTX vprintf syscall (which |
| 45 | // isn't particularly special; it's invoked just like a regular function). |
| 46 | // vprintf takes two args: A format string, and a pointer to a buffer containing |
| 47 | // the varargs. |
| 48 | // |
| 49 | // For example, the call |
| 50 | // |
| 51 | // printf("format string", arg1, arg2, arg3); |
| 52 | // |
| 53 | // is converted into something resembling |
| 54 | // |
| 55 | // struct Tmp { |
| 56 | // Arg1 a1; |
| 57 | // Arg2 a2; |
| 58 | // Arg3 a3; |
| 59 | // }; |
| 60 | // char* buf = alloca(sizeof(Tmp)); |
| 61 | // *(Tmp*)buf = {a1, a2, a3}; |
| 62 | // vprintf("format string", buf); |
| 63 | // |
| 64 | // buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the |
| 65 | // args is itself aligned to its preferred alignment. |
| 66 | // |
| 67 | // Note that by the time this function runs, E's args have already undergone the |
| 68 | // standard C vararg promotion (short -> int, float -> double, etc.). |
| 69 | |
| 70 | std::pair<llvm::Value *, llvm::TypeSize> |
| 71 | packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) { |
| 72 | const llvm::DataLayout &DL = CGF->CGM.getDataLayout(); |
| 73 | llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext(); |
| 74 | CGBuilderTy &Builder = CGF->Builder; |
| 75 | |
| 76 | // Construct and fill the args buffer that we'll pass to vprintf. |
| 77 | if (Args.size() <= 1) { |
| 78 | // If there are no args, pass a null pointer and size 0 |
| 79 | llvm::Value *BufferPtr = |
| 80 | llvm::ConstantPointerNull::get(T: llvm::PointerType::getUnqual(C&: Ctx)); |
| 81 | return {BufferPtr, llvm::TypeSize::getFixed(ExactSize: 0)}; |
| 82 | } else { |
| 83 | llvm::SmallVector<llvm::Type *, 8> ArgTypes; |
| 84 | for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) |
| 85 | ArgTypes.push_back(Elt: Args[I].getRValue(CGF&: *CGF).getScalarVal()->getType()); |
| 86 | |
| 87 | // Using llvm::StructType is correct only because printf doesn't accept |
| 88 | // aggregates. If we had to handle aggregates here, we'd have to manually |
| 89 | // compute the offsets within the alloca -- we wouldn't be able to assume |
| 90 | // that the alignment of the llvm type was the same as the alignment of the |
| 91 | // clang type. |
| 92 | llvm::Type *AllocaTy = llvm::StructType::create(Elements: ArgTypes, Name: "printf_args" ); |
| 93 | llvm::Value *Alloca = CGF->CreateTempAlloca(Ty: AllocaTy); |
| 94 | |
| 95 | for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { |
| 96 | llvm::Value *P = Builder.CreateStructGEP(Ty: AllocaTy, Ptr: Alloca, Idx: I - 1); |
| 97 | llvm::Value *Arg = Args[I].getRValue(CGF&: *CGF).getScalarVal(); |
| 98 | Builder.CreateAlignedStore(Val: Arg, Ptr: P, Align: DL.getPrefTypeAlign(Ty: Arg->getType())); |
| 99 | } |
| 100 | llvm::Value *BufferPtr = |
| 101 | Builder.CreatePointerCast(V: Alloca, DestTy: llvm::PointerType::getUnqual(C&: Ctx)); |
| 102 | return {BufferPtr, DL.getTypeAllocSize(Ty: AllocaTy)}; |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | bool containsNonScalarVarargs(CodeGenFunction *CGF, const CallArgList &Args) { |
| 107 | return llvm::any_of(Range: llvm::drop_begin(RangeOrContainer: Args), P: [&](const CallArg &A) { |
| 108 | return !A.getRValue(CGF&: *CGF).isScalar(); |
| 109 | }); |
| 110 | } |
| 111 | |
| 112 | RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF, |
| 113 | llvm::Function *Decl, bool WithSizeArg) { |
| 114 | CodeGenModule &CGM = CGF->CGM; |
| 115 | CGBuilderTy &Builder = CGF->Builder; |
| 116 | assert(E->getBuiltinCallee() == Builtin::BIprintf || |
| 117 | E->getBuiltinCallee() == Builtin::BI__builtin_printf); |
| 118 | assert(E->getNumArgs() >= 1); // printf always has at least one arg. |
| 119 | |
| 120 | // Uses the same format as nvptx for the argument packing, but also passes |
| 121 | // an i32 for the total size of the passed pointer |
| 122 | CallArgList Args; |
| 123 | CGF->EmitCallArgs(Args, |
| 124 | Prototype: E->getDirectCallee()->getType()->getAs<FunctionProtoType>(), |
| 125 | ArgRange: E->arguments(), AC: E->getDirectCallee(), |
| 126 | /* ParamsToSkip = */ 0); |
| 127 | |
| 128 | // We don't know how to emit non-scalar varargs. |
| 129 | if (containsNonScalarVarargs(CGF, Args)) { |
| 130 | CGM.ErrorUnsupported(S: E, Type: "non-scalar arg to printf" ); |
| 131 | return RValue::get(V: llvm::ConstantInt::get(Ty: CGF->IntTy, V: 0)); |
| 132 | } |
| 133 | |
| 134 | auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args); |
| 135 | llvm::Value *BufferPtr = r.first; |
| 136 | |
| 137 | llvm::SmallVector<llvm::Value *, 3> Vec = { |
| 138 | Args[0].getRValue(CGF&: *CGF).getScalarVal(), BufferPtr}; |
| 139 | if (WithSizeArg) { |
| 140 | // Passing > 32bit of data as a local alloca doesn't work for nvptx or |
| 141 | // amdgpu |
| 142 | llvm::Constant *Size = |
| 143 | llvm::ConstantInt::get(Ty: llvm::Type::getInt32Ty(C&: CGM.getLLVMContext()), |
| 144 | V: static_cast<uint32_t>(r.second.getFixedValue())); |
| 145 | |
| 146 | Vec.push_back(Elt: Size); |
| 147 | } |
| 148 | return RValue::get(V: Builder.CreateCall(Callee: Decl, Args: Vec)); |
| 149 | } |
| 150 | } // namespace |
| 151 | |
| 152 | RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) { |
| 153 | assert(getTarget().getTriple().isNVPTX()); |
| 154 | return EmitDevicePrintfCallExpr( |
| 155 | E, CGF: this, Decl: GetVprintfDeclaration(M&: CGM.getModule()), WithSizeArg: false); |
| 156 | } |
| 157 | |
| 158 | RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) { |
| 159 | assert(getTarget().getTriple().isAMDGCN() || |
| 160 | (getTarget().getTriple().isSPIRV() && |
| 161 | getTarget().getTriple().getVendor() == llvm::Triple::AMD)); |
| 162 | assert(E->getBuiltinCallee() == Builtin::BIprintf || |
| 163 | E->getBuiltinCallee() == Builtin::BI__builtin_printf); |
| 164 | assert(E->getNumArgs() >= 1); // printf always has at least one arg. |
| 165 | |
| 166 | CallArgList CallArgs; |
| 167 | EmitCallArgs(Args&: CallArgs, |
| 168 | Prototype: E->getDirectCallee()->getType()->getAs<FunctionProtoType>(), |
| 169 | ArgRange: E->arguments(), AC: E->getDirectCallee(), |
| 170 | /* ParamsToSkip = */ 0); |
| 171 | |
| 172 | SmallVector<llvm::Value *, 8> Args; |
| 173 | for (const auto &A : CallArgs) { |
| 174 | // We don't know how to emit non-scalar varargs. |
| 175 | if (!A.getRValue(CGF&: *this).isScalar()) { |
| 176 | CGM.ErrorUnsupported(S: E, Type: "non-scalar arg to printf" ); |
| 177 | return RValue::get(V: llvm::ConstantInt::get(Ty: IntTy, V: -1)); |
| 178 | } |
| 179 | |
| 180 | llvm::Value *Arg = A.getRValue(CGF&: *this).getScalarVal(); |
| 181 | Args.push_back(Elt: Arg); |
| 182 | } |
| 183 | |
| 184 | llvm::IRBuilder<> IRB(Builder.GetInsertBlock(), Builder.GetInsertPoint()); |
| 185 | IRB.SetCurrentDebugLocation(Builder.getCurrentDebugLocation()); |
| 186 | |
| 187 | bool isBuffered = (CGM.getTarget().getTargetOpts().AMDGPUPrintfKindVal == |
| 188 | clang::TargetOptions::AMDGPUPrintfKind::Buffered); |
| 189 | auto Printf = llvm::emitAMDGPUPrintfCall(Builder&: IRB, Args, isBuffered); |
| 190 | Builder.SetInsertPoint(TheBB: IRB.GetInsertBlock(), IP: IRB.GetInsertPoint()); |
| 191 | return RValue::get(V: Printf); |
| 192 | } |
| 193 | |