AMDGPUAtomicOptimizer.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp]

1	//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass optimizes atomic operations by using a single lane of a wavefront
11	/// to perform the atomic operation, thus reducing contention on that memory
12	/// location.
13	/// Atomic optimizer uses following strategies to compute scan and reduced
14	/// values
15	/// 1. DPP -
16	/// This is the most efficient implementation for scan. DPP uses Whole Wave
17	/// Mode (WWM)
18	/// 2. Iterative -
19	// An alternative implementation iterates over all active lanes
20	/// of Wavefront using llvm.cttz and performs scan using readlane & writelane
21	/// intrinsics
22	//===----------------------------------------------------------------------===//
23
24	#include "AMDGPU.h"
25	#include "GCNSubtarget.h"
26	#include "llvm/Analysis/DomTreeUpdater.h"
27	#include "llvm/Analysis/UniformityAnalysis.h"
28	#include "llvm/CodeGen/TargetPassConfig.h"
29	#include "llvm/IR/IRBuilder.h"
30	#include "llvm/IR/InstVisitor.h"
31	#include "llvm/IR/IntrinsicsAMDGPU.h"
32	#include "llvm/InitializePasses.h"
33	#include "llvm/Target/TargetMachine.h"
34	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
35
36	#define DEBUG_TYPE "amdgpu-atomic-optimizer"
37
38	using namespace llvm;
39	using namespace llvm::AMDGPU;
40
41	namespace {
42
43	struct ReplacementInfo {
44	Instruction *I;
45	AtomicRMWInst::BinOp Op;
46	unsigned ValIdx;
47	bool ValDivergent;
48	};
49
50	class AMDGPUAtomicOptimizer : public FunctionPass {
51	public:
52	static char ID;
53	ScanOptions ScanImpl;
54	AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
55	: FunctionPass (ID), ScanImpl(ScanImpl) {}
56
57	bool runOnFunction(Function &F) override;
58
59	void getAnalysisUsage(AnalysisUsage &AU) const override {
60	AU.addPreserved<DominatorTreeWrapperPass>();
61	AU.addRequired<UniformityInfoWrapperPass>();
62	AU.addRequired<TargetPassConfig>();
63	}
64	};
65
66	class AMDGPUAtomicOptimizerImpl
67	: public InstVisitor<AMDGPUAtomicOptimizerImpl> {
68	private:
69	SmallVector<ReplacementInfo, `8`> ToReplace;
70	const UniformityInfo *UA;
71	const DataLayout *DL;
72	DomTreeUpdater &DTU;
73	const GCNSubtarget *ST;
74	bool IsPixelShader;
75	ScanOptions ScanImpl;
76
77	Value buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value V,
78	Value *const Identity) const;
79	Value buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value V,
80	Value *const Identity) const;
81	Value buildShiftRight(IRBuilder<> &B, Value V, Value *const Identity) const;
82
83	std::pair<Value , Value >
84	buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
85	Value *const Identity, Value *V, Instruction &I,
86	BasicBlock ComputeLoop, BasicBlock ComputeEnd) const;
87
88	void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
89	bool ValDivergent) const;
90
91	public:
92	AMDGPUAtomicOptimizerImpl() = delete;
93
94	AMDGPUAtomicOptimizerImpl(const UniformityInfo UA, const* DataLayout *DL,
95	DomTreeUpdater &DTU, const GCNSubtarget *ST,
96	bool IsPixelShader, ScanOptions ScanImpl)
97	: UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
98	ScanImpl(ScanImpl) {}
99
100	bool run(Function &F);
101
102	void visitAtomicRMWInst(AtomicRMWInst &I);
103	void visitIntrinsicInst(IntrinsicInst &I);
104	};
105
106	} // namespace
107
108	char AMDGPUAtomicOptimizer::ID = `0`;
109
110	char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;
111
112	bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
113	if (skipFunction(F)) {
114	return false;
115	}
116
117	const UniformityInfo *UA =
118	&getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
119	const DataLayout *DL = &F.getDataLayout();
120
121	DominatorTreeWrapperPass *const DTW =
122	getAnalysisIfAvailable<DominatorTreeWrapperPass>();
123	DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
124	DomTreeUpdater::UpdateStrategy::Lazy);
125
126	const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
127	const TargetMachine &TM = TPC.getTM<TargetMachine>();
128	const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
129
130	bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
131
132	return AMDGPUAtomicOptimizerImpl (UA, DL, DTU, ST, IsPixelShader, ScanImpl)
133	.run(F);
134	}
135
136	PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
137	FunctionAnalysisManager &AM) {
138
139	const auto *UA = &AM.getResult<UniformityInfoAnalysis>(IR&: F);
140	const DataLayout *DL = &F.getDataLayout();
141
142	DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(IR&: F),
143	DomTreeUpdater::UpdateStrategy::Lazy);
144	const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
145
146	bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
147
148	bool IsChanged =
149	AMDGPUAtomicOptimizerImpl (UA, DL, DTU, ST, IsPixelShader, ScanImpl)
150	.run(F);
151
152	if (!IsChanged) {
153	return PreservedAnalyses::all();
154	}
155
156	PreservedAnalyses PA;
157	PA.preserve<DominatorTreeAnalysis>();
158	return PA;
159	}
160
161	bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
162
163	// Scan option None disables the Pass
164	if (ScanImpl == ScanOptions::None) {
165	return false;
166	}
167
168	visit(F);
169
170	const bool Changed = !ToReplace.empty();
171
172	for (ReplacementInfo &Info : ToReplace) {
173	optimizeAtomic(I&: *Info.I, Op: Info.Op, ValIdx: Info.ValIdx, ValDivergent: Info.ValDivergent);
174	}
175
176	ToReplace.clear();
177
178	return Changed;
179	}
180
181	static bool isLegalCrossLaneType(Type *Ty) {
182	switch (Ty->getTypeID()) {
183	case Type::FloatTyID:
184	case Type::DoubleTyID:
185	return true;
186	case Type::IntegerTyID: {
187	unsigned Size = Ty->getIntegerBitWidth();
188	return (Size == `32` \|\| Size == `64`);
189	}
190	default:
191	return false;
192	}
193	}
194
195	void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
196	// Early exit for unhandled address space atomic instructions.
197	switch (I.getPointerAddressSpace()) {
198	default:
199	return;
200	case AMDGPUAS::GLOBAL_ADDRESS:
201	case AMDGPUAS::LOCAL_ADDRESS:
202	break;
203	}
204
205	AtomicRMWInst::BinOp Op = I.getOperation();
206
207	switch (Op) {
208	default:
209	return;
210	case AtomicRMWInst::Add:
211	case AtomicRMWInst::Sub:
212	case AtomicRMWInst::And:
213	case AtomicRMWInst::Or:
214	case AtomicRMWInst::Xor:
215	case AtomicRMWInst::Max:
216	case AtomicRMWInst::Min:
217	case AtomicRMWInst::UMax:
218	case AtomicRMWInst::UMin:
219	case AtomicRMWInst::FAdd:
220	case AtomicRMWInst::FSub:
221	case AtomicRMWInst::FMax:
222	case AtomicRMWInst::FMin:
223	break;
224	}
225
226	// Only 32 and 64 bit floating point atomic ops are supported.
227	if (AtomicRMWInst::isFPOperation(Op) &&
228	!(I.getType()->isFloatTy() \|\| I.getType()->isDoubleTy())) {
229	return;
230	}
231
232	const unsigned PtrIdx = `0`;
233	const unsigned ValIdx = `1`;
234
235	// If the pointer operand is divergent, then each lane is doing an atomic
236	// operation on a different address, and we cannot optimize that.
237	if (UA->isDivergentUse(U: I.getOperandUse(i: PtrIdx))) {
238	return;
239	}
240
241	bool ValDivergent = UA->isDivergentUse(U: I.getOperandUse(i: ValIdx));
242
243	// If the value operand is divergent, each lane is contributing a different
244	// value to the atomic calculation. We can only optimize divergent values if
245	// we have DPP available on our subtarget (for DPP strategy), and the atomic
246	// operation is 32 or 64 bits.
247	if (ValDivergent) {
248	if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
249	return;
250
251	if (!isLegalCrossLaneType(Ty: I.getType()))
252	return;
253	}
254
255	// If we get here, we can optimize the atomic using a single wavefront-wide
256	// atomic operation to do the calculation for the entire wavefront, so
257	// remember the instruction so we can come back to it.
258	const ReplacementInfo Info = {.I: &I, .Op: Op, .ValIdx: ValIdx, .ValDivergent: ValDivergent};
259
260	ToReplace.push_back(Elt: Info);
261	}
262
263	void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
264	AtomicRMWInst::BinOp Op;
265
266	switch (I.getIntrinsicID()) {
267	default:
268	return;
269	case Intrinsic::amdgcn_struct_buffer_atomic_add:
270	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
271	case Intrinsic::amdgcn_raw_buffer_atomic_add:
272	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
273	Op = AtomicRMWInst::Add;
274	break;
275	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
276	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
277	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
278	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
279	Op = AtomicRMWInst::Sub;
280	break;
281	case Intrinsic::amdgcn_struct_buffer_atomic_and:
282	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
283	case Intrinsic::amdgcn_raw_buffer_atomic_and:
284	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
285	Op = AtomicRMWInst::And;
286	break;
287	case Intrinsic::amdgcn_struct_buffer_atomic_or:
288	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
289	case Intrinsic::amdgcn_raw_buffer_atomic_or:
290	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
291	Op = AtomicRMWInst::Or;
292	break;
293	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
294	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
295	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
296	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
297	Op = AtomicRMWInst::Xor;
298	break;
299	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
300	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
301	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
302	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
303	Op = AtomicRMWInst::Min;
304	break;
305	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
306	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
307	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
308	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
309	Op = AtomicRMWInst::UMin;
310	break;
311	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
312	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
313	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
314	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
315	Op = AtomicRMWInst::Max;
316	break;
317	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
318	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
319	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
320	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
321	Op = AtomicRMWInst::UMax;
322	break;
323	}
324
325	const unsigned ValIdx = `0`;
326
327	const bool ValDivergent = UA->isDivergentUse(U: I.getOperandUse(i: ValIdx));
328
329	// If the value operand is divergent, each lane is contributing a different
330	// value to the atomic calculation. We can only optimize divergent values if
331	// we have DPP available on our subtarget (for DPP strategy), and the atomic
332	// operation is 32 or 64 bits.
333	if (ValDivergent) {
334	if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
335	return;
336
337	if (!isLegalCrossLaneType(Ty: I.getType()))
338	return;
339	}
340
341	// If any of the other arguments to the intrinsic are divergent, we can't
342	// optimize the operation.
343	for (unsigned Idx = `1`; Idx < I.getNumOperands(); Idx++) {
344	if (UA->isDivergentUse(U: I.getOperandUse(i: Idx))) {
345	return;
346	}
347	}
348
349	// If we get here, we can optimize the atomic using a single wavefront-wide
350	// atomic operation to do the calculation for the entire wavefront, so
351	// remember the instruction so we can come back to it.
352	const ReplacementInfo Info = {.I: &I, .Op: Op, .ValIdx: ValIdx, .ValDivergent: ValDivergent};
353
354	ToReplace.push_back(Elt: Info);
355	}
356
357	// Use the builder to create the non-atomic counterpart of the specified
358	// atomicrmw binary op.
359	static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
360	Value LHS, Value RHS) {
361	CmpInst::Predicate Pred;
362
363	switch (Op) {
364	default:
365	llvm_unreachable("Unhandled atomic op");
366	case AtomicRMWInst::Add:
367	return B.CreateBinOp(Opc: Instruction::Add, LHS, RHS);
368	case AtomicRMWInst::FAdd:
369	return B.CreateFAdd(L: LHS, R: RHS);
370	case AtomicRMWInst::Sub:
371	return B.CreateBinOp(Opc: Instruction::Sub, LHS, RHS);
372	case AtomicRMWInst::FSub:
373	return B.CreateFSub(L: LHS, R: RHS);
374	case AtomicRMWInst::And:
375	return B.CreateBinOp(Opc: Instruction::And, LHS, RHS);
376	case AtomicRMWInst::Or:
377	return B.CreateBinOp(Opc: Instruction::Or, LHS, RHS);
378	case AtomicRMWInst::Xor:
379	return B.CreateBinOp(Opc: Instruction::Xor, LHS, RHS);
380
381	case AtomicRMWInst::Max:
382	Pred = CmpInst::ICMP_SGT;
383	break;
384	case AtomicRMWInst::Min:
385	Pred = CmpInst::ICMP_SLT;
386	break;
387	case AtomicRMWInst::UMax:
388	Pred = CmpInst::ICMP_UGT;
389	break;
390	case AtomicRMWInst::UMin:
391	Pred = CmpInst::ICMP_ULT;
392	break;
393	case AtomicRMWInst::FMax:
394	return B.CreateMaxNum(LHS, RHS);
395	case AtomicRMWInst::FMin:
396	return B.CreateMinNum(LHS, RHS);
397	}
398	Value *Cond = B.CreateICmp(P: Pred, LHS, RHS);
399	return B.CreateSelect(C: Cond, True: LHS, False: RHS);
400	}
401
402	// Use the builder to create a reduction of V across the wavefront, with all
403	// lanes active, returning the same result in all lanes.
404	Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
405	AtomicRMWInst::BinOp Op,
406	Value *V,
407	Value *const Identity) const {
408	Type *AtomicTy = V->getType();
409	Module *M = B.GetInsertBlock()->getModule();
410	Function *UpdateDPP =
411	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_update_dpp, Tys: AtomicTy);
412
413	// Reduce within each row of 16 lanes.
414	for (unsigned Idx = `0`; Idx < `4`; Idx++) {
415	V = buildNonAtomicBinOp(
416	B, Op, LHS: V,
417	RHS: B.CreateCall(Callee: UpdateDPP,
418	Args: {Identity, V, B.getInt32(C: DPP::ROW_XMASK0 \| `1` << Idx),
419	B.getInt32(C: `0xf`), B.getInt32(C: `0xf`), B.getFalse()}));
420	}
421
422	// Reduce within each pair of rows (i.e. 32 lanes).
423	assert(ST->hasPermLaneX16());
424	Value *Permlanex16Call = B.CreateIntrinsic(
425	RetTy: V->getType(), ID: Intrinsic::amdgcn_permlanex16,
426	Args: {V, V, B.getInt32(C: -`1`), B.getInt32(C: -`1`), B.getFalse(), B.getFalse()});
427	V = buildNonAtomicBinOp(B, Op, LHS: V, RHS: Permlanex16Call);
428	if (ST->isWave32()) {
429	return V;
430	}
431
432	if (ST->hasPermLane64()) {
433	// Reduce across the upper and lower 32 lanes.
434	Value *Permlane64Call =
435	B.CreateIntrinsic(RetTy: V->getType(), ID: Intrinsic::amdgcn_permlane64, Args: V);
436	return buildNonAtomicBinOp(B, Op, LHS: V, RHS: Permlane64Call);
437	}
438
439	// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
440	// combine them with a scalar operation.
441	Function *ReadLane =
442	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_readlane, Tys: AtomicTy);
443	Value *Lane0 = B.CreateCall(Callee: ReadLane, Args: {V, B.getInt32(C: `0`)});
444	Value *Lane32 = B.CreateCall(Callee: ReadLane, Args: {V, B.getInt32(C: `32`)});
445	return buildNonAtomicBinOp(B, Op, LHS: Lane0, RHS: Lane32);
446	}
447
448	// Use the builder to create an inclusive scan of V across the wavefront, with
449	// all lanes active.
450	Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
451	AtomicRMWInst::BinOp Op, Value *V,
452	Value Identity) const* {
453	Type *AtomicTy = V->getType();
454	Module *M = B.GetInsertBlock()->getModule();
455	Function *UpdateDPP =
456	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_update_dpp, Tys: AtomicTy);
457
458	for (unsigned Idx = `0`; Idx < `4`; Idx++) {
459	V = buildNonAtomicBinOp(
460	B, Op, LHS: V,
461	RHS: B.CreateCall(Callee: UpdateDPP,
462	Args: {Identity, V, B.getInt32(C: DPP::ROW_SHR0 \| `1` << Idx),
463	B.getInt32(C: `0xf`), B.getInt32(C: `0xf`), B.getFalse()}));
464	}
465	if (ST->hasDPPBroadcasts()) {
466	// GFX9 has DPP row broadcast operations.
467	V = buildNonAtomicBinOp(
468	B, Op, LHS: V,
469	RHS: B.CreateCall(Callee: UpdateDPP,
470	Args: {Identity, V, B.getInt32(C: DPP::BCAST15), B.getInt32(C: `0xa`),
471	B.getInt32(C: `0xf`), B.getFalse()}));
472	V = buildNonAtomicBinOp(
473	B, Op, LHS: V,
474	RHS: B.CreateCall(Callee: UpdateDPP,
475	Args: {Identity, V, B.getInt32(C: DPP::BCAST31), B.getInt32(C: `0xc`),
476	B.getInt32(C: `0xf`), B.getFalse()}));
477	} else {
478	// On GFX10 all DPP operations are confined to a single row. To get cross-
479	// row operations we have to use permlane or readlane.
480
481	// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
482	// 48..63).
483	assert(ST->hasPermLaneX16());
484	Value *PermX = B.CreateIntrinsic(
485	RetTy: V->getType(), ID: Intrinsic::amdgcn_permlanex16,
486	Args: {V, V, B.getInt32(C: -`1`), B.getInt32(C: -`1`), B.getFalse(), B.getFalse()});
487
488	Value *UpdateDPPCall = B.CreateCall(
489	Callee: UpdateDPP, Args: {Identity, PermX, B.getInt32(C: DPP::QUAD_PERM_ID),
490	B.getInt32(C: `0xa`), B.getInt32(C: `0xf`), B.getFalse()});
491	V = buildNonAtomicBinOp(B, Op, LHS: V, RHS: UpdateDPPCall);
492
493	if (!ST->isWave32()) {
494	// Combine lane 31 into lanes 32..63.
495	Value *const Lane31 = B.CreateIntrinsic(
496	RetTy: V->getType(), ID: Intrinsic::amdgcn_readlane, Args: {V, B.getInt32(C: `31`)});
497
498	Value *UpdateDPPCall = B.CreateCall(
499	Callee: UpdateDPP, Args: {Identity, Lane31, B.getInt32(C: DPP::QUAD_PERM_ID),
500	B.getInt32(C: `0xc`), B.getInt32(C: `0xf`), B.getFalse()});
501
502	V = buildNonAtomicBinOp(B, Op, LHS: V, RHS: UpdateDPPCall);
503	}
504	}
505	return V;
506	}
507
508	// Use the builder to create a shift right of V across the wavefront, with all
509	// lanes active, to turn an inclusive scan into an exclusive scan.
510	Value AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value V,
511	Value Identity) const* {
512	Type *AtomicTy = V->getType();
513	Module *M = B.GetInsertBlock()->getModule();
514	Function *UpdateDPP =
515	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_update_dpp, Tys: AtomicTy);
516	if (ST->hasDPPWavefrontShifts()) {
517	// GFX9 has DPP wavefront shift operations.
518	V = B.CreateCall(Callee: UpdateDPP,
519	Args: {Identity, V, B.getInt32(C: DPP::WAVE_SHR1), B.getInt32(C: `0xf`),
520	B.getInt32(C: `0xf`), B.getFalse()});
521	} else {
522	Function *ReadLane =
523	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_readlane, Tys: AtomicTy);
524	Function *WriteLane =
525	Intrinsic::getDeclaration(M, id: Intrinsic::amdgcn_writelane, Tys: AtomicTy);
526
527	// On GFX10 all DPP operations are confined to a single row. To get cross-
528	// row operations we have to use permlane or readlane.
529	Value *Old = V;
530	V = B.CreateCall(Callee: UpdateDPP,
531	Args: {Identity, V, B.getInt32(C: DPP::ROW_SHR0 + `1`),
532	B.getInt32(C: `0xf`), B.getInt32(C: `0xf`), B.getFalse()});
533
534	// Copy the old lane 15 to the new lane 16.
535	V = B.CreateCall(Callee: WriteLane, Args: {B.CreateCall(Callee: ReadLane, Args: {Old, B.getInt32(C: `15`)}),
536	B.getInt32(C: `16`), V});
537
538	if (!ST->isWave32()) {
539	// Copy the old lane 31 to the new lane 32.
540	V = B.CreateCall(
541	Callee: WriteLane,
542	Args: {B.CreateCall(Callee: ReadLane, Args: {Old, B.getInt32(C: `31`)}), B.getInt32(C: `32`), V});
543
544	// Copy the old lane 47 to the new lane 48.
545	V = B.CreateCall(
546	Callee: WriteLane,
547	Args: {B.CreateCall(Callee: ReadLane, Args: {Old, B.getInt32(C: `47`)}), B.getInt32(C: `48`), V});
548	}
549	}
550
551	return V;
552	}
553
554	// Use the builder to create an exclusive scan and compute the final reduced
555	// value using an iterative approach. This provides an alternative
556	// implementation to DPP which uses WMM for scan computations. This API iterate
557	// over active lanes to read, compute and update the value using
558	// readlane and writelane intrinsics.
559	std::pair<Value , Value > AMDGPUAtomicOptimizerImpl::buildScanIteratively(
560	IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
561	Instruction &I, BasicBlock ComputeLoop, BasicBlock ComputeEnd) const {
562	auto *Ty = I.getType();
563	auto *WaveTy = B.getIntNTy(N: ST->getWavefrontSize());
564	auto *EntryBB = I.getParent();
565	auto NeedResult = !I.use_empty();
566
567	auto *Ballot =
568	B.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot, Types: WaveTy, Args: B.getTrue());
569
570	// Start inserting instructions for ComputeLoop block
571	B.SetInsertPoint(ComputeLoop);
572	// Phi nodes for Accumulator, Scan results destination, and Active Lanes
573	auto *Accumulator = B.CreatePHI(Ty, NumReservedValues: `2`, Name: "Accumulator");
574	Accumulator->addIncoming(V: Identity, BB: EntryBB);
575	PHINode OldValuePhi = nullptr*;
576	if (NeedResult) {
577	OldValuePhi = B.CreatePHI(Ty, NumReservedValues: `2`, Name: "OldValuePhi");
578	OldValuePhi->addIncoming(V: PoisonValue::get(T: Ty), BB: EntryBB);
579	}
580	auto *ActiveBits = B.CreatePHI(Ty: WaveTy, NumReservedValues: `2`, Name: "ActiveBits");
581	ActiveBits->addIncoming(V: Ballot, BB: EntryBB);
582
583	// Use llvm.cttz instrinsic to find the lowest remaining active lane.
584	auto *FF1 =
585	B.CreateIntrinsic(ID: Intrinsic::cttz, Types: WaveTy, Args: {ActiveBits, B.getTrue()});
586
587	auto *LaneIdxInt = B.CreateTrunc(V: FF1, DestTy: B.getInt32Ty());
588
589	// Get the value required for atomic operation
590	Value *LaneValue = B.CreateIntrinsic(RetTy: V->getType(), ID: Intrinsic::amdgcn_readlane,
591	Args: {V, LaneIdxInt});
592
593	// Perform writelane if intermediate scan results are required later in the
594	// kernel computations
595	Value OldValue = nullptr*;
596	if (NeedResult) {
597	OldValue = B.CreateIntrinsic(RetTy: V->getType(), ID: Intrinsic::amdgcn_writelane,
598	Args: {Accumulator, LaneIdxInt, OldValuePhi});
599	OldValuePhi->addIncoming(V: OldValue, BB: ComputeLoop);
600	}
601
602	// Accumulate the results
603	auto *NewAccumulator = buildNonAtomicBinOp(B, Op, LHS: Accumulator, RHS: LaneValue);
604	Accumulator->addIncoming(V: NewAccumulator, BB: ComputeLoop);
605
606	// Set bit to zero of current active lane so that for next iteration llvm.cttz
607	// return the next active lane
608	auto *Mask = B.CreateShl(LHS: ConstantInt::get(Ty: WaveTy, V: `1`), RHS: FF1);
609
610	auto *InverseMask = B.CreateXor(LHS: Mask, RHS: ConstantInt::get(Ty: WaveTy, V: -`1`));
611	auto *NewActiveBits = B.CreateAnd(LHS: ActiveBits, RHS: InverseMask);
612	ActiveBits->addIncoming(V: NewActiveBits, BB: ComputeLoop);
613
614	// Branch out of the loop when all lanes are processed.
615	auto *IsEnd = B.CreateICmpEQ(LHS: NewActiveBits, RHS: ConstantInt::get(Ty: WaveTy, V: `0`));
616	B.CreateCondBr(Cond: IsEnd, True: ComputeEnd, False: ComputeLoop);
617
618	B.SetInsertPoint(ComputeEnd);
619
620	return {OldValue, NewAccumulator};
621	}
622
623	static Constant getIdentityValueForAtomicOp(Type const Ty,
624	AtomicRMWInst::BinOp Op) {
625	LLVMContext &C = Ty->getContext();
626	const unsigned BitWidth = Ty->getPrimitiveSizeInBits();
627	switch (Op) {
628	default:
629	llvm_unreachable("Unhandled atomic op");
630	case AtomicRMWInst::Add:
631	case AtomicRMWInst::Sub:
632	case AtomicRMWInst::Or:
633	case AtomicRMWInst::Xor:
634	case AtomicRMWInst::UMax:
635	return ConstantInt::get(Context&: C, V: APInt::getMinValue(numBits: BitWidth));
636	case AtomicRMWInst::And:
637	case AtomicRMWInst::UMin:
638	return ConstantInt::get(Context&: C, V: APInt::getMaxValue(numBits: BitWidth));
639	case AtomicRMWInst::Max:
640	return ConstantInt::get(Context&: C, V: APInt::getSignedMinValue(numBits: BitWidth));
641	case AtomicRMWInst::Min:
642	return ConstantInt::get(Context&: C, V: APInt::getSignedMaxValue(numBits: BitWidth));
643	case AtomicRMWInst::FAdd:
644	return ConstantFP::get(Context&: C, V: APFloat::getZero(Sem: Ty->getFltSemantics(), Negative: true));
645	case AtomicRMWInst::FSub:
646	return ConstantFP::get(Context&: C, V: APFloat::getZero(Sem: Ty->getFltSemantics(), Negative: false));
647	case AtomicRMWInst::FMin:
648	case AtomicRMWInst::FMax:
649	// FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the
650	// closest thing they have to an identity, but it still does not preserve
651	// the difference between quiet and signaling NaNs or NaNs with different
652	// payloads.
653	return ConstantFP::get(Context&: C, V: APFloat::getNaN(Sem: Ty->getFltSemantics()));
654	}
655	}
656
657	static Value buildMul(IRBuilder<> &B, Value LHS, Value *RHS) {
658	const ConstantInt *CI = dyn_cast<ConstantInt>(Val: LHS);
659	return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
660	}
661
662	void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
663	AtomicRMWInst::BinOp Op,
664	unsigned ValIdx,
665	bool ValDivergent) const {
666	// Start building just before the instruction.
667	IRBuilder<> B(&I);
668
669	if (AtomicRMWInst::isFPOperation(Op)) {
670	B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Kind: Attribute::StrictFP));
671	}
672
673	// If we are in a pixel shader, because of how we have to mask out helper
674	// lane invocations, we need to record the entry and exit BB's.
675	BasicBlock PixelEntryBB = nullptr*;
676	BasicBlock PixelExitBB = nullptr*;
677
678	// If we're optimizing an atomic within a pixel shader, we need to wrap the
679	// entire atomic operation in a helper-lane check. We do not want any helper
680	// lanes that are around only for the purposes of derivatives to take part
681	// in any cross-lane communication, and we use a branch on whether the lane is
682	// live to do this.
683	if (IsPixelShader) {
684	// Record I's original position as the entry block.
685	PixelEntryBB = I.getParent();
686
687	Value *const Cond = B.CreateIntrinsic(ID: Intrinsic::amdgcn_ps_live, Types: {}, Args: {});
688	Instruction *const NonHelperTerminator =
689	SplitBlockAndInsertIfThen(Cond, SplitBefore: &I, Unreachable: false, BranchWeights: nullptr, DTU: &DTU, LI: nullptr);
690
691	// Record I's new position as the exit block.
692	PixelExitBB = I.getParent();
693
694	I.moveBefore(MovePos: NonHelperTerminator);
695	B.SetInsertPoint(&I);
696	}
697
698	Type *const Ty = I.getType();
699	Type *Int32Ty = B.getInt32Ty();
700	bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
701	[[maybe_unused]] const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
702
703	// This is the value in the atomic operation we need to combine in order to
704	// reduce the number of atomic operations.
705	Value *V = I.getOperand(i: ValIdx);
706
707	// We need to know how many lanes are active within the wavefront, and we do
708	// this by doing a ballot of active lanes.
709	Type *const WaveTy = B.getIntNTy(N: ST->getWavefrontSize());
710	CallInst *const Ballot =
711	B.CreateIntrinsic(ID: Intrinsic::amdgcn_ballot, Types: WaveTy, Args: B.getTrue());
712
713	// We need to know how many lanes are active within the wavefront that are
714	// below us. If we counted each lane linearly starting from 0, a lane is
715	// below us only if its associated index was less than ours. We do this by
716	// using the mbcnt intrinsic.
717	Value *Mbcnt;
718	if (ST->isWave32()) {
719	Mbcnt = B.CreateIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Types: {},
720	Args: {Ballot, B.getInt32(C: `0`)});
721	} else {
722	Value *const ExtractLo = B.CreateTrunc(V: Ballot, DestTy: Int32Ty);
723	Value *const ExtractHi = B.CreateTrunc(V: B.CreateLShr(LHS: Ballot, RHS: `32`), DestTy: Int32Ty);
724	Mbcnt = B.CreateIntrinsic(ID: Intrinsic::amdgcn_mbcnt_lo, Types: {},
725	Args: {ExtractLo, B.getInt32(C: `0`)});
726	Mbcnt =
727	B.CreateIntrinsic(ID: Intrinsic::amdgcn_mbcnt_hi, Types: {}, Args: {ExtractHi, Mbcnt});
728	}
729
730	Function *F = I.getFunction();
731	LLVMContext &C = F->getContext();
732
733	// For atomic sub, perform scan with add operation and allow one lane to
734	// subtract the reduced value later.
735	AtomicRMWInst::BinOp ScanOp = Op;
736	if (Op == AtomicRMWInst::Sub) {
737	ScanOp = AtomicRMWInst::Add;
738	} else if (Op == AtomicRMWInst::FSub) {
739	ScanOp = AtomicRMWInst::FAdd;
740	}
741	Value *Identity = getIdentityValueForAtomicOp(Ty, Op: ScanOp);
742
743	Value ExclScan = nullptr*;
744	Value NewV = nullptr*;
745
746	const bool NeedResult = !I.use_empty();
747
748	BasicBlock ComputeLoop = nullptr*;
749	BasicBlock ComputeEnd = nullptr*;
750	// If we have a divergent value in each lane, we need to combine the value
751	// using DPP.
752	if (ValDivergent) {
753	if (ScanImpl == ScanOptions::DPP) {
754	// First we need to set all inactive invocations to the identity value, so
755	// that they can correctly contribute to the final result.
756	NewV =
757	B.CreateIntrinsic(ID: Intrinsic::amdgcn_set_inactive, Types: Ty, Args: {V, Identity});
758	if (!NeedResult && ST->hasPermLaneX16()) {
759	// On GFX10 the permlanex16 instruction helps us build a reduction
760	// without too many readlanes and writelanes, which are generally bad
761	// for performance.
762	NewV = buildReduction(B, Op: ScanOp, V: NewV, Identity);
763	} else {
764	NewV = buildScan(B, Op: ScanOp, V: NewV, Identity);
765	if (NeedResult)
766	ExclScan = buildShiftRight(B, V: NewV, Identity);
767	// Read the value from the last lane, which has accumulated the values
768	// of each active lane in the wavefront. This will be our new value
769	// which we will provide to the atomic operation.
770	Value *const LastLaneIdx = B.getInt32(C: ST->getWavefrontSize() - `1`);
771	NewV = B.CreateIntrinsic(RetTy: Ty, ID: Intrinsic::amdgcn_readlane,
772	Args: {NewV, LastLaneIdx});
773	}
774	// Finally mark the readlanes in the WWM section.
775	NewV = B.CreateIntrinsic(ID: Intrinsic::amdgcn_strict_wwm, Types: Ty, Args: NewV);
776	} else if (ScanImpl == ScanOptions::Iterative) {
777	// Alternative implementation for scan
778	ComputeLoop = BasicBlock::Create(Context&: C, Name: "ComputeLoop", Parent: F);
779	ComputeEnd = BasicBlock::Create(Context&: C, Name: "ComputeEnd", Parent: F);
780	std::tie(args&: ExclScan, args&: NewV) = buildScanIteratively(B, Op: ScanOp, Identity, V, I,
781	ComputeLoop, ComputeEnd);
782	} else {
783	llvm_unreachable("Atomic Optimzer is disabled for None strategy");
784	}
785	} else {
786	switch (Op) {
787	default:
788	llvm_unreachable("Unhandled atomic op");
789
790	case AtomicRMWInst::Add:
791	case AtomicRMWInst::Sub: {
792	// The new value we will be contributing to the atomic operation is the
793	// old value times the number of active lanes.
794	Value *const Ctpop = B.CreateIntCast(
795	V: B.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V: Ballot), DestTy: Ty, isSigned: false);
796	NewV = buildMul(B, LHS: V, RHS: Ctpop);
797	break;
798	}
799	case AtomicRMWInst::FAdd:
800	case AtomicRMWInst::FSub: {
801	Value *const Ctpop = B.CreateIntCast(
802	V: B.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V: Ballot), DestTy: Int32Ty, isSigned: false);
803	Value *const CtpopFP = B.CreateUIToFP(V: Ctpop, DestTy: Ty);
804	NewV = B.CreateFMul(L: V, R: CtpopFP);
805	break;
806	}
807	case AtomicRMWInst::And:
808	case AtomicRMWInst::Or:
809	case AtomicRMWInst::Max:
810	case AtomicRMWInst::Min:
811	case AtomicRMWInst::UMax:
812	case AtomicRMWInst::UMin:
813	case AtomicRMWInst::FMin:
814	case AtomicRMWInst::FMax:
815	// These operations with a uniform value are idempotent: doing the atomic
816	// operation multiple times has the same effect as doing it once.
817	NewV = V;
818	break;
819
820	case AtomicRMWInst::Xor:
821	// The new value we will be contributing to the atomic operation is the
822	// old value times the parity of the number of active lanes.
823	Value *const Ctpop = B.CreateIntCast(
824	V: B.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V: Ballot), DestTy: Ty, isSigned: false);
825	NewV = buildMul(B, LHS: V, RHS: B.CreateAnd(LHS: Ctpop, RHS: `1`));
826	break;
827	}
828	}
829
830	// We only want a single lane to enter our new control flow, and we do this
831	// by checking if there are any active lanes below us. Only one lane will
832	// have 0 active lanes below us, so that will be the only one to progress.
833	Value *const Cond = B.CreateICmpEQ(LHS: Mbcnt, RHS: B.getInt32(C: `0`));
834
835	// Store I's original basic block before we split the block.
836	BasicBlock *const OriginalBB = I.getParent();
837
838	// We need to introduce some new control flow to force a single lane to be
839	// active. We do this by splitting I's basic block at I, and introducing the
840	// new block such that:
841	// entry --> single_lane -\
842	// \------------------> exit
843	Instruction *const SingleLaneTerminator =
844	SplitBlockAndInsertIfThen(Cond, SplitBefore: &I, Unreachable: false, BranchWeights: nullptr, DTU: &DTU, LI: nullptr);
845
846	// At this point, we have split the I's block to allow one lane in wavefront
847	// to update the precomputed reduced value. Also, completed the codegen for
848	// new control flow i.e. iterative loop which perform reduction and scan using
849	// ComputeLoop and ComputeEnd.
850	// For the new control flow, we need to move branch instruction i.e.
851	// terminator created during SplitBlockAndInsertIfThen from I's block to
852	// ComputeEnd block. We also need to set up predecessor to next block when
853	// single lane done updating the final reduced value.
854	BasicBlock Predecessor = nullptr*;
855	if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
856	// Move terminator from I's block to ComputeEnd block.
857	//
858	// OriginalBB is known to have a branch as terminator because
859	// SplitBlockAndInsertIfThen will have inserted one.
860	BranchInst *Terminator = cast<BranchInst>(Val: OriginalBB->getTerminator());
861	B.SetInsertPoint(ComputeEnd);
862	Terminator->removeFromParent();
863	B.Insert(I: Terminator);
864
865	// Branch to ComputeLoop Block unconditionally from the I's block for
866	// iterative approach.
867	B.SetInsertPoint(OriginalBB);
868	B.CreateBr(Dest: ComputeLoop);
869
870	// Update the dominator tree for new control flow.
871	SmallVector<DominatorTree::UpdateType, `6`> DomTreeUpdates(
872	{{DominatorTree::Insert, OriginalBB, ComputeLoop},
873	{DominatorTree::Insert, ComputeLoop, ComputeEnd}});
874
875	// We're moving the terminator from EntryBB to ComputeEnd, make sure we move
876	// the DT edges as well.
877	for (auto *Succ : Terminator->successors()) {
878	DomTreeUpdates.push_back(Elt: {DominatorTree::Insert, ComputeEnd, Succ});
879	DomTreeUpdates.push_back(Elt: {DominatorTree::Delete, OriginalBB, Succ});
880	}
881
882	DTU.applyUpdates(Updates: DomTreeUpdates);
883
884	Predecessor = ComputeEnd;
885	} else {
886	Predecessor = OriginalBB;
887	}
888	// Move the IR builder into single_lane next.
889	B.SetInsertPoint(SingleLaneTerminator);
890
891	// Clone the original atomic operation into single lane, replacing the
892	// original value with our newly created one.
893	Instruction *const NewI = I.clone();
894	B.Insert(I: NewI);
895	NewI->setOperand(i: ValIdx, Val: NewV);
896
897	// Move the IR builder into exit next, and start inserting just before the
898	// original instruction.
899	B.SetInsertPoint(&I);
900
901	if (NeedResult) {
902	// Create a PHI node to get our new atomic result into the exit block.
903	PHINode *const PHI = B.CreatePHI(Ty, NumReservedValues: `2`);
904	PHI->addIncoming(V: PoisonValue::get(T: Ty), BB: Predecessor);
905	PHI->addIncoming(V: NewI, BB: SingleLaneTerminator->getParent());
906
907	// We need to broadcast the value who was the lowest active lane (the first
908	// lane) to all other lanes in the wavefront. We use an intrinsic for this,
909	// but have to handle 64-bit broadcasts with two calls to this intrinsic.
910	Value BroadcastI = nullptr*;
911	BroadcastI = B.CreateIntrinsic(RetTy: Ty, ID: Intrinsic::amdgcn_readfirstlane, Args: PHI);
912
913	// Now that we have the result of our single atomic operation, we need to
914	// get our individual lane's slice into the result. We use the lane offset
915	// we previously calculated combined with the atomic result value we got
916	// from the first lane, to get our lane's index into the atomic result.
917	Value LaneOffset = nullptr*;
918	if (ValDivergent) {
919	if (ScanImpl == ScanOptions::DPP) {
920	LaneOffset =
921	B.CreateIntrinsic(ID: Intrinsic::amdgcn_strict_wwm, Types: Ty, Args: ExclScan);
922	} else if (ScanImpl == ScanOptions::Iterative) {
923	LaneOffset = ExclScan;
924	} else {
925	llvm_unreachable("Atomic Optimzer is disabled for None strategy");
926	}
927	} else {
928	Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(V: Mbcnt, DestTy: Ty)
929	: B.CreateIntCast(V: Mbcnt, DestTy: Ty, isSigned: false);
930	switch (Op) {
931	default:
932	llvm_unreachable("Unhandled atomic op");
933	case AtomicRMWInst::Add:
934	case AtomicRMWInst::Sub:
935	LaneOffset = buildMul(B, LHS: V, RHS: Mbcnt);
936	break;
937	case AtomicRMWInst::And:
938	case AtomicRMWInst::Or:
939	case AtomicRMWInst::Max:
940	case AtomicRMWInst::Min:
941	case AtomicRMWInst::UMax:
942	case AtomicRMWInst::UMin:
943	case AtomicRMWInst::FMin:
944	case AtomicRMWInst::FMax:
945	LaneOffset = B.CreateSelect(C: Cond, True: Identity, False: V);
946	break;
947	case AtomicRMWInst::Xor:
948	LaneOffset = buildMul(B, LHS: V, RHS: B.CreateAnd(LHS: Mbcnt, RHS: `1`));
949	break;
950	case AtomicRMWInst::FAdd:
951	case AtomicRMWInst::FSub: {
952	LaneOffset = B.CreateFMul(L: V, R: Mbcnt);
953	break;
954	}
955	}
956	}
957	Value *Result = buildNonAtomicBinOp(B, Op, LHS: BroadcastI, RHS: LaneOffset);
958	if (isAtomicFloatingPointTy) {
959	// For fadd/fsub the first active lane of LaneOffset should be the
960	// identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
961	// is V +0.0 which might have the wrong sign or might be nan (if V is*
962	// inf or nan).
963	//
964	// For all floating point ops if the in-memory value was a nan then the
965	// binop we just built might have quieted it or changed its payload.
966	//
967	// Correct all these problems by using BroadcastI as the result in the
968	// first active lane.
969	Result = B.CreateSelect(C: Cond, True: BroadcastI, False: Result);
970	}
971
972	if (IsPixelShader) {
973	// Need a final PHI to reconverge to above the helper lane branch mask.
974	B.SetInsertPoint(TheBB: PixelExitBB, IP: PixelExitBB->getFirstNonPHIIt());
975
976	PHINode *const PHI = B.CreatePHI(Ty, NumReservedValues: `2`);
977	PHI->addIncoming(V: PoisonValue::get(T: Ty), BB: PixelEntryBB);
978	PHI->addIncoming(V: Result, BB: I.getParent());
979	I.replaceAllUsesWith(V: PHI);
980	} else {
981	// Replace the original atomic instruction with the new one.
982	I.replaceAllUsesWith(V: Result);
983	}
984	}
985
986	// And delete the original.
987	I.eraseFromParent();
988	}
989
990	INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
991	"AMDGPU atomic optimizations", false, false)
992	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
993	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
994	INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
995	"AMDGPU atomic optimizations", false, false)
996
997	FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {
998	return new AMDGPUAtomicOptimizer (ScanStrategy);
999	}
1000

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp