ArrayBoundChecker.cpp source code [llvm_projects/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp]

1	//== ArrayBoundChecker.cpp -------------------------------------------------==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines security.ArrayBound, which is a path-sensitive checker
10	// that looks for out of bounds access of memory regions.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "clang/AST/CharUnits.h"
15	#include "clang/AST/ParentMapContext.h"
16	#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17	#include "clang/StaticAnalyzer/Checkers/Taint.h"
18	#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19	#include "clang/StaticAnalyzer/Core/Checker.h"
20	#include "clang/StaticAnalyzer/Core/CheckerManager.h"
21	#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22	#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23	#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24	#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25	#include "llvm/ADT/APSInt.h"
26	#include "llvm/Support/FormatVariadic.h"
27	#include "llvm/Support/raw_ostream.h"
28	#include <optional>
29
30	using namespace clang;
31	using namespace ento;
32	using namespace taint;
33	using llvm::formatv;
34
35	namespace {
36	/// If `E` is an array subscript expression with a base that is "clean" (= not
37	/// modified by pointer arithmetic = the beginning of a memory region), return
38	/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
39	/// This helper function is used by two separate heuristics that are only valid
40	/// in these "clean" cases.
41	static const ArraySubscriptExpr *
42	getAsCleanArraySubscriptExpr(const Expr E, const* CheckerContext &C) {
43	const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E);
44	if (!ASE)
45	return nullptr;
46
47	const MemRegion *SubscriptBaseReg = C.getSVal(S: ASE->getBase()).getAsRegion();
48	if (!SubscriptBaseReg)
49	return nullptr;
50
51	// The base of the subscript expression is affected by pointer arithmetics,
52	// so we want to report byte offsets instead of indices and we don't want to
53	// activate the "index is unsigned -> cannot be negative" shortcut.
54	if (isa<ElementRegion>(Val: SubscriptBaseReg->StripCasts()))
55	return nullptr;
56
57	return ASE;
58	}
59
60	/// If `E` is a "clean" array subscript expression, return the type of the
61	/// accessed element; otherwise return std::nullopt because that's the best (or
62	/// least bad) option for the diagnostic generation that relies on this.
63	static std::optional<QualType> determineElementType(const Expr *E,
64	const CheckerContext &C) {
65	const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
66	if (!ASE)
67	return std::nullopt;
68
69	return ASE->getType();
70	}
71
72	static std::optional<int64_t>
73	determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
74	if (!T)
75	return std::nullopt;
76	return C.getASTContext().getTypeSizeInChars(T: *T).getQuantity();
77	}
78
79	class StateUpdateReporter {
80	const MemSpaceRegion *Space;
81	const SubRegion *Reg;
82	const NonLoc ByteOffsetVal;
83	const std::optional<QualType> ElementType;
84	const std::optional<int64_t> ElementSize;
85	bool AssumedNonNegative = false;
86	std::optional<NonLoc> AssumedUpperBound = std::nullopt;
87
88	public:
89	StateUpdateReporter(const SubRegion R, NonLoc ByteOffsVal, const* Expr *E,
90	CheckerContext &C)
91	: Space(R->getMemorySpace(State: C.getState())), Reg(R),
92	ByteOffsetVal (ByteOffsVal), ElementType(determineElementType(E, C)),
93	ElementSize(determineElementSize(T: ElementType, C)) {}
94
95	void recordNonNegativeAssumption() { AssumedNonNegative = true; }
96	void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
97	AssumedUpperBound = UpperBoundVal;
98	}
99
100	bool assumedNonNegative() { return AssumedNonNegative; }
101
102	const NoteTag createNoteTag(CheckerContext &C) const*;
103
104	private:
105	std::string getMessage(PathSensitiveBugReport &BR) const;
106
107	/// Return true if information about the value of `Sym` can put constraints
108	/// on some symbol which is interesting within the bug report `BR`.
109	/// In particular, this returns true when `Sym` is interesting within `BR`;
110	/// but it also returns true if `Sym` is an expression that contains integer
111	/// constants and a single symbolic operand which is interesting (in `BR`).
112	/// We need to use this instead of plain `BR.isInteresting()` because if we
113	/// are analyzing code like
114	/// int array[10];
115	/// int f(int arg) {
116	/// return array[arg] && array[arg + 10];
117	/// }
118	/// then the byte offsets are `arg 4` and `(arg + 10) * 4`, which are not*
119	/// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
120	/// to detect this out of bounds access).
121	static bool providesInformationAboutInteresting(SymbolRef Sym,
122	PathSensitiveBugReport &BR);
123	static bool providesInformationAboutInteresting(SVal SV,
124	PathSensitiveBugReport &BR) {
125	return providesInformationAboutInteresting(Sym: SV.getAsSymbol(), BR);
126	}
127	};
128
129	struct Messages {
130	std::string Short, Full;
131	};
132
133	// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
134	// instead of `PreStmt` because the current implementation passes the whole
135	// expression to `CheckerContext::getSVal()` which only works after the
136	// symbolic evaluation of the expression. (To turn them into `PreStmt`
137	// callbacks, we'd need to duplicate the logic that evaluates these
138	// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
139	// defined as `PostStmt` for the sake of consistency with the other callbacks.
140	class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
141	check::PostStmt<UnaryOperator>,
142	check::PostStmt<MemberExpr>> {
143	BugType BT{this, "Out-of-bound access"};
144	BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
145
146	void performCheck(const Expr E, CheckerContext &C) const*;
147
148	void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
149	NonLoc Offset, std::optional<NonLoc> Extent,
150	bool IsTaintBug = false) const;
151
152	static void markPartsInteresting(PathSensitiveBugReport &BR,
153	ProgramStateRef ErrorState, NonLoc Val,
154	bool MarkTaint);
155
156	static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
157
158	static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
159
160	static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
161	NonLoc Offset, NonLoc Limit,
162	CheckerContext &C);
163	static bool isInAddressOf(const Stmt *S, ASTContext &AC);
164
165	public:
166	void checkPostStmt(const ArraySubscriptExpr E, CheckerContext &C) const* {
167	performCheck(E, C);
168	}
169	void checkPostStmt(const UnaryOperator E, CheckerContext &C) const* {
170	if (E->getOpcode() == UO_Deref)
171	performCheck(E, C);
172	}
173	void checkPostStmt(const MemberExpr E, CheckerContext &C) const* {
174	if (E->isArrow())
175	performCheck(E: E->getBase(), C);
176	}
177	};
178
179	} // anonymous namespace
180
181	/// For a given Location that can be represented as a symbolic expression
182	/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
183	/// Arr and the distance of Location from the beginning of Arr (expressed in a
184	/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
185	/// cannot be determined.
186	static std::optional<std::pair<const SubRegion *, NonLoc>>
187	computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
188	QualType T = SVB.getArrayIndexType();
189	auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
190	// We will use this utility to add and multiply values.
191	return SVB.evalBinOpNN(state: State, op: Op, lhs: L, rhs: R, resultTy: T).getAs<NonLoc>();
192	};
193
194	const SubRegion OwnerRegion = nullptr*;
195	std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
196
197	const ElementRegion *CurRegion =
198	dyn_cast_or_null<ElementRegion>(Val: Location.getAsRegion());
199
200	while (CurRegion) {
201	const auto Index = CurRegion->getIndex().getAs<NonLoc>();
202	if (!Index)
203	return std::nullopt;
204
205	QualType ElemType = CurRegion->getElementType();
206
207	// FIXME: The following early return was presumably added to safeguard the
208	// getTypeSizeInChars() call (which doesn't accept an incomplete type), but
209	// it seems that `ElemType` cannot be incomplete at this point.
210	if (ElemType ->isIncompleteType())
211	return std::nullopt;
212
213	// Calculate Delta = Index sizeof(ElemType).*
214	NonLoc Size = SVB.makeArrayIndex(
215	idx: SVB.getContext().getTypeSizeInChars(T: ElemType).getQuantity());
216	auto Delta = EvalBinOp (BO_Mul, *Index, Size);
217	if (!Delta)
218	return std::nullopt;
219
220	// Perform Offset += Delta.
221	Offset = EvalBinOp (BO_Add, Offset, Delta);
222	if (!Offset)
223	return std::nullopt;
224
225	OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
226	// When this is just another ElementRegion layer, we need to continue the
227	// offset calculations:
228	CurRegion = dyn_cast_or_null<ElementRegion>(Val: OwnerRegion);
229	}
230
231	if (OwnerRegion)
232	return std::make_pair(x&: OwnerRegion, y&: *Offset);
233
234	return std::nullopt;
235	}
236
237	// NOTE: This function is the "heart" of this checker. It simplifies
238	// inequalities with transformations that are valid (and very elementary) in
239	// pure mathematics, but become invalid if we use them in C++ number model
240	// where the calculations may overflow.
241	// Due to the overflow issues I think it's impossible (or at least not
242	// practical) to integrate this kind of simplification into the resolution of
243	// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
244	// produces valid results when the calculations are handling memory offsets
245	// and every value is well below SIZE_MAX.
246	// TODO: This algorithm should be moved to a central location where it's
247	// available for other checkers that need to compare memory offsets.
248	// NOTE: the simplification preserves the order of the two operands in a
249	// mathematical sense, but it may change the result produced by a C++
250	// comparison operator (and the automatic type conversions).
251	// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
252	// size_t and the RHS is stored in an int. (As size_t is unsigned, this
253	// comparison is false for all values of "X".) However, the simplification may
254	// turn it into "X < -1", which is still always false in a mathematical sense,
255	// but can produce a true result when evaluated by `evalBinOp` (which follows
256	// the rules of C++ and casts -1 to SIZE_MAX).
257	static std::pair<NonLoc, nonloc::ConcreteInt>
258	getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
259	SValBuilder &svalBuilder) {
260	const llvm::APSInt &extentVal = extent.getValue();
261	std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
262	if (SymVal && SymVal ->isExpression()) {
263	if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(Val: SymVal ->getSymbol())) {
264	llvm::APSInt constant = APSIntType (extentVal).convert(Value: SIE->getRHS());
265	switch (SIE->getOpcode()) {
266	case BO_Mul:
267	// The constant should never be 0 here, becasue multiplication by zero
268	// is simplified by the engine.
269	if ((extentVal % constant) != `0`)
270	return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271	else
272	return getSimplifiedOffsets(
273	offset: nonloc::SymbolVal (SIE->getLHS()),
274	extent: svalBuilder.makeIntVal(integer: extentVal / constant), svalBuilder);
275	case BO_Add:
276	return getSimplifiedOffsets(
277	offset: nonloc::SymbolVal (SIE->getLHS()),
278	extent: svalBuilder.makeIntVal(integer: extentVal - constant), svalBuilder);
279	default:
280	break;
281	}
282	}
283	}
284
285	return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
286	}
287
288	static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
289	const llvm::APSInt *MaxV = SVB.getMaxValue(state: State, val: Value);
290	return MaxV && MaxV->isNegative();
291	}
292
293	static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
294	QualType T = Value.getType(SVB.getContext());
295	return T ->isUnsignedIntegerType();
296	}
297
298	// Evaluate the comparison Value < Threshold with the help of the custom
299	// simplification algorithm defined for this checker. Return a pair of states,
300	// where the first one corresponds to "value below threshold" and the second
301	// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
302	// the case when the evaluation fails.
303	// If the optional argument CheckEquality is true, then use BO_EQ instead of
304	// the default BO_LT after consistently applying the same simplification steps.
305	static std::pair<ProgramStateRef, ProgramStateRef>
306	compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
307	SValBuilder &SVB, bool CheckEquality = false) {
308	if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
309	std::tie(args&: Value, args&: Threshold) =
310	getSimplifiedOffsets(offset: Value, extent: *ConcreteThreshold, svalBuilder&: SVB);
311	}
312
313	// We want to perform a _mathematical_ comparison between the numbers `Value`
314	// and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
315	// perform automatic conversions. For example the number -1 is less than the
316	// number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
317	// -1 is converted to ULONGLONG_MAX.
318	// To avoid automatic conversions, we evaluate the "obvious" cases without
319	// calling `evalBinOpNN`:
320	if (isNegative(SVB, State, Value) && isUnsigned(SVB, Value: Threshold)) {
321	if (CheckEquality) {
322	// negative_value == unsigned_threshold is always false
323	return {nullptr, State};
324	}
325	// negative_value < unsigned_threshold is always true
326	return {State, nullptr};
327	}
328	if (isUnsigned(SVB, Value) && isNegative(SVB, State, Value: Threshold)) {
329	// unsigned_value == negative_threshold and
330	// unsigned_value < negative_threshold are both always false
331	return {nullptr, State};
332	}
333	// FIXME: These special cases are sufficient for handling real-world
334	// comparisons, but in theory there could be contrived situations where
335	// automatic conversion of a symbolic value (which can be negative and can be
336	// positive) leads to incorrect results.
337	// NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
338	// we want to ensure that assumptions coming from this precondition and
339	// assumptions coming from regular C/C++ operator calls are represented by
340	// constraints on the same symbolic expression. A solution that would
341	// evaluate these "mathematical" comparisons through a separate pathway would
342	// be a step backwards in this sense.
343
344	const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
345	auto BelowThreshold =
346	SVB.evalBinOpNN(state: State, op: OpKind, lhs: Value, rhs: Threshold, resultTy: SVB.getConditionType())
347	.getAs<NonLoc>();
348
349	if (BelowThreshold)
350	return State ->assume(Cond: *BelowThreshold);
351
352	return {nullptr, nullptr};
353	}
354
355	static std::string getRegionName(const MemSpaceRegion *Space,
356	const SubRegion *Region) {
357	if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
358	return RegName;
359
360	// Field regions only have descriptive names when their parent has a
361	// descriptive name; so we provide a fallback representation for them:
362	if (const auto *FR = Region->getAs<FieldRegion>()) {
363	if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
364	return formatv(Fmt: "the field '{0}'", Vals&: Name);
365	return "the unnamed field";
366	}
367
368	if (isa<AllocaRegion>(Val: Region))
369	return "the memory returned by 'alloca'";
370
371	if (isa<SymbolicRegion>(Val: Region) && isa<HeapSpaceRegion>(Val: Space))
372	return "the heap area";
373
374	if (isa<StringRegion>(Val: Region))
375	return "the string literal";
376
377	return "the region";
378	}
379
380	static std::optional<int64_t> getConcreteValue(NonLoc SV) {
381	if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
382	return ConcreteVal ->getValue()->tryExtValue();
383	}
384	return std::nullopt;
385	}
386
387	static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
388	return SV ? getConcreteValue(SV: *SV) : std::nullopt;
389	}
390
391	static Messages getPrecedesMsgs(const MemSpaceRegion *Space,
392	const SubRegion *Region, NonLoc Offset) {
393	std::string RegName = getRegionName(Space, Region), OffsetStr = "";
394
395	if (auto ConcreteOffset = getConcreteValue(SV: Offset))
396	OffsetStr = formatv(Fmt: " {0}", Vals&: ConcreteOffset);
397
398	return {
399	.Short: formatv(Fmt: "Out of bound access to memory preceding {0}", Vals&: RegName),
400	.Full: formatv(Fmt: "Access of {0} at negative byte offset{1}", Vals&: RegName, Vals&: OffsetStr)};
401	}
402
403	/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
404	/// it can be performed (`Divisor` is nonzero and there is no remainder). The
405	/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
406	/// division is considered to be successful.
407	static bool tryDividePair(std::optional<int64_t> &Val1,
408	std::optional<int64_t> &Val2, int64_t Divisor) {
409	if (!Divisor)
410	return false;
411	const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
412	const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
413	if (Val1HasRemainder \|\| Val2HasRemainder)
414	return false;
415	if (Val1)
416	*Val1 /= Divisor;
417	if (Val2)
418	*Val2 /= Divisor;
419	return true;
420	}
421
422	static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
423	const SubRegion *Region, NonLoc Offset,
424	NonLoc Extent, SVal Location,
425	bool AlsoMentionUnderflow) {
426	std::string RegName = getRegionName(Space, Region);
427	const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
428	assert(EReg && "this checker only handles element access");
429	QualType ElemType = EReg->getElementType();
430
431	std::optional<int64_t> OffsetN = getConcreteValue(SV: Offset);
432	std::optional<int64_t> ExtentN = getConcreteValue(SV: Extent);
433
434	int64_t ElemSize = ACtx.getTypeSizeInChars(T: ElemType).getQuantity();
435
436	bool UseByteOffsets = !tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: ElemSize);
437	const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
438
439	SmallString<`256`> Buf;
440	llvm::raw_svector_ostream Out(Buf);
441	Out << "Access of ";
442	if (!ExtentN && !UseByteOffsets)
443	Out << "'" << ElemType.getAsString() << "' element in ";
444	Out << RegName << " at ";
445	if (AlsoMentionUnderflow) {
446	Out << "a negative or overflowing " << OffsetOrIndex;
447	} else if (OffsetN) {
448	Out << OffsetOrIndex << " " << *OffsetN;
449	} else {
450	Out << "an overflowing " << OffsetOrIndex;
451	}
452	if (ExtentN) {
453	Out << ", while it holds only ";
454	if (*ExtentN != `1`)
455	Out << *ExtentN;
456	else
457	Out << "a single";
458	if (UseByteOffsets)
459	Out << " byte";
460	else
461	Out << " '" << ElemType.getAsString() << "' element";
462
463	if (*ExtentN > `1`)
464	Out << "s";
465	}
466
467	return {.Short: formatv(Fmt: "Out of bound access to memory {0} {1}",
468	Vals: AlsoMentionUnderflow ? "around" : "after the end of",
469	Vals&: RegName),
470	.Full: std::string(Buf)};
471	}
472
473	static Messages getTaintMsgs(const MemSpaceRegion *Space,
474	const SubRegion Region, const* char *OffsetName,
475	bool AlsoMentionUnderflow) {
476	std::string RegName = getRegionName(Space, Region);
477	return {.Short: formatv(Fmt: "Potential out of bound access to {0} with tainted {1}",
478	Vals&: RegName, Vals&: OffsetName),
479	.Full: formatv(Fmt: "Access of {0} with a tainted {1} that may be {2}too large",
480	Vals&: RegName, Vals&: OffsetName,
481	Vals: AlsoMentionUnderflow ? "negative or " : "")};
482	}
483
484	const NoteTag StateUpdateReporter::createNoteTag(CheckerContext &C) const* {
485	// Don't create a note tag if we didn't assume anything:
486	if (!AssumedNonNegative && !AssumedUpperBound)
487	return nullptr;
488
489	return C.getNoteTag(Cb: [*this](PathSensitiveBugReport &BR) -> std::string {
490	return getMessage(BR);
491	});
492	}
493
494	std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
495	bool ShouldReportNonNegative = AssumedNonNegative;
496	if (!providesInformationAboutInteresting(SV: ByteOffsetVal, BR)) {
497	if (AssumedUpperBound &&
498	providesInformationAboutInteresting(SV: *AssumedUpperBound, BR)) {
499	// Even if the byte offset isn't interesting (e.g. it's a constant value),
500	// the assumption can still be interesting if it provides information
501	// about an interesting symbolic upper bound.
502	ShouldReportNonNegative = false;
503	} else {
504	// We don't have anything interesting, don't report the assumption.
505	return "";
506	}
507	}
508
509	std::optional<int64_t> OffsetN = getConcreteValue(SV: ByteOffsetVal);
510	std::optional<int64_t> ExtentN = getConcreteValue(SV: AssumedUpperBound);
511
512	const bool UseIndex =
513	ElementSize && tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: *ElementSize);
514
515	SmallString<`256`> Buf;
516	llvm::raw_svector_ostream Out(Buf);
517	Out << "Assuming ";
518	if (UseIndex) {
519	Out << "index ";
520	if (OffsetN)
521	Out << "'" << OffsetN << "' ";
522	} else if (AssumedUpperBound) {
523	Out << "byte offset ";
524	if (OffsetN)
525	Out << "'" << OffsetN << "' ";
526	} else {
527	Out << "offset ";
528	}
529
530	Out << "is";
531	if (ShouldReportNonNegative) {
532	Out << " non-negative";
533	}
534	if (AssumedUpperBound) {
535	if (ShouldReportNonNegative)
536	Out << " and";
537	Out << " less than ";
538	if (ExtentN)
539	Out << *ExtentN << ", ";
540	if (UseIndex && ElementType)
541	Out << "the number of '" << ElementType ->getAsString()
542	<< "' elements in ";
543	else
544	Out << "the extent of ";
545	Out << getRegionName(Space, Region: Reg);
546	}
547	return std::string (Out.str());
548	}
549
550	bool StateUpdateReporter::providesInformationAboutInteresting(
551	SymbolRef Sym, PathSensitiveBugReport &BR) {
552	if (!Sym)
553	return false;
554	for (SymbolRef PartSym : Sym->symbols()) {
555	// The interestingess mark may appear on any layer as we're stripping off
556	// the SymIntExpr, UnarySymExpr etc. layers...
557	if (BR.isInteresting(sym: PartSym))
558	return true;
559	// ...but if both sides of the expression are symbolic, then there is no
560	// practical algorithm to produce separate constraints for the two
561	// operands (from the single combined result).
562	if (isa<SymSymExpr>(Val: PartSym))
563	return false;
564	}
565	return false;
566	}
567
568	void ArrayBoundChecker::performCheck(const Expr E, CheckerContext &C) const* {
569	const SVal Location = C.getSVal(S: E);
570
571	// The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
572	// #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
573	// and incomplete analysis of these leads to false positives. As even
574	// accurate reports would be confusing for the users, just disable reports
575	// from these macros:
576	if (isFromCtypeMacro(E, AC&: C.getASTContext()))
577	return;
578
579	ProgramStateRef State = C.getState();
580	SValBuilder &SVB = C.getSValBuilder();
581
582	const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
583	computeOffset(State, SVB, Location);
584
585	if (!RawOffset)
586	return;
587
588	auto [Reg, ByteOffset] = *RawOffset;
589
590	// The state updates will be reported as a single note tag, which will be
591	// composed by this helper class.
592	StateUpdateReporter SUR(Reg, ByteOffset, E, C);
593
594	// CHECK LOWER BOUND
595	const MemSpaceRegion *Space = Reg->getMemorySpace(State);
596	if (!(isa<SymbolicRegion>(Val: Reg) && isa<UnknownSpaceRegion>(Val: Space))) {
597	// A symbolic region in unknown space represents an unknown pointer that
598	// may point into the middle of an array, so we don't look for underflows.
599	// Both conditions are significant because we want to check underflows in
600	// symbolic regions on the heap (which may be introduced by checkers like
601	// MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
602	// non-symbolic regions (e.g. a field subregion of a symbolic region) in
603	// unknown space.
604	auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
605	State, Value: ByteOffset, Threshold: SVB.makeZeroArrayIndex(), SVB);
606
607	if (PrecedesLowerBound) {
608	// The analyzer thinks that the offset may be invalid (negative)...
609
610	if (isOffsetObviouslyNonnegative(E, C)) {
611	// ...but the offset is obviously non-negative (clear array subscript
612	// with an unsigned index), so we're in a buggy situation.
613
614	// TODO: Currently the analyzer ignores many casts (e.g. signed ->
615	// unsigned casts), so it can easily reach states where it will load a
616	// signed (and negative) value from an unsigned variable. This sanity
617	// check is a duct tape "solution" that silences most of the ugly false
618	// positives that are caused by this buggy behavior. Note that this is
619	// not a complete solution: this cannot silence reports where pointer
620	// arithmetic complicates the picture and cannot ensure modeling of the
621	// "unsigned index is positive with highest bit set" cases which are
622	// "usurped" by the nonsense "unsigned index is negative" case.
623	// For more information about this topic, see the umbrella ticket
624	// https://github.com/llvm/llvm-project/issues/39492
625	// TODO: Remove this hack once 'SymbolCast's are modeled properly.
626
627	if (!WithinLowerBound) {
628	// The state is completely nonsense -- let's just sink it!
629	C.addSink();
630	return;
631	}
632	// Otherwise continue on the 'WithinLowerBound' branch where the
633	// unsigned index _is_ non-negative. Don't mention this assumption as a
634	// note tag, because it would just confuse the users!
635	} else {
636	if (!WithinLowerBound) {
637	// ...and it cannot be valid (>= 0), so report an error.
638	Messages Msgs = getPrecedesMsgs(Space, Region: Reg, Offset: ByteOffset);
639	reportOOB(C, ErrorState: PrecedesLowerBound, Msgs, Offset: ByteOffset, Extent: std::nullopt);
640	return;
641	}
642	// ...but it can be valid as well, so the checker will (optimistically)
643	// assume that it's valid and mention this in the note tag.
644	SUR.recordNonNegativeAssumption();
645	}
646	}
647
648	// Actually update the state. The "if" only fails in the extremely unlikely
649	// case when compareValueToThreshold returns {nullptr, nullptr} because
650	// evalBinOpNN fails to evaluate the less-than operator.
651	if (WithinLowerBound)
652	State = WithinLowerBound;
653	}
654
655	// CHECK UPPER BOUND
656	DefinedOrUnknownSVal Size = getDynamicExtent(State, MR: Reg, SVB);
657	if (auto KnownSize = Size.getAs<NonLoc>()) {
658	// In a situation where both underflow and overflow are possible (but the
659	// index is either tainted or known to be invalid), the logic of this
660	// checker will first assume that the offset is non-negative, and then
661	// (with this additional assumption) it will detect an overflow error.
662	// In this situation the warning message should mention both possibilities.
663	bool AlsoMentionUnderflow = SUR.assumedNonNegative();
664
665	auto [WithinUpperBound, ExceedsUpperBound] =
666	compareValueToThreshold(State, Value: ByteOffset, Threshold: *KnownSize, SVB);
667
668	if (ExceedsUpperBound) {
669	// The offset may be invalid (>= Size)...
670	if (!WithinUpperBound) {
671	// ...and it cannot be within bounds, so report an error, unless we can
672	// definitely determine that this is an idiomatic `&array[size]`
673	// expression that calculates the past-the-end pointer.
674	if (isIdiomaticPastTheEndPtr(E, State: ExceedsUpperBound, Offset: ByteOffset,
675	Limit: *KnownSize, C)) {
676	C.addTransition(State: ExceedsUpperBound, Tag: SUR.createNoteTag(C));
677	return;
678	}
679
680	Messages Msgs =
681	getExceedsMsgs(ACtx&: C.getASTContext(), Space, Region: Reg, Offset: ByteOffset,
682	Extent: *KnownSize, Location, AlsoMentionUnderflow);
683	reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize);
684	return;
685	}
686	// ...and it can be valid as well...
687	if (isTainted(State, V: ByteOffset)) {
688	// ...but it's tainted, so report an error.
689
690	// Diagnostic detail: saying "tainted offset" is always correct, but
691	// the common case is that 'idx' is tainted in 'arr[idx]' and then it's
692	// nicer to say "tainted index".
693	const char *OffsetName = "offset";
694	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E))
695	if (isTainted(State, S: ASE->getIdx(), LCtx: C.getLocationContext()))
696	OffsetName = "index";
697
698	Messages Msgs =
699	getTaintMsgs(Space, Region: Reg, OffsetName, AlsoMentionUnderflow);
700	reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize,
701	/IsTaintBug=/true);
702	return;
703	}
704	// ...and it isn't tainted, so the checker will (optimistically) assume
705	// that the offset is in bounds and mention this in the note tag.
706	SUR.recordUpperBoundAssumption(UpperBoundVal: *KnownSize);
707	}
708
709	// Actually update the state. The "if" only fails in the extremely unlikely
710	// case when compareValueToThreshold returns {nullptr, nullptr} because
711	// evalBinOpNN fails to evaluate the less-than operator.
712	if (WithinUpperBound)
713	State = WithinUpperBound;
714	}
715
716	// Add a transition, reporting the state updates that we accumulated.
717	C.addTransition(State, Tag: SUR.createNoteTag(C));
718	}
719
720	void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR,
721	ProgramStateRef ErrorState,
722	NonLoc Val, bool MarkTaint) {
723	if (SymbolRef Sym = Val.getAsSymbol()) {
724	// If the offset is a symbolic value, iterate over its "parts" with
725	// `SymExpr::symbols()` and mark each of them as interesting.
726	// For example, if the offset is `x4 + y` then we put interestingness onto*
727	// the SymSymExpr `x4 + y`, the SymIntExpr `x4` and the two data symbols
728	// `x` and `y`.
729	for (SymbolRef PartSym : Sym->symbols())
730	BR.markInteresting(sym: PartSym);
731	}
732
733	if (MarkTaint) {
734	// If the issue that we're reporting depends on the taintedness of the
735	// offset, then put interestingness onto symbols that could be the origin
736	// of the taint. Note that this may find symbols that did not appear in
737	// `Sym->symbols()` (because they're only loosely connected to `Val`).
738	for (SymbolRef Sym : getTaintedSymbols(State: ErrorState, V: Val))
739	BR.markInteresting(sym: Sym);
740	}
741	}
742
743	void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
744	Messages Msgs, NonLoc Offset,
745	std::optional<NonLoc> Extent,
746	bool IsTaintBug /=false/) const {
747
748	ExplodedNode *ErrorNode = C.generateErrorNode(State: ErrorState);
749	if (!ErrorNode)
750	return;
751
752	auto BR = std::make_unique<PathSensitiveBugReport>(
753	args: IsTaintBug ? TaintBT : BT, args&: Msgs.Short, args&: Msgs.Full, args&: ErrorNode);
754
755	// FIXME: ideally we would just call trackExpressionValue() and that would
756	// "do the right thing": mark the relevant symbols as interesting, track the
757	// control dependencies and statements storing the relevant values and add
758	// helpful diagnostic pieces. However, right now trackExpressionValue() is
759	// a heap of unreliable heuristics, so it would cause several issues:
760	// - Interestingness is not applied consistently, e.g. if `array[x+10]`
761	// causes an overflow, then `x` is not marked as interesting.
762	// - We get irrelevant diagnostic pieces, e.g. in the code
763	// `int p = (int)malloc(2sizeof(int)); p[3] = 0;`*
764	// it places a "Storing uninitialized value" note on the `malloc` call
765	// (which is technically true, but irrelevant).
766	// If trackExpressionValue() becomes reliable, it should be applied instead
767	// of this custom markPartsInteresting().
768	markPartsInteresting(BR&: *BR, ErrorState, Val: Offset, MarkTaint: IsTaintBug);
769	if (Extent)
770	markPartsInteresting(BR&: BR, ErrorState, Val: Extent, MarkTaint: IsTaintBug);
771
772	C.emitReport(R: std::move(BR));
773	}
774
775	bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
776	SourceLocation Loc = E->getBeginLoc();
777	if (!Loc.isMacroID())
778	return false;
779
780	StringRef MacroName = Lexer::getImmediateMacroName(
781	Loc, SM: ACtx.getSourceManager(), LangOpts: ACtx.getLangOpts());
782
783	if (MacroName.size() < `7` \|\| MacroName [`0`] != `'i'` \|\| MacroName [`1`] != `'s'`)
784	return false;
785
786	return ((MacroName == "isalnum") \|\| (MacroName == "isalpha") \|\|
787	(MacroName == "isblank") \|\| (MacroName == "isdigit") \|\|
788	(MacroName == "isgraph") \|\| (MacroName == "islower") \|\|
789	(MacroName == "isnctrl") \|\| (MacroName == "isprint") \|\|
790	(MacroName == "ispunct") \|\| (MacroName == "isspace") \|\|
791	(MacroName == "isupper") \|\| (MacroName == "isxdigit"));
792	}
793
794	bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
795	CheckerContext &C) {
796	const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
797	if (!ASE)
798	return false;
799	return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType();
800	}
801
802	bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
803	ParentMapContext &ParentCtx = ACtx.getParentMapContext();
804	do {
805	const DynTypedNodeList Parents = ParentCtx.getParents(Node: *S);
806	if (Parents.empty())
807	return false;
808	S = Parents [`0`].get<Stmt>();
809	} while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(Val: S));
810	const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(Val: S);
811	return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
812	}
813
814	bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E,
815	ProgramStateRef State,
816	NonLoc Offset, NonLoc Limit,
817	CheckerContext &C) {
818	if (isa<ArraySubscriptExpr>(Val: E) && isInAddressOf(S: E, ACtx&: C.getASTContext())) {
819	auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
820	State, Value: Offset, Threshold: Limit, SVB&: C.getSValBuilder(), /CheckEquality=/true);
821	return EqualsToThreshold && !NotEqualToThreshold;
822	}
823	return false;
824	}
825
826	void ento::registerArrayBoundChecker(CheckerManager &mgr) {
827	mgr.registerChecker<ArrayBoundChecker>();
828	}
829
830	bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) {
831	return true;
832	}
833

Browse the source code of llvm_projects/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp