1//== ArrayBoundChecker.cpp -------------------------------------------------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines security.ArrayBound, which is a path-sensitive checker
10// that looks for out of bounds access of memory regions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
15#include "clang/AST/ParentMapContext.h"
16#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17#include "clang/StaticAnalyzer/Checkers/Taint.h"
18#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19#include "clang/StaticAnalyzer/Core/Checker.h"
20#include "clang/StaticAnalyzer/Core/CheckerManager.h"
21#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25#include "llvm/ADT/APSInt.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Support/raw_ostream.h"
28#include <optional>
29
30using namespace clang;
31using namespace ento;
32using namespace taint;
33using llvm::formatv;
34
35namespace {
36/// If `E` is an array subscript expression with a base that is "clean" (= not
37/// modified by pointer arithmetic = the beginning of a memory region), return
38/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
39/// This helper function is used by two separate heuristics that are only valid
40/// in these "clean" cases.
41static const ArraySubscriptExpr *
42getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) {
43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E);
44 if (!ASE)
45 return nullptr;
46
47 const MemRegion *SubscriptBaseReg = C.getSVal(S: ASE->getBase()).getAsRegion();
48 if (!SubscriptBaseReg)
49 return nullptr;
50
51 // The base of the subscript expression is affected by pointer arithmetics,
52 // so we want to report byte offsets instead of indices and we don't want to
53 // activate the "index is unsigned -> cannot be negative" shortcut.
54 if (isa<ElementRegion>(Val: SubscriptBaseReg->StripCasts()))
55 return nullptr;
56
57 return ASE;
58}
59
60/// If `E` is a "clean" array subscript expression, return the type of the
61/// accessed element; otherwise return std::nullopt because that's the best (or
62/// least bad) option for the diagnostic generation that relies on this.
63static std::optional<QualType> determineElementType(const Expr *E,
64 const CheckerContext &C) {
65 const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
66 if (!ASE)
67 return std::nullopt;
68
69 return ASE->getType();
70}
71
72static std::optional<int64_t>
73determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
74 if (!T)
75 return std::nullopt;
76 return C.getASTContext().getTypeSizeInChars(T: *T).getQuantity();
77}
78
79class StateUpdateReporter {
80 const MemSpaceRegion *Space;
81 const SubRegion *Reg;
82 const NonLoc ByteOffsetVal;
83 const std::optional<QualType> ElementType;
84 const std::optional<int64_t> ElementSize;
85 bool AssumedNonNegative = false;
86 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
87
88public:
89 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
90 CheckerContext &C)
91 : Space(R->getMemorySpace(State: C.getState())), Reg(R),
92 ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)),
93 ElementSize(determineElementSize(T: ElementType, C)) {}
94
95 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
96 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
97 AssumedUpperBound = UpperBoundVal;
98 }
99
100 bool assumedNonNegative() { return AssumedNonNegative; }
101
102 const NoteTag *createNoteTag(CheckerContext &C) const;
103
104private:
105 std::string getMessage(PathSensitiveBugReport &BR) const;
106
107 /// Return true if information about the value of `Sym` can put constraints
108 /// on some symbol which is interesting within the bug report `BR`.
109 /// In particular, this returns true when `Sym` is interesting within `BR`;
110 /// but it also returns true if `Sym` is an expression that contains integer
111 /// constants and a single symbolic operand which is interesting (in `BR`).
112 /// We need to use this instead of plain `BR.isInteresting()` because if we
113 /// are analyzing code like
114 /// int array[10];
115 /// int f(int arg) {
116 /// return array[arg] && array[arg + 10];
117 /// }
118 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
119 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
120 /// to detect this out of bounds access).
121 static bool providesInformationAboutInteresting(SymbolRef Sym,
122 PathSensitiveBugReport &BR);
123 static bool providesInformationAboutInteresting(SVal SV,
124 PathSensitiveBugReport &BR) {
125 return providesInformationAboutInteresting(Sym: SV.getAsSymbol(), BR);
126 }
127};
128
129struct Messages {
130 std::string Short, Full;
131};
132
133// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
134// instead of `PreStmt` because the current implementation passes the whole
135// expression to `CheckerContext::getSVal()` which only works after the
136// symbolic evaluation of the expression. (To turn them into `PreStmt`
137// callbacks, we'd need to duplicate the logic that evaluates these
138// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
139// defined as `PostStmt` for the sake of consistency with the other callbacks.
140class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
141 check::PostStmt<UnaryOperator>,
142 check::PostStmt<MemberExpr>> {
143 BugType BT{this, "Out-of-bound access"};
144 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
145
146 void performCheck(const Expr *E, CheckerContext &C) const;
147
148 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
149 NonLoc Offset, std::optional<NonLoc> Extent,
150 bool IsTaintBug = false) const;
151
152 static void markPartsInteresting(PathSensitiveBugReport &BR,
153 ProgramStateRef ErrorState, NonLoc Val,
154 bool MarkTaint);
155
156 static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
157
158 static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
159
160 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
161 NonLoc Offset, NonLoc Limit,
162 CheckerContext &C);
163 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
164
165public:
166 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
167 performCheck(E, C);
168 }
169 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
170 if (E->getOpcode() == UO_Deref)
171 performCheck(E, C);
172 }
173 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
174 if (E->isArrow())
175 performCheck(E: E->getBase(), C);
176 }
177};
178
179} // anonymous namespace
180
181/// For a given Location that can be represented as a symbolic expression
182/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
183/// Arr and the distance of Location from the beginning of Arr (expressed in a
184/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
185/// cannot be determined.
186static std::optional<std::pair<const SubRegion *, NonLoc>>
187computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
188 QualType T = SVB.getArrayIndexType();
189 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
190 // We will use this utility to add and multiply values.
191 return SVB.evalBinOpNN(state: State, op: Op, lhs: L, rhs: R, resultTy: T).getAs<NonLoc>();
192 };
193
194 const SubRegion *OwnerRegion = nullptr;
195 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
196
197 const ElementRegion *CurRegion =
198 dyn_cast_or_null<ElementRegion>(Val: Location.getAsRegion());
199
200 while (CurRegion) {
201 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
202 if (!Index)
203 return std::nullopt;
204
205 QualType ElemType = CurRegion->getElementType();
206
207 // FIXME: The following early return was presumably added to safeguard the
208 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
209 // it seems that `ElemType` cannot be incomplete at this point.
210 if (ElemType->isIncompleteType())
211 return std::nullopt;
212
213 // Calculate Delta = Index * sizeof(ElemType).
214 NonLoc Size = SVB.makeArrayIndex(
215 idx: SVB.getContext().getTypeSizeInChars(T: ElemType).getQuantity());
216 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
217 if (!Delta)
218 return std::nullopt;
219
220 // Perform Offset += Delta.
221 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
222 if (!Offset)
223 return std::nullopt;
224
225 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
226 // When this is just another ElementRegion layer, we need to continue the
227 // offset calculations:
228 CurRegion = dyn_cast_or_null<ElementRegion>(Val: OwnerRegion);
229 }
230
231 if (OwnerRegion)
232 return std::make_pair(x&: OwnerRegion, y&: *Offset);
233
234 return std::nullopt;
235}
236
237// NOTE: This function is the "heart" of this checker. It simplifies
238// inequalities with transformations that are valid (and very elementary) in
239// pure mathematics, but become invalid if we use them in C++ number model
240// where the calculations may overflow.
241// Due to the overflow issues I think it's impossible (or at least not
242// practical) to integrate this kind of simplification into the resolution of
243// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
244// produces valid results when the calculations are handling memory offsets
245// and every value is well below SIZE_MAX.
246// TODO: This algorithm should be moved to a central location where it's
247// available for other checkers that need to compare memory offsets.
248// NOTE: the simplification preserves the order of the two operands in a
249// mathematical sense, but it may change the result produced by a C++
250// comparison operator (and the automatic type conversions).
251// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
252// size_t and the RHS is stored in an int. (As size_t is unsigned, this
253// comparison is false for all values of "X".) However, the simplification may
254// turn it into "X < -1", which is still always false in a mathematical sense,
255// but can produce a true result when evaluated by `evalBinOp` (which follows
256// the rules of C++ and casts -1 to SIZE_MAX).
257static std::pair<NonLoc, nonloc::ConcreteInt>
258getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
259 SValBuilder &svalBuilder) {
260 const llvm::APSInt &extentVal = extent.getValue();
261 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
262 if (SymVal && SymVal->isExpression()) {
263 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(Val: SymVal->getSymbol())) {
264 llvm::APSInt constant = APSIntType(extentVal).convert(Value: SIE->getRHS());
265 switch (SIE->getOpcode()) {
266 case BO_Mul:
267 // The constant should never be 0 here, becasue multiplication by zero
268 // is simplified by the engine.
269 if ((extentVal % constant) != 0)
270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271 else
272 return getSimplifiedOffsets(
273 offset: nonloc::SymbolVal(SIE->getLHS()),
274 extent: svalBuilder.makeIntVal(integer: extentVal / constant), svalBuilder);
275 case BO_Add:
276 return getSimplifiedOffsets(
277 offset: nonloc::SymbolVal(SIE->getLHS()),
278 extent: svalBuilder.makeIntVal(integer: extentVal - constant), svalBuilder);
279 default:
280 break;
281 }
282 }
283 }
284
285 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
286}
287
288static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
289 const llvm::APSInt *MaxV = SVB.getMaxValue(state: State, val: Value);
290 return MaxV && MaxV->isNegative();
291}
292
293static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
294 QualType T = Value.getType(SVB.getContext());
295 return T->isUnsignedIntegerType();
296}
297
298// Evaluate the comparison Value < Threshold with the help of the custom
299// simplification algorithm defined for this checker. Return a pair of states,
300// where the first one corresponds to "value below threshold" and the second
301// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
302// the case when the evaluation fails.
303// If the optional argument CheckEquality is true, then use BO_EQ instead of
304// the default BO_LT after consistently applying the same simplification steps.
305static std::pair<ProgramStateRef, ProgramStateRef>
306compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
307 SValBuilder &SVB, bool CheckEquality = false) {
308 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
309 std::tie(args&: Value, args&: Threshold) =
310 getSimplifiedOffsets(offset: Value, extent: *ConcreteThreshold, svalBuilder&: SVB);
311 }
312
313 // We want to perform a _mathematical_ comparison between the numbers `Value`
314 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
315 // perform automatic conversions. For example the number -1 is less than the
316 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
317 // -1 is converted to ULONGLONG_MAX.
318 // To avoid automatic conversions, we evaluate the "obvious" cases without
319 // calling `evalBinOpNN`:
320 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Value: Threshold)) {
321 if (CheckEquality) {
322 // negative_value == unsigned_threshold is always false
323 return {nullptr, State};
324 }
325 // negative_value < unsigned_threshold is always true
326 return {State, nullptr};
327 }
328 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Value: Threshold)) {
329 // unsigned_value == negative_threshold and
330 // unsigned_value < negative_threshold are both always false
331 return {nullptr, State};
332 }
333 // FIXME: These special cases are sufficient for handling real-world
334 // comparisons, but in theory there could be contrived situations where
335 // automatic conversion of a symbolic value (which can be negative and can be
336 // positive) leads to incorrect results.
337 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
338 // we want to ensure that assumptions coming from this precondition and
339 // assumptions coming from regular C/C++ operator calls are represented by
340 // constraints on the same symbolic expression. A solution that would
341 // evaluate these "mathematical" comparisons through a separate pathway would
342 // be a step backwards in this sense.
343
344 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
345 auto BelowThreshold =
346 SVB.evalBinOpNN(state: State, op: OpKind, lhs: Value, rhs: Threshold, resultTy: SVB.getConditionType())
347 .getAs<NonLoc>();
348
349 if (BelowThreshold)
350 return State->assume(Cond: *BelowThreshold);
351
352 return {nullptr, nullptr};
353}
354
355static std::string getRegionName(const MemSpaceRegion *Space,
356 const SubRegion *Region) {
357 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
358 return RegName;
359
360 // Field regions only have descriptive names when their parent has a
361 // descriptive name; so we provide a fallback representation for them:
362 if (const auto *FR = Region->getAs<FieldRegion>()) {
363 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
364 return formatv(Fmt: "the field '{0}'", Vals&: Name);
365 return "the unnamed field";
366 }
367
368 if (isa<AllocaRegion>(Val: Region))
369 return "the memory returned by 'alloca'";
370
371 if (isa<SymbolicRegion>(Val: Region) && isa<HeapSpaceRegion>(Val: Space))
372 return "the heap area";
373
374 if (isa<StringRegion>(Val: Region))
375 return "the string literal";
376
377 return "the region";
378}
379
380static std::optional<int64_t> getConcreteValue(NonLoc SV) {
381 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
382 return ConcreteVal->getValue()->tryExtValue();
383 }
384 return std::nullopt;
385}
386
387static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
388 return SV ? getConcreteValue(SV: *SV) : std::nullopt;
389}
390
391static Messages getPrecedesMsgs(const MemSpaceRegion *Space,
392 const SubRegion *Region, NonLoc Offset) {
393 std::string RegName = getRegionName(Space, Region), OffsetStr = "";
394
395 if (auto ConcreteOffset = getConcreteValue(SV: Offset))
396 OffsetStr = formatv(Fmt: " {0}", Vals&: ConcreteOffset);
397
398 return {
399 .Short: formatv(Fmt: "Out of bound access to memory preceding {0}", Vals&: RegName),
400 .Full: formatv(Fmt: "Access of {0} at negative byte offset{1}", Vals&: RegName, Vals&: OffsetStr)};
401}
402
403/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
404/// it can be performed (`Divisor` is nonzero and there is no remainder). The
405/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
406/// division is considered to be successful.
407static bool tryDividePair(std::optional<int64_t> &Val1,
408 std::optional<int64_t> &Val2, int64_t Divisor) {
409 if (!Divisor)
410 return false;
411 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
412 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
413 if (Val1HasRemainder || Val2HasRemainder)
414 return false;
415 if (Val1)
416 *Val1 /= Divisor;
417 if (Val2)
418 *Val2 /= Divisor;
419 return true;
420}
421
422static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
423 const SubRegion *Region, NonLoc Offset,
424 NonLoc Extent, SVal Location,
425 bool AlsoMentionUnderflow) {
426 std::string RegName = getRegionName(Space, Region);
427 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
428 assert(EReg && "this checker only handles element access");
429 QualType ElemType = EReg->getElementType();
430
431 std::optional<int64_t> OffsetN = getConcreteValue(SV: Offset);
432 std::optional<int64_t> ExtentN = getConcreteValue(SV: Extent);
433
434 int64_t ElemSize = ACtx.getTypeSizeInChars(T: ElemType).getQuantity();
435
436 bool UseByteOffsets = !tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: ElemSize);
437 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
438
439 SmallString<256> Buf;
440 llvm::raw_svector_ostream Out(Buf);
441 Out << "Access of ";
442 if (!ExtentN && !UseByteOffsets)
443 Out << "'" << ElemType.getAsString() << "' element in ";
444 Out << RegName << " at ";
445 if (AlsoMentionUnderflow) {
446 Out << "a negative or overflowing " << OffsetOrIndex;
447 } else if (OffsetN) {
448 Out << OffsetOrIndex << " " << *OffsetN;
449 } else {
450 Out << "an overflowing " << OffsetOrIndex;
451 }
452 if (ExtentN) {
453 Out << ", while it holds only ";
454 if (*ExtentN != 1)
455 Out << *ExtentN;
456 else
457 Out << "a single";
458 if (UseByteOffsets)
459 Out << " byte";
460 else
461 Out << " '" << ElemType.getAsString() << "' element";
462
463 if (*ExtentN > 1)
464 Out << "s";
465 }
466
467 return {.Short: formatv(Fmt: "Out of bound access to memory {0} {1}",
468 Vals: AlsoMentionUnderflow ? "around" : "after the end of",
469 Vals&: RegName),
470 .Full: std::string(Buf)};
471}
472
473static Messages getTaintMsgs(const MemSpaceRegion *Space,
474 const SubRegion *Region, const char *OffsetName,
475 bool AlsoMentionUnderflow) {
476 std::string RegName = getRegionName(Space, Region);
477 return {.Short: formatv(Fmt: "Potential out of bound access to {0} with tainted {1}",
478 Vals&: RegName, Vals&: OffsetName),
479 .Full: formatv(Fmt: "Access of {0} with a tainted {1} that may be {2}too large",
480 Vals&: RegName, Vals&: OffsetName,
481 Vals: AlsoMentionUnderflow ? "negative or " : "")};
482}
483
484const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
485 // Don't create a note tag if we didn't assume anything:
486 if (!AssumedNonNegative && !AssumedUpperBound)
487 return nullptr;
488
489 return C.getNoteTag(Cb: [*this](PathSensitiveBugReport &BR) -> std::string {
490 return getMessage(BR);
491 });
492}
493
494std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
495 bool ShouldReportNonNegative = AssumedNonNegative;
496 if (!providesInformationAboutInteresting(SV: ByteOffsetVal, BR)) {
497 if (AssumedUpperBound &&
498 providesInformationAboutInteresting(SV: *AssumedUpperBound, BR)) {
499 // Even if the byte offset isn't interesting (e.g. it's a constant value),
500 // the assumption can still be interesting if it provides information
501 // about an interesting symbolic upper bound.
502 ShouldReportNonNegative = false;
503 } else {
504 // We don't have anything interesting, don't report the assumption.
505 return "";
506 }
507 }
508
509 std::optional<int64_t> OffsetN = getConcreteValue(SV: ByteOffsetVal);
510 std::optional<int64_t> ExtentN = getConcreteValue(SV: AssumedUpperBound);
511
512 const bool UseIndex =
513 ElementSize && tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: *ElementSize);
514
515 SmallString<256> Buf;
516 llvm::raw_svector_ostream Out(Buf);
517 Out << "Assuming ";
518 if (UseIndex) {
519 Out << "index ";
520 if (OffsetN)
521 Out << "'" << OffsetN << "' ";
522 } else if (AssumedUpperBound) {
523 Out << "byte offset ";
524 if (OffsetN)
525 Out << "'" << OffsetN << "' ";
526 } else {
527 Out << "offset ";
528 }
529
530 Out << "is";
531 if (ShouldReportNonNegative) {
532 Out << " non-negative";
533 }
534 if (AssumedUpperBound) {
535 if (ShouldReportNonNegative)
536 Out << " and";
537 Out << " less than ";
538 if (ExtentN)
539 Out << *ExtentN << ", ";
540 if (UseIndex && ElementType)
541 Out << "the number of '" << ElementType->getAsString()
542 << "' elements in ";
543 else
544 Out << "the extent of ";
545 Out << getRegionName(Space, Region: Reg);
546 }
547 return std::string(Out.str());
548}
549
550bool StateUpdateReporter::providesInformationAboutInteresting(
551 SymbolRef Sym, PathSensitiveBugReport &BR) {
552 if (!Sym)
553 return false;
554 for (SymbolRef PartSym : Sym->symbols()) {
555 // The interestingess mark may appear on any layer as we're stripping off
556 // the SymIntExpr, UnarySymExpr etc. layers...
557 if (BR.isInteresting(sym: PartSym))
558 return true;
559 // ...but if both sides of the expression are symbolic, then there is no
560 // practical algorithm to produce separate constraints for the two
561 // operands (from the single combined result).
562 if (isa<SymSymExpr>(Val: PartSym))
563 return false;
564 }
565 return false;
566}
567
568void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
569 const SVal Location = C.getSVal(S: E);
570
571 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
572 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
573 // and incomplete analysis of these leads to false positives. As even
574 // accurate reports would be confusing for the users, just disable reports
575 // from these macros:
576 if (isFromCtypeMacro(E, AC&: C.getASTContext()))
577 return;
578
579 ProgramStateRef State = C.getState();
580 SValBuilder &SVB = C.getSValBuilder();
581
582 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
583 computeOffset(State, SVB, Location);
584
585 if (!RawOffset)
586 return;
587
588 auto [Reg, ByteOffset] = *RawOffset;
589
590 // The state updates will be reported as a single note tag, which will be
591 // composed by this helper class.
592 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
593
594 // CHECK LOWER BOUND
595 const MemSpaceRegion *Space = Reg->getMemorySpace(State);
596 if (!(isa<SymbolicRegion>(Val: Reg) && isa<UnknownSpaceRegion>(Val: Space))) {
597 // A symbolic region in unknown space represents an unknown pointer that
598 // may point into the middle of an array, so we don't look for underflows.
599 // Both conditions are significant because we want to check underflows in
600 // symbolic regions on the heap (which may be introduced by checkers like
601 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
602 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
603 // unknown space.
604 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
605 State, Value: ByteOffset, Threshold: SVB.makeZeroArrayIndex(), SVB);
606
607 if (PrecedesLowerBound) {
608 // The analyzer thinks that the offset may be invalid (negative)...
609
610 if (isOffsetObviouslyNonnegative(E, C)) {
611 // ...but the offset is obviously non-negative (clear array subscript
612 // with an unsigned index), so we're in a buggy situation.
613
614 // TODO: Currently the analyzer ignores many casts (e.g. signed ->
615 // unsigned casts), so it can easily reach states where it will load a
616 // signed (and negative) value from an unsigned variable. This sanity
617 // check is a duct tape "solution" that silences most of the ugly false
618 // positives that are caused by this buggy behavior. Note that this is
619 // not a complete solution: this cannot silence reports where pointer
620 // arithmetic complicates the picture and cannot ensure modeling of the
621 // "unsigned index is positive with highest bit set" cases which are
622 // "usurped" by the nonsense "unsigned index is negative" case.
623 // For more information about this topic, see the umbrella ticket
624 // https://github.com/llvm/llvm-project/issues/39492
625 // TODO: Remove this hack once 'SymbolCast's are modeled properly.
626
627 if (!WithinLowerBound) {
628 // The state is completely nonsense -- let's just sink it!
629 C.addSink();
630 return;
631 }
632 // Otherwise continue on the 'WithinLowerBound' branch where the
633 // unsigned index _is_ non-negative. Don't mention this assumption as a
634 // note tag, because it would just confuse the users!
635 } else {
636 if (!WithinLowerBound) {
637 // ...and it cannot be valid (>= 0), so report an error.
638 Messages Msgs = getPrecedesMsgs(Space, Region: Reg, Offset: ByteOffset);
639 reportOOB(C, ErrorState: PrecedesLowerBound, Msgs, Offset: ByteOffset, Extent: std::nullopt);
640 return;
641 }
642 // ...but it can be valid as well, so the checker will (optimistically)
643 // assume that it's valid and mention this in the note tag.
644 SUR.recordNonNegativeAssumption();
645 }
646 }
647
648 // Actually update the state. The "if" only fails in the extremely unlikely
649 // case when compareValueToThreshold returns {nullptr, nullptr} because
650 // evalBinOpNN fails to evaluate the less-than operator.
651 if (WithinLowerBound)
652 State = WithinLowerBound;
653 }
654
655 // CHECK UPPER BOUND
656 DefinedOrUnknownSVal Size = getDynamicExtent(State, MR: Reg, SVB);
657 if (auto KnownSize = Size.getAs<NonLoc>()) {
658 // In a situation where both underflow and overflow are possible (but the
659 // index is either tainted or known to be invalid), the logic of this
660 // checker will first assume that the offset is non-negative, and then
661 // (with this additional assumption) it will detect an overflow error.
662 // In this situation the warning message should mention both possibilities.
663 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
664
665 auto [WithinUpperBound, ExceedsUpperBound] =
666 compareValueToThreshold(State, Value: ByteOffset, Threshold: *KnownSize, SVB);
667
668 if (ExceedsUpperBound) {
669 // The offset may be invalid (>= Size)...
670 if (!WithinUpperBound) {
671 // ...and it cannot be within bounds, so report an error, unless we can
672 // definitely determine that this is an idiomatic `&array[size]`
673 // expression that calculates the past-the-end pointer.
674 if (isIdiomaticPastTheEndPtr(E, State: ExceedsUpperBound, Offset: ByteOffset,
675 Limit: *KnownSize, C)) {
676 C.addTransition(State: ExceedsUpperBound, Tag: SUR.createNoteTag(C));
677 return;
678 }
679
680 Messages Msgs =
681 getExceedsMsgs(ACtx&: C.getASTContext(), Space, Region: Reg, Offset: ByteOffset,
682 Extent: *KnownSize, Location, AlsoMentionUnderflow);
683 reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize);
684 return;
685 }
686 // ...and it can be valid as well...
687 if (isTainted(State, V: ByteOffset)) {
688 // ...but it's tainted, so report an error.
689
690 // Diagnostic detail: saying "tainted offset" is always correct, but
691 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
692 // nicer to say "tainted index".
693 const char *OffsetName = "offset";
694 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E))
695 if (isTainted(State, S: ASE->getIdx(), LCtx: C.getLocationContext()))
696 OffsetName = "index";
697
698 Messages Msgs =
699 getTaintMsgs(Space, Region: Reg, OffsetName, AlsoMentionUnderflow);
700 reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize,
701 /*IsTaintBug=*/true);
702 return;
703 }
704 // ...and it isn't tainted, so the checker will (optimistically) assume
705 // that the offset is in bounds and mention this in the note tag.
706 SUR.recordUpperBoundAssumption(UpperBoundVal: *KnownSize);
707 }
708
709 // Actually update the state. The "if" only fails in the extremely unlikely
710 // case when compareValueToThreshold returns {nullptr, nullptr} because
711 // evalBinOpNN fails to evaluate the less-than operator.
712 if (WithinUpperBound)
713 State = WithinUpperBound;
714 }
715
716 // Add a transition, reporting the state updates that we accumulated.
717 C.addTransition(State, Tag: SUR.createNoteTag(C));
718}
719
720void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR,
721 ProgramStateRef ErrorState,
722 NonLoc Val, bool MarkTaint) {
723 if (SymbolRef Sym = Val.getAsSymbol()) {
724 // If the offset is a symbolic value, iterate over its "parts" with
725 // `SymExpr::symbols()` and mark each of them as interesting.
726 // For example, if the offset is `x*4 + y` then we put interestingness onto
727 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
728 // `x` and `y`.
729 for (SymbolRef PartSym : Sym->symbols())
730 BR.markInteresting(sym: PartSym);
731 }
732
733 if (MarkTaint) {
734 // If the issue that we're reporting depends on the taintedness of the
735 // offset, then put interestingness onto symbols that could be the origin
736 // of the taint. Note that this may find symbols that did not appear in
737 // `Sym->symbols()` (because they're only loosely connected to `Val`).
738 for (SymbolRef Sym : getTaintedSymbols(State: ErrorState, V: Val))
739 BR.markInteresting(sym: Sym);
740 }
741}
742
743void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
744 Messages Msgs, NonLoc Offset,
745 std::optional<NonLoc> Extent,
746 bool IsTaintBug /*=false*/) const {
747
748 ExplodedNode *ErrorNode = C.generateErrorNode(State: ErrorState);
749 if (!ErrorNode)
750 return;
751
752 auto BR = std::make_unique<PathSensitiveBugReport>(
753 args: IsTaintBug ? TaintBT : BT, args&: Msgs.Short, args&: Msgs.Full, args&: ErrorNode);
754
755 // FIXME: ideally we would just call trackExpressionValue() and that would
756 // "do the right thing": mark the relevant symbols as interesting, track the
757 // control dependencies and statements storing the relevant values and add
758 // helpful diagnostic pieces. However, right now trackExpressionValue() is
759 // a heap of unreliable heuristics, so it would cause several issues:
760 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
761 // causes an overflow, then `x` is not marked as interesting.
762 // - We get irrelevant diagnostic pieces, e.g. in the code
763 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
764 // it places a "Storing uninitialized value" note on the `malloc` call
765 // (which is technically true, but irrelevant).
766 // If trackExpressionValue() becomes reliable, it should be applied instead
767 // of this custom markPartsInteresting().
768 markPartsInteresting(BR&: *BR, ErrorState, Val: Offset, MarkTaint: IsTaintBug);
769 if (Extent)
770 markPartsInteresting(BR&: *BR, ErrorState, Val: *Extent, MarkTaint: IsTaintBug);
771
772 C.emitReport(R: std::move(BR));
773}
774
775bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
776 SourceLocation Loc = E->getBeginLoc();
777 if (!Loc.isMacroID())
778 return false;
779
780 StringRef MacroName = Lexer::getImmediateMacroName(
781 Loc, SM: ACtx.getSourceManager(), LangOpts: ACtx.getLangOpts());
782
783 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
784 return false;
785
786 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
787 (MacroName == "isblank") || (MacroName == "isdigit") ||
788 (MacroName == "isgraph") || (MacroName == "islower") ||
789 (MacroName == "isnctrl") || (MacroName == "isprint") ||
790 (MacroName == "ispunct") || (MacroName == "isspace") ||
791 (MacroName == "isupper") || (MacroName == "isxdigit"));
792}
793
794bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
795 CheckerContext &C) {
796 const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
797 if (!ASE)
798 return false;
799 return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType();
800}
801
802bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
803 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
804 do {
805 const DynTypedNodeList Parents = ParentCtx.getParents(Node: *S);
806 if (Parents.empty())
807 return false;
808 S = Parents[0].get<Stmt>();
809 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(Val: S));
810 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(Val: S);
811 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
812}
813
814bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E,
815 ProgramStateRef State,
816 NonLoc Offset, NonLoc Limit,
817 CheckerContext &C) {
818 if (isa<ArraySubscriptExpr>(Val: E) && isInAddressOf(S: E, ACtx&: C.getASTContext())) {
819 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
820 State, Value: Offset, Threshold: Limit, SVB&: C.getSValBuilder(), /*CheckEquality=*/true);
821 return EqualsToThreshold && !NotEqualToThreshold;
822 }
823 return false;
824}
825
826void ento::registerArrayBoundChecker(CheckerManager &mgr) {
827 mgr.registerChecker<ArrayBoundChecker>();
828}
829
830bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) {
831 return true;
832}
833