1//== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10// which looks for an out-of-bound array element access.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
15#include "clang/AST/ParentMapContext.h"
16#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
17#include "clang/StaticAnalyzer/Checkers/Taint.h"
18#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
19#include "clang/StaticAnalyzer/Core/Checker.h"
20#include "clang/StaticAnalyzer/Core/CheckerManager.h"
21#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
22#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
23#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
24#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
25#include "llvm/ADT/SmallString.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Support/raw_ostream.h"
28#include <optional>
29
30using namespace clang;
31using namespace ento;
32using namespace taint;
33using llvm::formatv;
34
35namespace {
36/// If `E` is a "clean" array subscript expression, return the type of the
37/// accessed element. If the base of the subscript expression is modified by
38/// pointer arithmetic (and not the beginning of a "full" memory region), this
39/// always returns nullopt because that's the right (or the least bad) thing to
40/// do for the diagnostic output that's relying on this.
41static std::optional<QualType> determineElementType(const Expr *E,
42 const CheckerContext &C) {
43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E);
44 if (!ASE)
45 return std::nullopt;
46
47 const MemRegion *SubscriptBaseReg = C.getSVal(S: ASE->getBase()).getAsRegion();
48 if (!SubscriptBaseReg)
49 return std::nullopt;
50
51 // The base of the subscript expression is affected by pointer arithmetics,
52 // so we want to report byte offsets instead of indices.
53 if (isa<ElementRegion>(Val: SubscriptBaseReg->StripCasts()))
54 return std::nullopt;
55
56 return ASE->getType();
57}
58
59static std::optional<int64_t>
60determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
61 if (!T)
62 return std::nullopt;
63 return C.getASTContext().getTypeSizeInChars(T: *T).getQuantity();
64}
65
66class StateUpdateReporter {
67 const SubRegion *Reg;
68 const NonLoc ByteOffsetVal;
69 const std::optional<QualType> ElementType;
70 const std::optional<int64_t> ElementSize;
71 bool AssumedNonNegative = false;
72 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
73
74public:
75 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
76 CheckerContext &C)
77 : Reg(R), ByteOffsetVal(ByteOffsVal),
78 ElementType(determineElementType(E, C)),
79 ElementSize(determineElementSize(T: ElementType, C)) {}
80
81 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
82 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
83 AssumedUpperBound = UpperBoundVal;
84 }
85
86 bool assumedNonNegative() { return AssumedNonNegative; }
87
88 const NoteTag *createNoteTag(CheckerContext &C) const;
89
90private:
91 std::string getMessage(PathSensitiveBugReport &BR) const;
92
93 /// Return true if information about the value of `Sym` can put constraints
94 /// on some symbol which is interesting within the bug report `BR`.
95 /// In particular, this returns true when `Sym` is interesting within `BR`;
96 /// but it also returns true if `Sym` is an expression that contains integer
97 /// constants and a single symbolic operand which is interesting (in `BR`).
98 /// We need to use this instead of plain `BR.isInteresting()` because if we
99 /// are analyzing code like
100 /// int array[10];
101 /// int f(int arg) {
102 /// return array[arg] && array[arg + 10];
103 /// }
104 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
105 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
106 /// to detect this out of bounds access).
107 static bool providesInformationAboutInteresting(SymbolRef Sym,
108 PathSensitiveBugReport &BR);
109 static bool providesInformationAboutInteresting(SVal SV,
110 PathSensitiveBugReport &BR) {
111 return providesInformationAboutInteresting(Sym: SV.getAsSymbol(), BR);
112 }
113};
114
115struct Messages {
116 std::string Short, Full;
117};
118
119// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
120// instead of `PreStmt` because the current implementation passes the whole
121// expression to `CheckerContext::getSVal()` which only works after the
122// symbolic evaluation of the expression. (To turn them into `PreStmt`
123// callbacks, we'd need to duplicate the logic that evaluates these
124// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
125// defined as `PostStmt` for the sake of consistency with the other callbacks.
126class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
127 check::PostStmt<UnaryOperator>,
128 check::PostStmt<MemberExpr>> {
129 BugType BT{this, "Out-of-bound access"};
130 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
131
132 void performCheck(const Expr *E, CheckerContext &C) const;
133
134 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
135 NonLoc Offset, std::optional<NonLoc> Extent,
136 bool IsTaintBug = false) const;
137
138 static void markPartsInteresting(PathSensitiveBugReport &BR,
139 ProgramStateRef ErrorState, NonLoc Val,
140 bool MarkTaint);
141
142 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
143
144 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
145 NonLoc Offset, NonLoc Limit,
146 CheckerContext &C);
147 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
148
149public:
150 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
151 performCheck(E, C);
152 }
153 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
154 if (E->getOpcode() == UO_Deref)
155 performCheck(E, C);
156 }
157 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
158 if (E->isArrow())
159 performCheck(E: E->getBase(), C);
160 }
161};
162
163} // anonymous namespace
164
165/// For a given Location that can be represented as a symbolic expression
166/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
167/// Arr and the distance of Location from the beginning of Arr (expressed in a
168/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
169/// cannot be determined.
170static std::optional<std::pair<const SubRegion *, NonLoc>>
171computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
172 QualType T = SVB.getArrayIndexType();
173 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
174 // We will use this utility to add and multiply values.
175 return SVB.evalBinOpNN(state: State, op: Op, lhs: L, rhs: R, resultTy: T).getAs<NonLoc>();
176 };
177
178 const SubRegion *OwnerRegion = nullptr;
179 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
180
181 const ElementRegion *CurRegion =
182 dyn_cast_or_null<ElementRegion>(Val: Location.getAsRegion());
183
184 while (CurRegion) {
185 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
186 if (!Index)
187 return std::nullopt;
188
189 QualType ElemType = CurRegion->getElementType();
190
191 // FIXME: The following early return was presumably added to safeguard the
192 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
193 // it seems that `ElemType` cannot be incomplete at this point.
194 if (ElemType->isIncompleteType())
195 return std::nullopt;
196
197 // Calculate Delta = Index * sizeof(ElemType).
198 NonLoc Size = SVB.makeArrayIndex(
199 idx: SVB.getContext().getTypeSizeInChars(T: ElemType).getQuantity());
200 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
201 if (!Delta)
202 return std::nullopt;
203
204 // Perform Offset += Delta.
205 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
206 if (!Offset)
207 return std::nullopt;
208
209 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
210 // When this is just another ElementRegion layer, we need to continue the
211 // offset calculations:
212 CurRegion = dyn_cast_or_null<ElementRegion>(Val: OwnerRegion);
213 }
214
215 if (OwnerRegion)
216 return std::make_pair(x&: OwnerRegion, y&: *Offset);
217
218 return std::nullopt;
219}
220
221// NOTE: This function is the "heart" of this checker. It simplifies
222// inequalities with transformations that are valid (and very elementary) in
223// pure mathematics, but become invalid if we use them in C++ number model
224// where the calculations may overflow.
225// Due to the overflow issues I think it's impossible (or at least not
226// practical) to integrate this kind of simplification into the resolution of
227// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
228// produces valid results when the calculations are handling memory offsets
229// and every value is well below SIZE_MAX.
230// TODO: This algorithm should be moved to a central location where it's
231// available for other checkers that need to compare memory offsets.
232// NOTE: the simplification preserves the order of the two operands in a
233// mathematical sense, but it may change the result produced by a C++
234// comparison operator (and the automatic type conversions).
235// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
236// size_t and the RHS is stored in an int. (As size_t is unsigned, this
237// comparison is false for all values of "X".) However, the simplification may
238// turn it into "X < -1", which is still always false in a mathematical sense,
239// but can produce a true result when evaluated by `evalBinOp` (which follows
240// the rules of C++ and casts -1 to SIZE_MAX).
241static std::pair<NonLoc, nonloc::ConcreteInt>
242getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
243 SValBuilder &svalBuilder) {
244 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
245 if (SymVal && SymVal->isExpression()) {
246 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(Val: SymVal->getSymbol())) {
247 llvm::APSInt constant =
248 APSIntType(extent.getValue()).convert(Value: SIE->getRHS());
249 switch (SIE->getOpcode()) {
250 case BO_Mul:
251 // The constant should never be 0 here, becasue multiplication by zero
252 // is simplified by the engine.
253 if ((extent.getValue() % constant) != 0)
254 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
255 else
256 return getSimplifiedOffsets(
257 offset: nonloc::SymbolVal(SIE->getLHS()),
258 extent: svalBuilder.makeIntVal(integer: extent.getValue() / constant),
259 svalBuilder);
260 case BO_Add:
261 return getSimplifiedOffsets(
262 offset: nonloc::SymbolVal(SIE->getLHS()),
263 extent: svalBuilder.makeIntVal(integer: extent.getValue() - constant), svalBuilder);
264 default:
265 break;
266 }
267 }
268 }
269
270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271}
272
273static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
274 const llvm::APSInt *MaxV = SVB.getMaxValue(state: State, val: Value);
275 return MaxV && MaxV->isNegative();
276}
277
278static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
279 QualType T = Value.getType(SVB.getContext());
280 return T->isUnsignedIntegerType();
281}
282
283// Evaluate the comparison Value < Threshold with the help of the custom
284// simplification algorithm defined for this checker. Return a pair of states,
285// where the first one corresponds to "value below threshold" and the second
286// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
287// the case when the evaluation fails.
288// If the optional argument CheckEquality is true, then use BO_EQ instead of
289// the default BO_LT after consistently applying the same simplification steps.
290static std::pair<ProgramStateRef, ProgramStateRef>
291compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
292 SValBuilder &SVB, bool CheckEquality = false) {
293 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
294 std::tie(args&: Value, args&: Threshold) = getSimplifiedOffsets(offset: Value, extent: *ConcreteThreshold, svalBuilder&: SVB);
295 }
296
297 // We want to perform a _mathematical_ comparison between the numbers `Value`
298 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
299 // perform automatic conversions. For example the number -1 is less than the
300 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
301 // -1 is converted to ULONGLONG_MAX.
302 // To avoid automatic conversions, we evaluate the "obvious" cases without
303 // calling `evalBinOpNN`:
304 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Value: Threshold)) {
305 if (CheckEquality) {
306 // negative_value == unsigned_threshold is always false
307 return {nullptr, State};
308 }
309 // negative_value < unsigned_threshold is always true
310 return {State, nullptr};
311 }
312 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Value: Threshold)) {
313 // unsigned_value == negative_threshold and
314 // unsigned_value < negative_threshold are both always false
315 return {nullptr, State};
316 }
317 // FIXME: These special cases are sufficient for handling real-world
318 // comparisons, but in theory there could be contrived situations where
319 // automatic conversion of a symbolic value (which can be negative and can be
320 // positive) leads to incorrect results.
321 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
322 // we want to ensure that assumptions coming from this precondition and
323 // assumptions coming from regular C/C++ operator calls are represented by
324 // constraints on the same symbolic expression. A solution that would
325 // evaluate these "mathematical" compariosns through a separate pathway would
326 // be a step backwards in this sense.
327
328 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
329 auto BelowThreshold =
330 SVB.evalBinOpNN(state: State, op: OpKind, lhs: Value, rhs: Threshold, resultTy: SVB.getConditionType())
331 .getAs<NonLoc>();
332
333 if (BelowThreshold)
334 return State->assume(Cond: *BelowThreshold);
335
336 return {nullptr, nullptr};
337}
338
339static std::string getRegionName(const SubRegion *Region) {
340 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
341 return RegName;
342
343 // Field regions only have descriptive names when their parent has a
344 // descriptive name; so we provide a fallback representation for them:
345 if (const auto *FR = Region->getAs<FieldRegion>()) {
346 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
347 return formatv(Fmt: "the field '{0}'", Vals&: Name);
348 return "the unnamed field";
349 }
350
351 if (isa<AllocaRegion>(Val: Region))
352 return "the memory returned by 'alloca'";
353
354 if (isa<SymbolicRegion>(Val: Region) &&
355 isa<HeapSpaceRegion>(Val: Region->getMemorySpace()))
356 return "the heap area";
357
358 if (isa<StringRegion>(Val: Region))
359 return "the string literal";
360
361 return "the region";
362}
363
364static std::optional<int64_t> getConcreteValue(NonLoc SV) {
365 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
366 return ConcreteVal->getValue().tryExtValue();
367 }
368 return std::nullopt;
369}
370
371static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
372 return SV ? getConcreteValue(SV: *SV) : std::nullopt;
373}
374
375static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
376 std::string RegName = getRegionName(Region), OffsetStr = "";
377
378 if (auto ConcreteOffset = getConcreteValue(SV: Offset))
379 OffsetStr = formatv(Fmt: " {0}", Vals&: ConcreteOffset);
380
381 return {
382 .Short: formatv(Fmt: "Out of bound access to memory preceding {0}", Vals&: RegName),
383 .Full: formatv(Fmt: "Access of {0} at negative byte offset{1}", Vals&: RegName, Vals&: OffsetStr)};
384}
385
386/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
387/// it can be performed (`Divisor` is nonzero and there is no remainder). The
388/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
389/// division is considered to be successful.
390static bool tryDividePair(std::optional<int64_t> &Val1,
391 std::optional<int64_t> &Val2, int64_t Divisor) {
392 if (!Divisor)
393 return false;
394 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
395 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
396 if (!Val1HasRemainder && !Val2HasRemainder) {
397 if (Val1)
398 *Val1 /= Divisor;
399 if (Val2)
400 *Val2 /= Divisor;
401 return true;
402 }
403 return false;
404}
405
406static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
407 NonLoc Offset, NonLoc Extent, SVal Location,
408 bool AlsoMentionUnderflow) {
409 std::string RegName = getRegionName(Region);
410 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
411 assert(EReg && "this checker only handles element access");
412 QualType ElemType = EReg->getElementType();
413
414 std::optional<int64_t> OffsetN = getConcreteValue(SV: Offset);
415 std::optional<int64_t> ExtentN = getConcreteValue(SV: Extent);
416
417 int64_t ElemSize = ACtx.getTypeSizeInChars(T: ElemType).getQuantity();
418
419 bool UseByteOffsets = !tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: ElemSize);
420 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
421
422 SmallString<256> Buf;
423 llvm::raw_svector_ostream Out(Buf);
424 Out << "Access of ";
425 if (!ExtentN && !UseByteOffsets)
426 Out << "'" << ElemType.getAsString() << "' element in ";
427 Out << RegName << " at ";
428 if (AlsoMentionUnderflow) {
429 Out << "a negative or overflowing " << OffsetOrIndex;
430 } else if (OffsetN) {
431 Out << OffsetOrIndex << " " << *OffsetN;
432 } else {
433 Out << "an overflowing " << OffsetOrIndex;
434 }
435 if (ExtentN) {
436 Out << ", while it holds only ";
437 if (*ExtentN != 1)
438 Out << *ExtentN;
439 else
440 Out << "a single";
441 if (UseByteOffsets)
442 Out << " byte";
443 else
444 Out << " '" << ElemType.getAsString() << "' element";
445
446 if (*ExtentN > 1)
447 Out << "s";
448 }
449
450 return {.Short: formatv(Fmt: "Out of bound access to memory {0} {1}",
451 Vals: AlsoMentionUnderflow ? "around" : "after the end of",
452 Vals&: RegName),
453 .Full: std::string(Buf)};
454}
455
456static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
457 bool AlsoMentionUnderflow) {
458 std::string RegName = getRegionName(Region);
459 return {.Short: formatv(Fmt: "Potential out of bound access to {0} with tainted {1}",
460 Vals&: RegName, Vals&: OffsetName),
461 .Full: formatv(Fmt: "Access of {0} with a tainted {1} that may be {2}too large",
462 Vals&: RegName, Vals&: OffsetName,
463 Vals: AlsoMentionUnderflow ? "negative or " : "")};
464}
465
466const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
467 // Don't create a note tag if we didn't assume anything:
468 if (!AssumedNonNegative && !AssumedUpperBound)
469 return nullptr;
470
471 return C.getNoteTag(Cb: [*this](PathSensitiveBugReport &BR) -> std::string {
472 return getMessage(BR);
473 });
474}
475
476std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
477 bool ShouldReportNonNegative = AssumedNonNegative;
478 if (!providesInformationAboutInteresting(SV: ByteOffsetVal, BR)) {
479 if (AssumedUpperBound &&
480 providesInformationAboutInteresting(SV: *AssumedUpperBound, BR)) {
481 // Even if the byte offset isn't interesting (e.g. it's a constant value),
482 // the assumption can still be interesting if it provides information
483 // about an interesting symbolic upper bound.
484 ShouldReportNonNegative = false;
485 } else {
486 // We don't have anything interesting, don't report the assumption.
487 return "";
488 }
489 }
490
491 std::optional<int64_t> OffsetN = getConcreteValue(SV: ByteOffsetVal);
492 std::optional<int64_t> ExtentN = getConcreteValue(SV: AssumedUpperBound);
493
494 const bool UseIndex =
495 ElementSize && tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: *ElementSize);
496
497 SmallString<256> Buf;
498 llvm::raw_svector_ostream Out(Buf);
499 Out << "Assuming ";
500 if (UseIndex) {
501 Out << "index ";
502 if (OffsetN)
503 Out << "'" << OffsetN << "' ";
504 } else if (AssumedUpperBound) {
505 Out << "byte offset ";
506 if (OffsetN)
507 Out << "'" << OffsetN << "' ";
508 } else {
509 Out << "offset ";
510 }
511
512 Out << "is";
513 if (ShouldReportNonNegative) {
514 Out << " non-negative";
515 }
516 if (AssumedUpperBound) {
517 if (ShouldReportNonNegative)
518 Out << " and";
519 Out << " less than ";
520 if (ExtentN)
521 Out << *ExtentN << ", ";
522 if (UseIndex && ElementType)
523 Out << "the number of '" << ElementType->getAsString()
524 << "' elements in ";
525 else
526 Out << "the extent of ";
527 Out << getRegionName(Region: Reg);
528 }
529 return std::string(Out.str());
530}
531
532bool StateUpdateReporter::providesInformationAboutInteresting(
533 SymbolRef Sym, PathSensitiveBugReport &BR) {
534 if (!Sym)
535 return false;
536 for (SymbolRef PartSym : Sym->symbols()) {
537 // The interestingess mark may appear on any layer as we're stripping off
538 // the SymIntExpr, UnarySymExpr etc. layers...
539 if (BR.isInteresting(sym: PartSym))
540 return true;
541 // ...but if both sides of the expression are symbolic, then there is no
542 // practical algorithm to produce separate constraints for the two
543 // operands (from the single combined result).
544 if (isa<SymSymExpr>(Val: PartSym))
545 return false;
546 }
547 return false;
548}
549
550void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
551 const SVal Location = C.getSVal(S: E);
552
553 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
554 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
555 // and incomplete analysis of these leads to false positives. As even
556 // accurate reports would be confusing for the users, just disable reports
557 // from these macros:
558 if (isFromCtypeMacro(S: E, AC&: C.getASTContext()))
559 return;
560
561 ProgramStateRef State = C.getState();
562 SValBuilder &SVB = C.getSValBuilder();
563
564 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
565 computeOffset(State, SVB, Location);
566
567 if (!RawOffset)
568 return;
569
570 auto [Reg, ByteOffset] = *RawOffset;
571
572 // The state updates will be reported as a single note tag, which will be
573 // composed by this helper class.
574 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
575
576 // CHECK LOWER BOUND
577 const MemSpaceRegion *Space = Reg->getMemorySpace();
578 if (!(isa<SymbolicRegion>(Val: Reg) && isa<UnknownSpaceRegion>(Val: Space))) {
579 // A symbolic region in unknown space represents an unknown pointer that
580 // may point into the middle of an array, so we don't look for underflows.
581 // Both conditions are significant because we want to check underflows in
582 // symbolic regions on the heap (which may be introduced by checkers like
583 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
584 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
585 // unknown space.
586 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
587 State, Value: ByteOffset, Threshold: SVB.makeZeroArrayIndex(), SVB);
588
589 if (PrecedesLowerBound) {
590 // The offset may be invalid (negative)...
591 if (!WithinLowerBound) {
592 // ...and it cannot be valid (>= 0), so report an error.
593 Messages Msgs = getPrecedesMsgs(Region: Reg, Offset: ByteOffset);
594 reportOOB(C, ErrorState: PrecedesLowerBound, Msgs, Offset: ByteOffset, Extent: std::nullopt);
595 return;
596 }
597 // ...but it can be valid as well, so the checker will (optimistically)
598 // assume that it's valid and mention this in the note tag.
599 SUR.recordNonNegativeAssumption();
600 }
601
602 // Actually update the state. The "if" only fails in the extremely unlikely
603 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
604 // evalBinOpNN fails to evaluate the less-than operator.
605 if (WithinLowerBound)
606 State = WithinLowerBound;
607 }
608
609 // CHECK UPPER BOUND
610 DefinedOrUnknownSVal Size = getDynamicExtent(State, MR: Reg, SVB);
611 if (auto KnownSize = Size.getAs<NonLoc>()) {
612 // In a situation where both underflow and overflow are possible (but the
613 // index is either tainted or known to be invalid), the logic of this
614 // checker will first assume that the offset is non-negative, and then
615 // (with this additional assumption) it will detect an overflow error.
616 // In this situation the warning message should mention both possibilities.
617 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
618
619 auto [WithinUpperBound, ExceedsUpperBound] =
620 compareValueToThreshold(State, Value: ByteOffset, Threshold: *KnownSize, SVB);
621
622 if (ExceedsUpperBound) {
623 // The offset may be invalid (>= Size)...
624 if (!WithinUpperBound) {
625 // ...and it cannot be within bounds, so report an error, unless we can
626 // definitely determine that this is an idiomatic `&array[size]`
627 // expression that calculates the past-the-end pointer.
628 if (isIdiomaticPastTheEndPtr(E, State: ExceedsUpperBound, Offset: ByteOffset,
629 Limit: *KnownSize, C)) {
630 C.addTransition(State: ExceedsUpperBound, Tag: SUR.createNoteTag(C));
631 return;
632 }
633
634 Messages Msgs =
635 getExceedsMsgs(ACtx&: C.getASTContext(), Region: Reg, Offset: ByteOffset, Extent: *KnownSize,
636 Location, AlsoMentionUnderflow);
637 reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize);
638 return;
639 }
640 // ...and it can be valid as well...
641 if (isTainted(State, V: ByteOffset)) {
642 // ...but it's tainted, so report an error.
643
644 // Diagnostic detail: saying "tainted offset" is always correct, but
645 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
646 // nicer to say "tainted index".
647 const char *OffsetName = "offset";
648 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E))
649 if (isTainted(State, S: ASE->getIdx(), LCtx: C.getLocationContext()))
650 OffsetName = "index";
651
652 Messages Msgs = getTaintMsgs(Region: Reg, OffsetName, AlsoMentionUnderflow);
653 reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize,
654 /*IsTaintBug=*/true);
655 return;
656 }
657 // ...and it isn't tainted, so the checker will (optimistically) assume
658 // that the offset is in bounds and mention this in the note tag.
659 SUR.recordUpperBoundAssumption(UpperBoundVal: *KnownSize);
660 }
661
662 // Actually update the state. The "if" only fails in the extremely unlikely
663 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
664 // evalBinOpNN fails to evaluate the less-than operator.
665 if (WithinUpperBound)
666 State = WithinUpperBound;
667 }
668
669 // Add a transition, reporting the state updates that we accumulated.
670 C.addTransition(State, Tag: SUR.createNoteTag(C));
671}
672
673void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
674 ProgramStateRef ErrorState,
675 NonLoc Val, bool MarkTaint) {
676 if (SymbolRef Sym = Val.getAsSymbol()) {
677 // If the offset is a symbolic value, iterate over its "parts" with
678 // `SymExpr::symbols()` and mark each of them as interesting.
679 // For example, if the offset is `x*4 + y` then we put interestingness onto
680 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
681 // `x` and `y`.
682 for (SymbolRef PartSym : Sym->symbols())
683 BR.markInteresting(sym: PartSym);
684 }
685
686 if (MarkTaint) {
687 // If the issue that we're reporting depends on the taintedness of the
688 // offset, then put interestingness onto symbols that could be the origin
689 // of the taint. Note that this may find symbols that did not appear in
690 // `Sym->symbols()` (because they're only loosely connected to `Val`).
691 for (SymbolRef Sym : getTaintedSymbols(State: ErrorState, V: Val))
692 BR.markInteresting(sym: Sym);
693 }
694}
695
696void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
697 ProgramStateRef ErrorState, Messages Msgs,
698 NonLoc Offset, std::optional<NonLoc> Extent,
699 bool IsTaintBug /*=false*/) const {
700
701 ExplodedNode *ErrorNode = C.generateErrorNode(State: ErrorState);
702 if (!ErrorNode)
703 return;
704
705 auto BR = std::make_unique<PathSensitiveBugReport>(
706 args: IsTaintBug ? TaintBT : BT, args&: Msgs.Short, args&: Msgs.Full, args&: ErrorNode);
707
708 // FIXME: ideally we would just call trackExpressionValue() and that would
709 // "do the right thing": mark the relevant symbols as interesting, track the
710 // control dependencies and statements storing the relevant values and add
711 // helpful diagnostic pieces. However, right now trackExpressionValue() is
712 // a heap of unreliable heuristics, so it would cause several issues:
713 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
714 // causes an overflow, then `x` is not marked as interesting.
715 // - We get irrelevant diagnostic pieces, e.g. in the code
716 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
717 // it places a "Storing uninitialized value" note on the `malloc` call
718 // (which is technically true, but irrelevant).
719 // If trackExpressionValue() becomes reliable, it should be applied instead
720 // of this custom markPartsInteresting().
721 markPartsInteresting(BR&: *BR, ErrorState, Val: Offset, MarkTaint: IsTaintBug);
722 if (Extent)
723 markPartsInteresting(BR&: *BR, ErrorState, Val: *Extent, MarkTaint: IsTaintBug);
724
725 C.emitReport(R: std::move(BR));
726}
727
728bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
729 SourceLocation Loc = S->getBeginLoc();
730 if (!Loc.isMacroID())
731 return false;
732
733 StringRef MacroName = Lexer::getImmediateMacroName(
734 Loc, SM: ACtx.getSourceManager(), LangOpts: ACtx.getLangOpts());
735
736 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
737 return false;
738
739 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
740 (MacroName == "isblank") || (MacroName == "isdigit") ||
741 (MacroName == "isgraph") || (MacroName == "islower") ||
742 (MacroName == "isnctrl") || (MacroName == "isprint") ||
743 (MacroName == "ispunct") || (MacroName == "isspace") ||
744 (MacroName == "isupper") || (MacroName == "isxdigit"));
745}
746
747bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
748 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
749 do {
750 const DynTypedNodeList Parents = ParentCtx.getParents(Node: *S);
751 if (Parents.empty())
752 return false;
753 S = Parents[0].get<Stmt>();
754 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(Val: S));
755 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(Val: S);
756 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
757}
758
759bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
760 ProgramStateRef State,
761 NonLoc Offset, NonLoc Limit,
762 CheckerContext &C) {
763 if (isa<ArraySubscriptExpr>(Val: E) && isInAddressOf(S: E, ACtx&: C.getASTContext())) {
764 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
765 State, Value: Offset, Threshold: Limit, SVB&: C.getSValBuilder(), /*CheckEquality=*/true);
766 return EqualsToThreshold && !NotEqualToThreshold;
767 }
768 return false;
769}
770
771void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
772 mgr.registerChecker<ArrayBoundCheckerV2>();
773}
774
775bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
776 return true;
777}
778