| 1 | //== ArrayBoundChecker.cpp -------------------------------------------------==// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file defines security.ArrayBound, which is a path-sensitive checker |
| 10 | // that looks for out of bounds access of memory regions. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "clang/AST/CharUnits.h" |
| 15 | #include "clang/AST/ParentMapContext.h" |
| 16 | #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" |
| 17 | #include "clang/StaticAnalyzer/Checkers/Taint.h" |
| 18 | #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" |
| 19 | #include "clang/StaticAnalyzer/Core/Checker.h" |
| 20 | #include "clang/StaticAnalyzer/Core/CheckerManager.h" |
| 21 | #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h" |
| 22 | #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" |
| 23 | #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" |
| 24 | #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" |
| 25 | #include "llvm/ADT/APSInt.h" |
| 26 | #include "llvm/Support/FormatVariadic.h" |
| 27 | #include "llvm/Support/raw_ostream.h" |
| 28 | #include <optional> |
| 29 | |
| 30 | using namespace clang; |
| 31 | using namespace ento; |
| 32 | using namespace taint; |
| 33 | using llvm::formatv; |
| 34 | |
| 35 | namespace { |
| 36 | /// If `E` is an array subscript expression with a base that is "clean" (= not |
| 37 | /// modified by pointer arithmetic = the beginning of a memory region), return |
| 38 | /// it as a pointer to ArraySubscriptExpr; otherwise return nullptr. |
| 39 | /// This helper function is used by two separate heuristics that are only valid |
| 40 | /// in these "clean" cases. |
| 41 | static const ArraySubscriptExpr * |
| 42 | getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) { |
| 43 | const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E); |
| 44 | if (!ASE) |
| 45 | return nullptr; |
| 46 | |
| 47 | const MemRegion *SubscriptBaseReg = C.getSVal(S: ASE->getBase()).getAsRegion(); |
| 48 | if (!SubscriptBaseReg) |
| 49 | return nullptr; |
| 50 | |
| 51 | // The base of the subscript expression is affected by pointer arithmetics, |
| 52 | // so we want to report byte offsets instead of indices and we don't want to |
| 53 | // activate the "index is unsigned -> cannot be negative" shortcut. |
| 54 | if (isa<ElementRegion>(Val: SubscriptBaseReg->StripCasts())) |
| 55 | return nullptr; |
| 56 | |
| 57 | return ASE; |
| 58 | } |
| 59 | |
| 60 | /// If `E` is a "clean" array subscript expression, return the type of the |
| 61 | /// accessed element; otherwise return std::nullopt because that's the best (or |
| 62 | /// least bad) option for the diagnostic generation that relies on this. |
| 63 | static std::optional<QualType> determineElementType(const Expr *E, |
| 64 | const CheckerContext &C) { |
| 65 | const auto *ASE = getAsCleanArraySubscriptExpr(E, C); |
| 66 | if (!ASE) |
| 67 | return std::nullopt; |
| 68 | |
| 69 | return ASE->getType(); |
| 70 | } |
| 71 | |
| 72 | static std::optional<int64_t> |
| 73 | determineElementSize(const std::optional<QualType> T, const CheckerContext &C) { |
| 74 | if (!T) |
| 75 | return std::nullopt; |
| 76 | return C.getASTContext().getTypeSizeInChars(T: *T).getQuantity(); |
| 77 | } |
| 78 | |
| 79 | class StateUpdateReporter { |
| 80 | const MemSpaceRegion *Space; |
| 81 | const SubRegion *Reg; |
| 82 | const NonLoc ByteOffsetVal; |
| 83 | const std::optional<QualType> ElementType; |
| 84 | const std::optional<int64_t> ElementSize; |
| 85 | bool AssumedNonNegative = false; |
| 86 | std::optional<NonLoc> AssumedUpperBound = std::nullopt; |
| 87 | |
| 88 | public: |
| 89 | StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E, |
| 90 | CheckerContext &C) |
| 91 | : Space(R->getMemorySpace(State: C.getState())), Reg(R), |
| 92 | ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)), |
| 93 | ElementSize(determineElementSize(T: ElementType, C)) {} |
| 94 | |
| 95 | void recordNonNegativeAssumption() { AssumedNonNegative = true; } |
| 96 | void recordUpperBoundAssumption(NonLoc UpperBoundVal) { |
| 97 | AssumedUpperBound = UpperBoundVal; |
| 98 | } |
| 99 | |
| 100 | bool assumedNonNegative() { return AssumedNonNegative; } |
| 101 | |
| 102 | const NoteTag *createNoteTag(CheckerContext &C) const; |
| 103 | |
| 104 | private: |
| 105 | std::string getMessage(PathSensitiveBugReport &BR) const; |
| 106 | |
| 107 | /// Return true if information about the value of `Sym` can put constraints |
| 108 | /// on some symbol which is interesting within the bug report `BR`. |
| 109 | /// In particular, this returns true when `Sym` is interesting within `BR`; |
| 110 | /// but it also returns true if `Sym` is an expression that contains integer |
| 111 | /// constants and a single symbolic operand which is interesting (in `BR`). |
| 112 | /// We need to use this instead of plain `BR.isInteresting()` because if we |
| 113 | /// are analyzing code like |
| 114 | /// int array[10]; |
| 115 | /// int f(int arg) { |
| 116 | /// return array[arg] && array[arg + 10]; |
| 117 | /// } |
| 118 | /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not |
| 119 | /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough |
| 120 | /// to detect this out of bounds access). |
| 121 | static bool providesInformationAboutInteresting(SymbolRef Sym, |
| 122 | PathSensitiveBugReport &BR); |
| 123 | static bool providesInformationAboutInteresting(SVal SV, |
| 124 | PathSensitiveBugReport &BR) { |
| 125 | return providesInformationAboutInteresting(Sym: SV.getAsSymbol(), BR); |
| 126 | } |
| 127 | }; |
| 128 | |
| 129 | struct Messages { |
| 130 | std::string Short, Full; |
| 131 | }; |
| 132 | |
| 133 | // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt` |
| 134 | // instead of `PreStmt` because the current implementation passes the whole |
| 135 | // expression to `CheckerContext::getSVal()` which only works after the |
| 136 | // symbolic evaluation of the expression. (To turn them into `PreStmt` |
| 137 | // callbacks, we'd need to duplicate the logic that evaluates these |
| 138 | // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's |
| 139 | // defined as `PostStmt` for the sake of consistency with the other callbacks. |
| 140 | class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>, |
| 141 | check::PostStmt<UnaryOperator>, |
| 142 | check::PostStmt<MemberExpr>> { |
| 143 | BugType BT{this, "Out-of-bound access" }; |
| 144 | BugType TaintBT{this, "Out-of-bound access" , categories::TaintedData}; |
| 145 | |
| 146 | void performCheck(const Expr *E, CheckerContext &C) const; |
| 147 | |
| 148 | void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs, |
| 149 | NonLoc Offset, std::optional<NonLoc> Extent, |
| 150 | bool IsTaintBug = false) const; |
| 151 | |
| 152 | static void markPartsInteresting(PathSensitiveBugReport &BR, |
| 153 | ProgramStateRef ErrorState, NonLoc Val, |
| 154 | bool MarkTaint); |
| 155 | |
| 156 | static bool isFromCtypeMacro(const Expr *E, ASTContext &AC); |
| 157 | |
| 158 | static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C); |
| 159 | |
| 160 | static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, |
| 161 | NonLoc Offset, NonLoc Limit, |
| 162 | CheckerContext &C); |
| 163 | static bool isInAddressOf(const Stmt *S, ASTContext &AC); |
| 164 | |
| 165 | public: |
| 166 | void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const { |
| 167 | performCheck(E, C); |
| 168 | } |
| 169 | void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const { |
| 170 | if (E->getOpcode() == UO_Deref) |
| 171 | performCheck(E, C); |
| 172 | } |
| 173 | void checkPostStmt(const MemberExpr *E, CheckerContext &C) const { |
| 174 | if (E->isArrow()) |
| 175 | performCheck(E: E->getBase(), C); |
| 176 | } |
| 177 | }; |
| 178 | |
| 179 | } // anonymous namespace |
| 180 | |
| 181 | /// For a given Location that can be represented as a symbolic expression |
| 182 | /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block |
| 183 | /// Arr and the distance of Location from the beginning of Arr (expressed in a |
| 184 | /// NonLoc that specifies the number of CharUnits). Returns nullopt when these |
| 185 | /// cannot be determined. |
| 186 | static std::optional<std::pair<const SubRegion *, NonLoc>> |
| 187 | computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) { |
| 188 | QualType T = SVB.getArrayIndexType(); |
| 189 | auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) { |
| 190 | // We will use this utility to add and multiply values. |
| 191 | return SVB.evalBinOpNN(state: State, op: Op, lhs: L, rhs: R, resultTy: T).getAs<NonLoc>(); |
| 192 | }; |
| 193 | |
| 194 | const SubRegion *OwnerRegion = nullptr; |
| 195 | std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex(); |
| 196 | |
| 197 | const ElementRegion *CurRegion = |
| 198 | dyn_cast_or_null<ElementRegion>(Val: Location.getAsRegion()); |
| 199 | |
| 200 | while (CurRegion) { |
| 201 | const auto Index = CurRegion->getIndex().getAs<NonLoc>(); |
| 202 | if (!Index) |
| 203 | return std::nullopt; |
| 204 | |
| 205 | QualType ElemType = CurRegion->getElementType(); |
| 206 | |
| 207 | // FIXME: The following early return was presumably added to safeguard the |
| 208 | // getTypeSizeInChars() call (which doesn't accept an incomplete type), but |
| 209 | // it seems that `ElemType` cannot be incomplete at this point. |
| 210 | if (ElemType->isIncompleteType()) |
| 211 | return std::nullopt; |
| 212 | |
| 213 | // Calculate Delta = Index * sizeof(ElemType). |
| 214 | NonLoc Size = SVB.makeArrayIndex( |
| 215 | idx: SVB.getContext().getTypeSizeInChars(T: ElemType).getQuantity()); |
| 216 | auto Delta = EvalBinOp(BO_Mul, *Index, Size); |
| 217 | if (!Delta) |
| 218 | return std::nullopt; |
| 219 | |
| 220 | // Perform Offset += Delta. |
| 221 | Offset = EvalBinOp(BO_Add, *Offset, *Delta); |
| 222 | if (!Offset) |
| 223 | return std::nullopt; |
| 224 | |
| 225 | OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>(); |
| 226 | // When this is just another ElementRegion layer, we need to continue the |
| 227 | // offset calculations: |
| 228 | CurRegion = dyn_cast_or_null<ElementRegion>(Val: OwnerRegion); |
| 229 | } |
| 230 | |
| 231 | if (OwnerRegion) |
| 232 | return std::make_pair(x&: OwnerRegion, y&: *Offset); |
| 233 | |
| 234 | return std::nullopt; |
| 235 | } |
| 236 | |
| 237 | // NOTE: This function is the "heart" of this checker. It simplifies |
| 238 | // inequalities with transformations that are valid (and very elementary) in |
| 239 | // pure mathematics, but become invalid if we use them in C++ number model |
| 240 | // where the calculations may overflow. |
| 241 | // Due to the overflow issues I think it's impossible (or at least not |
| 242 | // practical) to integrate this kind of simplification into the resolution of |
| 243 | // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function |
| 244 | // produces valid results when the calculations are handling memory offsets |
| 245 | // and every value is well below SIZE_MAX. |
| 246 | // TODO: This algorithm should be moved to a central location where it's |
| 247 | // available for other checkers that need to compare memory offsets. |
| 248 | // NOTE: the simplification preserves the order of the two operands in a |
| 249 | // mathematical sense, but it may change the result produced by a C++ |
| 250 | // comparison operator (and the automatic type conversions). |
| 251 | // For example, consider a comparison "X+1 < 0", where the LHS is stored as a |
| 252 | // size_t and the RHS is stored in an int. (As size_t is unsigned, this |
| 253 | // comparison is false for all values of "X".) However, the simplification may |
| 254 | // turn it into "X < -1", which is still always false in a mathematical sense, |
| 255 | // but can produce a true result when evaluated by `evalBinOp` (which follows |
| 256 | // the rules of C++ and casts -1 to SIZE_MAX). |
| 257 | static std::pair<NonLoc, nonloc::ConcreteInt> |
| 258 | getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, |
| 259 | SValBuilder &svalBuilder) { |
| 260 | const llvm::APSInt &extentVal = extent.getValue(); |
| 261 | std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>(); |
| 262 | if (SymVal && SymVal->isExpression()) { |
| 263 | if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(Val: SymVal->getSymbol())) { |
| 264 | llvm::APSInt constant = APSIntType(extentVal).convert(Value: SIE->getRHS()); |
| 265 | switch (SIE->getOpcode()) { |
| 266 | case BO_Mul: |
| 267 | // The constant should never be 0 here, becasue multiplication by zero |
| 268 | // is simplified by the engine. |
| 269 | if ((extentVal % constant) != 0) |
| 270 | return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); |
| 271 | else |
| 272 | return getSimplifiedOffsets( |
| 273 | offset: nonloc::SymbolVal(SIE->getLHS()), |
| 274 | extent: svalBuilder.makeIntVal(integer: extentVal / constant), svalBuilder); |
| 275 | case BO_Add: |
| 276 | return getSimplifiedOffsets( |
| 277 | offset: nonloc::SymbolVal(SIE->getLHS()), |
| 278 | extent: svalBuilder.makeIntVal(integer: extentVal - constant), svalBuilder); |
| 279 | default: |
| 280 | break; |
| 281 | } |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent); |
| 286 | } |
| 287 | |
| 288 | static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) { |
| 289 | const llvm::APSInt *MaxV = SVB.getMaxValue(state: State, val: Value); |
| 290 | return MaxV && MaxV->isNegative(); |
| 291 | } |
| 292 | |
| 293 | static bool isUnsigned(SValBuilder &SVB, NonLoc Value) { |
| 294 | QualType T = Value.getType(SVB.getContext()); |
| 295 | return T->isUnsignedIntegerType(); |
| 296 | } |
| 297 | |
| 298 | // Evaluate the comparison Value < Threshold with the help of the custom |
| 299 | // simplification algorithm defined for this checker. Return a pair of states, |
| 300 | // where the first one corresponds to "value below threshold" and the second |
| 301 | // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in |
| 302 | // the case when the evaluation fails. |
| 303 | // If the optional argument CheckEquality is true, then use BO_EQ instead of |
| 304 | // the default BO_LT after consistently applying the same simplification steps. |
| 305 | static std::pair<ProgramStateRef, ProgramStateRef> |
| 306 | compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, |
| 307 | SValBuilder &SVB, bool CheckEquality = false) { |
| 308 | if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { |
| 309 | std::tie(args&: Value, args&: Threshold) = |
| 310 | getSimplifiedOffsets(offset: Value, extent: *ConcreteThreshold, svalBuilder&: SVB); |
| 311 | } |
| 312 | |
| 313 | // We want to perform a _mathematical_ comparison between the numbers `Value` |
| 314 | // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may |
| 315 | // perform automatic conversions. For example the number -1 is less than the |
| 316 | // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int` |
| 317 | // -1 is converted to ULONGLONG_MAX. |
| 318 | // To avoid automatic conversions, we evaluate the "obvious" cases without |
| 319 | // calling `evalBinOpNN`: |
| 320 | if (isNegative(SVB, State, Value) && isUnsigned(SVB, Value: Threshold)) { |
| 321 | if (CheckEquality) { |
| 322 | // negative_value == unsigned_threshold is always false |
| 323 | return {nullptr, State}; |
| 324 | } |
| 325 | // negative_value < unsigned_threshold is always true |
| 326 | return {State, nullptr}; |
| 327 | } |
| 328 | if (isUnsigned(SVB, Value) && isNegative(SVB, State, Value: Threshold)) { |
| 329 | // unsigned_value == negative_threshold and |
| 330 | // unsigned_value < negative_threshold are both always false |
| 331 | return {nullptr, State}; |
| 332 | } |
| 333 | // FIXME: These special cases are sufficient for handling real-world |
| 334 | // comparisons, but in theory there could be contrived situations where |
| 335 | // automatic conversion of a symbolic value (which can be negative and can be |
| 336 | // positive) leads to incorrect results. |
| 337 | // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because |
| 338 | // we want to ensure that assumptions coming from this precondition and |
| 339 | // assumptions coming from regular C/C++ operator calls are represented by |
| 340 | // constraints on the same symbolic expression. A solution that would |
| 341 | // evaluate these "mathematical" comparisons through a separate pathway would |
| 342 | // be a step backwards in this sense. |
| 343 | |
| 344 | const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; |
| 345 | auto BelowThreshold = |
| 346 | SVB.evalBinOpNN(state: State, op: OpKind, lhs: Value, rhs: Threshold, resultTy: SVB.getConditionType()) |
| 347 | .getAs<NonLoc>(); |
| 348 | |
| 349 | if (BelowThreshold) |
| 350 | return State->assume(Cond: *BelowThreshold); |
| 351 | |
| 352 | return {nullptr, nullptr}; |
| 353 | } |
| 354 | |
| 355 | static std::string getRegionName(const MemSpaceRegion *Space, |
| 356 | const SubRegion *Region) { |
| 357 | if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) |
| 358 | return RegName; |
| 359 | |
| 360 | // Field regions only have descriptive names when their parent has a |
| 361 | // descriptive name; so we provide a fallback representation for them: |
| 362 | if (const auto *FR = Region->getAs<FieldRegion>()) { |
| 363 | if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) |
| 364 | return formatv(Fmt: "the field '{0}'" , Vals&: Name); |
| 365 | return "the unnamed field" ; |
| 366 | } |
| 367 | |
| 368 | if (isa<AllocaRegion>(Val: Region)) |
| 369 | return "the memory returned by 'alloca'" ; |
| 370 | |
| 371 | if (isa<SymbolicRegion>(Val: Region) && isa<HeapSpaceRegion>(Val: Space)) |
| 372 | return "the heap area" ; |
| 373 | |
| 374 | if (isa<StringRegion>(Val: Region)) |
| 375 | return "the string literal" ; |
| 376 | |
| 377 | return "the region" ; |
| 378 | } |
| 379 | |
| 380 | static std::optional<int64_t> getConcreteValue(NonLoc SV) { |
| 381 | if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { |
| 382 | return ConcreteVal->getValue()->tryExtValue(); |
| 383 | } |
| 384 | return std::nullopt; |
| 385 | } |
| 386 | |
| 387 | static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { |
| 388 | return SV ? getConcreteValue(SV: *SV) : std::nullopt; |
| 389 | } |
| 390 | |
| 391 | static Messages getPrecedesMsgs(const MemSpaceRegion *Space, |
| 392 | const SubRegion *Region, NonLoc Offset) { |
| 393 | std::string RegName = getRegionName(Space, Region), OffsetStr = "" ; |
| 394 | |
| 395 | if (auto ConcreteOffset = getConcreteValue(SV: Offset)) |
| 396 | OffsetStr = formatv(Fmt: " {0}" , Vals&: ConcreteOffset); |
| 397 | |
| 398 | return { |
| 399 | .Short: formatv(Fmt: "Out of bound access to memory preceding {0}" , Vals&: RegName), |
| 400 | .Full: formatv(Fmt: "Access of {0} at negative byte offset{1}" , Vals&: RegName, Vals&: OffsetStr)}; |
| 401 | } |
| 402 | |
| 403 | /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if |
| 404 | /// it can be performed (`Divisor` is nonzero and there is no remainder). The |
| 405 | /// values `Val1` and `Val2` may be nullopt and in that case the corresponding |
| 406 | /// division is considered to be successful. |
| 407 | static bool tryDividePair(std::optional<int64_t> &Val1, |
| 408 | std::optional<int64_t> &Val2, int64_t Divisor) { |
| 409 | if (!Divisor) |
| 410 | return false; |
| 411 | const bool Val1HasRemainder = Val1 && *Val1 % Divisor; |
| 412 | const bool Val2HasRemainder = Val2 && *Val2 % Divisor; |
| 413 | if (Val1HasRemainder || Val2HasRemainder) |
| 414 | return false; |
| 415 | if (Val1) |
| 416 | *Val1 /= Divisor; |
| 417 | if (Val2) |
| 418 | *Val2 /= Divisor; |
| 419 | return true; |
| 420 | } |
| 421 | |
| 422 | static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space, |
| 423 | const SubRegion *Region, NonLoc Offset, |
| 424 | NonLoc Extent, SVal Location, |
| 425 | bool AlsoMentionUnderflow) { |
| 426 | std::string RegName = getRegionName(Space, Region); |
| 427 | const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>(); |
| 428 | assert(EReg && "this checker only handles element access" ); |
| 429 | QualType ElemType = EReg->getElementType(); |
| 430 | |
| 431 | std::optional<int64_t> OffsetN = getConcreteValue(SV: Offset); |
| 432 | std::optional<int64_t> ExtentN = getConcreteValue(SV: Extent); |
| 433 | |
| 434 | int64_t ElemSize = ACtx.getTypeSizeInChars(T: ElemType).getQuantity(); |
| 435 | |
| 436 | bool UseByteOffsets = !tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: ElemSize); |
| 437 | const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index" ; |
| 438 | |
| 439 | SmallString<256> Buf; |
| 440 | llvm::raw_svector_ostream Out(Buf); |
| 441 | Out << "Access of " ; |
| 442 | if (!ExtentN && !UseByteOffsets) |
| 443 | Out << "'" << ElemType.getAsString() << "' element in " ; |
| 444 | Out << RegName << " at " ; |
| 445 | if (AlsoMentionUnderflow) { |
| 446 | Out << "a negative or overflowing " << OffsetOrIndex; |
| 447 | } else if (OffsetN) { |
| 448 | Out << OffsetOrIndex << " " << *OffsetN; |
| 449 | } else { |
| 450 | Out << "an overflowing " << OffsetOrIndex; |
| 451 | } |
| 452 | if (ExtentN) { |
| 453 | Out << ", while it holds only " ; |
| 454 | if (*ExtentN != 1) |
| 455 | Out << *ExtentN; |
| 456 | else |
| 457 | Out << "a single" ; |
| 458 | if (UseByteOffsets) |
| 459 | Out << " byte" ; |
| 460 | else |
| 461 | Out << " '" << ElemType.getAsString() << "' element" ; |
| 462 | |
| 463 | if (*ExtentN > 1) |
| 464 | Out << "s" ; |
| 465 | } |
| 466 | |
| 467 | return {.Short: formatv(Fmt: "Out of bound access to memory {0} {1}" , |
| 468 | Vals: AlsoMentionUnderflow ? "around" : "after the end of" , |
| 469 | Vals&: RegName), |
| 470 | .Full: std::string(Buf)}; |
| 471 | } |
| 472 | |
| 473 | static Messages getTaintMsgs(const MemSpaceRegion *Space, |
| 474 | const SubRegion *Region, const char *OffsetName, |
| 475 | bool AlsoMentionUnderflow) { |
| 476 | std::string RegName = getRegionName(Space, Region); |
| 477 | return {.Short: formatv(Fmt: "Potential out of bound access to {0} with tainted {1}" , |
| 478 | Vals&: RegName, Vals&: OffsetName), |
| 479 | .Full: formatv(Fmt: "Access of {0} with a tainted {1} that may be {2}too large" , |
| 480 | Vals&: RegName, Vals&: OffsetName, |
| 481 | Vals: AlsoMentionUnderflow ? "negative or " : "" )}; |
| 482 | } |
| 483 | |
| 484 | const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const { |
| 485 | // Don't create a note tag if we didn't assume anything: |
| 486 | if (!AssumedNonNegative && !AssumedUpperBound) |
| 487 | return nullptr; |
| 488 | |
| 489 | return C.getNoteTag(Cb: [*this](PathSensitiveBugReport &BR) -> std::string { |
| 490 | return getMessage(BR); |
| 491 | }); |
| 492 | } |
| 493 | |
| 494 | std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const { |
| 495 | bool ShouldReportNonNegative = AssumedNonNegative; |
| 496 | if (!providesInformationAboutInteresting(SV: ByteOffsetVal, BR)) { |
| 497 | if (AssumedUpperBound && |
| 498 | providesInformationAboutInteresting(SV: *AssumedUpperBound, BR)) { |
| 499 | // Even if the byte offset isn't interesting (e.g. it's a constant value), |
| 500 | // the assumption can still be interesting if it provides information |
| 501 | // about an interesting symbolic upper bound. |
| 502 | ShouldReportNonNegative = false; |
| 503 | } else { |
| 504 | // We don't have anything interesting, don't report the assumption. |
| 505 | return "" ; |
| 506 | } |
| 507 | } |
| 508 | |
| 509 | std::optional<int64_t> OffsetN = getConcreteValue(SV: ByteOffsetVal); |
| 510 | std::optional<int64_t> ExtentN = getConcreteValue(SV: AssumedUpperBound); |
| 511 | |
| 512 | const bool UseIndex = |
| 513 | ElementSize && tryDividePair(Val1&: OffsetN, Val2&: ExtentN, Divisor: *ElementSize); |
| 514 | |
| 515 | SmallString<256> Buf; |
| 516 | llvm::raw_svector_ostream Out(Buf); |
| 517 | Out << "Assuming " ; |
| 518 | if (UseIndex) { |
| 519 | Out << "index " ; |
| 520 | if (OffsetN) |
| 521 | Out << "'" << OffsetN << "' " ; |
| 522 | } else if (AssumedUpperBound) { |
| 523 | Out << "byte offset " ; |
| 524 | if (OffsetN) |
| 525 | Out << "'" << OffsetN << "' " ; |
| 526 | } else { |
| 527 | Out << "offset " ; |
| 528 | } |
| 529 | |
| 530 | Out << "is" ; |
| 531 | if (ShouldReportNonNegative) { |
| 532 | Out << " non-negative" ; |
| 533 | } |
| 534 | if (AssumedUpperBound) { |
| 535 | if (ShouldReportNonNegative) |
| 536 | Out << " and" ; |
| 537 | Out << " less than " ; |
| 538 | if (ExtentN) |
| 539 | Out << *ExtentN << ", " ; |
| 540 | if (UseIndex && ElementType) |
| 541 | Out << "the number of '" << ElementType->getAsString() |
| 542 | << "' elements in " ; |
| 543 | else |
| 544 | Out << "the extent of " ; |
| 545 | Out << getRegionName(Space, Region: Reg); |
| 546 | } |
| 547 | return std::string(Out.str()); |
| 548 | } |
| 549 | |
| 550 | bool StateUpdateReporter::providesInformationAboutInteresting( |
| 551 | SymbolRef Sym, PathSensitiveBugReport &BR) { |
| 552 | if (!Sym) |
| 553 | return false; |
| 554 | for (SymbolRef PartSym : Sym->symbols()) { |
| 555 | // The interestingess mark may appear on any layer as we're stripping off |
| 556 | // the SymIntExpr, UnarySymExpr etc. layers... |
| 557 | if (BR.isInteresting(sym: PartSym)) |
| 558 | return true; |
| 559 | // ...but if both sides of the expression are symbolic, then there is no |
| 560 | // practical algorithm to produce separate constraints for the two |
| 561 | // operands (from the single combined result). |
| 562 | if (isa<SymSymExpr>(Val: PartSym)) |
| 563 | return false; |
| 564 | } |
| 565 | return false; |
| 566 | } |
| 567 | |
| 568 | void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { |
| 569 | const SVal Location = C.getSVal(S: E); |
| 570 | |
| 571 | // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as |
| 572 | // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX) |
| 573 | // and incomplete analysis of these leads to false positives. As even |
| 574 | // accurate reports would be confusing for the users, just disable reports |
| 575 | // from these macros: |
| 576 | if (isFromCtypeMacro(E, AC&: C.getASTContext())) |
| 577 | return; |
| 578 | |
| 579 | ProgramStateRef State = C.getState(); |
| 580 | SValBuilder &SVB = C.getSValBuilder(); |
| 581 | |
| 582 | const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset = |
| 583 | computeOffset(State, SVB, Location); |
| 584 | |
| 585 | if (!RawOffset) |
| 586 | return; |
| 587 | |
| 588 | auto [Reg, ByteOffset] = *RawOffset; |
| 589 | |
| 590 | // The state updates will be reported as a single note tag, which will be |
| 591 | // composed by this helper class. |
| 592 | StateUpdateReporter SUR(Reg, ByteOffset, E, C); |
| 593 | |
| 594 | // CHECK LOWER BOUND |
| 595 | const MemSpaceRegion *Space = Reg->getMemorySpace(State); |
| 596 | if (!(isa<SymbolicRegion>(Val: Reg) && isa<UnknownSpaceRegion>(Val: Space))) { |
| 597 | // A symbolic region in unknown space represents an unknown pointer that |
| 598 | // may point into the middle of an array, so we don't look for underflows. |
| 599 | // Both conditions are significant because we want to check underflows in |
| 600 | // symbolic regions on the heap (which may be introduced by checkers like |
| 601 | // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and |
| 602 | // non-symbolic regions (e.g. a field subregion of a symbolic region) in |
| 603 | // unknown space. |
| 604 | auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold( |
| 605 | State, Value: ByteOffset, Threshold: SVB.makeZeroArrayIndex(), SVB); |
| 606 | |
| 607 | if (PrecedesLowerBound) { |
| 608 | // The analyzer thinks that the offset may be invalid (negative)... |
| 609 | |
| 610 | if (isOffsetObviouslyNonnegative(E, C)) { |
| 611 | // ...but the offset is obviously non-negative (clear array subscript |
| 612 | // with an unsigned index), so we're in a buggy situation. |
| 613 | |
| 614 | // TODO: Currently the analyzer ignores many casts (e.g. signed -> |
| 615 | // unsigned casts), so it can easily reach states where it will load a |
| 616 | // signed (and negative) value from an unsigned variable. This sanity |
| 617 | // check is a duct tape "solution" that silences most of the ugly false |
| 618 | // positives that are caused by this buggy behavior. Note that this is |
| 619 | // not a complete solution: this cannot silence reports where pointer |
| 620 | // arithmetic complicates the picture and cannot ensure modeling of the |
| 621 | // "unsigned index is positive with highest bit set" cases which are |
| 622 | // "usurped" by the nonsense "unsigned index is negative" case. |
| 623 | // For more information about this topic, see the umbrella ticket |
| 624 | // https://github.com/llvm/llvm-project/issues/39492 |
| 625 | // TODO: Remove this hack once 'SymbolCast's are modeled properly. |
| 626 | |
| 627 | if (!WithinLowerBound) { |
| 628 | // The state is completely nonsense -- let's just sink it! |
| 629 | C.addSink(); |
| 630 | return; |
| 631 | } |
| 632 | // Otherwise continue on the 'WithinLowerBound' branch where the |
| 633 | // unsigned index _is_ non-negative. Don't mention this assumption as a |
| 634 | // note tag, because it would just confuse the users! |
| 635 | } else { |
| 636 | if (!WithinLowerBound) { |
| 637 | // ...and it cannot be valid (>= 0), so report an error. |
| 638 | Messages Msgs = getPrecedesMsgs(Space, Region: Reg, Offset: ByteOffset); |
| 639 | reportOOB(C, ErrorState: PrecedesLowerBound, Msgs, Offset: ByteOffset, Extent: std::nullopt); |
| 640 | return; |
| 641 | } |
| 642 | // ...but it can be valid as well, so the checker will (optimistically) |
| 643 | // assume that it's valid and mention this in the note tag. |
| 644 | SUR.recordNonNegativeAssumption(); |
| 645 | } |
| 646 | } |
| 647 | |
| 648 | // Actually update the state. The "if" only fails in the extremely unlikely |
| 649 | // case when compareValueToThreshold returns {nullptr, nullptr} because |
| 650 | // evalBinOpNN fails to evaluate the less-than operator. |
| 651 | if (WithinLowerBound) |
| 652 | State = WithinLowerBound; |
| 653 | } |
| 654 | |
| 655 | // CHECK UPPER BOUND |
| 656 | DefinedOrUnknownSVal Size = getDynamicExtent(State, MR: Reg, SVB); |
| 657 | if (auto KnownSize = Size.getAs<NonLoc>()) { |
| 658 | // In a situation where both underflow and overflow are possible (but the |
| 659 | // index is either tainted or known to be invalid), the logic of this |
| 660 | // checker will first assume that the offset is non-negative, and then |
| 661 | // (with this additional assumption) it will detect an overflow error. |
| 662 | // In this situation the warning message should mention both possibilities. |
| 663 | bool AlsoMentionUnderflow = SUR.assumedNonNegative(); |
| 664 | |
| 665 | auto [WithinUpperBound, ExceedsUpperBound] = |
| 666 | compareValueToThreshold(State, Value: ByteOffset, Threshold: *KnownSize, SVB); |
| 667 | |
| 668 | if (ExceedsUpperBound) { |
| 669 | // The offset may be invalid (>= Size)... |
| 670 | if (!WithinUpperBound) { |
| 671 | // ...and it cannot be within bounds, so report an error, unless we can |
| 672 | // definitely determine that this is an idiomatic `&array[size]` |
| 673 | // expression that calculates the past-the-end pointer. |
| 674 | if (isIdiomaticPastTheEndPtr(E, State: ExceedsUpperBound, Offset: ByteOffset, |
| 675 | Limit: *KnownSize, C)) { |
| 676 | C.addTransition(State: ExceedsUpperBound, Tag: SUR.createNoteTag(C)); |
| 677 | return; |
| 678 | } |
| 679 | |
| 680 | Messages Msgs = |
| 681 | getExceedsMsgs(ACtx&: C.getASTContext(), Space, Region: Reg, Offset: ByteOffset, |
| 682 | Extent: *KnownSize, Location, AlsoMentionUnderflow); |
| 683 | reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize); |
| 684 | return; |
| 685 | } |
| 686 | // ...and it can be valid as well... |
| 687 | if (isTainted(State, V: ByteOffset)) { |
| 688 | // ...but it's tainted, so report an error. |
| 689 | |
| 690 | // Diagnostic detail: saying "tainted offset" is always correct, but |
| 691 | // the common case is that 'idx' is tainted in 'arr[idx]' and then it's |
| 692 | // nicer to say "tainted index". |
| 693 | const char *OffsetName = "offset" ; |
| 694 | if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: E)) |
| 695 | if (isTainted(State, S: ASE->getIdx(), LCtx: C.getLocationContext())) |
| 696 | OffsetName = "index" ; |
| 697 | |
| 698 | Messages Msgs = |
| 699 | getTaintMsgs(Space, Region: Reg, OffsetName, AlsoMentionUnderflow); |
| 700 | reportOOB(C, ErrorState: ExceedsUpperBound, Msgs, Offset: ByteOffset, Extent: KnownSize, |
| 701 | /*IsTaintBug=*/true); |
| 702 | return; |
| 703 | } |
| 704 | // ...and it isn't tainted, so the checker will (optimistically) assume |
| 705 | // that the offset is in bounds and mention this in the note tag. |
| 706 | SUR.recordUpperBoundAssumption(UpperBoundVal: *KnownSize); |
| 707 | } |
| 708 | |
| 709 | // Actually update the state. The "if" only fails in the extremely unlikely |
| 710 | // case when compareValueToThreshold returns {nullptr, nullptr} because |
| 711 | // evalBinOpNN fails to evaluate the less-than operator. |
| 712 | if (WithinUpperBound) |
| 713 | State = WithinUpperBound; |
| 714 | } |
| 715 | |
| 716 | // Add a transition, reporting the state updates that we accumulated. |
| 717 | C.addTransition(State, Tag: SUR.createNoteTag(C)); |
| 718 | } |
| 719 | |
| 720 | void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR, |
| 721 | ProgramStateRef ErrorState, |
| 722 | NonLoc Val, bool MarkTaint) { |
| 723 | if (SymbolRef Sym = Val.getAsSymbol()) { |
| 724 | // If the offset is a symbolic value, iterate over its "parts" with |
| 725 | // `SymExpr::symbols()` and mark each of them as interesting. |
| 726 | // For example, if the offset is `x*4 + y` then we put interestingness onto |
| 727 | // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols |
| 728 | // `x` and `y`. |
| 729 | for (SymbolRef PartSym : Sym->symbols()) |
| 730 | BR.markInteresting(sym: PartSym); |
| 731 | } |
| 732 | |
| 733 | if (MarkTaint) { |
| 734 | // If the issue that we're reporting depends on the taintedness of the |
| 735 | // offset, then put interestingness onto symbols that could be the origin |
| 736 | // of the taint. Note that this may find symbols that did not appear in |
| 737 | // `Sym->symbols()` (because they're only loosely connected to `Val`). |
| 738 | for (SymbolRef Sym : getTaintedSymbols(State: ErrorState, V: Val)) |
| 739 | BR.markInteresting(sym: Sym); |
| 740 | } |
| 741 | } |
| 742 | |
| 743 | void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState, |
| 744 | Messages Msgs, NonLoc Offset, |
| 745 | std::optional<NonLoc> Extent, |
| 746 | bool IsTaintBug /*=false*/) const { |
| 747 | |
| 748 | ExplodedNode *ErrorNode = C.generateErrorNode(State: ErrorState); |
| 749 | if (!ErrorNode) |
| 750 | return; |
| 751 | |
| 752 | auto BR = std::make_unique<PathSensitiveBugReport>( |
| 753 | args: IsTaintBug ? TaintBT : BT, args&: Msgs.Short, args&: Msgs.Full, args&: ErrorNode); |
| 754 | |
| 755 | // FIXME: ideally we would just call trackExpressionValue() and that would |
| 756 | // "do the right thing": mark the relevant symbols as interesting, track the |
| 757 | // control dependencies and statements storing the relevant values and add |
| 758 | // helpful diagnostic pieces. However, right now trackExpressionValue() is |
| 759 | // a heap of unreliable heuristics, so it would cause several issues: |
| 760 | // - Interestingness is not applied consistently, e.g. if `array[x+10]` |
| 761 | // causes an overflow, then `x` is not marked as interesting. |
| 762 | // - We get irrelevant diagnostic pieces, e.g. in the code |
| 763 | // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;` |
| 764 | // it places a "Storing uninitialized value" note on the `malloc` call |
| 765 | // (which is technically true, but irrelevant). |
| 766 | // If trackExpressionValue() becomes reliable, it should be applied instead |
| 767 | // of this custom markPartsInteresting(). |
| 768 | markPartsInteresting(BR&: *BR, ErrorState, Val: Offset, MarkTaint: IsTaintBug); |
| 769 | if (Extent) |
| 770 | markPartsInteresting(BR&: *BR, ErrorState, Val: *Extent, MarkTaint: IsTaintBug); |
| 771 | |
| 772 | C.emitReport(R: std::move(BR)); |
| 773 | } |
| 774 | |
| 775 | bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) { |
| 776 | SourceLocation Loc = E->getBeginLoc(); |
| 777 | if (!Loc.isMacroID()) |
| 778 | return false; |
| 779 | |
| 780 | StringRef MacroName = Lexer::getImmediateMacroName( |
| 781 | Loc, SM: ACtx.getSourceManager(), LangOpts: ACtx.getLangOpts()); |
| 782 | |
| 783 | if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's') |
| 784 | return false; |
| 785 | |
| 786 | return ((MacroName == "isalnum" ) || (MacroName == "isalpha" ) || |
| 787 | (MacroName == "isblank" ) || (MacroName == "isdigit" ) || |
| 788 | (MacroName == "isgraph" ) || (MacroName == "islower" ) || |
| 789 | (MacroName == "isnctrl" ) || (MacroName == "isprint" ) || |
| 790 | (MacroName == "ispunct" ) || (MacroName == "isspace" ) || |
| 791 | (MacroName == "isupper" ) || (MacroName == "isxdigit" )); |
| 792 | } |
| 793 | |
| 794 | bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E, |
| 795 | CheckerContext &C) { |
| 796 | const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C); |
| 797 | if (!ASE) |
| 798 | return false; |
| 799 | return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType(); |
| 800 | } |
| 801 | |
| 802 | bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) { |
| 803 | ParentMapContext &ParentCtx = ACtx.getParentMapContext(); |
| 804 | do { |
| 805 | const DynTypedNodeList Parents = ParentCtx.getParents(Node: *S); |
| 806 | if (Parents.empty()) |
| 807 | return false; |
| 808 | S = Parents[0].get<Stmt>(); |
| 809 | } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(Val: S)); |
| 810 | const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(Val: S); |
| 811 | return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf; |
| 812 | } |
| 813 | |
| 814 | bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E, |
| 815 | ProgramStateRef State, |
| 816 | NonLoc Offset, NonLoc Limit, |
| 817 | CheckerContext &C) { |
| 818 | if (isa<ArraySubscriptExpr>(Val: E) && isInAddressOf(S: E, ACtx&: C.getASTContext())) { |
| 819 | auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold( |
| 820 | State, Value: Offset, Threshold: Limit, SVB&: C.getSValBuilder(), /*CheckEquality=*/true); |
| 821 | return EqualsToThreshold && !NotEqualToThreshold; |
| 822 | } |
| 823 | return false; |
| 824 | } |
| 825 | |
| 826 | void ento::registerArrayBoundChecker(CheckerManager &mgr) { |
| 827 | mgr.registerChecker<ArrayBoundChecker>(); |
| 828 | } |
| 829 | |
| 830 | bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) { |
| 831 | return true; |
| 832 | } |
| 833 | |