| 1 | //===-- cas-fuzzer.cpp - Fuzzer for CAS ObjectStore::validate() -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Fuzzer for on-disk CAS validation. Creates a valid CAS database, stores |
| 10 | // objects, corrupts the on-disk files using fuzzer-provided bytes, then calls |
| 11 | // validate(). The invariant: validate() must either succeed or return an error, |
| 12 | // never crash. |
| 13 | // |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #include "llvm/ADT/ScopeExit.h" |
| 17 | #include "llvm/CAS/ActionCache.h" |
| 18 | #include "llvm/CAS/BuiltinUnifiedCASDatabases.h" |
| 19 | #include "llvm/CAS/ObjectStore.h" |
| 20 | #include "llvm/Support/Error.h" |
| 21 | #include "llvm/Support/FileSystem.h" |
| 22 | #include "llvm/Support/MemoryBuffer.h" |
| 23 | #include "llvm/Support/Path.h" |
| 24 | #include "llvm/Support/ScopedPrinter.h" |
| 25 | #include "llvm/Support/raw_ostream.h" |
| 26 | #include <cstdint> |
| 27 | #include <cstring> |
| 28 | |
| 29 | using namespace llvm; |
| 30 | using namespace llvm::cas; |
| 31 | |
| 32 | namespace { |
| 33 | |
| 34 | /// Read a little-endian uint32 from Data, or 0 if not enough bytes. |
| 35 | static uint32_t readU32(ArrayRef<uint8_t> Data, size_t Offset) { |
| 36 | if (Offset + sizeof(uint32_t) > Data.size()) |
| 37 | return 0; |
| 38 | return support::endian::read32le(P: Data.data() + Offset); |
| 39 | } |
| 40 | |
| 41 | /// Read a little-endian uint16 from Data, or 0 if not enough bytes. |
| 42 | static uint16_t readU16(ArrayRef<uint8_t> Data, size_t Offset) { |
| 43 | if (Offset + sizeof(uint16_t) > Data.size()) |
| 44 | return 0; |
| 45 | return support::endian::read16le(P: Data.data() + Offset); |
| 46 | } |
| 47 | |
| 48 | /// Find the versioned subdirectory (v1.N) inside the CAS root. |
| 49 | static std::string findVersionedSubdir(StringRef CASDir) { |
| 50 | std::error_code EC; |
| 51 | std::string Best; |
| 52 | uint64_t BestOrder = 0; |
| 53 | for (sys::fs::directory_iterator DirI(CASDir, EC), DirE; !EC && DirI != DirE; |
| 54 | DirI.increment(ec&: EC)) { |
| 55 | if (DirI->type() != sys::fs::file_type::directory_file) |
| 56 | continue; |
| 57 | StringRef Name = sys::path::filename(path: DirI->path()); |
| 58 | if (!Name.starts_with(Prefix: "v1." )) |
| 59 | continue; |
| 60 | uint64_t Order; |
| 61 | if (Name.substr(Start: 3).getAsInteger(Radix: 10, Result&: Order)) |
| 62 | continue; |
| 63 | if (Best.empty() || Order > BestOrder) { |
| 64 | Best = DirI->path(); |
| 65 | BestOrder = Order; |
| 66 | } |
| 67 | } |
| 68 | return Best; |
| 69 | } |
| 70 | |
| 71 | /// Collect paths of files matching a prefix in a directory. |
| 72 | static void collectFilesWithPrefix(StringRef Dir, StringRef Prefix, |
| 73 | SmallVectorImpl<std::string> &Results) { |
| 74 | std::error_code EC; |
| 75 | for (sys::fs::directory_iterator DirI(Dir, EC), DirE; !EC && DirI != DirE; |
| 76 | DirI.increment(ec&: EC)) { |
| 77 | StringRef Name = sys::path::filename(path: DirI->path()); |
| 78 | if (Name.starts_with(Prefix)) |
| 79 | Results.push_back(Elt: DirI->path()); |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | /// Read an entire file into a buffer. |
| 84 | static bool readFileBytes(StringRef Path, SmallVectorImpl<char> &Buf) { |
| 85 | auto MBOrErr = MemoryBuffer::getFile(Filename: Path, /*IsText=*/false, |
| 86 | /*RequiresNullTerminator=*/false); |
| 87 | if (!MBOrErr) |
| 88 | return false; |
| 89 | Buf.assign(in_start: (*MBOrErr)->getBufferStart(), in_end: (*MBOrErr)->getBufferEnd()); |
| 90 | return true; |
| 91 | } |
| 92 | |
| 93 | /// Write buffer contents to a file, replacing it entirely. |
| 94 | static bool writeFileBytes(StringRef Path, ArrayRef<char> Buf) { |
| 95 | std::error_code EC; |
| 96 | raw_fd_ostream OS(Path, EC, sys::fs::OF_None); |
| 97 | if (EC) |
| 98 | return false; |
| 99 | OS.write(Ptr: Buf.data(), Size: Buf.size()); |
| 100 | return !OS.has_error(); |
| 101 | } |
| 102 | |
| 103 | /// Create a CAS database and store some baseline objects. |
| 104 | /// Returns true on success, populating CAS and AC via output parameters. |
| 105 | static bool createAndPopulateCAS(StringRef TmpDir, |
| 106 | std::unique_ptr<ObjectStore> &CAS, |
| 107 | std::unique_ptr<ActionCache> &AC) { |
| 108 | auto Result = createOnDiskUnifiedCASDatabases(Path: TmpDir); |
| 109 | if (!Result) { |
| 110 | consumeError(Err: Result.takeError()); |
| 111 | return false; |
| 112 | } |
| 113 | CAS = std::move(Result->first); |
| 114 | AC = std::move(Result->second); |
| 115 | |
| 116 | // Store a leaf node (no refs, small data). |
| 117 | const char LeafData[] = "hello-cas-fuzzer-leaf-data" ; |
| 118 | auto Leaf = CAS->store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: LeafData)); |
| 119 | if (!Leaf) { |
| 120 | consumeError(Err: Leaf.takeError()); |
| 121 | return false; |
| 122 | } |
| 123 | |
| 124 | // Store a node with a ref to the leaf. |
| 125 | const char NodeData[] = "node-with-one-ref" ; |
| 126 | auto Node1 = CAS->store(Refs: {*Leaf}, Data: arrayRefFromStringRef<char>(Input: NodeData)); |
| 127 | if (!Node1) { |
| 128 | consumeError(Err: Node1.takeError()); |
| 129 | return false; |
| 130 | } |
| 131 | |
| 132 | // Store a node referencing both previous nodes. |
| 133 | const char Node2Data[] = "node-with-two-refs" ; |
| 134 | auto Node2 = |
| 135 | CAS->store(Refs: {*Leaf, *Node1}, Data: arrayRefFromStringRef<char>(Input: Node2Data)); |
| 136 | if (!Node2) { |
| 137 | consumeError(Err: Node2.takeError()); |
| 138 | return false; |
| 139 | } |
| 140 | |
| 141 | // Store a larger data node to potentially exercise different size encodings. |
| 142 | std::string LargeData(4096, 'X'); |
| 143 | auto LargeNode = |
| 144 | CAS->store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: StringRef(LargeData))); |
| 145 | if (!LargeNode) { |
| 146 | consumeError(Err: LargeNode.takeError()); |
| 147 | return false; |
| 148 | } |
| 149 | |
| 150 | return true; |
| 151 | } |
| 152 | |
| 153 | /// Apply byte-level mutations to a file. |
| 154 | static void applyByteMutations(StringRef Path, ArrayRef<uint8_t> Data) { |
| 155 | SmallVector<char> Buf; |
| 156 | if (!readFileBytes(Path, Buf) || Buf.empty()) |
| 157 | return; |
| 158 | |
| 159 | // Parse as 7-byte chunks: [offset(4)][op(1)][value(1)][unused(1)] |
| 160 | for (size_t I = 0; I + 6 <= Data.size(); I += 7) { |
| 161 | uint32_t Offset = readU32(Data, Offset: I) % Buf.size(); |
| 162 | uint8_t Op = Data[I + 4] % 3; |
| 163 | uint8_t Value = Data[I + 5]; |
| 164 | switch (Op) { |
| 165 | case 0: // XOR |
| 166 | Buf[Offset] ^= Value; |
| 167 | break; |
| 168 | case 1: // SET |
| 169 | Buf[Offset] = Value; |
| 170 | break; |
| 171 | case 2: // Zero |
| 172 | Buf[Offset] = 0; |
| 173 | break; |
| 174 | } |
| 175 | } |
| 176 | writeFileBytes(Path, Buf); |
| 177 | } |
| 178 | |
| 179 | /// Truncate a file to a given fraction of its size. |
| 180 | static void truncateFile(StringRef Path, uint8_t Fraction) { |
| 181 | SmallVector<char> Buf; |
| 182 | if (!readFileBytes(Path, Buf) || Buf.empty()) |
| 183 | return; |
| 184 | // Fraction is 0-255, map to 0-100% of file size. |
| 185 | size_t NewSize = |
| 186 | static_cast<size_t>(static_cast<uint64_t>(Buf.size()) * Fraction / 255); |
| 187 | // Don't zero out the size. |
| 188 | if (NewSize == 0) |
| 189 | NewSize = 1; |
| 190 | Buf.resize(N: NewSize); |
| 191 | writeFileBytes(Path, Buf); |
| 192 | } |
| 193 | |
| 194 | /// Append garbage bytes to a file. |
| 195 | static void appendGarbage(StringRef Path, ArrayRef<uint8_t> Data) { |
| 196 | SmallVector<char> Buf; |
| 197 | if (!readFileBytes(Path, Buf)) |
| 198 | return; |
| 199 | Buf.append(in_start: Data.begin(), in_end: Data.end()); |
| 200 | writeFileBytes(Path, Buf); |
| 201 | } |
| 202 | |
| 203 | /// Zero out a range in a file. |
| 204 | static void zeroRange(StringRef Path, uint32_t Offset, uint16_t Length) { |
| 205 | SmallVector<char> Buf; |
| 206 | if (!readFileBytes(Path, Buf) || Buf.empty()) |
| 207 | return; |
| 208 | size_t Start = Offset % Buf.size(); |
| 209 | size_t End = std::min(a: Start + static_cast<size_t>(Length), b: Buf.size()); |
| 210 | std::memset(s: Buf.data() + Start, c: 0, n: End - Start); |
| 211 | writeFileBytes(Path, Buf); |
| 212 | } |
| 213 | |
| 214 | /// Corrupt standalone files (obj.*, leaf.*, leaf+0.*). |
| 215 | static void corruptStandaloneFiles(StringRef SubDir, ArrayRef<uint8_t> Data) { |
| 216 | SmallVector<std::string> StandaloneFiles; |
| 217 | collectFilesWithPrefix(Dir: SubDir, Prefix: "obj." , Results&: StandaloneFiles); |
| 218 | collectFilesWithPrefix(Dir: SubDir, Prefix: "leaf." , Results&: StandaloneFiles); |
| 219 | collectFilesWithPrefix(Dir: SubDir, Prefix: "leaf+0." , Results&: StandaloneFiles); |
| 220 | |
| 221 | if (StandaloneFiles.empty()) |
| 222 | return; |
| 223 | |
| 224 | for (size_t I = 0; I < Data.size() && !StandaloneFiles.empty(); I += 3) { |
| 225 | size_t FileIdx = Data[I] % StandaloneFiles.size(); |
| 226 | uint8_t Action = (I + 1 < Data.size()) ? Data[I + 1] % 4 : 0; |
| 227 | uint8_t Param = (I + 2 < Data.size()) ? Data[I + 2] : 128; |
| 228 | |
| 229 | StringRef FilePath = StandaloneFiles[FileIdx]; |
| 230 | switch (Action) { |
| 231 | case 0: // Delete the file |
| 232 | sys::fs::remove(path: FilePath); |
| 233 | break; |
| 234 | case 1: // Truncate |
| 235 | truncateFile(Path: FilePath, Fraction: Param); |
| 236 | break; |
| 237 | case 2: // Corrupt bytes |
| 238 | if (I + 3 < Data.size()) |
| 239 | applyByteMutations( |
| 240 | Path: FilePath, Data: Data.slice(N: I + 3, M: std::min(a: Data.size() - I - 3, |
| 241 | b: static_cast<size_t>(21)))); |
| 242 | break; |
| 243 | case 3: // Zero out beginning |
| 244 | zeroRange(Path: FilePath, Offset: 0, Length: Param); |
| 245 | break; |
| 246 | } |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | /// Select which data file to target (index.v1 or data.v1). |
| 251 | static std::string selectTargetFile(StringRef SubDir, uint8_t Selector) { |
| 252 | SmallString<256> Path(SubDir); |
| 253 | if (Selector % 2 == 0) |
| 254 | sys::path::append(path&: Path, a: "index.v1" ); |
| 255 | else |
| 256 | sys::path::append(path&: Path, a: "data.v1" ); |
| 257 | return std::string(Path); |
| 258 | } |
| 259 | |
| 260 | /// Try to exercise the CAS after corruption: store and load. |
| 261 | static void exerciseCAS(ObjectStore &CAS) { |
| 262 | // Try storing a new object. |
| 263 | const char NewData[] = "post-corruption-data" ; |
| 264 | auto NewObj = CAS.store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: NewData)); |
| 265 | if (!NewObj) |
| 266 | consumeError(Err: NewObj.takeError()); |
| 267 | |
| 268 | // Try validate again with CheckHash=false. |
| 269 | if (auto E = CAS.validate(/*CheckHash=*/false)) |
| 270 | consumeError(Err: std::move(E)); |
| 271 | } |
| 272 | |
| 273 | } // end anonymous namespace |
| 274 | |
| 275 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { |
| 276 | if (Size == 0) |
| 277 | return 0; |
| 278 | |
| 279 | // Create a unique temp directory for this fuzzer run. |
| 280 | SmallString<256> TmpDir; |
| 281 | if (sys::fs::createUniqueDirectory(Prefix: "cas-fuzzer" , ResultPath&: TmpDir)) |
| 282 | return 0; |
| 283 | |
| 284 | // Ensure cleanup on exit. |
| 285 | auto Cleanup = scope_exit([&]() { sys::fs::remove_directories(path: TmpDir); }); |
| 286 | |
| 287 | // Step 1: Create and populate a valid CAS. |
| 288 | std::unique_ptr<ObjectStore> CAS; |
| 289 | std::unique_ptr<ActionCache> AC; |
| 290 | if (!createAndPopulateCAS(TmpDir, CAS, AC)) |
| 291 | return 0; |
| 292 | |
| 293 | // Step 2: Validate baseline - should succeed. |
| 294 | if (auto E = CAS->validate(/*CheckHash=*/true)) { |
| 295 | // If baseline validation fails, something is wrong with the setup. |
| 296 | consumeError(Err: std::move(E)); |
| 297 | return 0; |
| 298 | } |
| 299 | |
| 300 | // Step 3: Close the CAS so files are unmapped. |
| 301 | CAS.reset(); |
| 302 | AC.reset(); |
| 303 | |
| 304 | // Step 4: Find the versioned subdirectory. |
| 305 | std::string SubDir = findVersionedSubdir(CASDir: TmpDir); |
| 306 | if (SubDir.empty()) |
| 307 | return 0; |
| 308 | |
| 309 | // Step 5: Apply corruption based on mode selector (first byte). |
| 310 | ArrayRef<uint8_t> Input(Data, Size); |
| 311 | uint8_t Mode = Input[0] % 6; |
| 312 | ArrayRef<uint8_t> Rest = Input.drop_front(N: 1); |
| 313 | |
| 314 | switch (Mode) { |
| 315 | case 0: { // Byte-level mutations |
| 316 | if (Rest.empty()) |
| 317 | break; |
| 318 | std::string Target = selectTargetFile(SubDir, Selector: Rest[0]); |
| 319 | if (Rest.size() > 1) |
| 320 | applyByteMutations(Path: Target, Data: Rest.drop_front(N: 1)); |
| 321 | break; |
| 322 | } |
| 323 | case 1: { // File truncation |
| 324 | if (Rest.size() < 2) |
| 325 | break; |
| 326 | std::string Target = selectTargetFile(SubDir, Selector: Rest[0]); |
| 327 | truncateFile(Path: Target, Fraction: Rest[1]); |
| 328 | break; |
| 329 | } |
| 330 | case 2: { // Append garbage |
| 331 | if (Rest.empty()) |
| 332 | break; |
| 333 | std::string Target = selectTargetFile(SubDir, Selector: Rest[0]); |
| 334 | if (Rest.size() > 1) |
| 335 | appendGarbage(Path: Target, Data: Rest.drop_front(N: 1)); |
| 336 | break; |
| 337 | } |
| 338 | case 3: { // Zero out a range |
| 339 | if (Rest.size() < 7) |
| 340 | break; |
| 341 | std::string Target = selectTargetFile(SubDir, Selector: Rest[0]); |
| 342 | uint32_t Offset = readU32(Data: Rest, Offset: 1); |
| 343 | uint16_t Length = readU16(Data: Rest, Offset: 5); |
| 344 | zeroRange(Path: Target, Offset, Length); |
| 345 | break; |
| 346 | } |
| 347 | case 4: { // Standalone file corruption |
| 348 | corruptStandaloneFiles(SubDir, Data: Rest); |
| 349 | break; |
| 350 | } |
| 351 | case 5: { // Combined: byte mutations + exercise CAS |
| 352 | if (Rest.empty()) |
| 353 | break; |
| 354 | std::string Target = selectTargetFile(SubDir, Selector: Rest[0]); |
| 355 | if (Rest.size() > 1) |
| 356 | applyByteMutations(Path: Target, Data: Rest.drop_front(N: 1)); |
| 357 | break; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | // Step 6: Reopen the CAS after corruption. |
| 362 | auto Reopened = createOnDiskUnifiedCASDatabases(Path: TmpDir); |
| 363 | if (!Reopened) { |
| 364 | // Reopen failing is acceptable — corruption may have broken the format. |
| 365 | consumeError(Err: Reopened.takeError()); |
| 366 | return 0; |
| 367 | } |
| 368 | CAS = std::move(Reopened->first); |
| 369 | AC = std::move(Reopened->second); |
| 370 | |
| 371 | // Step 7: Validate — must not crash. |
| 372 | bool ValidationFailed = false; |
| 373 | if (auto E = CAS->validate(/*CheckHash=*/true)) { |
| 374 | consumeError(Err: std::move(E)); |
| 375 | ValidationFailed = true; |
| 376 | } |
| 377 | if (auto E = CAS->validate(/*CheckHash=*/false)) { |
| 378 | consumeError(Err: std::move(E)); |
| 379 | ValidationFailed = true; |
| 380 | } |
| 381 | |
| 382 | // Step 8: For mode 5, exercise the CAS only if validation passed. |
| 383 | if (Mode == 5 && !ValidationFailed) |
| 384 | exerciseCAS(CAS&: *CAS); |
| 385 | |
| 386 | return 0; |
| 387 | } |
| 388 | |