1//===-- cas-fuzzer.cpp - Fuzzer for CAS ObjectStore::validate() -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Fuzzer for on-disk CAS validation. Creates a valid CAS database, stores
10// objects, corrupts the on-disk files using fuzzer-provided bytes, then calls
11// validate(). The invariant: validate() must either succeed or return an error,
12// never crash.
13//
14//===----------------------------------------------------------------------===//
15
16#include "llvm/ADT/ScopeExit.h"
17#include "llvm/CAS/ActionCache.h"
18#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
19#include "llvm/CAS/ObjectStore.h"
20#include "llvm/Support/Error.h"
21#include "llvm/Support/FileSystem.h"
22#include "llvm/Support/MemoryBuffer.h"
23#include "llvm/Support/Path.h"
24#include "llvm/Support/ScopedPrinter.h"
25#include "llvm/Support/raw_ostream.h"
26#include <cstdint>
27#include <cstring>
28
29using namespace llvm;
30using namespace llvm::cas;
31
32namespace {
33
34/// Read a little-endian uint32 from Data, or 0 if not enough bytes.
35static uint32_t readU32(ArrayRef<uint8_t> Data, size_t Offset) {
36 if (Offset + sizeof(uint32_t) > Data.size())
37 return 0;
38 return support::endian::read32le(P: Data.data() + Offset);
39}
40
41/// Read a little-endian uint16 from Data, or 0 if not enough bytes.
42static uint16_t readU16(ArrayRef<uint8_t> Data, size_t Offset) {
43 if (Offset + sizeof(uint16_t) > Data.size())
44 return 0;
45 return support::endian::read16le(P: Data.data() + Offset);
46}
47
48/// Find the versioned subdirectory (v1.N) inside the CAS root.
49static std::string findVersionedSubdir(StringRef CASDir) {
50 std::error_code EC;
51 std::string Best;
52 uint64_t BestOrder = 0;
53 for (sys::fs::directory_iterator DirI(CASDir, EC), DirE; !EC && DirI != DirE;
54 DirI.increment(ec&: EC)) {
55 if (DirI->type() != sys::fs::file_type::directory_file)
56 continue;
57 StringRef Name = sys::path::filename(path: DirI->path());
58 if (!Name.starts_with(Prefix: "v1."))
59 continue;
60 uint64_t Order;
61 if (Name.substr(Start: 3).getAsInteger(Radix: 10, Result&: Order))
62 continue;
63 if (Best.empty() || Order > BestOrder) {
64 Best = DirI->path();
65 BestOrder = Order;
66 }
67 }
68 return Best;
69}
70
71/// Collect paths of files matching a prefix in a directory.
72static void collectFilesWithPrefix(StringRef Dir, StringRef Prefix,
73 SmallVectorImpl<std::string> &Results) {
74 std::error_code EC;
75 for (sys::fs::directory_iterator DirI(Dir, EC), DirE; !EC && DirI != DirE;
76 DirI.increment(ec&: EC)) {
77 StringRef Name = sys::path::filename(path: DirI->path());
78 if (Name.starts_with(Prefix))
79 Results.push_back(Elt: DirI->path());
80 }
81}
82
83/// Read an entire file into a buffer.
84static bool readFileBytes(StringRef Path, SmallVectorImpl<char> &Buf) {
85 auto MBOrErr = MemoryBuffer::getFile(Filename: Path, /*IsText=*/false,
86 /*RequiresNullTerminator=*/false);
87 if (!MBOrErr)
88 return false;
89 Buf.assign(in_start: (*MBOrErr)->getBufferStart(), in_end: (*MBOrErr)->getBufferEnd());
90 return true;
91}
92
93/// Write buffer contents to a file, replacing it entirely.
94static bool writeFileBytes(StringRef Path, ArrayRef<char> Buf) {
95 std::error_code EC;
96 raw_fd_ostream OS(Path, EC, sys::fs::OF_None);
97 if (EC)
98 return false;
99 OS.write(Ptr: Buf.data(), Size: Buf.size());
100 return !OS.has_error();
101}
102
103/// Create a CAS database and store some baseline objects.
104/// Returns true on success, populating CAS and AC via output parameters.
105static bool createAndPopulateCAS(StringRef TmpDir,
106 std::unique_ptr<ObjectStore> &CAS,
107 std::unique_ptr<ActionCache> &AC) {
108 auto Result = createOnDiskUnifiedCASDatabases(Path: TmpDir);
109 if (!Result) {
110 consumeError(Err: Result.takeError());
111 return false;
112 }
113 CAS = std::move(Result->first);
114 AC = std::move(Result->second);
115
116 // Store a leaf node (no refs, small data).
117 const char LeafData[] = "hello-cas-fuzzer-leaf-data";
118 auto Leaf = CAS->store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: LeafData));
119 if (!Leaf) {
120 consumeError(Err: Leaf.takeError());
121 return false;
122 }
123
124 // Store a node with a ref to the leaf.
125 const char NodeData[] = "node-with-one-ref";
126 auto Node1 = CAS->store(Refs: {*Leaf}, Data: arrayRefFromStringRef<char>(Input: NodeData));
127 if (!Node1) {
128 consumeError(Err: Node1.takeError());
129 return false;
130 }
131
132 // Store a node referencing both previous nodes.
133 const char Node2Data[] = "node-with-two-refs";
134 auto Node2 =
135 CAS->store(Refs: {*Leaf, *Node1}, Data: arrayRefFromStringRef<char>(Input: Node2Data));
136 if (!Node2) {
137 consumeError(Err: Node2.takeError());
138 return false;
139 }
140
141 // Store a larger data node to potentially exercise different size encodings.
142 std::string LargeData(4096, 'X');
143 auto LargeNode =
144 CAS->store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: StringRef(LargeData)));
145 if (!LargeNode) {
146 consumeError(Err: LargeNode.takeError());
147 return false;
148 }
149
150 return true;
151}
152
153/// Apply byte-level mutations to a file.
154static void applyByteMutations(StringRef Path, ArrayRef<uint8_t> Data) {
155 SmallVector<char> Buf;
156 if (!readFileBytes(Path, Buf) || Buf.empty())
157 return;
158
159 // Parse as 7-byte chunks: [offset(4)][op(1)][value(1)][unused(1)]
160 for (size_t I = 0; I + 6 <= Data.size(); I += 7) {
161 uint32_t Offset = readU32(Data, Offset: I) % Buf.size();
162 uint8_t Op = Data[I + 4] % 3;
163 uint8_t Value = Data[I + 5];
164 switch (Op) {
165 case 0: // XOR
166 Buf[Offset] ^= Value;
167 break;
168 case 1: // SET
169 Buf[Offset] = Value;
170 break;
171 case 2: // Zero
172 Buf[Offset] = 0;
173 break;
174 }
175 }
176 writeFileBytes(Path, Buf);
177}
178
179/// Truncate a file to a given fraction of its size.
180static void truncateFile(StringRef Path, uint8_t Fraction) {
181 SmallVector<char> Buf;
182 if (!readFileBytes(Path, Buf) || Buf.empty())
183 return;
184 // Fraction is 0-255, map to 0-100% of file size.
185 size_t NewSize =
186 static_cast<size_t>(static_cast<uint64_t>(Buf.size()) * Fraction / 255);
187 // Don't zero out the size.
188 if (NewSize == 0)
189 NewSize = 1;
190 Buf.resize(N: NewSize);
191 writeFileBytes(Path, Buf);
192}
193
194/// Append garbage bytes to a file.
195static void appendGarbage(StringRef Path, ArrayRef<uint8_t> Data) {
196 SmallVector<char> Buf;
197 if (!readFileBytes(Path, Buf))
198 return;
199 Buf.append(in_start: Data.begin(), in_end: Data.end());
200 writeFileBytes(Path, Buf);
201}
202
203/// Zero out a range in a file.
204static void zeroRange(StringRef Path, uint32_t Offset, uint16_t Length) {
205 SmallVector<char> Buf;
206 if (!readFileBytes(Path, Buf) || Buf.empty())
207 return;
208 size_t Start = Offset % Buf.size();
209 size_t End = std::min(a: Start + static_cast<size_t>(Length), b: Buf.size());
210 std::memset(s: Buf.data() + Start, c: 0, n: End - Start);
211 writeFileBytes(Path, Buf);
212}
213
214/// Corrupt standalone files (obj.*, leaf.*, leaf+0.*).
215static void corruptStandaloneFiles(StringRef SubDir, ArrayRef<uint8_t> Data) {
216 SmallVector<std::string> StandaloneFiles;
217 collectFilesWithPrefix(Dir: SubDir, Prefix: "obj.", Results&: StandaloneFiles);
218 collectFilesWithPrefix(Dir: SubDir, Prefix: "leaf.", Results&: StandaloneFiles);
219 collectFilesWithPrefix(Dir: SubDir, Prefix: "leaf+0.", Results&: StandaloneFiles);
220
221 if (StandaloneFiles.empty())
222 return;
223
224 for (size_t I = 0; I < Data.size() && !StandaloneFiles.empty(); I += 3) {
225 size_t FileIdx = Data[I] % StandaloneFiles.size();
226 uint8_t Action = (I + 1 < Data.size()) ? Data[I + 1] % 4 : 0;
227 uint8_t Param = (I + 2 < Data.size()) ? Data[I + 2] : 128;
228
229 StringRef FilePath = StandaloneFiles[FileIdx];
230 switch (Action) {
231 case 0: // Delete the file
232 sys::fs::remove(path: FilePath);
233 break;
234 case 1: // Truncate
235 truncateFile(Path: FilePath, Fraction: Param);
236 break;
237 case 2: // Corrupt bytes
238 if (I + 3 < Data.size())
239 applyByteMutations(
240 Path: FilePath, Data: Data.slice(N: I + 3, M: std::min(a: Data.size() - I - 3,
241 b: static_cast<size_t>(21))));
242 break;
243 case 3: // Zero out beginning
244 zeroRange(Path: FilePath, Offset: 0, Length: Param);
245 break;
246 }
247 }
248}
249
250/// Select which data file to target (index.v1 or data.v1).
251static std::string selectTargetFile(StringRef SubDir, uint8_t Selector) {
252 SmallString<256> Path(SubDir);
253 if (Selector % 2 == 0)
254 sys::path::append(path&: Path, a: "index.v1");
255 else
256 sys::path::append(path&: Path, a: "data.v1");
257 return std::string(Path);
258}
259
260/// Try to exercise the CAS after corruption: store and load.
261static void exerciseCAS(ObjectStore &CAS) {
262 // Try storing a new object.
263 const char NewData[] = "post-corruption-data";
264 auto NewObj = CAS.store(Refs: {}, Data: arrayRefFromStringRef<char>(Input: NewData));
265 if (!NewObj)
266 consumeError(Err: NewObj.takeError());
267
268 // Try validate again with CheckHash=false.
269 if (auto E = CAS.validate(/*CheckHash=*/false))
270 consumeError(Err: std::move(E));
271}
272
273} // end anonymous namespace
274
275extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
276 if (Size == 0)
277 return 0;
278
279 // Create a unique temp directory for this fuzzer run.
280 SmallString<256> TmpDir;
281 if (sys::fs::createUniqueDirectory(Prefix: "cas-fuzzer", ResultPath&: TmpDir))
282 return 0;
283
284 // Ensure cleanup on exit.
285 auto Cleanup = scope_exit([&]() { sys::fs::remove_directories(path: TmpDir); });
286
287 // Step 1: Create and populate a valid CAS.
288 std::unique_ptr<ObjectStore> CAS;
289 std::unique_ptr<ActionCache> AC;
290 if (!createAndPopulateCAS(TmpDir, CAS, AC))
291 return 0;
292
293 // Step 2: Validate baseline - should succeed.
294 if (auto E = CAS->validate(/*CheckHash=*/true)) {
295 // If baseline validation fails, something is wrong with the setup.
296 consumeError(Err: std::move(E));
297 return 0;
298 }
299
300 // Step 3: Close the CAS so files are unmapped.
301 CAS.reset();
302 AC.reset();
303
304 // Step 4: Find the versioned subdirectory.
305 std::string SubDir = findVersionedSubdir(CASDir: TmpDir);
306 if (SubDir.empty())
307 return 0;
308
309 // Step 5: Apply corruption based on mode selector (first byte).
310 ArrayRef<uint8_t> Input(Data, Size);
311 uint8_t Mode = Input[0] % 6;
312 ArrayRef<uint8_t> Rest = Input.drop_front(N: 1);
313
314 switch (Mode) {
315 case 0: { // Byte-level mutations
316 if (Rest.empty())
317 break;
318 std::string Target = selectTargetFile(SubDir, Selector: Rest[0]);
319 if (Rest.size() > 1)
320 applyByteMutations(Path: Target, Data: Rest.drop_front(N: 1));
321 break;
322 }
323 case 1: { // File truncation
324 if (Rest.size() < 2)
325 break;
326 std::string Target = selectTargetFile(SubDir, Selector: Rest[0]);
327 truncateFile(Path: Target, Fraction: Rest[1]);
328 break;
329 }
330 case 2: { // Append garbage
331 if (Rest.empty())
332 break;
333 std::string Target = selectTargetFile(SubDir, Selector: Rest[0]);
334 if (Rest.size() > 1)
335 appendGarbage(Path: Target, Data: Rest.drop_front(N: 1));
336 break;
337 }
338 case 3: { // Zero out a range
339 if (Rest.size() < 7)
340 break;
341 std::string Target = selectTargetFile(SubDir, Selector: Rest[0]);
342 uint32_t Offset = readU32(Data: Rest, Offset: 1);
343 uint16_t Length = readU16(Data: Rest, Offset: 5);
344 zeroRange(Path: Target, Offset, Length);
345 break;
346 }
347 case 4: { // Standalone file corruption
348 corruptStandaloneFiles(SubDir, Data: Rest);
349 break;
350 }
351 case 5: { // Combined: byte mutations + exercise CAS
352 if (Rest.empty())
353 break;
354 std::string Target = selectTargetFile(SubDir, Selector: Rest[0]);
355 if (Rest.size() > 1)
356 applyByteMutations(Path: Target, Data: Rest.drop_front(N: 1));
357 break;
358 }
359 }
360
361 // Step 6: Reopen the CAS after corruption.
362 auto Reopened = createOnDiskUnifiedCASDatabases(Path: TmpDir);
363 if (!Reopened) {
364 // Reopen failing is acceptable — corruption may have broken the format.
365 consumeError(Err: Reopened.takeError());
366 return 0;
367 }
368 CAS = std::move(Reopened->first);
369 AC = std::move(Reopened->second);
370
371 // Step 7: Validate — must not crash.
372 bool ValidationFailed = false;
373 if (auto E = CAS->validate(/*CheckHash=*/true)) {
374 consumeError(Err: std::move(E));
375 ValidationFailed = true;
376 }
377 if (auto E = CAS->validate(/*CheckHash=*/false)) {
378 consumeError(Err: std::move(E));
379 ValidationFailed = true;
380 }
381
382 // Step 8: For mode 5, exercise the CAS only if validation passed.
383 if (Mode == 5 && !ValidationFailed)
384 exerciseCAS(CAS&: *CAS);
385
386 return 0;
387}
388