1//===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// fuzzer::InputCorpus
9//===----------------------------------------------------------------------===//
10
11#ifndef LLVM_FUZZER_CORPUS
12#define LLVM_FUZZER_CORPUS
13
14#include "FuzzerDataFlowTrace.h"
15#include "FuzzerDefs.h"
16#include "FuzzerIO.h"
17#include "FuzzerRandom.h"
18#include "FuzzerSHA1.h"
19#include "FuzzerTracePC.h"
20#include <algorithm>
21#include <bitset>
22#include <chrono>
23#include <numeric>
24#include <random>
25#include <unordered_set>
26
27namespace fuzzer {
28
29struct InputInfo {
30 Unit U; // The actual input data.
31 std::chrono::microseconds TimeOfUnit;
32 uint8_t Sha1[kSHA1NumBytes]; // Checksum.
33 // Number of features that this input has and no smaller input has.
34 size_t NumFeatures = 0;
35 size_t Tmp = 0; // Used by ValidateFeatureSet.
36 // Stats.
37 size_t NumExecutedMutations = 0;
38 size_t NumSuccessfullMutations = 0;
39 bool NeverReduce = false;
40 bool MayDeleteFile = false;
41 bool Reduced = false;
42 bool HasFocusFunction = false;
43 std::vector<uint32_t> UniqFeatureSet;
44 std::vector<uint8_t> DataFlowTraceForFocusFunction;
45 // Power schedule.
46 bool NeedsEnergyUpdate = false;
47 double Energy = 0.0;
48 double SumIncidence = 0.0;
49 std::vector<std::pair<uint32_t, uint16_t>> FeatureFreqs;
50
51 // Delete feature Idx and its frequency from FeatureFreqs.
52 bool DeleteFeatureFreq(uint32_t Idx) {
53 if (FeatureFreqs.empty())
54 return false;
55
56 // Binary search over local feature frequencies sorted by index.
57 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
58 std::pair<uint32_t, uint16_t>(Idx, 0));
59
60 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
61 FeatureFreqs.erase(Lower);
62 return true;
63 }
64 return false;
65 }
66
67 // Assign more energy to a high-entropy seed, i.e., that reveals more
68 // information about the globally rare features in the neighborhood of the
69 // seed. Since we do not know the entropy of a seed that has never been
70 // executed we assign fresh seeds maximum entropy and let II->Energy approach
71 // the true entropy from above. If ScalePerExecTime is true, the computed
72 // entropy is scaled based on how fast this input executes compared to the
73 // average execution time of inputs. The faster an input executes, the more
74 // energy gets assigned to the input.
75 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime,
76 std::chrono::microseconds AverageUnitExecutionTime) {
77 Energy = 0.0;
78 SumIncidence = 0.0;
79
80 // Apply add-one smoothing to locally discovered features.
81 for (const auto &F : FeatureFreqs) {
82 double LocalIncidence = F.second + 1;
83 Energy -= LocalIncidence * log(LocalIncidence);
84 SumIncidence += LocalIncidence;
85 }
86
87 // Apply add-one smoothing to locally undiscovered features.
88 // PreciseEnergy -= 0; // since log(1.0) == 0)
89 SumIncidence +=
90 static_cast<double>(GlobalNumberOfFeatures - FeatureFreqs.size());
91
92 // Add a single locally abundant feature apply add-one smoothing.
93 double AbdIncidence = static_cast<double>(NumExecutedMutations + 1);
94 Energy -= AbdIncidence * log(AbdIncidence);
95 SumIncidence += AbdIncidence;
96
97 // Normalize.
98 if (SumIncidence != 0)
99 Energy = Energy / SumIncidence + log(SumIncidence);
100
101 if (ScalePerExecTime) {
102 // Scaling to favor inputs with lower execution time.
103 uint32_t PerfScore = 100;
104 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10)
105 PerfScore = 10;
106 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4)
107 PerfScore = 25;
108 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2)
109 PerfScore = 50;
110 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4)
111 PerfScore = 75;
112 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count())
113 PerfScore = 300;
114 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count())
115 PerfScore = 200;
116 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count())
117 PerfScore = 150;
118
119 Energy *= PerfScore;
120 }
121 }
122
123 // Increment the frequency of the feature Idx.
124 void UpdateFeatureFrequency(uint32_t Idx) {
125 NeedsEnergyUpdate = true;
126
127 // The local feature frequencies is an ordered vector of pairs.
128 // If there are no local feature frequencies, push_back preserves order.
129 // Set the feature frequency for feature Idx32 to 1.
130 if (FeatureFreqs.empty()) {
131 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1));
132 return;
133 }
134
135 // Binary search over local feature frequencies sorted by index.
136 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
137 std::pair<uint32_t, uint16_t>(Idx, 0));
138
139 // If feature Idx32 already exists, increment its frequency.
140 // Otherwise, insert a new pair right after the next lower index.
141 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
142 Lower->second++;
143 } else {
144 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1));
145 }
146 }
147};
148
149struct EntropicOptions {
150 bool Enabled;
151 size_t NumberOfRarestFeatures;
152 size_t FeatureFrequencyThreshold;
153 bool ScalePerExecTime;
154};
155
156class InputCorpus {
157 static const uint32_t kFeatureSetSize = 1 << 21;
158 static const uint8_t kMaxMutationFactor = 20;
159 static const size_t kSparseEnergyUpdates = 100;
160
161 size_t NumExecutedMutations = 0;
162
163 EntropicOptions Entropic;
164
165public:
166 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic)
167 : Entropic(Entropic), OutputCorpus(OutputCorpus) {
168 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
169 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
170 }
171 ~InputCorpus() {
172 for (auto II : Inputs)
173 delete II;
174 }
175 size_t size() const { return Inputs.size(); }
176 size_t SizeInBytes() const {
177 size_t Res = 0;
178 for (auto II : Inputs)
179 Res += II->U.size();
180 return Res;
181 }
182 size_t NumActiveUnits() const {
183 size_t Res = 0;
184 for (auto II : Inputs)
185 Res += !II->U.empty();
186 return Res;
187 }
188 size_t MaxInputSize() const {
189 size_t Res = 0;
190 for (auto II : Inputs)
191 Res = std::max(Res, II->U.size());
192 return Res;
193 }
194 void IncrementNumExecutedMutations() { NumExecutedMutations++; }
195
196 size_t NumInputsThatTouchFocusFunction() {
197 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
198 return II->HasFocusFunction;
199 });
200 }
201
202 size_t NumInputsWithDataFlowTrace() {
203 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
204 return !II->DataFlowTraceForFocusFunction.empty();
205 });
206 }
207
208 bool empty() const { return Inputs.empty(); }
209 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
210 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
211 bool HasFocusFunction, bool NeverReduce,
212 std::chrono::microseconds TimeOfUnit,
213 const std::vector<uint32_t> &FeatureSet,
214 const DataFlowTrace &DFT, const InputInfo *BaseII) {
215 assert(!U.empty());
216 if (FeatureDebug)
217 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
218 // Inputs.size() is cast to uint32_t below.
219 assert(Inputs.size() < std::numeric_limits<uint32_t>::max());
220 Inputs.push_back(new InputInfo());
221 InputInfo &II = *Inputs.back();
222 II.U = U;
223 II.NumFeatures = NumFeatures;
224 II.NeverReduce = NeverReduce;
225 II.TimeOfUnit = TimeOfUnit;
226 II.MayDeleteFile = MayDeleteFile;
227 II.UniqFeatureSet = FeatureSet;
228 II.HasFocusFunction = HasFocusFunction;
229 // Assign maximal energy to the new seed.
230 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size());
231 II.SumIncidence = static_cast<double>(RareFeatures.size());
232 II.NeedsEnergyUpdate = false;
233 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
234 ComputeSHA1(U.data(), U.size(), II.Sha1);
235 auto Sha1Str = Sha1ToString(II.Sha1);
236 Hashes.insert(Sha1Str);
237 if (HasFocusFunction)
238 if (auto V = DFT.Get(Sha1Str))
239 II.DataFlowTraceForFocusFunction = *V;
240 // This is a gross heuristic.
241 // Ideally, when we add an element to a corpus we need to know its DFT.
242 // But if we don't, we'll use the DFT of its base input.
243 if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
244 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
245 DistributionNeedsUpdate = true;
246 PrintCorpus();
247 // ValidateFeatureSet();
248 return &II;
249 }
250
251 // Debug-only
252 void PrintUnit(const Unit &U) {
253 if (!FeatureDebug) return;
254 for (uint8_t C : U) {
255 if (C != 'F' && C != 'U' && C != 'Z')
256 C = '.';
257 Printf("%c", C);
258 }
259 }
260
261 // Debug-only
262 void PrintFeatureSet(const std::vector<uint32_t> &FeatureSet) {
263 if (!FeatureDebug) return;
264 Printf("{");
265 for (uint32_t Feature: FeatureSet)
266 Printf("%u,", Feature);
267 Printf("}");
268 }
269
270 // Debug-only
271 void PrintCorpus() {
272 if (!FeatureDebug) return;
273 Printf("======= CORPUS:\n");
274 int i = 0;
275 for (auto II : Inputs) {
276 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
277 Printf("[%2d] ", i);
278 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
279 PrintUnit(II->U);
280 Printf(" ");
281 PrintFeatureSet(II->UniqFeatureSet);
282 Printf("\n");
283 }
284 i++;
285 }
286 }
287
288 void Replace(InputInfo *II, const Unit &U,
289 std::chrono::microseconds TimeOfUnit) {
290 assert(II->U.size() > U.size());
291 Hashes.erase(Sha1ToString(II->Sha1));
292 DeleteFile(*II);
293 ComputeSHA1(U.data(), U.size(), II->Sha1);
294 Hashes.insert(Sha1ToString(II->Sha1));
295 II->U = U;
296 II->Reduced = true;
297 II->TimeOfUnit = TimeOfUnit;
298 DistributionNeedsUpdate = true;
299 }
300
301 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
302 bool HasUnit(const std::string &H) { return Hashes.count(H); }
303 InputInfo &ChooseUnitToMutate(Random &Rand) {
304 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
305 assert(!II.U.empty());
306 return II;
307 }
308
309 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) {
310 if (!UniformDist) {
311 return ChooseUnitToMutate(Rand);
312 }
313 InputInfo &II = *Inputs[Rand(Inputs.size())];
314 assert(!II.U.empty());
315 return II;
316 }
317
318 // Returns an index of random unit from the corpus to mutate.
319 size_t ChooseUnitIdxToMutate(Random &Rand) {
320 UpdateCorpusDistribution(Rand);
321 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
322 assert(Idx < Inputs.size());
323 return Idx;
324 }
325
326 void PrintStats() {
327 for (size_t i = 0; i < Inputs.size(); i++) {
328 const auto &II = *Inputs[i];
329 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
330 Sha1ToString(II.Sha1).c_str(), II.U.size(),
331 II.NumExecutedMutations, II.NumSuccessfullMutations,
332 II.HasFocusFunction);
333 }
334 }
335
336 void PrintFeatureSet() {
337 for (size_t i = 0; i < kFeatureSetSize; i++) {
338 if(size_t Sz = GetFeature(i))
339 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
340 }
341 Printf("\n\t");
342 for (size_t i = 0; i < Inputs.size(); i++)
343 if (size_t N = Inputs[i]->NumFeatures)
344 Printf(" %zd=>%zd ", i, N);
345 Printf("\n");
346 }
347
348 void DeleteFile(const InputInfo &II) {
349 if (!OutputCorpus.empty() && II.MayDeleteFile)
350 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
351 }
352
353 void DeleteInput(size_t Idx) {
354 InputInfo &II = *Inputs[Idx];
355 DeleteFile(II);
356 Unit().swap(II.U);
357 II.Energy = 0.0;
358 II.NeedsEnergyUpdate = false;
359 DistributionNeedsUpdate = true;
360 if (FeatureDebug)
361 Printf("EVICTED %zd\n", Idx);
362 }
363
364 void AddRareFeature(uint32_t Idx) {
365 // Maintain *at least* TopXRarestFeatures many rare features
366 // and all features with a frequency below ConsideredRare.
367 // Remove all other features.
368 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures &&
369 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) {
370
371 // Find most and second most abbundant feature.
372 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0],
373 RareFeatures[0]};
374 size_t Delete = 0;
375 for (size_t i = 0; i < RareFeatures.size(); i++) {
376 uint32_t Idx2 = RareFeatures[i];
377 if (GlobalFeatureFreqs[Idx2] >=
378 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) {
379 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0];
380 MostAbundantRareFeatureIndices[0] = Idx2;
381 Delete = i;
382 }
383 }
384
385 // Remove most abundant rare feature.
386 IsRareFeature[Delete] = false;
387 RareFeatures[Delete] = RareFeatures.back();
388 RareFeatures.pop_back();
389
390 for (auto II : Inputs) {
391 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0]))
392 II->NeedsEnergyUpdate = true;
393 }
394
395 // Set 2nd most abundant as the new most abundant feature count.
396 FreqOfMostAbundantRareFeature =
397 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]];
398 }
399
400 // Add rare feature, handle collisions, and update energy.
401 RareFeatures.push_back(Idx);
402 IsRareFeature[Idx] = true;
403 GlobalFeatureFreqs[Idx] = 0;
404 for (auto II : Inputs) {
405 II->DeleteFeatureFreq(Idx);
406
407 // Apply add-one smoothing to this locally undiscovered feature.
408 // Zero energy seeds will never be fuzzed and remain zero energy.
409 if (II->Energy > 0.0) {
410 II->SumIncidence += 1;
411 II->Energy += log(II->SumIncidence) / II->SumIncidence;
412 }
413 }
414
415 DistributionNeedsUpdate = true;
416 }
417
418 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
419 assert(NewSize);
420 Idx = Idx % kFeatureSetSize;
421 uint32_t OldSize = GetFeature(Idx);
422 if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
423 if (OldSize > 0) {
424 size_t OldIdx = SmallestElementPerFeature[Idx];
425 InputInfo &II = *Inputs[OldIdx];
426 assert(II.NumFeatures > 0);
427 II.NumFeatures--;
428 if (II.NumFeatures == 0)
429 DeleteInput(OldIdx);
430 } else {
431 NumAddedFeatures++;
432 if (Entropic.Enabled)
433 AddRareFeature((uint32_t)Idx);
434 }
435 NumUpdatedFeatures++;
436 if (FeatureDebug)
437 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
438 // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus.
439 SmallestElementPerFeature[Idx] = static_cast<uint32_t>(Inputs.size());
440 InputSizesPerFeature[Idx] = NewSize;
441 return true;
442 }
443 return false;
444 }
445
446 // Increment frequency of feature Idx globally and locally.
447 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) {
448 uint32_t Idx32 = Idx % kFeatureSetSize;
449
450 // Saturated increment.
451 if (GlobalFeatureFreqs[Idx32] == 0xFFFF)
452 return;
453 uint16_t Freq = GlobalFeatureFreqs[Idx32]++;
454
455 // Skip if abundant.
456 if (Freq > FreqOfMostAbundantRareFeature || !IsRareFeature[Idx32])
457 return;
458
459 // Update global frequencies.
460 if (Freq == FreqOfMostAbundantRareFeature)
461 FreqOfMostAbundantRareFeature++;
462
463 // Update local frequencies.
464 if (II)
465 II->UpdateFeatureFrequency(Idx32);
466 }
467
468 size_t NumFeatures() const { return NumAddedFeatures; }
469 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
470
471private:
472
473 static const bool FeatureDebug = false;
474
475 uint32_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
476
477 void ValidateFeatureSet() {
478 if (FeatureDebug)
479 PrintFeatureSet();
480 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
481 if (GetFeature(Idx))
482 Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
483 for (auto II: Inputs) {
484 if (II->Tmp != II->NumFeatures)
485 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
486 assert(II->Tmp == II->NumFeatures);
487 II->Tmp = 0;
488 }
489 }
490
491 // Updates the probability distribution for the units in the corpus.
492 // Must be called whenever the corpus or unit weights are changed.
493 //
494 // Hypothesis: inputs that maximize information about globally rare features
495 // are interesting.
496 void UpdateCorpusDistribution(Random &Rand) {
497 // Skip update if no seeds or rare features were added/deleted.
498 // Sparse updates for local change of feature frequencies,
499 // i.e., randomly do not skip.
500 if (!DistributionNeedsUpdate &&
501 (!Entropic.Enabled || Rand(kSparseEnergyUpdates)))
502 return;
503
504 DistributionNeedsUpdate = false;
505
506 size_t N = Inputs.size();
507 assert(N);
508 Intervals.resize(N + 1);
509 Weights.resize(N);
510 std::iota(Intervals.begin(), Intervals.end(), 0);
511
512 std::chrono::microseconds AverageUnitExecutionTime(0);
513 for (auto II : Inputs) {
514 AverageUnitExecutionTime += II->TimeOfUnit;
515 }
516 AverageUnitExecutionTime /= N;
517
518 bool VanillaSchedule = true;
519 if (Entropic.Enabled) {
520 for (auto II : Inputs) {
521 if (II->NeedsEnergyUpdate && II->Energy != 0.0) {
522 II->NeedsEnergyUpdate = false;
523 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime,
524 AverageUnitExecutionTime);
525 }
526 }
527
528 for (size_t i = 0; i < N; i++) {
529
530 if (Inputs[i]->NumFeatures == 0) {
531 // If the seed doesn't represent any features, assign zero energy.
532 Weights[i] = 0.;
533 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor >
534 NumExecutedMutations / Inputs.size()) {
535 // If the seed was fuzzed a lot more than average, assign zero energy.
536 Weights[i] = 0.;
537 } else {
538 // Otherwise, simply assign the computed energy.
539 Weights[i] = Inputs[i]->Energy;
540 }
541
542 // If energy for all seeds is zero, fall back to vanilla schedule.
543 if (Weights[i] > 0.0)
544 VanillaSchedule = false;
545 }
546 }
547
548 if (VanillaSchedule) {
549 for (size_t i = 0; i < N; i++)
550 Weights[i] =
551 Inputs[i]->NumFeatures
552 ? static_cast<double>((i + 1) *
553 (Inputs[i]->HasFocusFunction ? 1000 : 1))
554 : 0.;
555 }
556
557 if (FeatureDebug) {
558 for (size_t i = 0; i < N; i++)
559 Printf("%zd ", Inputs[i]->NumFeatures);
560 Printf("SCORE\n");
561 for (size_t i = 0; i < N; i++)
562 Printf("%f ", Weights[i]);
563 Printf("Weights\n");
564 }
565 CorpusDistribution = std::piecewise_constant_distribution<double>(
566 Intervals.begin(), Intervals.end(), Weights.begin());
567 }
568 std::piecewise_constant_distribution<double> CorpusDistribution;
569
570 std::vector<double> Intervals;
571 std::vector<double> Weights;
572
573 std::unordered_set<std::string> Hashes;
574 std::vector<InputInfo *> Inputs;
575
576 size_t NumAddedFeatures = 0;
577 size_t NumUpdatedFeatures = 0;
578 uint32_t InputSizesPerFeature[kFeatureSetSize];
579 uint32_t SmallestElementPerFeature[kFeatureSetSize];
580
581 bool DistributionNeedsUpdate = true;
582 uint16_t FreqOfMostAbundantRareFeature = 0;
583 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {};
584 std::vector<uint32_t> RareFeatures;
585 std::bitset<kFeatureSetSize> IsRareFeature;
586
587 std::string OutputCorpus;
588};
589
590} // namespace fuzzer
591
592#endif // LLVM_FUZZER_CORPUS
593