FuzzerMerge.cpp source code [llvm_runtimes/compiler-rt/lib/fuzzer/FuzzerMerge.cpp]

1	//===- FuzzerMerge.cpp - merging corpora ----------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	// Merging corpora.
9	//===----------------------------------------------------------------------===//
10
11	#include "FuzzerCommand.h"
12	#include "FuzzerMerge.h"
13	#include "FuzzerIO.h"
14	#include "FuzzerInternal.h"
15	#include "FuzzerTracePC.h"
16	#include "FuzzerUtil.h"
17
18	#include <fstream>
19	#include <iterator>
20	#include <set>
21	#include <sstream>
22	#include <unordered_set>
23
24	namespace fuzzer {
25
26	bool Merger::Parse(const std::string &Str, bool ParseCoverage) {
27	std::istringstream SS(Str);
28	return Parse(SS, ParseCoverage);
29	}
30
31	void Merger::ParseOrExit(std::istream &IS, bool ParseCoverage) {
32	if (!Parse(IS, ParseCoverage)) {
33	Printf("MERGE: failed to parse the control file (unexpected error)\n");
34	exit(`1`);
35	}
36	}
37
38	// The control file example:
39	//
40	// 3 # The number of inputs
41	// 1 # The number of inputs in the first corpus, <= the previous number
42	// file0
43	// file1
44	// file2 # One file name per line.
45	// STARTED 0 123 # FileID, file size
46	// FT 0 1 4 6 8 # FileID COV1 COV2 ...
47	// COV 0 7 8 9 # FileID COV1 COV1
48	// STARTED 1 456 # If FT is missing, the input crashed while processing.
49	// STARTED 2 567
50	// FT 2 8 9
51	// COV 2 11 12
52	bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
53	LastFailure.clear();
54	std::string Line;
55
56	// Parse NumFiles.
57	if (!std::getline(IS, Line, `'\n'`)) return false;
58	std::istringstream L1(Line);
59	size_t NumFiles = `0`;
60	L1 >> NumFiles;
61	if (NumFiles == `0` \|\| NumFiles > `10000000`) return false;
62
63	// Parse NumFilesInFirstCorpus.
64	if (!std::getline(IS, Line, `'\n'`)) return false;
65	std::istringstream L2(Line);
66	NumFilesInFirstCorpus = NumFiles + `1`;
67	L2 >> NumFilesInFirstCorpus;
68	if (NumFilesInFirstCorpus > NumFiles) return false;
69
70	// Parse file names.
71	Files.resize(NumFiles);
72	for (size_t i = `0`; i < NumFiles; i++)
73	if (!std::getline(IS, Files[i].Name, `'\n'`))
74	return false;
75
76	// Parse STARTED, FT, and COV lines.
77	size_t ExpectedStartMarker = `0`;
78	const size_t kInvalidStartMarker = -`1`;
79	size_t LastSeenStartMarker = kInvalidStartMarker;
80	bool HaveFtMarker = true;
81	std::vector<uint32_t> TmpFeatures;
82	std::set<uint32_t> PCs;
83	while (std::getline(IS, Line, `'\n'`)) {
84	std::istringstream ISS1(Line);
85	std::string Marker;
86	uint32_t N;
87	if (!(ISS1 >> Marker) \|\| !(ISS1 >> N))
88	return false;
89	if (Marker == "STARTED") {
90	// STARTED FILE_ID FILE_SIZE
91	if (ExpectedStartMarker != N)
92	return false;
93	ISS1 >> Files[ExpectedStartMarker].Size;
94	LastSeenStartMarker = ExpectedStartMarker;
95	assert(ExpectedStartMarker < Files.size());
96	ExpectedStartMarker++;
97	HaveFtMarker = false;
98	} else if (Marker == "FT") {
99	// FT FILE_ID COV1 COV2 COV3 ...
100	size_t CurrentFileIdx = N;
101	if (CurrentFileIdx != LastSeenStartMarker)
102	return false;
103	HaveFtMarker = true;
104	if (ParseCoverage) {
105	TmpFeatures.clear(); // use a vector from outer scope to avoid resizes.
106	while (ISS1 >> N)
107	TmpFeatures.push_back(N);
108	std::sort(TmpFeatures.begin(), TmpFeatures.end());
109	Files[CurrentFileIdx].Features = TmpFeatures;
110	}
111	} else if (Marker == "COV") {
112	size_t CurrentFileIdx = N;
113	if (CurrentFileIdx != LastSeenStartMarker)
114	return false;
115	if (ParseCoverage)
116	while (ISS1 >> N)
117	if (PCs.insert(N).second)
118	Files[CurrentFileIdx].Cov.push_back(N);
119	} else {
120	return false;
121	}
122	}
123	if (!HaveFtMarker && LastSeenStartMarker != kInvalidStartMarker)
124	LastFailure = Files[LastSeenStartMarker].Name;
125
126	FirstNotProcessedFile = ExpectedStartMarker;
127	return true;
128	}
129
130	size_t Merger::ApproximateMemoryConsumption() const {
131	size_t Res = `0`;
132	for (const auto &F: Files)
133	Res += sizeof(F) + F.Features.size() * sizeof(F.Features[`0`]);
134	return Res;
135	}
136
137	// Decides which files need to be merged (add those to NewFiles).
138	// Returns the number of new features added.
139	size_t Merger::Merge(const std::set<uint32_t> &InitialFeatures,
140	std::set<uint32_t> *NewFeatures,
141	const std::set<uint32_t> &InitialCov,
142	std::set<uint32_t> *NewCov,
143	std::vector<std::string> *NewFiles) {
144	NewFiles->clear();
145	NewFeatures->clear();
146	NewCov->clear();
147	assert(NumFilesInFirstCorpus <= Files.size());
148	std::set<uint32_t> AllFeatures = InitialFeatures;
149
150	// What features are in the initial corpus?
151	for (size_t i = `0`; i < NumFilesInFirstCorpus; i++) {
152	auto &Cur = Files[i].Features;
153	AllFeatures.insert(Cur.begin(), Cur.end());
154	}
155	// Remove all features that we already know from all other inputs.
156	for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {
157	auto &Cur = Files[i].Features;
158	std::vector<uint32_t> Tmp;
159	std::set_difference(Cur.begin(), Cur.end(), AllFeatures.begin(),
160	AllFeatures.end(), std::inserter(Tmp, Tmp.begin()));
161	Cur.swap(Tmp);
162	}
163
164	// Sort. Give preference to
165	// smaller files*
166	// files with more features.*
167	std::sort(Files.begin() + NumFilesInFirstCorpus, Files.end(),
168	[&](const MergeFileInfo &a, const MergeFileInfo &b) -> bool {
169	if (a.Size != b.Size)
170	return a.Size < b.Size;
171	return a.Features.size() > b.Features.size();
172	});
173
174	// One greedy pass: add the file's features to AllFeatures.
175	// If new features were added, add this file to NewFiles.
176	for (size_t i = NumFilesInFirstCorpus; i < Files.size(); i++) {
177	auto &Cur = Files[i].Features;
178	// Printf("%s -> sz %zd ft %zd\n", Files[i].Name.c_str(),
179	// Files[i].Size, Cur.size());
180	bool FoundNewFeatures = false;
181	for (auto Fe: Cur) {
182	if (AllFeatures.insert(Fe).second) {
183	FoundNewFeatures = true;
184	NewFeatures->insert(Fe);
185	}
186	}
187	if (FoundNewFeatures)
188	NewFiles->push_back(Files[i].Name);
189	for (auto Cov : Files[i].Cov)
190	if (InitialCov.find(Cov) == InitialCov.end())
191	NewCov->insert(Cov);
192	}
193	return NewFeatures->size();
194	}
195
196	std::set<uint32_t> Merger::AllFeatures() const {
197	std::set<uint32_t> S;
198	for (auto &File : Files)
199	S.insert(File.Features.begin(), File.Features.end());
200	return S;
201	}
202
203	// Inner process. May crash if the target crashes.
204	void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath,
205	bool IsSetCoverMerge) {
206	Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());
207	Merger M;
208	std::ifstream IF(CFPath);
209	M.ParseOrExit(IF, false);
210	IF.close();
211	if (!M.LastFailure.empty())
212	Printf("MERGE-INNER: '%s' caused a failure at the previous merge step\n",
213	M.LastFailure.c_str());
214
215	Printf("MERGE-INNER: %zd total files;"
216	" %zd processed earlier; will process %zd files now\n",
217	M.Files.size(), M.FirstNotProcessedFile,
218	M.Files.size() - M.FirstNotProcessedFile);
219
220	std::ofstream OF(CFPath, std::ofstream::out \| std::ofstream::app);
221	std::set<size_t> AllFeatures;
222	auto PrintStatsWrapper = [this, &AllFeatures](const char* Where) {
223	this->PrintStats(Where, "\n", `0`, AllFeatures.size());
224	};
225	std::set<const TracePC::PCTableEntry *> AllPCs;
226	for (size_t i = M.FirstNotProcessedFile; i < M.Files.size(); i++) {
227	Fuzzer::MaybeExitGracefully();
228	auto U = FileToVector(M.Files[i].Name);
229	if (U.size() > MaxInputLen) {
230	U.resize(MaxInputLen);
231	U.shrink_to_fit();
232	}
233
234	// Write the pre-run marker.
235	OF << "STARTED " << i << " " << U.size() << "\n";
236	OF.flush(); // Flush is important since Command::Execute may crash.
237	// Run.
238	TPC.ResetMaps();
239	ExecuteCallback(U.data(), U.size());
240	// Collect coverage. We are iterating over the files in this order:
241	// First, files in the initial corpus ordered by size, smallest first.*
242	// Then, all other files, smallest first.*
243	std::set<size_t> Features;
244	if (IsSetCoverMerge)
245	TPC.CollectFeatures([&](size_t Feature) { Features.insert(Feature); });
246	else
247	TPC.CollectFeatures([&](size_t Feature) {
248	if (AllFeatures.insert(Feature).second)
249	Features.insert(Feature);
250	});
251	TPC.UpdateObservedPCs();
252	// Show stats.
253	if (!(TotalNumberOfRuns & (TotalNumberOfRuns - `1`)))
254	PrintStatsWrapper("pulse ");
255	if (TotalNumberOfRuns == M.NumFilesInFirstCorpus)
256	PrintStatsWrapper("LOADED");
257	// Write the post-run marker and the coverage.
258	OF << "FT " << i;
259	for (size_t F : Features)
260	OF << " " << F;
261	OF << "\n";
262	OF << "COV " << i;
263	TPC.ForEachObservedPC([&](const TracePC::PCTableEntry *TE) {
264	if (AllPCs.insert(TE).second)
265	OF << " " << TPC.PCTableEntryIdx(TE);
266	});
267	OF << "\n";
268	OF.flush();
269	}
270	PrintStatsWrapper("DONE ");
271	}
272
273	// Merges all corpora into the first corpus. A file is added into
274	// the first corpus only if it adds new features. Unlike `Merger::Merge`,
275	// this implementation calculates an approximation of the minimum set
276	// of corpora files, that cover all known features (set cover problem).
277	// Generally, this means that files with more features are preferred for
278	// merge into the first corpus. When two files have the same number of
279	// features, the smaller one is preferred.
280	size_t Merger::SetCoverMerge(const std::set<uint32_t> &InitialFeatures,
281	std::set<uint32_t> *NewFeatures,
282	const std::set<uint32_t> &InitialCov,
283	std::set<uint32_t> *NewCov,
284	std::vector<std::string> *NewFiles) {
285	assert(NumFilesInFirstCorpus <= Files.size());
286	NewFiles->clear();
287	NewFeatures->clear();
288	NewCov->clear();
289	std::set<uint32_t> AllFeatures;
290	// 1 << 21 - 1 is the maximum feature index.
291	// See 'kFeatureSetSize' in 'FuzzerCorpus.h'.
292	const uint32_t kFeatureSetSize = `1` << `21`;
293	std::vector<bool> Covered(kFeatureSetSize, false);
294	size_t NumCovered = `0`;
295
296	std::set<uint32_t> ExistingFeatures = InitialFeatures;
297	for (size_t i = `0`; i < NumFilesInFirstCorpus; ++i)
298	ExistingFeatures.insert(Files[i].Features.begin(), Files[i].Features.end());
299
300	// Mark the existing features as covered.
301	for (const auto &F : ExistingFeatures) {
302	if (!Covered[F % kFeatureSetSize]) {
303	++NumCovered;
304	Covered[F % kFeatureSetSize] = true;
305	}
306	// Calculate an underestimation of the set of covered features
307	// since the `Covered` bitvector is smaller than the feature range.
308	AllFeatures.insert(F % kFeatureSetSize);
309	}
310
311	std::set<size_t> RemainingFiles;
312	for (size_t i = NumFilesInFirstCorpus; i < Files.size(); ++i) {
313	// Construct an incremental sequence which represent the
314	// indices to all files (excluding those in the initial corpus).
315	// RemainingFiles = range(NumFilesInFirstCorpus..Files.size()).
316	RemainingFiles.insert(i);
317	// Insert this file's unique features to all features.
318	for (const auto &F : Files[i].Features)
319	AllFeatures.insert(F % kFeatureSetSize);
320	}
321
322	// Integrate files into Covered until set is complete.
323	while (NumCovered != AllFeatures.size()) {
324	// Index to file with largest number of unique features.
325	size_t MaxFeaturesIndex = NumFilesInFirstCorpus;
326	// Indices to remove from RemainingFiles.
327	std::set<size_t> RemoveIndices;
328	// Running max unique feature count.
329	// Updated upon finding a file with more features.
330	size_t MaxNumFeatures = `0`;
331
332	// Iterate over all files not yet integrated into Covered,
333	// to find the file which has the largest number of
334	// features that are not already in Covered.
335	for (const auto &i : RemainingFiles) {
336	const auto &File = Files[i];
337	size_t CurrentUnique = `0`;
338	// Count number of features in this file
339	// which are not yet in Covered.
340	for (const auto &F : File.Features)
341	if (!Covered[F % kFeatureSetSize])
342	++CurrentUnique;
343
344	if (CurrentUnique == `0`) {
345	// All features in this file are already in Covered: skip next time.
346	RemoveIndices.insert(i);
347	} else if (CurrentUnique > MaxNumFeatures \|\|
348	(CurrentUnique == MaxNumFeatures &&
349	File.Size < Files[MaxFeaturesIndex].Size)) {
350	// Update the max features file based on unique features
351	// Break ties by selecting smaller files.
352	MaxNumFeatures = CurrentUnique;
353	MaxFeaturesIndex = i;
354	}
355	}
356	// Must be a valid index/
357	assert(MaxFeaturesIndex < Files.size());
358	// Remove any feature-less files found.
359	for (const auto &i : RemoveIndices)
360	RemainingFiles.erase(i);
361	if (MaxNumFeatures == `0`) {
362	// Did not find a file that adds unique features.
363	// This means that we should have no remaining files.
364	assert(RemainingFiles.size() == `0`);
365	assert(NumCovered == AllFeatures.size());
366	break;
367	}
368
369	// MaxFeaturesIndex must be an element of Remaining.
370	assert(RemainingFiles.find(MaxFeaturesIndex) != RemainingFiles.end());
371	// Remove the file with the most features from Remaining.
372	RemainingFiles.erase(MaxFeaturesIndex);
373	const auto &MaxFeatureFile = Files[MaxFeaturesIndex];
374	// Add the features of the max feature file to Covered.
375	for (const auto &F : MaxFeatureFile.Features) {
376	if (!Covered[F % kFeatureSetSize]) {
377	++NumCovered;
378	Covered[F % kFeatureSetSize] = true;
379	NewFeatures->insert(F);
380	}
381	}
382	// Add the index to this file to the result.
383	NewFiles->push_back(MaxFeatureFile.Name);
384	// Update NewCov with the additional coverage
385	// that MaxFeatureFile provides.
386	for (const auto &C : MaxFeatureFile.Cov)
387	if (InitialCov.find(C) == InitialCov.end())
388	NewCov->insert(C);
389	}
390
391	return NewFeatures->size();
392	}
393
394	static size_t
395	WriteNewControlFile(const std::string &CFPath,
396	const std::vector<SizedFile> &OldCorpus,
397	const std::vector<SizedFile> &NewCorpus,
398	const std::vector<MergeFileInfo> &KnownFiles) {
399	std::unordered_set<std::string> FilesToSkip;
400	for (auto &SF: KnownFiles)
401	FilesToSkip.insert(SF.Name);
402
403	std::vector<std::string> FilesToUse;
404	auto MaybeUseFile = [=, &FilesToUse](std::string Name) {
405	if (FilesToSkip.find(Name) == FilesToSkip.end())
406	FilesToUse.push_back(Name);
407	};
408	for (auto &SF: OldCorpus)
409	MaybeUseFile(SF.File);
410	auto FilesToUseFromOldCorpus = FilesToUse.size();
411	for (auto &SF: NewCorpus)
412	MaybeUseFile(SF.File);
413
414	RemoveFile(CFPath);
415	std::ofstream ControlFile(CFPath);
416	ControlFile << FilesToUse.size() << "\n";
417	ControlFile << FilesToUseFromOldCorpus << "\n";
418	for (auto &FN: FilesToUse)
419	ControlFile << FN << "\n";
420
421	if (!ControlFile) {
422	Printf("MERGE-OUTER: failed to write to the control file: %s\n",
423	CFPath.c_str());
424	exit(`1`);
425	}
426
427	return FilesToUse.size();
428	}
429
430	// Outer process. Does not call the target code and thus should not fail.
431	void CrashResistantMerge(const std::vector<std::string> &Args,
432	const std::vector<SizedFile> &OldCorpus,
433	const std::vector<SizedFile> &NewCorpus,
434	std::vector<std::string> *NewFiles,
435	const std::set<uint32_t> &InitialFeatures,
436	std::set<uint32_t> *NewFeatures,
437	const std::set<uint32_t> &InitialCov,
438	std::set<uint32_t> NewCov, const* std::string &CFPath,
439	bool V, /Verbose/
440	bool IsSetCoverMerge) {
441	if (NewCorpus.empty() && OldCorpus.empty()) return; // Nothing to merge.
442	size_t NumAttempts = `0`;
443	std::vector<MergeFileInfo> KnownFiles;
444	if (FileSize(CFPath)) {
445	VPrintf(V, "MERGE-OUTER: non-empty control file provided: '%s'\n",
446	CFPath.c_str());
447	Merger M;
448	std::ifstream IF(CFPath);
449	if (M.Parse(IF, /ParseCoverage=/true)) {
450	VPrintf(V, "MERGE-OUTER: control file ok, %zd files total,"
451	" first not processed file %zd\n",
452	M.Files.size(), M.FirstNotProcessedFile);
453	if (!M.LastFailure.empty())
454	VPrintf(V, "MERGE-OUTER: '%s' will be skipped as unlucky "
455	"(merge has stumbled on it the last time)\n",
456	M.LastFailure.c_str());
457	if (M.FirstNotProcessedFile >= M.Files.size()) {
458	// Merge has already been completed with the given merge control file.
459	if (M.Files.size() == OldCorpus.size() + NewCorpus.size()) {
460	VPrintf(
461	V,
462	"MERGE-OUTER: nothing to do, merge has been completed before\n");
463	exit(`0`);
464	}
465
466	// Number of input files likely changed, start merge from scratch, but
467	// reuse coverage information from the given merge control file.
468	VPrintf(
469	V,
470	"MERGE-OUTER: starting merge from scratch, but reusing coverage "
471	"information from the given control file\n");
472	KnownFiles = M.Files;
473	} else {
474	// There is a merge in progress, continue.
475	NumAttempts = M.Files.size() - M.FirstNotProcessedFile;
476	}
477	} else {
478	VPrintf(V, "MERGE-OUTER: bad control file, will overwrite it\n");
479	}
480	}
481
482	if (!NumAttempts) {
483	// The supplied control file is empty or bad, create a fresh one.
484	VPrintf(V, "MERGE-OUTER: "
485	"%zd files, %zd in the initial corpus, %zd processed earlier\n",
486	OldCorpus.size() + NewCorpus.size(), OldCorpus.size(),
487	KnownFiles.size());
488	NumAttempts = WriteNewControlFile(CFPath, OldCorpus, NewCorpus, KnownFiles);
489	}
490
491	// Execute the inner process until it passes.
492	// Every inner process should execute at least one input.
493	Command BaseCmd(Args);
494	BaseCmd.removeFlag("merge");
495	BaseCmd.removeFlag("set_cover_merge");
496	BaseCmd.removeFlag("fork");
497	BaseCmd.removeFlag("collect_data_flow");
498	for (size_t Attempt = `1`; Attempt <= NumAttempts; Attempt++) {
499	Fuzzer::MaybeExitGracefully();
500	VPrintf(V, "MERGE-OUTER: attempt %zd\n", Attempt);
501	Command Cmd(BaseCmd);
502	Cmd.addFlag("merge_control_file", CFPath);
503	// If we are going to use the set cover implementation for
504	// minimization add the merge_inner=2 internal flag.
505	Cmd.addFlag("merge_inner", IsSetCoverMerge ? "2" : "1");
506	if (!V) {
507	Cmd.setOutputFile(getDevNull());
508	Cmd.combineOutAndErr();
509	}
510	auto ExitCode = ExecuteCommand(Cmd);
511	if (!ExitCode) {
512	VPrintf(V, "MERGE-OUTER: successful in %zd attempt(s)\n", Attempt);
513	break;
514	}
515	}
516	// Read the control file and do the merge.
517	Merger M;
518	std::ifstream IF(CFPath);
519	IF.seekg(`0`, IF.end);
520	VPrintf(V, "MERGE-OUTER: the control file has %zd bytes\n",
521	(size_t)IF.tellg());
522	IF.seekg(`0`, IF.beg);
523	M.ParseOrExit(IF, true);
524	IF.close();
525	VPrintf(V,
526	"MERGE-OUTER: consumed %zdMb (%zdMb rss) to parse the control file\n",
527	M.ApproximateMemoryConsumption() >> `20`, GetPeakRSSMb());
528
529	M.Files.insert(M.Files.end(), KnownFiles.begin(), KnownFiles.end());
530	if (IsSetCoverMerge)
531	M.SetCoverMerge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);
532	else
533	M.Merge(InitialFeatures, NewFeatures, InitialCov, NewCov, NewFiles);
534	VPrintf(V, "MERGE-OUTER: %zd new files with %zd new features added; "
535	"%zd new coverage edges\n",
536	NewFiles->size(), NewFeatures->size(), NewCov->size());
537	}
538
539	} // namespace fuzzer
540

Browse the source code of llvm_runtimes/compiler-rt/lib/fuzzer/FuzzerMerge.cpp