1 | //===- FuzzerDataFlowTrace.cpp - DataFlowTrace ---*- C++ -* ===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // fuzzer::DataFlowTrace |
9 | //===----------------------------------------------------------------------===// |
10 | |
11 | #include "FuzzerDataFlowTrace.h" |
12 | |
13 | #include "FuzzerCommand.h" |
14 | #include "FuzzerIO.h" |
15 | #include "FuzzerRandom.h" |
16 | #include "FuzzerSHA1.h" |
17 | #include "FuzzerUtil.h" |
18 | |
19 | #include <cstdlib> |
20 | #include <fstream> |
21 | #include <numeric> |
22 | #include <queue> |
23 | #include <sstream> |
24 | #include <string> |
25 | #include <unordered_map> |
26 | #include <unordered_set> |
27 | #include <vector> |
28 | |
29 | namespace fuzzer { |
30 | static const char *kFunctionsTxt = "functions.txt" ; |
31 | |
32 | bool BlockCoverage::AppendCoverage(const std::string &S) { |
33 | std::stringstream SS(S); |
34 | return AppendCoverage(IN&: SS); |
35 | } |
36 | |
37 | // Coverage lines have this form: |
38 | // CN X Y Z T |
39 | // where N is the number of the function, T is the total number of instrumented |
40 | // BBs, and X,Y,Z, if present, are the indices of covered BB. |
41 | // BB #0, which is the entry block, is not explicitly listed. |
42 | bool BlockCoverage::AppendCoverage(std::istream &IN) { |
43 | std::string L; |
44 | while (std::getline(is&: IN, str&: L, dlm: '\n')) { |
45 | if (L.empty()) |
46 | continue; |
47 | std::stringstream SS(L.c_str() + 1); |
48 | size_t FunctionId = 0; |
49 | SS >> FunctionId; |
50 | if (L[0] == 'F') { |
51 | FunctionsWithDFT.insert(x: FunctionId); |
52 | continue; |
53 | } |
54 | if (L[0] != 'C') continue; |
55 | std::vector<uint32_t> CoveredBlocks; |
56 | while (true) { |
57 | uint32_t BB = 0; |
58 | SS >> BB; |
59 | if (!SS) break; |
60 | CoveredBlocks.push_back(x: BB); |
61 | } |
62 | if (CoveredBlocks.empty()) return false; |
63 | // Ensures no CoverageVector is longer than UINT32_MAX. |
64 | uint32_t NumBlocks = CoveredBlocks.back(); |
65 | CoveredBlocks.pop_back(); |
66 | for (auto BB : CoveredBlocks) |
67 | if (BB >= NumBlocks) return false; |
68 | auto It = Functions.find(k: FunctionId); |
69 | auto &Counters = |
70 | It == Functions.end() |
71 | ? Functions.insert(x: {FunctionId, std::vector<uint32_t>(NumBlocks)}) |
72 | .first->second |
73 | : It->second; |
74 | |
75 | if (Counters.size() != NumBlocks) return false; // wrong number of blocks. |
76 | |
77 | Counters[0]++; |
78 | for (auto BB : CoveredBlocks) |
79 | Counters[BB]++; |
80 | } |
81 | return true; |
82 | } |
83 | |
84 | // Assign weights to each function. |
85 | // General principles: |
86 | // * any uncovered function gets weight 0. |
87 | // * a function with lots of uncovered blocks gets bigger weight. |
88 | // * a function with a less frequently executed code gets bigger weight. |
89 | std::vector<double> BlockCoverage::FunctionWeights(size_t NumFunctions) const { |
90 | std::vector<double> Res(NumFunctions); |
91 | for (const auto &It : Functions) { |
92 | auto FunctionID = It.first; |
93 | auto Counters = It.second; |
94 | assert(FunctionID < NumFunctions); |
95 | auto &Weight = Res[FunctionID]; |
96 | // Give higher weight if the function has a DFT. |
97 | Weight = FunctionsWithDFT.count(k: FunctionID) ? 1000. : 1; |
98 | // Give higher weight to functions with less frequently seen basic blocks. |
99 | Weight /= SmallestNonZeroCounter(Counters); |
100 | // Give higher weight to functions with the most uncovered basic blocks. |
101 | Weight *= NumberOfUncoveredBlocks(Counters) + 1; |
102 | } |
103 | return Res; |
104 | } |
105 | |
106 | void DataFlowTrace::ReadCoverage(const std::string &DirPath) { |
107 | std::vector<SizedFile> Files; |
108 | GetSizedFilesFromDir(Dir: DirPath, V: &Files); |
109 | for (auto &SF : Files) { |
110 | auto Name = Basename(Path: SF.File); |
111 | if (Name == kFunctionsTxt) continue; |
112 | if (!CorporaHashes.count(k: Name)) continue; |
113 | std::ifstream IF(SF.File); |
114 | Coverage.AppendCoverage(IN&: IF); |
115 | } |
116 | } |
117 | |
118 | static void DFTStringAppendToVector(std::vector<uint8_t> *DFT, |
119 | const std::string &DFTString) { |
120 | assert(DFT->size() == DFTString.size()); |
121 | for (size_t I = 0, Len = DFT->size(); I < Len; I++) |
122 | (*DFT)[I] = DFTString[I] == '1'; |
123 | } |
124 | |
125 | // converts a string of '0' and '1' into a std::vector<uint8_t> |
126 | static std::vector<uint8_t> DFTStringToVector(const std::string &DFTString) { |
127 | std::vector<uint8_t> DFT(DFTString.size()); |
128 | DFTStringAppendToVector(DFT: &DFT, DFTString); |
129 | return DFT; |
130 | } |
131 | |
132 | static bool ParseError(const char *Err, const std::string &Line) { |
133 | Printf(Fmt: "DataFlowTrace: parse error: %s: Line: %s\n" , Err, Line.c_str()); |
134 | return false; |
135 | } |
136 | |
137 | // TODO(metzman): replace std::string with std::string_view for |
138 | // better performance. Need to figure our how to use string_view on Windows. |
139 | static bool ParseDFTLine(const std::string &Line, size_t *FunctionNum, |
140 | std::string *DFTString) { |
141 | if (!Line.empty() && Line[0] != 'F') |
142 | return false; // Ignore coverage. |
143 | size_t SpacePos = Line.find(c: ' '); |
144 | if (SpacePos == std::string::npos) |
145 | return ParseError(Err: "no space in the trace line" , Line); |
146 | if (Line.empty() || Line[0] != 'F') |
147 | return ParseError(Err: "the trace line doesn't start with 'F'" , Line); |
148 | *FunctionNum = std::atol(nptr: Line.c_str() + 1); |
149 | const char *Beg = Line.c_str() + SpacePos + 1; |
150 | const char *End = Line.c_str() + Line.size(); |
151 | assert(Beg < End); |
152 | size_t Len = End - Beg; |
153 | for (size_t I = 0; I < Len; I++) { |
154 | if (Beg[I] != '0' && Beg[I] != '1') |
155 | return ParseError(Err: "the trace should contain only 0 or 1" , Line); |
156 | } |
157 | *DFTString = Beg; |
158 | return true; |
159 | } |
160 | |
161 | bool DataFlowTrace::Init(const std::string &DirPath, std::string *FocusFunction, |
162 | std::vector<SizedFile> &CorporaFiles, Random &Rand) { |
163 | if (DirPath.empty()) return false; |
164 | Printf(Fmt: "INFO: DataFlowTrace: reading from '%s'\n" , DirPath.c_str()); |
165 | std::vector<SizedFile> Files; |
166 | GetSizedFilesFromDir(Dir: DirPath, V: &Files); |
167 | std::string L; |
168 | size_t FocusFuncIdx = SIZE_MAX; |
169 | std::vector<std::string> FunctionNames; |
170 | |
171 | // Collect the hashes of the corpus files. |
172 | for (auto &SF : CorporaFiles) |
173 | CorporaHashes.insert(x: Hash(U: FileToVector(Path: SF.File))); |
174 | |
175 | // Read functions.txt |
176 | std::ifstream IF(DirPlusFile(DirPath, FileName: kFunctionsTxt)); |
177 | size_t NumFunctions = 0; |
178 | while (std::getline(is&: IF, str&: L, dlm: '\n')) { |
179 | FunctionNames.push_back(x: L); |
180 | NumFunctions++; |
181 | if (*FocusFunction == L) |
182 | FocusFuncIdx = NumFunctions - 1; |
183 | } |
184 | if (!NumFunctions) |
185 | return false; |
186 | |
187 | if (*FocusFunction == "auto" ) { |
188 | // AUTOFOCUS works like this: |
189 | // * reads the coverage data from the DFT files. |
190 | // * assigns weights to functions based on coverage. |
191 | // * chooses a random function according to the weights. |
192 | ReadCoverage(DirPath); |
193 | auto Weights = Coverage.FunctionWeights(NumFunctions); |
194 | std::vector<double> Intervals(NumFunctions + 1); |
195 | std::iota(first: Intervals.begin(), last: Intervals.end(), value: 0); |
196 | auto Distribution = std::piecewise_constant_distribution<double>( |
197 | Intervals.begin(), Intervals.end(), Weights.begin()); |
198 | FocusFuncIdx = static_cast<size_t>(Distribution(Rand)); |
199 | *FocusFunction = FunctionNames[FocusFuncIdx]; |
200 | assert(FocusFuncIdx < NumFunctions); |
201 | Printf(Fmt: "INFO: AUTOFOCUS: %zd %s\n" , FocusFuncIdx, |
202 | FunctionNames[FocusFuncIdx].c_str()); |
203 | for (size_t i = 0; i < NumFunctions; i++) { |
204 | if (Weights[i] == 0.0) |
205 | continue; |
206 | Printf(Fmt: " [%zd] W %g\tBB-tot %u\tBB-cov %u\tEntryFreq %u:\t%s\n" , i, |
207 | Weights[i], Coverage.GetNumberOfBlocks(FunctionId: i), |
208 | Coverage.GetNumberOfCoveredBlocks(FunctionId: i), Coverage.GetCounter(FunctionId: i, BasicBlockId: 0), |
209 | FunctionNames[i].c_str()); |
210 | } |
211 | } |
212 | |
213 | if (!NumFunctions || FocusFuncIdx == SIZE_MAX || Files.size() <= 1) |
214 | return false; |
215 | |
216 | // Read traces. |
217 | size_t NumTraceFiles = 0; |
218 | size_t NumTracesWithFocusFunction = 0; |
219 | for (auto &SF : Files) { |
220 | auto Name = Basename(Path: SF.File); |
221 | if (Name == kFunctionsTxt) continue; |
222 | if (!CorporaHashes.count(k: Name)) continue; // not in the corpus. |
223 | NumTraceFiles++; |
224 | // Printf("=== %s\n", Name.c_str()); |
225 | std::ifstream IF(SF.File); |
226 | while (std::getline(is&: IF, str&: L, dlm: '\n')) { |
227 | size_t FunctionNum = 0; |
228 | std::string DFTString; |
229 | if (ParseDFTLine(Line: L, FunctionNum: &FunctionNum, DFTString: &DFTString) && |
230 | FunctionNum == FocusFuncIdx) { |
231 | NumTracesWithFocusFunction++; |
232 | |
233 | if (FunctionNum >= NumFunctions) |
234 | return ParseError(Err: "N is greater than the number of functions" , Line: L); |
235 | Traces[Name] = DFTStringToVector(DFTString); |
236 | // Print just a few small traces. |
237 | if (NumTracesWithFocusFunction <= 3 && DFTString.size() <= 16) |
238 | Printf(Fmt: "%s => |%s|\n" , Name.c_str(), std::string(DFTString).c_str()); |
239 | break; // No need to parse the following lines. |
240 | } |
241 | } |
242 | } |
243 | Printf(Fmt: "INFO: DataFlowTrace: %zd trace files, %zd functions, " |
244 | "%zd traces with focus function\n" , |
245 | NumTraceFiles, NumFunctions, NumTracesWithFocusFunction); |
246 | return NumTraceFiles > 0; |
247 | } |
248 | |
249 | int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath, |
250 | const std::vector<SizedFile> &CorporaFiles) { |
251 | Printf(Fmt: "INFO: collecting data flow: bin: %s dir: %s files: %zd\n" , |
252 | DFTBinary.c_str(), DirPath.c_str(), CorporaFiles.size()); |
253 | if (CorporaFiles.empty()) { |
254 | Printf(Fmt: "ERROR: can't collect data flow without corpus provided." ); |
255 | return 1; |
256 | } |
257 | |
258 | static char DFSanEnv[] = "DFSAN_OPTIONS=warn_unimplemented=0" ; |
259 | putenv(string: DFSanEnv); |
260 | MkDir(Path: DirPath); |
261 | for (auto &F : CorporaFiles) { |
262 | // For every input F we need to collect the data flow and the coverage. |
263 | // Data flow collection may fail if we request too many DFSan tags at once. |
264 | // So, we start from requesting all tags in range [0,Size) and if that fails |
265 | // we then request tags in [0,Size/2) and [Size/2, Size), and so on. |
266 | // Function number => DFT. |
267 | auto OutPath = DirPlusFile(DirPath, FileName: Hash(U: FileToVector(Path: F.File))); |
268 | std::unordered_map<size_t, std::vector<uint8_t>> DFTMap; |
269 | std::unordered_set<std::string> Cov; |
270 | Command Cmd; |
271 | Cmd.addArgument(Arg: DFTBinary); |
272 | Cmd.addArgument(Arg: F.File); |
273 | Cmd.addArgument(Arg: OutPath); |
274 | Printf(Fmt: "CMD: %s\n" , Cmd.toString().c_str()); |
275 | ExecuteCommand(Cmd); |
276 | } |
277 | // Write functions.txt if it's currently empty or doesn't exist. |
278 | auto FunctionsTxtPath = DirPlusFile(DirPath, FileName: kFunctionsTxt); |
279 | if (FileToString(Path: FunctionsTxtPath).empty()) { |
280 | Command Cmd; |
281 | Cmd.addArgument(Arg: DFTBinary); |
282 | Cmd.setOutputFile(FunctionsTxtPath); |
283 | ExecuteCommand(Cmd); |
284 | } |
285 | return 0; |
286 | } |
287 | |
288 | } // namespace fuzzer |
289 | |