| 1 | //===- ExportTrie.cpp -----------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This is a partial implementation of the Mach-O export trie format. It's |
| 10 | // essentially a symbol table encoded as a compressed prefix trie, meaning that |
| 11 | // the common prefixes of each symbol name are shared for a more compact |
| 12 | // representation. The prefixes are stored on the edges of the trie, and one |
| 13 | // edge can represent multiple characters. For example, given two exported |
| 14 | // symbols _bar and _baz, we will have a trie like this (terminal nodes are |
| 15 | // marked with an asterisk): |
| 16 | // |
| 17 | // +-+-+ |
| 18 | // | | // root node |
| 19 | // +-+-+ |
| 20 | // | |
| 21 | // | _ba |
| 22 | // | |
| 23 | // +-+-+ |
| 24 | // | | |
| 25 | // +-+-+ |
| 26 | // r / \ z |
| 27 | // / \ |
| 28 | // +-+-+ +-+-+ |
| 29 | // | * | | * | |
| 30 | // +-+-+ +-+-+ |
| 31 | // |
| 32 | // More documentation of the format can be found in |
| 33 | // llvm/tools/obj2yaml/macho2yaml.cpp. |
| 34 | // |
| 35 | //===----------------------------------------------------------------------===// |
| 36 | |
| 37 | #include "ExportTrie.h" |
| 38 | #include "Symbols.h" |
| 39 | |
| 40 | #include "lld/Common/ErrorHandler.h" |
| 41 | #include "llvm/BinaryFormat/MachO.h" |
| 42 | #include "llvm/Support/LEB128.h" |
| 43 | #include <optional> |
| 44 | |
| 45 | using namespace llvm; |
| 46 | using namespace lld; |
| 47 | using namespace lld::macho; |
| 48 | |
| 49 | namespace { |
| 50 | |
| 51 | struct Edge { |
| 52 | Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} |
| 53 | |
| 54 | StringRef substring; |
| 55 | struct TrieNode *child; |
| 56 | }; |
| 57 | |
| 58 | struct ExportInfo { |
| 59 | uint64_t address; |
| 60 | uint64_t ordinal = 0; |
| 61 | uint8_t flags = 0; |
| 62 | ExportInfo(const Symbol &sym, uint64_t imageBase) |
| 63 | : address(sym.getVA() - imageBase) { |
| 64 | using namespace llvm::MachO; |
| 65 | if (sym.isWeakDef()) |
| 66 | flags |= EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION; |
| 67 | if (sym.isTlv()) |
| 68 | flags |= EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL; |
| 69 | // TODO: Add proper support for stub-and-resolver flags. |
| 70 | |
| 71 | if (auto *defined = dyn_cast<Defined>(Val: &sym)) { |
| 72 | if (defined->isAbsolute()) |
| 73 | flags |= EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE; |
| 74 | } else if (auto *dysym = dyn_cast<DylibSymbol>(Val: &sym)) { |
| 75 | flags |= EXPORT_SYMBOL_FLAGS_REEXPORT; |
| 76 | if (!dysym->isDynamicLookup()) |
| 77 | ordinal = dysym->getFile()->ordinal; |
| 78 | } |
| 79 | } |
| 80 | }; |
| 81 | |
| 82 | } // namespace |
| 83 | |
| 84 | struct macho::TrieNode { |
| 85 | std::vector<Edge> edges; |
| 86 | std::optional<ExportInfo> info; |
| 87 | // Estimated offset from the start of the serialized trie to the current node. |
| 88 | // This will converge to the true offset when updateOffset() is run to a |
| 89 | // fixpoint. |
| 90 | size_t offset = 0; |
| 91 | |
| 92 | uint32_t getTerminalSize() const; |
| 93 | // Returns whether the new estimated offset differs from the old one. |
| 94 | bool updateOffset(size_t &nextOffset); |
| 95 | void writeTo(uint8_t *buf) const; |
| 96 | }; |
| 97 | |
| 98 | // For regular symbols, the node layout (excluding the children) is |
| 99 | // |
| 100 | // uleb128 terminalSize; |
| 101 | // uleb128 flags; |
| 102 | // uleb128 address; |
| 103 | // |
| 104 | // For re-exported symbols, the layout is |
| 105 | // |
| 106 | // uleb128 terminalSize; |
| 107 | // uleb128 flags; |
| 108 | // uleb128 ordinal; |
| 109 | // char[] originalName; |
| 110 | // |
| 111 | // If libfoo.dylib is linked against libbar.dylib, and libfoo exports an alias |
| 112 | // _foo to a symbol _bar in libbar, then originalName will be "_bar". If libfoo |
| 113 | // re-exports _bar directly (i.e. not via an alias), then originalName will be |
| 114 | // the empty string. |
| 115 | // |
| 116 | // TODO: Support aliased re-exports. (Since we don't yet support these, |
| 117 | // originalName will always be the empty string.) |
| 118 | // |
| 119 | // For stub-and-resolver nodes, the layout is |
| 120 | // |
| 121 | // uleb128 terminalSize; |
| 122 | // uleb128 flags; |
| 123 | // uleb128 stubAddress; |
| 124 | // uleb128 resolverAddress; |
| 125 | // |
| 126 | // TODO: Support stub-and-resolver nodes. |
| 127 | uint32_t TrieNode::getTerminalSize() const { |
| 128 | uint32_t size = getULEB128Size(Value: info->flags); |
| 129 | if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) |
| 130 | size += getULEB128Size(Value: info->ordinal) + 1; // + 1 for the null-terminator |
| 131 | else |
| 132 | size += getULEB128Size(Value: info->address); |
| 133 | return size; |
| 134 | } |
| 135 | |
| 136 | bool TrieNode::updateOffset(size_t &nextOffset) { |
| 137 | // Size of the whole node (including the terminalSize and the outgoing edges.) |
| 138 | // In contrast, terminalSize only records the size of the other data in the |
| 139 | // node. |
| 140 | size_t nodeSize; |
| 141 | if (info) { |
| 142 | uint32_t terminalSize = getTerminalSize(); |
| 143 | // Overall node size so far is the uleb128 size of the length of the symbol |
| 144 | // info + the symbol info itself. |
| 145 | nodeSize = terminalSize + getULEB128Size(Value: terminalSize); |
| 146 | } else { |
| 147 | nodeSize = 1; // Size of terminalSize (which has a value of 0) |
| 148 | } |
| 149 | // Compute size of all child edges. |
| 150 | ++nodeSize; // Byte for number of children. |
| 151 | for (const Edge &edge : edges) { |
| 152 | nodeSize += edge.substring.size() + 1 // String length. |
| 153 | + getULEB128Size(Value: edge.child->offset); // Offset len. |
| 154 | } |
| 155 | // On input, 'nextOffset' is the new preferred location for this node. |
| 156 | bool result = (offset != nextOffset); |
| 157 | // Store new location in node object for use by parents. |
| 158 | offset = nextOffset; |
| 159 | nextOffset += nodeSize; |
| 160 | return result; |
| 161 | } |
| 162 | |
| 163 | void TrieNode::writeTo(uint8_t *buf) const { |
| 164 | buf += offset; |
| 165 | if (info) { |
| 166 | uint32_t terminalSize = getTerminalSize(); |
| 167 | buf += encodeULEB128(Value: terminalSize, p: buf); |
| 168 | buf += encodeULEB128(Value: info->flags, p: buf); |
| 169 | if (info->flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) { |
| 170 | buf += encodeULEB128(Value: info->ordinal, p: buf); |
| 171 | *buf++ = 0; // empty originalName string |
| 172 | } else { |
| 173 | buf += encodeULEB128(Value: info->address, p: buf); |
| 174 | } |
| 175 | } else { |
| 176 | // TrieNode with no Symbol info. |
| 177 | *buf++ = 0; // terminalSize |
| 178 | } |
| 179 | // Add number of children. TODO: Handle case where we have more than 256. |
| 180 | assert(edges.size() < 256); |
| 181 | *buf++ = edges.size(); |
| 182 | // Append each child edge substring and node offset. |
| 183 | for (const Edge &edge : edges) { |
| 184 | memcpy(dest: buf, src: edge.substring.data(), n: edge.substring.size()); |
| 185 | buf += edge.substring.size(); |
| 186 | *buf++ = '\0'; |
| 187 | buf += encodeULEB128(Value: edge.child->offset, p: buf); |
| 188 | } |
| 189 | } |
| 190 | |
| 191 | TrieBuilder::~TrieBuilder() { |
| 192 | for (TrieNode *node : nodes) |
| 193 | delete node; |
| 194 | } |
| 195 | |
| 196 | TrieNode *TrieBuilder::makeNode() { |
| 197 | auto *node = new TrieNode(); |
| 198 | nodes.emplace_back(args&: node); |
| 199 | return node; |
| 200 | } |
| 201 | |
| 202 | static int charAt(const Symbol *sym, size_t pos) { |
| 203 | StringRef str = sym->getName(); |
| 204 | if (pos >= str.size()) |
| 205 | return -1; |
| 206 | return str[pos]; |
| 207 | } |
| 208 | |
| 209 | // Build the trie by performing a three-way radix quicksort: We start by sorting |
| 210 | // the strings by their first characters, then sort the strings with the same |
| 211 | // first characters by their second characters, and so on recursively. Each |
| 212 | // time the prefixes diverge, we add a node to the trie. |
| 213 | // |
| 214 | // node: The most recently created node along this path in the trie (i.e. |
| 215 | // the furthest from the root.) |
| 216 | // lastPos: The prefix length of the most recently created node, i.e. the number |
| 217 | // of characters along its path from the root. |
| 218 | // pos: The string index we are currently sorting on. Note that each symbol |
| 219 | // S contained in vec has the same prefix S[0...pos). |
| 220 | void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec, |
| 221 | TrieNode *node, size_t lastPos, size_t pos) { |
| 222 | tailcall: |
| 223 | if (vec.empty()) |
| 224 | return; |
| 225 | |
| 226 | // Partition items so that items in [0, i) are less than the pivot, |
| 227 | // [i, j) are the same as the pivot, and [j, vec.size()) are greater than |
| 228 | // the pivot. |
| 229 | const Symbol *pivotSymbol = vec[vec.size() / 2]; |
| 230 | int pivot = charAt(sym: pivotSymbol, pos); |
| 231 | size_t i = 0; |
| 232 | size_t j = vec.size(); |
| 233 | for (size_t k = 0; k < j;) { |
| 234 | int c = charAt(sym: vec[k], pos); |
| 235 | if (c < pivot) |
| 236 | std::swap(a&: vec[i++], b&: vec[k++]); |
| 237 | else if (c > pivot) |
| 238 | std::swap(a&: vec[--j], b&: vec[k]); |
| 239 | else |
| 240 | k++; |
| 241 | } |
| 242 | |
| 243 | bool isTerminal = pivot == -1; |
| 244 | bool prefixesDiverge = i != 0 || j != vec.size(); |
| 245 | if (lastPos != pos && (isTerminal || prefixesDiverge)) { |
| 246 | TrieNode *newNode = makeNode(); |
| 247 | node->edges.emplace_back(args: pivotSymbol->getName().slice(Start: lastPos, End: pos), |
| 248 | args&: newNode); |
| 249 | node = newNode; |
| 250 | lastPos = pos; |
| 251 | } |
| 252 | |
| 253 | sortAndBuild(vec: vec.slice(N: 0, M: i), node, lastPos, pos); |
| 254 | sortAndBuild(vec: vec.slice(N: j), node, lastPos, pos); |
| 255 | |
| 256 | if (isTerminal) { |
| 257 | assert(j - i == 1); // no duplicate symbols |
| 258 | node->info = ExportInfo(*pivotSymbol, imageBase); |
| 259 | } else { |
| 260 | // This is the tail-call-optimized version of the following: |
| 261 | // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1); |
| 262 | vec = vec.slice(N: i, M: j - i); |
| 263 | ++pos; |
| 264 | goto tailcall; |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | size_t TrieBuilder::build() { |
| 269 | if (exported.empty()) |
| 270 | return 0; |
| 271 | |
| 272 | TrieNode *root = makeNode(); |
| 273 | sortAndBuild(vec: exported, node: root, lastPos: 0, pos: 0); |
| 274 | |
| 275 | // Assign each node in the vector an offset in the trie stream, iterating |
| 276 | // until all uleb128 sizes have stabilized. |
| 277 | size_t offset; |
| 278 | bool more; |
| 279 | do { |
| 280 | offset = 0; |
| 281 | more = false; |
| 282 | for (TrieNode *node : nodes) |
| 283 | more |= node->updateOffset(nextOffset&: offset); |
| 284 | } while (more); |
| 285 | |
| 286 | return offset; |
| 287 | } |
| 288 | |
| 289 | void TrieBuilder::writeTo(uint8_t *buf) const { |
| 290 | for (TrieNode *node : nodes) |
| 291 | node->writeTo(buf); |
| 292 | } |
| 293 | |
| 294 | namespace { |
| 295 | |
| 296 | // Parse a serialized trie and invoke a callback for each entry. |
| 297 | class TrieParser { |
| 298 | public: |
| 299 | TrieParser(const uint8_t *buf, size_t size, const TrieEntryCallback &callback) |
| 300 | : start(buf), end(start + size), callback(callback) {} |
| 301 | |
| 302 | void parse(const uint8_t *buf, const Twine &cumulativeString); |
| 303 | |
| 304 | void parse() { parse(buf: start, cumulativeString: "" ); } |
| 305 | |
| 306 | const uint8_t *start; |
| 307 | const uint8_t *end; |
| 308 | const TrieEntryCallback &callback; |
| 309 | }; |
| 310 | |
| 311 | } // namespace |
| 312 | |
| 313 | void TrieParser::parse(const uint8_t *buf, const Twine &cumulativeString) { |
| 314 | if (buf >= end) |
| 315 | fatal(msg: "Node offset points outside export section" ); |
| 316 | |
| 317 | unsigned ulebSize; |
| 318 | uint64_t terminalSize = decodeULEB128(p: buf, n: &ulebSize); |
| 319 | buf += ulebSize; |
| 320 | uint64_t flags = 0; |
| 321 | size_t offset; |
| 322 | if (terminalSize != 0) { |
| 323 | flags = decodeULEB128(p: buf, n: &ulebSize); |
| 324 | callback(cumulativeString, flags); |
| 325 | } |
| 326 | buf += terminalSize; |
| 327 | uint8_t numEdges = *buf++; |
| 328 | for (uint8_t i = 0; i < numEdges; ++i) { |
| 329 | const char *cbuf = reinterpret_cast<const char *>(buf); |
| 330 | StringRef substring = StringRef(cbuf, strnlen(string: cbuf, maxlen: end - buf)); |
| 331 | buf += substring.size() + 1; |
| 332 | offset = decodeULEB128(p: buf, n: &ulebSize); |
| 333 | buf += ulebSize; |
| 334 | parse(buf: start + offset, cumulativeString: cumulativeString + substring); |
| 335 | } |
| 336 | } |
| 337 | |
| 338 | void macho::parseTrie(const uint8_t *buf, size_t size, |
| 339 | const TrieEntryCallback &callback) { |
| 340 | if (size == 0) |
| 341 | return; |
| 342 | |
| 343 | TrieParser(buf, size, callback).parse(); |
| 344 | } |
| 345 | |