1 | //===- MachOObject.h - Mach-O object file model -----------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H |
10 | #define LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H |
11 | |
12 | #include "llvm/ADT/StringRef.h" |
13 | #include "llvm/BinaryFormat/MachO.h" |
14 | #include "llvm/MC/StringTableBuilder.h" |
15 | #include "llvm/ObjectYAML/DWARFYAML.h" |
16 | #include "llvm/Support/StringSaver.h" |
17 | #include "llvm/Support/YAMLTraits.h" |
18 | #include <cstdint> |
19 | #include <string> |
20 | #include <vector> |
21 | |
22 | namespace llvm { |
23 | namespace objcopy { |
24 | namespace macho { |
25 | |
26 | struct { |
27 | uint32_t ; |
28 | uint32_t ; |
29 | uint32_t ; |
30 | uint32_t ; |
31 | uint32_t ; |
32 | uint32_t ; |
33 | uint32_t ; |
34 | uint32_t = 0; |
35 | }; |
36 | |
37 | struct RelocationInfo; |
38 | struct Section { |
39 | uint32_t Index; |
40 | std::string Segname; |
41 | std::string Sectname; |
42 | // CanonicalName is a string formatted as “<Segname>,<Sectname>". |
43 | std::string CanonicalName; |
44 | uint64_t Addr = 0; |
45 | uint64_t Size = 0; |
46 | // Offset in the input file. |
47 | std::optional<uint32_t> OriginalOffset; |
48 | uint32_t Offset = 0; |
49 | uint32_t Align = 0; |
50 | uint32_t RelOff = 0; |
51 | uint32_t NReloc = 0; |
52 | uint32_t Flags = 0; |
53 | uint32_t Reserved1 = 0; |
54 | uint32_t Reserved2 = 0; |
55 | uint32_t Reserved3 = 0; |
56 | StringRef Content; |
57 | std::vector<RelocationInfo> Relocations; |
58 | |
59 | Section(StringRef SegName, StringRef SectName); |
60 | |
61 | Section(StringRef SegName, StringRef SectName, StringRef Content); |
62 | |
63 | MachO::SectionType getType() const { |
64 | return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); |
65 | } |
66 | |
67 | bool isVirtualSection() const { |
68 | return (getType() == MachO::S_ZEROFILL || |
69 | getType() == MachO::S_GB_ZEROFILL || |
70 | getType() == MachO::S_THREAD_LOCAL_ZEROFILL); |
71 | } |
72 | |
73 | bool hasValidOffset() const { |
74 | return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0)); |
75 | } |
76 | }; |
77 | |
78 | struct LoadCommand { |
79 | // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h |
80 | // and it is a union of all the structs corresponding to various load |
81 | // commands. |
82 | MachO::macho_load_command MachOLoadCommand; |
83 | |
84 | // The raw content of the payload of the load command (located right after the |
85 | // corresponding struct). In some cases it is either empty or can be |
86 | // copied-over without digging into its structure. |
87 | std::vector<uint8_t> Payload; |
88 | |
89 | // Some load commands can contain (inside the payload) an array of sections, |
90 | // though the contents of the sections are stored separately. The struct |
91 | // Section describes only sections' metadata and where to find the |
92 | // corresponding content inside the binary. |
93 | std::vector<std::unique_ptr<Section>> Sections; |
94 | |
95 | // Returns the segment name if the load command is a segment command. |
96 | std::optional<StringRef> getSegmentName() const; |
97 | |
98 | // Returns the segment vm address if the load command is a segment command. |
99 | std::optional<uint64_t> getSegmentVMAddr() const; |
100 | }; |
101 | |
102 | // A symbol information. Fields which starts with "n_" are same as them in the |
103 | // nlist. |
104 | struct SymbolEntry { |
105 | std::string Name; |
106 | bool Referenced = false; |
107 | uint32_t Index; |
108 | uint8_t n_type; |
109 | uint8_t n_sect; |
110 | uint16_t n_desc; |
111 | uint64_t n_value; |
112 | |
113 | bool isExternalSymbol() const { return n_type & MachO::N_EXT; } |
114 | |
115 | bool isLocalSymbol() const { return !isExternalSymbol(); } |
116 | |
117 | bool isUndefinedSymbol() const { |
118 | return (n_type & MachO::N_TYPE) == MachO::N_UNDF; |
119 | } |
120 | |
121 | bool isSwiftSymbol() const { |
122 | return StringRef(Name).starts_with(Prefix: "_$s" ) || |
123 | StringRef(Name).starts_with(Prefix: "_$S" ); |
124 | } |
125 | |
126 | std::optional<uint32_t> section() const { |
127 | return n_sect == MachO::NO_SECT ? std::nullopt |
128 | : std::optional<uint32_t>(n_sect); |
129 | } |
130 | }; |
131 | |
132 | /// The location of the symbol table inside the binary is described by LC_SYMTAB |
133 | /// load command. |
134 | struct SymbolTable { |
135 | std::vector<std::unique_ptr<SymbolEntry>> Symbols; |
136 | |
137 | using iterator = pointee_iterator< |
138 | std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>; |
139 | |
140 | iterator begin() const { return iterator(Symbols.begin()); } |
141 | iterator end() const { return iterator(Symbols.end()); } |
142 | |
143 | const SymbolEntry *getSymbolByIndex(uint32_t Index) const; |
144 | SymbolEntry *getSymbolByIndex(uint32_t Index); |
145 | void removeSymbols( |
146 | function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove); |
147 | }; |
148 | |
149 | struct IndirectSymbolEntry { |
150 | // The original value in an indirect symbol table. Higher bits encode extra |
151 | // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS). |
152 | uint32_t OriginalIndex; |
153 | /// The Symbol referenced by this entry. It's std::nullopt if the index is |
154 | /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS. |
155 | std::optional<SymbolEntry *> Symbol; |
156 | |
157 | IndirectSymbolEntry(uint32_t OriginalIndex, |
158 | std::optional<SymbolEntry *> Symbol) |
159 | : OriginalIndex(OriginalIndex), Symbol(Symbol) {} |
160 | }; |
161 | |
162 | struct IndirectSymbolTable { |
163 | std::vector<IndirectSymbolEntry> Symbols; |
164 | }; |
165 | |
166 | /// The location of the string table inside the binary is described by LC_SYMTAB |
167 | /// load command. |
168 | struct StringTable { |
169 | std::vector<std::string> Strings; |
170 | }; |
171 | |
172 | struct RelocationInfo { |
173 | // The referenced symbol entry. Set if !Scattered && Extern. |
174 | std::optional<const SymbolEntry *> Symbol; |
175 | // The referenced section. Set if !Scattered && !Extern. |
176 | std::optional<const Section *> Sec; |
177 | // True if Info is a scattered_relocation_info. |
178 | bool Scattered; |
179 | // True if the type is an ADDEND. r_symbolnum holds the addend instead of a |
180 | // symbol index. |
181 | bool IsAddend; |
182 | // True if the r_symbolnum points to a section number (i.e. r_extern=0). |
183 | bool Extern; |
184 | MachO::any_relocation_info Info; |
185 | |
186 | unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) { |
187 | if (IsLittleEndian) |
188 | return Info.r_word1 & 0xffffff; |
189 | return Info.r_word1 >> 8; |
190 | } |
191 | |
192 | void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) { |
193 | assert(SymbolNum < (1 << 24) && "SymbolNum out of range" ); |
194 | if (IsLittleEndian) |
195 | Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum; |
196 | else |
197 | Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8); |
198 | } |
199 | }; |
200 | |
201 | /// The location of the rebase info inside the binary is described by |
202 | /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at |
203 | /// an address different from its preferred address. The rebase information is |
204 | /// a stream of byte sized opcodes whose symbolic names start with |
205 | /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples: |
206 | /// <seg-index, seg-offset, type> |
207 | /// The opcodes are a compressed way to encode the table by only |
208 | /// encoding when a column changes. In addition simple patterns |
209 | /// like "every n'th offset for m times" can be encoded in a few |
210 | /// bytes. |
211 | struct RebaseInfo { |
212 | // At the moment we do not parse this info (and it is simply copied over), |
213 | // but the proper support will be added later. |
214 | ArrayRef<uint8_t> Opcodes; |
215 | }; |
216 | |
217 | /// The location of the bind info inside the binary is described by |
218 | /// LC_DYLD_INFO load command. Dyld binds an image during the loading process, |
219 | /// if the image requires any pointers to be initialized to symbols in other |
220 | /// images. The bind information is a stream of byte sized opcodes whose |
221 | /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is |
222 | /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal, |
223 | /// symbol-name, addend> The opcodes are a compressed way to encode the table by |
224 | /// only encoding when a column changes. In addition simple patterns like for |
225 | /// runs of pointers initialized to the same value can be encoded in a few |
226 | /// bytes. |
227 | struct BindInfo { |
228 | // At the moment we do not parse this info (and it is simply copied over), |
229 | // but the proper support will be added later. |
230 | ArrayRef<uint8_t> Opcodes; |
231 | }; |
232 | |
233 | /// The location of the weak bind info inside the binary is described by |
234 | /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols |
235 | /// so that all images in the process use the same copy of some code/data. This |
236 | /// step is done after binding. The content of the weak_bind info is an opcode |
237 | /// stream like the bind_info. But it is sorted alphabetically by symbol name. |
238 | /// This enable dyld to walk all images with weak binding information in order |
239 | /// and look for collisions. If there are no collisions, dyld does no updating. |
240 | /// That means that some fixups are also encoded in the bind_info. For |
241 | /// instance, all calls to "operator new" are first bound to libstdc++.dylib |
242 | /// using the information in bind_info. Then if some image overrides operator |
243 | /// new that is detected when the weak_bind information is processed and the |
244 | /// call to operator new is then rebound. |
245 | struct WeakBindInfo { |
246 | // At the moment we do not parse this info (and it is simply copied over), |
247 | // but the proper support will be added later. |
248 | ArrayRef<uint8_t> Opcodes; |
249 | }; |
250 | |
251 | /// The location of the lazy bind info inside the binary is described by |
252 | /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be |
253 | /// bound immediately. Instead they can be lazily bound on first use. The |
254 | /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal |
255 | /// use is that dyld ignores the lazy_bind section when loading an image. |
256 | /// Instead the static linker arranged for the lazy pointer to initially point |
257 | /// to a helper function which pushes the offset into the lazy_bind area for the |
258 | /// symbol needing to be bound, then jumps to dyld which simply adds the offset |
259 | /// to lazy_bind_off to get the information on what to bind. |
260 | struct LazyBindInfo { |
261 | ArrayRef<uint8_t> Opcodes; |
262 | }; |
263 | |
264 | /// The location of the export info inside the binary is described by |
265 | /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a |
266 | /// trie. This is a compact representation that factors out common prefixes. It |
267 | /// also reduces LINKEDIT pages in RAM because it encodes all information (name, |
268 | /// address, flags) in one small, contiguous range. The export area is a stream |
269 | /// of nodes. The first node sequentially is the start node for the trie. Nodes |
270 | /// for a symbol start with a uleb128 that is the length of the exported symbol |
271 | /// information for the string so far. If there is no exported symbol, the node |
272 | /// starts with a zero byte. If there is exported info, it follows the length. |
273 | /// First is a uleb128 containing flags. Normally, it is followed by |
274 | /// a uleb128 encoded offset which is location of the content named |
275 | /// by the symbol from the mach_header for the image. If the flags |
276 | /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is |
277 | /// a uleb128 encoded library ordinal, then a zero terminated |
278 | /// UTF8 string. If the string is zero length, then the symbol |
279 | /// is re-export from the specified dylib with the same name. |
280 | /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following |
281 | /// the flags is two uleb128s: the stub offset and the resolver offset. |
282 | /// The stub is used by non-lazy pointers. The resolver is used |
283 | /// by lazy pointers and must be called to get the actual address to use. |
284 | /// After the optional exported symbol information is a byte of |
285 | /// how many edges (0-255) that this node has leaving it, |
286 | /// followed by each edge. |
287 | /// Each edge is a zero terminated UTF8 of the addition chars |
288 | /// in the symbol, followed by a uleb128 offset for the node that |
289 | /// edge points to. |
290 | struct ExportInfo { |
291 | ArrayRef<uint8_t> Trie; |
292 | }; |
293 | |
294 | struct LinkData { |
295 | ArrayRef<uint8_t> Data; |
296 | }; |
297 | |
298 | struct Object { |
299 | MachHeader ; |
300 | std::vector<LoadCommand> LoadCommands; |
301 | |
302 | SymbolTable SymTable; |
303 | StringTable StrTable; |
304 | |
305 | RebaseInfo Rebases; |
306 | BindInfo Binds; |
307 | WeakBindInfo WeakBinds; |
308 | LazyBindInfo LazyBinds; |
309 | ExportInfo Exports; |
310 | IndirectSymbolTable IndirectSymTable; |
311 | LinkData DataInCode; |
312 | LinkData LinkerOptimizationHint; |
313 | LinkData FunctionStarts; |
314 | LinkData ExportsTrie; |
315 | LinkData ChainedFixups; |
316 | LinkData DylibCodeSignDRs; |
317 | |
318 | std::optional<uint32_t> SwiftVersion; |
319 | |
320 | /// The index of LC_CODE_SIGNATURE load command if present. |
321 | std::optional<size_t> CodeSignatureCommandIndex; |
322 | /// The index of LC_DYLIB_CODE_SIGN_DRS load command if present. |
323 | std::optional<size_t> DylibCodeSignDRsIndex; |
324 | /// The index of LC_SYMTAB load command if present. |
325 | std::optional<size_t> SymTabCommandIndex; |
326 | /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present. |
327 | std::optional<size_t> DyLdInfoCommandIndex; |
328 | /// The index LC_DYSYMTAB load command if present. |
329 | std::optional<size_t> DySymTabCommandIndex; |
330 | /// The index LC_DATA_IN_CODE load command if present. |
331 | std::optional<size_t> DataInCodeCommandIndex; |
332 | /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present. |
333 | std::optional<size_t> LinkerOptimizationHintCommandIndex; |
334 | /// The index LC_FUNCTION_STARTS load command if present. |
335 | std::optional<size_t> FunctionStartsCommandIndex; |
336 | /// The index LC_DYLD_CHAINED_FIXUPS load command if present. |
337 | std::optional<size_t> ChainedFixupsCommandIndex; |
338 | /// The index LC_DYLD_EXPORTS_TRIE load command if present. |
339 | std::optional<size_t> ExportsTrieCommandIndex; |
340 | /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command |
341 | /// corresponding to the __TEXT segment. |
342 | std::optional<size_t> TextSegmentCommandIndex; |
343 | |
344 | BumpPtrAllocator Alloc; |
345 | StringSaver NewSectionsContents; |
346 | |
347 | Object() : NewSectionsContents(Alloc) {} |
348 | |
349 | Error |
350 | removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove); |
351 | |
352 | Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove); |
353 | |
354 | void updateLoadCommandIndexes(); |
355 | |
356 | /// Creates a new segment load command in the object and returns a reference |
357 | /// to the newly created load command. The caller should verify that SegName |
358 | /// is not too long (SegName.size() should be less than or equal to 16). |
359 | LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize); |
360 | |
361 | bool is64Bit() const { |
362 | return Header.Magic == MachO::MH_MAGIC_64 || |
363 | Header.Magic == MachO::MH_CIGAM_64; |
364 | } |
365 | |
366 | uint64_t nextAvailableSegmentAddress() const; |
367 | }; |
368 | |
369 | } // end namespace macho |
370 | } // end namespace objcopy |
371 | } // end namespace llvm |
372 | |
373 | #endif // LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H |
374 | |