| 1 | //===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains the declaration of the ObjectStore class. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_CAS_OBJECTSTORE_H |
| 15 | #define LLVM_CAS_OBJECTSTORE_H |
| 16 | |
| 17 | #include "llvm/ADT/StringRef.h" |
| 18 | #include "llvm/CAS/CASID.h" |
| 19 | #include "llvm/CAS/CASReference.h" |
| 20 | #include "llvm/Support/Error.h" |
| 21 | #include "llvm/Support/FileSystem.h" |
| 22 | #include <cstddef> |
| 23 | |
| 24 | namespace llvm { |
| 25 | |
| 26 | class MemoryBuffer; |
| 27 | template <typename T> class unique_function; |
| 28 | |
| 29 | namespace cas { |
| 30 | |
| 31 | class ObjectStore; |
| 32 | class ObjectProxy; |
| 33 | |
| 34 | /// Content-addressable storage for objects. |
| 35 | /// |
| 36 | /// Conceptually, objects are stored in a "unique set". |
| 37 | /// |
| 38 | /// - Objects are immutable ("value objects") that are defined by their |
| 39 | /// content. They are implicitly deduplicated by content. |
| 40 | /// - Each object has a unique identifier (UID) that's derived from its content, |
| 41 | /// called a \a CASID. |
| 42 | /// - This UID is a fixed-size (strong) hash of the transitive content of a |
| 43 | /// CAS object. |
| 44 | /// - It's comparable between any two CAS instances that have the same \a |
| 45 | /// CASIDContext::getHashSchemaIdentifier(). |
| 46 | /// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed |
| 47 | /// by the same or a different CAS instance with \a |
| 48 | /// ObjectStore::parseID(). |
| 49 | /// - An object can be looked up by content or by UID. |
| 50 | /// - \a store() is "get-or-create" methods, writing an object if it |
| 51 | /// doesn't exist yet, and return a ref to it in any case. |
| 52 | /// - \a loadObject(const CASID&) looks up an object by its UID. |
| 53 | /// - Objects can reference other objects, forming an arbitrary DAG. |
| 54 | /// |
| 55 | /// The \a ObjectStore interface has a few ways of referencing objects: |
| 56 | /// |
| 57 | /// - \a ObjectRef encapsulates a reference to something in the CAS. It is an |
| 58 | /// opaque type that references an object inside a specific CAS. It is |
| 59 | /// implementation defined if the underlying object exists or not for an |
| 60 | /// ObjectRef, and it can used to speed up CAS lookup as an implementation |
| 61 | /// detail. However, you don't know anything about the underlying objects. |
| 62 | /// "Loading" the object is a separate step that may not have happened |
| 63 | /// yet, and which can fail (e.g. due to filesystem corruption) or introduce |
| 64 | /// latency (if downloading from a remote store). |
| 65 | /// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of |
| 66 | /// these to inspect the content of an object: to look at its stored |
| 67 | /// data and references. This is internal to CAS implementation and not |
| 68 | /// availble from CAS public APIs. |
| 69 | /// - \a CASID: the UID for an object in the CAS, obtained through \a |
| 70 | /// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS |
| 71 | /// identifier, but may reference an object that is unknown to this CAS |
| 72 | /// instance. |
| 73 | /// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and |
| 74 | /// wraps access APIs to avoid having to pass extra parameters. It is the |
| 75 | /// object used for accessing underlying data and refs by CAS users. |
| 76 | /// |
| 77 | /// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t` and |
| 78 | /// are only valid with the associated ObjectStore instance. |
| 79 | /// |
| 80 | /// There are a few options for accessing content of objects, with different |
| 81 | /// lifetime tradeoffs: |
| 82 | /// |
| 83 | /// - \a getData() accesses data without exposing lifetime at all. |
| 84 | /// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime |
| 85 | /// is independent of the CAS (it can live longer). |
| 86 | /// - \a getDataString() return StringRef with lifetime is guaranteed to last as |
| 87 | /// long as \a ObjectStore. |
| 88 | /// - \a readRef() and \a forEachRef() iterate through the references in an |
| 89 | /// object. There is no lifetime assumption. |
| 90 | class ObjectStore { |
| 91 | friend class ObjectProxy; |
| 92 | void anchor(); |
| 93 | |
| 94 | public: |
| 95 | /// Get a \p CASID from a \p ID, which should have been generated by \a |
| 96 | /// CASID::print(). This succeeds as long as \a validateID() would pass. The |
| 97 | /// object may be unknown to this CAS instance. |
| 98 | /// |
| 99 | /// TODO: Remove, and update callers to use \a validateID() or \a |
| 100 | /// extractHashFromID(). |
| 101 | virtual Expected<CASID> parseID(StringRef ID) = 0; |
| 102 | |
| 103 | /// Store object into ObjectStore. |
| 104 | virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs, |
| 105 | ArrayRef<char> Data) = 0; |
| 106 | /// Get an ID for \p Ref. |
| 107 | virtual CASID getID(ObjectRef Ref) const = 0; |
| 108 | |
| 109 | /// Stores the data of a file into ObjectStore. |
| 110 | /// |
| 111 | /// An underlying implementation could perform optimizations that reduce I/O |
| 112 | /// and disk space consumption. |
| 113 | /// |
| 114 | /// If there are any concurrent modifications to the file, the contents in the |
| 115 | /// CAS may be corrupt. |
| 116 | /// |
| 117 | /// \param FilePath the path of the file data. |
| 118 | virtual Expected<ObjectRef> storeFromFile(StringRef Path); |
| 119 | |
| 120 | /// Exports the data of an object to a file path. It does not include any |
| 121 | /// references of the object. |
| 122 | /// |
| 123 | /// An underlying implementation could perform optimizations that reduce I/O |
| 124 | /// and disk space consumption. |
| 125 | /// |
| 126 | /// \param Node the object to read data from. |
| 127 | /// \param FilePath the path of the file data. |
| 128 | virtual Error exportDataToFile(ObjectHandle Node, StringRef Path) const; |
| 129 | |
| 130 | /// Get an existing reference to the object called \p ID. |
| 131 | /// |
| 132 | /// Returns \c None if the object is not stored in this CAS. |
| 133 | virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0; |
| 134 | |
| 135 | /// \returns true if the object is directly available from the local CAS, for |
| 136 | /// implementations that have this kind of distinction. |
| 137 | virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; |
| 138 | |
| 139 | /// Validate the underlying object referred by CASID. |
| 140 | virtual Error validateObject(const CASID &ID) = 0; |
| 141 | |
| 142 | /// Validate the entire ObjectStore. |
| 143 | virtual Error validate(bool CheckHash) const = 0; |
| 144 | |
| 145 | protected: |
| 146 | /// Load the object referenced by \p Ref. |
| 147 | /// |
| 148 | /// Errors if the object cannot be loaded. |
| 149 | /// \returns \c std::nullopt if the object is missing from the CAS. |
| 150 | virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0; |
| 151 | |
| 152 | /// Like \c loadIfExists but returns an error if the object is missing. |
| 153 | Expected<ObjectHandle> load(ObjectRef Ref); |
| 154 | |
| 155 | /// Get the size of some data. |
| 156 | virtual uint64_t getDataSize(ObjectHandle Node) const = 0; |
| 157 | |
| 158 | /// Methods for handling objects. CAS implementations need to override to |
| 159 | /// provide functions to access stored CAS objects and references. |
| 160 | virtual Error forEachRef(ObjectHandle Node, |
| 161 | function_ref<Error(ObjectRef)> Callback) const = 0; |
| 162 | virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0; |
| 163 | virtual size_t getNumRefs(ObjectHandle Node) const = 0; |
| 164 | virtual ArrayRef<char> getData(ObjectHandle Node, |
| 165 | bool RequiresNullTerminator = false) const = 0; |
| 166 | |
| 167 | /// Get ObjectRef from open file. |
| 168 | virtual Expected<ObjectRef> |
| 169 | storeFromOpenFileImpl(sys::fs::file_t FD, |
| 170 | std::optional<sys::fs::file_status> Status); |
| 171 | |
| 172 | /// Get a lifetime-extended StringRef pointing at \p Data. |
| 173 | /// |
| 174 | /// Depending on the CAS implementation, this may involve in-memory storage |
| 175 | /// overhead. |
| 176 | StringRef getDataString(ObjectHandle Node) { |
| 177 | return toStringRef(Input: getData(Node)); |
| 178 | } |
| 179 | |
| 180 | /// Get a lifetime-extended MemoryBuffer pointing at \p Data. |
| 181 | /// |
| 182 | /// Depending on the CAS implementation, this may involve in-memory storage |
| 183 | /// overhead. |
| 184 | std::unique_ptr<MemoryBuffer> |
| 185 | getMemoryBuffer(ObjectHandle Node, StringRef Name = "" , |
| 186 | bool RequiresNullTerminator = true); |
| 187 | |
| 188 | /// Read all the refs from object in a SmallVector. |
| 189 | virtual void readRefs(ObjectHandle Node, |
| 190 | SmallVectorImpl<ObjectRef> &Refs) const; |
| 191 | |
| 192 | /// Allow ObjectStore implementations to create internal handles. |
| 193 | #define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \ |
| 194 | HandleKind make##HandleKind(uint64_t InternalRef) const { \ |
| 195 | return HandleKind(*this, InternalRef); \ |
| 196 | } |
| 197 | MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle) |
| 198 | MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef) |
| 199 | #undef MAKE_CAS_HANDLE_CONSTRUCTOR |
| 200 | |
| 201 | public: |
| 202 | /// Helper functions to store object and returns a ObjectProxy. |
| 203 | LLVM_ABI_FOR_TEST Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs, |
| 204 | StringRef Data); |
| 205 | |
| 206 | /// Store object from StringRef. |
| 207 | Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs, |
| 208 | StringRef String) { |
| 209 | return store(Refs, Data: arrayRefFromStringRef<char>(Input: String)); |
| 210 | } |
| 211 | |
| 212 | /// Default implementation reads \p FD and calls \a storeNode(). Does not |
| 213 | /// take ownership of \p FD; the caller is responsible for closing it. |
| 214 | /// |
| 215 | /// If \p Status is sent in it is to be treated as a hint. Implementations |
| 216 | /// must protect against the file size potentially growing after the status |
| 217 | /// was taken (i.e., they cannot assume that an mmap will be null-terminated |
| 218 | /// where \p Status implies). |
| 219 | /// |
| 220 | /// Returns the \a CASID and the size of the file. |
| 221 | Expected<ObjectRef> |
| 222 | storeFromOpenFile(sys::fs::file_t FD, |
| 223 | std::optional<sys::fs::file_status> Status = std::nullopt) { |
| 224 | return storeFromOpenFileImpl(FD, Status); |
| 225 | } |
| 226 | |
| 227 | static Error createUnknownObjectError(const CASID &ID); |
| 228 | |
| 229 | /// Create ObjectProxy from CASID. If the object doesn't exist, get an error. |
| 230 | LLVM_ABI Expected<ObjectProxy> getProxy(const CASID &ID); |
| 231 | /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an |
| 232 | /// error. |
| 233 | LLVM_ABI Expected<ObjectProxy> getProxy(ObjectRef Ref); |
| 234 | |
| 235 | /// \returns \c std::nullopt if the object is missing from the CAS. |
| 236 | Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref); |
| 237 | |
| 238 | /// Read the data from \p Data into \p OS. |
| 239 | uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0, |
| 240 | uint64_t MaxBytes = -1ULL) const { |
| 241 | ArrayRef<char> Data = getData(Node); |
| 242 | assert(Offset < Data.size() && "Expected valid offset" ); |
| 243 | Data = Data.drop_front(N: Offset).take_front(N: MaxBytes); |
| 244 | OS << toStringRef(Input: Data); |
| 245 | return Data.size(); |
| 246 | } |
| 247 | |
| 248 | /// Set the size for limiting growth of on-disk storage. This has an effect |
| 249 | /// for when the instance is closed. |
| 250 | /// |
| 251 | /// Implementations may leave this unimplemented. |
| 252 | virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) { |
| 253 | return Error::success(); |
| 254 | } |
| 255 | |
| 256 | /// \returns the storage size of the on-disk CAS data. |
| 257 | /// |
| 258 | /// Implementations that don't have an implementation for this should return |
| 259 | /// \p std::nullopt. |
| 260 | virtual Expected<std::optional<uint64_t>> getStorageSize() const { |
| 261 | return std::nullopt; |
| 262 | } |
| 263 | |
| 264 | /// Prune local storage to reduce its size according to the desired size |
| 265 | /// limit. Pruning can happen concurrently with other operations. |
| 266 | /// |
| 267 | /// Implementations may leave this unimplemented. |
| 268 | virtual Error pruneStorageData() { return Error::success(); } |
| 269 | |
| 270 | /// Validate the whole node tree. |
| 271 | Error validateTree(ObjectRef Ref); |
| 272 | |
| 273 | /// Import object from another CAS. This will import the full tree from the |
| 274 | /// other CAS. |
| 275 | LLVM_ABI Expected<ObjectRef> importObject(ObjectStore &Upstream, |
| 276 | ObjectRef Other); |
| 277 | |
| 278 | /// Print the ObjectStore internals for debugging purpose. |
| 279 | virtual void print(raw_ostream &) const {} |
| 280 | void dump() const; |
| 281 | |
| 282 | /// Get CASContext |
| 283 | const CASContext &getContext() const { return Context; } |
| 284 | |
| 285 | virtual ~ObjectStore() = default; |
| 286 | |
| 287 | protected: |
| 288 | ObjectStore(const CASContext &Context) : Context(Context) {} |
| 289 | |
| 290 | private: |
| 291 | const CASContext &Context; |
| 292 | }; |
| 293 | |
| 294 | /// Reference to an abstract hierarchical node, with data and references. |
| 295 | /// Reference is passed by value and is expected to be valid as long as the \a |
| 296 | /// ObjectStore is. |
| 297 | class ObjectProxy { |
| 298 | public: |
| 299 | ObjectStore &getCAS() const { return *CAS; } |
| 300 | CASID getID() const { return CAS->getID(Ref); } |
| 301 | ObjectRef getRef() const { return Ref; } |
| 302 | size_t getNumReferences() const { return CAS->getNumRefs(Node: H); } |
| 303 | ObjectRef getReference(size_t I) const { return CAS->readRef(Node: H, I); } |
| 304 | |
| 305 | operator CASID() const { return getID(); } |
| 306 | CASID getReferenceID(size_t I) const { |
| 307 | std::optional<CASID> ID = getCAS().getID(Ref: getReference(I)); |
| 308 | assert(ID && "Expected reference to be first-class object" ); |
| 309 | return *ID; |
| 310 | } |
| 311 | |
| 312 | /// Visit each reference in order, returning an error from \p Callback to |
| 313 | /// stop early. |
| 314 | Error forEachReference(function_ref<Error(ObjectRef)> Callback) const { |
| 315 | return CAS->forEachRef(Node: H, Callback); |
| 316 | } |
| 317 | |
| 318 | std::unique_ptr<MemoryBuffer> |
| 319 | getMemoryBuffer(StringRef Name = "" , |
| 320 | bool RequiresNullTerminator = true) const; |
| 321 | |
| 322 | /// Get the content of the node. Valid as long as the CAS is valid. |
| 323 | StringRef getData() const { return CAS->getDataString(Node: H); } |
| 324 | |
| 325 | /// Exports the data of an object to a file path. |
| 326 | Error exportDataToFile(StringRef Path) const { |
| 327 | return CAS->exportDataToFile(Node: H, Path); |
| 328 | } |
| 329 | |
| 330 | friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) { |
| 331 | return Proxy.getRef() == Ref; |
| 332 | } |
| 333 | friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) { |
| 334 | return Proxy.getRef() == Ref; |
| 335 | } |
| 336 | friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) { |
| 337 | return !(Proxy.getRef() == Ref); |
| 338 | } |
| 339 | friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) { |
| 340 | return !(Proxy.getRef() == Ref); |
| 341 | } |
| 342 | |
| 343 | public: |
| 344 | ObjectProxy() = delete; |
| 345 | |
| 346 | static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) { |
| 347 | return ObjectProxy(CAS, Ref, Node); |
| 348 | } |
| 349 | |
| 350 | private: |
| 351 | ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H) |
| 352 | : CAS(&CAS), Ref(Ref), H(H) {} |
| 353 | |
| 354 | ObjectStore *CAS; |
| 355 | ObjectRef Ref; |
| 356 | ObjectHandle H; |
| 357 | }; |
| 358 | |
| 359 | /// Create an in memory CAS. |
| 360 | LLVM_ABI std::unique_ptr<ObjectStore> createInMemoryCAS(); |
| 361 | |
| 362 | /// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. |
| 363 | bool isOnDiskCASEnabled(); |
| 364 | |
| 365 | /// Create a persistent on-disk path at \p Path. |
| 366 | LLVM_ABI Expected<std::unique_ptr<ObjectStore>> |
| 367 | createOnDiskCAS(const Twine &Path); |
| 368 | |
| 369 | } // namespace cas |
| 370 | } // namespace llvm |
| 371 | |
| 372 | #endif // LLVM_CAS_OBJECTSTORE_H |
| 373 | |