| 1 | //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Lempel–Ziv–Welch encoding/decoding |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef SANITIZER_LZW_H |
| 14 | #define SANITIZER_LZW_H |
| 15 | |
| 16 | #include "sanitizer_dense_map.h" |
| 17 | |
| 18 | namespace __sanitizer { |
| 19 | |
| 20 | using LzwCodeType = u32; |
| 21 | |
| 22 | template <class T, class ItIn, class ItOut> |
| 23 | ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) { |
| 24 | using Substring = |
| 25 | detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>; |
| 26 | |
| 27 | // Sentinel value for substrings of len 1. |
| 28 | static constexpr LzwCodeType kNoPrefix = |
| 29 | Min(DenseMapInfo<Substring>::getEmptyKey().first, |
| 30 | DenseMapInfo<Substring>::getTombstoneKey().first) - |
| 31 | 1; |
| 32 | DenseMap<Substring, LzwCodeType> prefix_to_code; |
| 33 | { |
| 34 | // Add all substring of len 1 as initial dictionary. |
| 35 | InternalMmapVector<T> dict_len1; |
| 36 | for (auto it = begin; it != end; ++it) |
| 37 | if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second) |
| 38 | dict_len1.push_back(*it); |
| 39 | |
| 40 | // Slightly helps with later delta encoding. |
| 41 | Sort(dict_len1.data(), dict_len1.size()); |
| 42 | |
| 43 | // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can |
| 44 | // just generate them. |
| 45 | *out = dict_len1.size(); |
| 46 | ++out; |
| 47 | |
| 48 | for (uptr i = 0; i != dict_len1.size(); ++i) { |
| 49 | // Remap after the Sort. |
| 50 | prefix_to_code[{kNoPrefix, dict_len1[i]}] = i; |
| 51 | *out = dict_len1[i]; |
| 52 | ++out; |
| 53 | } |
| 54 | CHECK_EQ(prefix_to_code.size(), dict_len1.size()); |
| 55 | } |
| 56 | |
| 57 | if (begin == end) |
| 58 | return out; |
| 59 | |
| 60 | // Main LZW encoding loop. |
| 61 | LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second; |
| 62 | ++begin; |
| 63 | for (auto it = begin; it != end; ++it) { |
| 64 | // Extend match with the new item. |
| 65 | auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size()); |
| 66 | if (ins.second) { |
| 67 | // This is a new substring, but emit the code for the current match |
| 68 | // (before extend). This allows LZW decoder to recover the dictionary. |
| 69 | *out = match; |
| 70 | ++out; |
| 71 | // Reset the match to a single item, which must be already in the map. |
| 72 | match = prefix_to_code.find({kNoPrefix, *it})->second; |
| 73 | } else { |
| 74 | // Already known, use as the current match. |
| 75 | match = ins.first->second; |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | *out = match; |
| 80 | ++out; |
| 81 | |
| 82 | return out; |
| 83 | } |
| 84 | |
| 85 | template <class T, class ItIn, class ItOut> |
| 86 | ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) { |
| 87 | if (begin == end) |
| 88 | return out; |
| 89 | |
| 90 | // Load dictionary of len 1 substrings. Theses correspont to lowest codes. |
| 91 | InternalMmapVector<T> dict_len1(*begin); |
| 92 | ++begin; |
| 93 | |
| 94 | if (begin == end) |
| 95 | return out; |
| 96 | |
| 97 | for (auto& v : dict_len1) { |
| 98 | v = *begin; |
| 99 | ++begin; |
| 100 | } |
| 101 | |
| 102 | // Substrings of len 2 and up. Indexes are shifted because [0, |
| 103 | // dict_len1.size()) stored in dict_len1. Substings get here after being |
| 104 | // emitted to the output, so we can use output position. |
| 105 | InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>> |
| 106 | code_to_substr; |
| 107 | |
| 108 | // Copies already emitted substrings into the output again. |
| 109 | auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) { |
| 110 | if (code < dict_len1.size()) { |
| 111 | *out = dict_len1[code]; |
| 112 | ++out; |
| 113 | return out; |
| 114 | } |
| 115 | const auto& s = code_to_substr[code - dict_len1.size()]; |
| 116 | |
| 117 | for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it; |
| 118 | return out; |
| 119 | }; |
| 120 | |
| 121 | // Returns lens of the substring with the given code. |
| 122 | auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr { |
| 123 | if (code < dict_len1.size()) |
| 124 | return 1; |
| 125 | const auto& s = code_to_substr[code - dict_len1.size()]; |
| 126 | return s.second - s.first; |
| 127 | }; |
| 128 | |
| 129 | // Main LZW decoding loop. |
| 130 | LzwCodeType prev_code = *begin; |
| 131 | ++begin; |
| 132 | out = copy(prev_code, out); |
| 133 | for (auto it = begin; it != end; ++it) { |
| 134 | LzwCodeType code = *it; |
| 135 | auto start = out; |
| 136 | if (code == dict_len1.size() + code_to_substr.size()) { |
| 137 | // Special LZW case. The code is not in the dictionary yet. This is |
| 138 | // possible only when the new substring is the same as previous one plus |
| 139 | // the first item of the previous substring. We can emit that in two |
| 140 | // steps. |
| 141 | out = copy(prev_code, out); |
| 142 | *out = *start; |
| 143 | ++out; |
| 144 | } else { |
| 145 | out = copy(code, out); |
| 146 | } |
| 147 | |
| 148 | // Every time encoded emits the code, it also creates substing of len + 1 |
| 149 | // including the first item of the just emmited substring. Do the same here. |
| 150 | uptr len = code_to_len(prev_code); |
| 151 | code_to_substr.push_back({start - len, start + 1}); |
| 152 | |
| 153 | prev_code = code; |
| 154 | } |
| 155 | return out; |
| 156 | } |
| 157 | |
| 158 | } // namespace __sanitizer |
| 159 | #endif |
| 160 | |