1//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Lempel–Ziv–Welch encoding/decoding
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef SANITIZER_LZW_H
14#define SANITIZER_LZW_H
15
16#include "sanitizer_dense_map.h"
17
18namespace __sanitizer {
19
20using LzwCodeType = u32;
21
22template <class T, class ItIn, class ItOut>
23ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24 using Substring =
25 detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
26
27 // Sentinel value for substrings of len 1.
28 static constexpr LzwCodeType kNoPrefix =
29 DenseMapInfo<Substring>::getEmptyKey().first - 1;
30 DenseMap<Substring, LzwCodeType> prefix_to_code;
31 {
32 // Add all substring of len 1 as initial dictionary.
33 InternalMmapVector<T> dict_len1;
34 for (auto it = begin; it != end; ++it)
35 if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
36 dict_len1.push_back(*it);
37
38 // Slightly helps with later delta encoding.
39 Sort(dict_len1.data(), dict_len1.size());
40
41 // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
42 // just generate them.
43 *out = dict_len1.size();
44 ++out;
45
46 for (uptr i = 0; i != dict_len1.size(); ++i) {
47 // Remap after the Sort.
48 prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
49 *out = dict_len1[i];
50 ++out;
51 }
52 CHECK_EQ(prefix_to_code.size(), dict_len1.size());
53 }
54
55 if (begin == end)
56 return out;
57
58 // Main LZW encoding loop.
59 LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
60 ++begin;
61 for (auto it = begin; it != end; ++it) {
62 // Extend match with the new item.
63 auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
64 if (ins.second) {
65 // This is a new substring, but emit the code for the current match
66 // (before extend). This allows LZW decoder to recover the dictionary.
67 *out = match;
68 ++out;
69 // Reset the match to a single item, which must be already in the map.
70 match = prefix_to_code.find({kNoPrefix, *it})->second;
71 } else {
72 // Already known, use as the current match.
73 match = ins.first->second;
74 }
75 }
76
77 *out = match;
78 ++out;
79
80 return out;
81}
82
83template <class T, class ItIn, class ItOut>
84ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
85 if (begin == end)
86 return out;
87
88 // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
89 InternalMmapVector<T> dict_len1(*begin);
90 ++begin;
91
92 if (begin == end)
93 return out;
94
95 for (auto& v : dict_len1) {
96 v = *begin;
97 ++begin;
98 }
99
100 // Substrings of len 2 and up. Indexes are shifted because [0,
101 // dict_len1.size()) stored in dict_len1. Substings get here after being
102 // emitted to the output, so we can use output position.
103 InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
104 code_to_substr;
105
106 // Copies already emitted substrings into the output again.
107 auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
108 if (code < dict_len1.size()) {
109 *out = dict_len1[code];
110 ++out;
111 return out;
112 }
113 const auto& s = code_to_substr[code - dict_len1.size()];
114
115 for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
116 return out;
117 };
118
119 // Returns lens of the substring with the given code.
120 auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
121 if (code < dict_len1.size())
122 return 1;
123 const auto& s = code_to_substr[code - dict_len1.size()];
124 return s.second - s.first;
125 };
126
127 // Main LZW decoding loop.
128 LzwCodeType prev_code = *begin;
129 ++begin;
130 out = copy(prev_code, out);
131 for (auto it = begin; it != end; ++it) {
132 LzwCodeType code = *it;
133 auto start = out;
134 if (code == dict_len1.size() + code_to_substr.size()) {
135 // Special LZW case. The code is not in the dictionary yet. This is
136 // possible only when the new substring is the same as previous one plus
137 // the first item of the previous substring. We can emit that in two
138 // steps.
139 out = copy(prev_code, out);
140 *out = *start;
141 ++out;
142 } else {
143 out = copy(code, out);
144 }
145
146 // Every time encoded emits the code, it also creates substing of len + 1
147 // including the first item of the just emmited substring. Do the same here.
148 uptr len = code_to_len(prev_code);
149 code_to_substr.push_back({start - len, start + 1});
150
151 prev_code = code;
152 }
153 return out;
154}
155
156} // namespace __sanitizer
157#endif
158