sanitizer_lzw.h source code [llvm_runtimes/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h]

1	//===-- sanitizer_lzw.h ------------------------------------------ C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Lempel–Ziv–Welch encoding/decoding
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef SANITIZER_LZW_H
14	#define SANITIZER_LZW_H
15
16	#include "sanitizer_dense_map.h"
17
18	namespace __sanitizer {
19
20	using LzwCodeType = u32;
21
22	template <class T, class ItIn, class ItOut>
23	ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24	using Substring =
25	detail::DenseMapPair<LzwCodeType / Prefix /, T / Next input />;
26
27	// Sentinel value for substrings of len 1.
28	static constexpr LzwCodeType kNoPrefix =
29	Min(DenseMapInfo<Substring>::getEmptyKey().first,
30	DenseMapInfo<Substring>::getTombstoneKey().first) -
31	`1`;
32	DenseMap<Substring, LzwCodeType> prefix_to_code;
33	{
34	// Add all substring of len 1 as initial dictionary.
35	InternalMmapVector<T> dict_len1;
36	for (auto it = begin; it != end; ++it)
37	if (prefix_to_code.try_emplace({kNoPrefix, *it}, `0`).second)
38	dict_len1.push_back(*it);
39
40	// Slightly helps with later delta encoding.
41	Sort(dict_len1.data(), dict_len1.size());
42
43	// For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44	// just generate them.
45	*out = dict_len1.size();
46	++out;
47
48	for (uptr i = `0`; i != dict_len1.size(); ++i) {
49	// Remap after the Sort.
50	prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
51	*out = dict_len1[i];
52	++out;
53	}
54	CHECK_EQ(prefix_to_code.size(), dict_len1.size());
55	}
56
57	if (begin == end)
58	return out;
59
60	// Main LZW encoding loop.
61	LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
62	++begin;
63	for (auto it = begin; it != end; ++it) {
64	// Extend match with the new item.
65	auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
66	if (ins.second) {
67	// This is a new substring, but emit the code for the current match
68	// (before extend). This allows LZW decoder to recover the dictionary.
69	*out = match;
70	++out;
71	// Reset the match to a single item, which must be already in the map.
72	match = prefix_to_code.find({kNoPrefix, *it})->second;
73	} else {
74	// Already known, use as the current match.
75	match = ins.first->second;
76	}
77	}
78
79	*out = match;
80	++out;
81
82	return out;
83	}
84
85	template <class T, class ItIn, class ItOut>
86	ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
87	if (begin == end)
88	return out;
89
90	// Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91	InternalMmapVector<T> dict_len1(*begin);
92	++begin;
93
94	if (begin == end)
95	return out;
96
97	for (auto& v : dict_len1) {
98	v = *begin;
99	++begin;
100	}
101
102	// Substrings of len 2 and up. Indexes are shifted because [0,
103	// dict_len1.size()) stored in dict_len1. Substings get here after being
104	// emitted to the output, so we can use output position.
105	InternalMmapVector<detail::DenseMapPair<ItOut / begin. /, ItOut / end />>
106	code_to_substr;
107
108	// Copies already emitted substrings into the output again.
109	auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110	if (code < dict_len1.size()) {
111	*out = dict_len1[code];
112	++out;
113	return out;
114	}
115	const auto& s = code_to_substr[code - dict_len1.size()];
116
117	for (ItOut it = s.first; it != s.second; ++it, ++out) out = it;
118	return out;
119	};
120
121	// Returns lens of the substring with the given code.
122	auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123	if (code < dict_len1.size())
124	return `1`;
125	const auto& s = code_to_substr[code - dict_len1.size()];
126	return s.second - s.first;
127	};
128
129	// Main LZW decoding loop.
130	LzwCodeType prev_code = *begin;
131	++begin;
132	out = copy(prev_code, out);
133	for (auto it = begin; it != end; ++it) {
134	LzwCodeType code = *it;
135	auto start = out;
136	if (code == dict_len1.size() + code_to_substr.size()) {
137	// Special LZW case. The code is not in the dictionary yet. This is
138	// possible only when the new substring is the same as previous one plus
139	// the first item of the previous substring. We can emit that in two
140	// steps.
141	out = copy(prev_code, out);
142	out = start;
143	++out;
144	} else {
145	out = copy(code, out);
146	}
147
148	// Every time encoded emits the code, it also creates substing of len + 1
149	// including the first item of the just emmited substring. Do the same here.
150	uptr len = code_to_len(prev_code);
151	code_to_substr.push_back({start - len, start + `1`});
152
153	prev_code = code;
154	}
155	return out;
156	}
157
158	} // namespace __sanitizer
159	#endif
160

Browse the source code of llvm_runtimes/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h