sanitizer_lzw.h source code [llvm_runtimes/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h]

1	//===-- sanitizer_lzw.h ------------------------------------------ C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Lempel–Ziv–Welch encoding/decoding
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef SANITIZER_LZW_H
14	#define SANITIZER_LZW_H
15
16	#include "sanitizer_dense_map.h"
17
18	namespace __sanitizer {
19
20	using LzwCodeType = u32;
21
22	template <class T, class ItIn, class ItOut>
23	ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
24	using Substring =
25	detail::DenseMapPair<LzwCodeType / Prefix /, T / Next input />;
26
27	// Sentinel value for substrings of len 1.
28	static constexpr LzwCodeType kNoPrefix =
29	DenseMapInfo<Substring>::getEmptyKey().first - `1`;
30	DenseMap<Substring, LzwCodeType> prefix_to_code;
31	{
32	// Add all substring of len 1 as initial dictionary.
33	InternalMmapVector<T> dict_len1;
34	for (auto it = begin; it != end; ++it)
35	if (prefix_to_code.try_emplace({kNoPrefix, *it}, `0`).second)
36	dict_len1.push_back(*it);
37
38	// Slightly helps with later delta encoding.
39	Sort(dict_len1.data(), dict_len1.size());
40
41	// For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
42	// just generate them.
43	*out = dict_len1.size();
44	++out;
45
46	for (uptr i = `0`; i != dict_len1.size(); ++i) {
47	// Remap after the Sort.
48	prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
49	*out = dict_len1[i];
50	++out;
51	}
52	CHECK_EQ(prefix_to_code.size(), dict_len1.size());
53	}
54
55	if (begin == end)
56	return out;
57
58	// Main LZW encoding loop.
59	LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
60	++begin;
61	for (auto it = begin; it != end; ++it) {
62	// Extend match with the new item.
63	auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
64	if (ins.second) {
65	// This is a new substring, but emit the code for the current match
66	// (before extend). This allows LZW decoder to recover the dictionary.
67	*out = match;
68	++out;
69	// Reset the match to a single item, which must be already in the map.
70	match = prefix_to_code.find({kNoPrefix, *it})->second;
71	} else {
72	// Already known, use as the current match.
73	match = ins.first->second;
74	}
75	}
76
77	*out = match;
78	++out;
79
80	return out;
81	}
82
83	template <class T, class ItIn, class ItOut>
84	ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
85	if (begin == end)
86	return out;
87
88	// Load dictionary of len 1 substrings. Theses correspont to lowest codes.
89	InternalMmapVector<T> dict_len1(*begin);
90	++begin;
91
92	if (begin == end)
93	return out;
94
95	for (auto& v : dict_len1) {
96	v = *begin;
97	++begin;
98	}
99
100	// Substrings of len 2 and up. Indexes are shifted because [0,
101	// dict_len1.size()) stored in dict_len1. Substings get here after being
102	// emitted to the output, so we can use output position.
103	InternalMmapVector<detail::DenseMapPair<ItOut / begin. /, ItOut / end />>
104	code_to_substr;
105
106	// Copies already emitted substrings into the output again.
107	auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
108	if (code < dict_len1.size()) {
109	*out = dict_len1[code];
110	++out;
111	return out;
112	}
113	const auto& s = code_to_substr[code - dict_len1.size()];
114
115	for (ItOut it = s.first; it != s.second; ++it, ++out) out = it;
116	return out;
117	};
118
119	// Returns lens of the substring with the given code.
120	auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
121	if (code < dict_len1.size())
122	return `1`;
123	const auto& s = code_to_substr[code - dict_len1.size()];
124	return s.second - s.first;
125	};
126
127	// Main LZW decoding loop.
128	LzwCodeType prev_code = *begin;
129	++begin;
130	out = copy(prev_code, out);
131	for (auto it = begin; it != end; ++it) {
132	LzwCodeType code = *it;
133	auto start = out;
134	if (code == dict_len1.size() + code_to_substr.size()) {
135	// Special LZW case. The code is not in the dictionary yet. This is
136	// possible only when the new substring is the same as previous one plus
137	// the first item of the previous substring. We can emit that in two
138	// steps.
139	out = copy(prev_code, out);
140	out = start;
141	++out;
142	} else {
143	out = copy(code, out);
144	}
145
146	// Every time encoded emits the code, it also creates substing of len + 1
147	// including the first item of the just emmited substring. Do the same here.
148	uptr len = code_to_len(prev_code);
149	code_to_substr.push_back({start - len, start + `1`});
150
151	prev_code = code;
152	}
153	return out;
154	}
155
156	} // namespace __sanitizer
157	#endif
158

Browse the source code of llvm_runtimes/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h