StringRef.cpp source code [llvm_projects/llvm/lib/Support/StringRef.cpp]

1	//===-- StringRef.cpp - Lightweight String References ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "llvm/ADT/StringRef.h"
10	#include "llvm/ADT/APFloat.h"
11	#include "llvm/ADT/APInt.h"
12	#include "llvm/ADT/Hashing.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/ADT/edit_distance.h"
15	#include "llvm/Support/Error.h"
16	#include <bitset>
17
18	using namespace llvm;
19
20	// MSVC emits references to this into the translation units which reference it.
21	#ifndef _MSC_VER
22	constexpr size_t StringRef::npos;
23	#endif
24
25	// strncasecmp() is not available on non-POSIX systems, so define an
26	// alternative function here.
27	static int ascii_strncasecmp(const char LHS, const* char *RHS, size_t Length) {
28	for (size_t I = `0`; I < Length; ++I) {
29	unsigned char LHC = toLower(x: LHS[I]);
30	unsigned char RHC = toLower(x: RHS[I]);
31	if (LHC != RHC)
32	return LHC < RHC ? -`1` : `1`;
33	}
34	return `0`;
35	}
36
37	int StringRef::compare_insensitive(StringRef RHS) const {
38	if (int Res =
39	ascii_strncasecmp(LHS: data(), RHS: RHS.data(), Length: std::min(a: size(), b: RHS.size())))
40	return Res;
41	if (size() == RHS.size())
42	return `0`;
43	return size() < RHS.size() ? -`1` : `1`;
44	}
45
46	bool StringRef::starts_with_insensitive(StringRef Prefix) const {
47	return size() >= Prefix.size() &&
48	ascii_strncasecmp(LHS: data(), RHS: Prefix.data(), Length: Prefix.size()) == `0`;
49	}
50
51	bool StringRef::ends_with_insensitive(StringRef Suffix) const {
52	return size() >= Suffix.size() &&
53	ascii_strncasecmp(LHS: end() - Suffix.size(), RHS: Suffix.data(),
54	Length: Suffix.size()) == `0`;
55	}
56
57	size_t StringRef::find_insensitive(char C, size_t From) const {
58	char L = toLower(x: C);
59	return find_if(F: [L](char D) { return toLower(x: D) == L; }, From);
60	}
61
62	/// compare_numeric - Compare strings, handle embedded numbers.
63	int StringRef::compare_numeric(StringRef RHS) const {
64	for (size_t I = `0`, E = std::min(a: size(), b: RHS.size()); I != E; ++I) {
65	// Check for sequences of digits.
66	if (isDigit(C: data()[I]) && isDigit(C: RHS.data()[I])) {
67	// The longer sequence of numbers is considered larger.
68	// This doesn't really handle prefixed zeros well.
69	size_t J;
70	for (J = I + `1`; J != E + `1`; ++J) {
71	bool ld = J < size() && isDigit(C: data()[J]);
72	bool rd = J < RHS.size() && isDigit(C: RHS.data()[J]);
73	if (ld != rd)
74	return rd ? -`1` : `1`;
75	if (!rd)
76	break;
77	}
78	// The two number sequences have the same length (J-I), just memcmp them.
79	if (int Res = compareMemory(Lhs: data() + I, Rhs: RHS.data() + I, Length: J - I))
80	return Res < `0` ? -`1` : `1`;
81	// Identical number sequences, continue search after the numbers.
82	I = J - `1`;
83	continue;
84	}
85	if (data()[I] != RHS.data()[I])
86	return (unsigned char)data()[I] < (unsigned char)RHS.data()[I] ? -`1` : `1`;
87	}
88	if (size() == RHS.size())
89	return `0`;
90	return size() < RHS.size() ? -`1` : `1`;
91	}
92
93	// Compute the edit distance between the two given strings.
94	unsigned StringRef::edit_distance(llvm::StringRef Other,
95	bool AllowReplacements,
96	unsigned MaxEditDistance) const {
97	return llvm::ComputeEditDistance(FromArray: ArrayRef(data(), size()),
98	ToArray: ArrayRef(Other.data(), Other.size()),
99	AllowReplacements, MaxEditDistance);
100	}
101
102	unsigned llvm::StringRef::edit_distance_insensitive(
103	StringRef Other, bool AllowReplacements, unsigned MaxEditDistance) const {
104	return llvm::ComputeMappedEditDistance(
105	FromArray: ArrayRef(data(), size()), ToArray: ArrayRef(Other.data(), Other.size()),
106	Map: llvm::toLower, AllowReplacements, MaxEditDistance);
107	}
108
109	//===----------------------------------------------------------------------===//
110	// String Operations
111	//===----------------------------------------------------------------------===//
112
113	std::string StringRef::lower() const {
114	return std::string (map_iterator(I: begin(), F: toLower),
115	map_iterator(I: end(), F: toLower));
116	}
117
118	std::string StringRef::upper() const {
119	return std::string (map_iterator(I: begin(), F: toUpper),
120	map_iterator(I: end(), F: toUpper));
121	}
122
123	//===----------------------------------------------------------------------===//
124	// String Searching
125	//===----------------------------------------------------------------------===//
126
127
128	/// find - Search for the first string \arg Str in the string.
129	///
130	/// \return - The index of the first occurrence of \arg Str, or npos if not
131	/// found.
132	size_t StringRef::find(StringRef Str, size_t From) const {
133	if (From > size())
134	return npos;
135
136	const char *Start = data() + From;
137	size_t Size = size() - From;
138
139	const char *Needle = Str.data();
140	size_t N = Str.size();
141	if (N == `0`)
142	return From;
143	if (Size < N)
144	return npos;
145	if (N == `1`) {
146	const char Ptr = (const* char *)::memchr(s: Start, c: Needle[`0`], n: Size);
147	return Ptr == nullptr ? npos : Ptr - data();
148	}
149
150	const char *Stop = Start + (Size - N + `1`);
151
152	if (N == `2`) {
153	// Provide a fast path for newline finding (CRLF case) in InclusionRewriter.
154	// Not the most optimized strategy, but getting memcmp inlined should be
155	// good enough.
156	do {
157	if (std::memcmp(s1: Start, s2: Needle, n: `2`) == `0`)
158	return Start - data();
159	++Start;
160	} while (Start < Stop);
161	return npos;
162	}
163
164	// For short haystacks or unsupported needles fall back to the naive algorithm
165	if (Size < `16` \|\| N > `255`) {
166	do {
167	if (std::memcmp(s1: Start, s2: Needle, n: N) == `0`)
168	return Start - data();
169	++Start;
170	} while (Start < Stop);
171	return npos;
172	}
173
174	// Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
175	uint8_t BadCharSkip[`256`];
176	std::memset(s: BadCharSkip, c: N, n: `256`);
177	for (unsigned i = `0`; i != N-`1`; ++i)
178	BadCharSkip[(uint8_t)Str [i]] = N-`1`-i;
179
180	do {
181	uint8_t Last = Start[N - `1`];
182	if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - `1`]))
183	if (std::memcmp(s1: Start, s2: Needle, n: N - `1`) == `0`)
184	return Start - data();
185
186	// Otherwise skip the appropriate number of bytes.
187	Start += BadCharSkip[Last];
188	} while (Start < Stop);
189
190	return npos;
191	}
192
193	size_t StringRef::find_insensitive(StringRef Str, size_t From) const {
194	StringRef This = substr(Start: From);
195	while (This.size() >= Str.size()) {
196	if (This.starts_with_insensitive(Prefix: Str))
197	return From;
198	This = This.drop_front();
199	++From;
200	}
201	return npos;
202	}
203
204	size_t StringRef::rfind_insensitive(char C, size_t From) const {
205	From = std::min(a: From, b: size());
206	size_t i = From;
207	while (i != `0`) {
208	--i;
209	if (toLower(x: data()[i]) == toLower(x: C))
210	return i;
211	}
212	return npos;
213	}
214
215	/// rfind - Search for the last string \arg Str in the string.
216	///
217	/// \return - The index of the last occurrence of \arg Str, or npos if not
218	/// found.
219	size_t StringRef::rfind(StringRef Str) const {
220	return std::string_view(*this).rfind(str: Str);
221	}
222
223	size_t StringRef::rfind_insensitive(StringRef Str) const {
224	size_t N = Str.size();
225	if (N > size())
226	return npos;
227	for (size_t i = size() - N + `1`, e = `0`; i != e;) {
228	--i;
229	if (substr(Start: i, N).equals_insensitive(RHS: Str))
230	return i;
231	}
232	return npos;
233	}
234
235	/// find_first_of - Find the first character in the string that is in \arg
236	/// Chars, or npos if not found.
237	///
238	/// Note: O(size() + Chars.size())
239	StringRef::size_type StringRef::find_first_of(StringRef Chars,
240	size_t From) const {
241	std::bitset<`1` << CHAR_BIT> CharBits;
242	for (char C : Chars)
243	CharBits.set(position: (unsigned char)C);
244
245	for (size_type i = std::min(a: From, b: size()), e = size(); i != e; ++i)
246	if (CharBits.test(position: (unsigned char)data()[i]))
247	return i;
248	return npos;
249	}
250
251	/// find_first_not_of - Find the first character in the string that is not
252	/// \arg C or npos if not found.
253	StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
254	return std::string_view(*this).find_first_not_of(c: C, pos: From);
255	}
256
257	/// find_first_not_of - Find the first character in the string that is not
258	/// in the string \arg Chars, or npos if not found.
259	///
260	/// Note: O(size() + Chars.size())
261	StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
262	size_t From) const {
263	std::bitset<`1` << CHAR_BIT> CharBits;
264	for (char C : Chars)
265	CharBits.set(position: (unsigned char)C);
266
267	for (size_type i = std::min(a: From, b: size()), e = size(); i != e; ++i)
268	if (!CharBits.test(position: (unsigned char)data()[i]))
269	return i;
270	return npos;
271	}
272
273	/// find_last_of - Find the last character in the string that is in \arg C,
274	/// or npos if not found.
275	///
276	/// Note: O(size() + Chars.size())
277	StringRef::size_type StringRef::find_last_of(StringRef Chars,
278	size_t From) const {
279	std::bitset<`1` << CHAR_BIT> CharBits;
280	for (char C : Chars)
281	CharBits.set(position: (unsigned char)C);
282
283	for (size_type i = std::min(a: From, b: size()) - `1`, e = -`1`; i != e; --i)
284	if (CharBits.test(position: (unsigned char)data()[i]))
285	return i;
286	return npos;
287	}
288
289	/// find_last_not_of - Find the last character in the string that is not
290	/// \arg C, or npos if not found.
291	StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
292	for (size_type i = std::min(a: From, b: size()) - `1`, e = -`1`; i != e; --i)
293	if (data()[i] != C)
294	return i;
295	return npos;
296	}
297
298	/// find_last_not_of - Find the last character in the string that is not in
299	/// \arg Chars, or npos if not found.
300	///
301	/// Note: O(size() + Chars.size())
302	StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
303	size_t From) const {
304	std::bitset<`1` << CHAR_BIT> CharBits;
305	for (char C : Chars)
306	CharBits.set(position: (unsigned char)C);
307
308	for (size_type i = std::min(a: From, b: size()) - `1`, e = -`1`; i != e; --i)
309	if (!CharBits.test(position: (unsigned char)data()[i]))
310	return i;
311	return npos;
312	}
313
314	void StringRef::split(SmallVectorImpl<StringRef> &A,
315	StringRef Separator, int MaxSplit,
316	bool KeepEmpty) const {
317	StringRef S = *this;
318
319	// Count down from MaxSplit. When MaxSplit is -1, this will just split
320	// "forever". This doesn't support splitting more than 2^31 times
321	// intentionally; if we ever want that we can make MaxSplit a 64-bit integer
322	// but that seems unlikely to be useful.
323	while (MaxSplit-- != `0`) {
324	size_t Idx = S.find(Str: Separator);
325	if (Idx == npos)
326	break;
327
328	// Push this split.
329	if (KeepEmpty \|\| Idx > `0`)
330	A.push_back(Elt: S.slice(Start: `0`, End: Idx));
331
332	// Jump forward.
333	S = S.substr(Start: Idx + Separator.size());
334	}
335
336	// Push the tail.
337	if (KeepEmpty \|\| !S.empty())
338	A.push_back(Elt: S);
339	}
340
341	void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
342	int MaxSplit, bool KeepEmpty) const {
343	StringRef S = *this;
344
345	// Count down from MaxSplit. When MaxSplit is -1, this will just split
346	// "forever". This doesn't support splitting more than 2^31 times
347	// intentionally; if we ever want that we can make MaxSplit a 64-bit integer
348	// but that seems unlikely to be useful.
349	while (MaxSplit-- != `0`) {
350	size_t Idx = S.find(C: Separator);
351	if (Idx == npos)
352	break;
353
354	// Push this split.
355	if (KeepEmpty \|\| Idx > `0`)
356	A.push_back(Elt: S.slice(Start: `0`, End: Idx));
357
358	// Jump forward.
359	S = S.substr(Start: Idx + `1`);
360	}
361
362	// Push the tail.
363	if (KeepEmpty \|\| !S.empty())
364	A.push_back(Elt: S);
365	}
366
367	//===----------------------------------------------------------------------===//
368	// Helpful Algorithms
369	//===----------------------------------------------------------------------===//
370
371	/// count - Return the number of non-overlapped occurrences of \arg Str in
372	/// the string.
373	size_t StringRef::count(StringRef Str) const {
374	size_t Count = `0`;
375	size_t Pos = `0`;
376	size_t N = Str.size();
377	// TODO: For an empty `Str` we return 0 for legacy reasons. Consider changing
378	// this to `Length + 1` which is more in-line with the function
379	// description.
380	if (!N)
381	return `0`;
382	while ((Pos = find(Str, From: Pos)) != npos) {
383	++Count;
384	Pos += N;
385	}
386	return Count;
387	}
388
389	static unsigned GetAutoSenseRadix(StringRef &Str) {
390	if (Str.empty())
391	return `10`;
392
393	if (Str.consume_front_insensitive(Prefix: "0x"))
394	return `16`;
395
396	if (Str.consume_front_insensitive(Prefix: "0b"))
397	return `2`;
398
399	if (Str.consume_front(Prefix: "0o"))
400	return `8`;
401
402	if (Str [`0`] == `'0'` && Str.size() > `1` && isDigit(C: Str [`1`])) {
403	Str = Str.substr(Start: `1`);
404	return `8`;
405	}
406
407	return `10`;
408	}
409
410	bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
411	unsigned long long &Result) {
412	// Autosense radix if not specified.
413	if (Radix == `0`)
414	Radix = GetAutoSenseRadix(Str);
415
416	// Empty strings (after the radix autosense) are invalid.
417	if (Str.empty()) return true;
418
419	// Parse all the bytes of the string given this radix. Watch for overflow.
420	StringRef Str2 = Str;
421	Result = `0`;
422	while (!Str2.empty()) {
423	unsigned CharVal;
424	if (Str2 [`0`] >= `'0'` && Str2 [`0`] <= `'9'`)
425	CharVal = Str2 [`0`] - `'0'`;
426	else if (Str2 [`0`] >= `'a'` && Str2 [`0`] <= `'z'`)
427	CharVal = Str2 [`0`] - `'a'` + `10`;
428	else if (Str2 [`0`] >= `'A'` && Str2 [`0`] <= `'Z'`)
429	CharVal = Str2 [`0`] - `'A'` + `10`;
430	else
431	break;
432
433	// If the parsed value is larger than the integer radix, we cannot
434	// consume any more characters.
435	if (CharVal >= Radix)
436	break;
437
438	// Add in this character.
439	unsigned long long PrevResult = Result;
440	Result = Result * Radix + CharVal;
441
442	// Check for overflow by shifting back and seeing if bits were lost.
443	if (Result / Radix < PrevResult)
444	return true;
445
446	Str2 = Str2.substr(Start: `1`);
447	}
448
449	// We consider the operation a failure if no characters were consumed
450	// successfully.
451	if (Str.size() == Str2.size())
452	return true;
453
454	Str = Str2;
455	return false;
456	}
457
458	bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
459	long long &Result) {
460	unsigned long long ULLVal;
461
462	// Handle positive strings first.
463	if (!Str.starts_with(Prefix: "-")) {
464	if (consumeUnsignedInteger(Str, Radix, Result&: ULLVal) \|\|
465	// Check for value so large it overflows a signed value.
466	(long long)ULLVal < `0`)
467	return true;
468	Result = ULLVal;
469	return false;
470	}
471
472	// Get the positive part of the value.
473	StringRef Str2 = Str.drop_front(N: `1`);
474	if (consumeUnsignedInteger(Str&: Str2, Radix, Result&: ULLVal) \|\|
475	// Reject values so large they'd overflow as negative signed, but allow
476	// "-0". This negates the unsigned so that the negative isn't undefined
477	// on signed overflow.
478	(long long)-ULLVal > `0`)
479	return true;
480
481	Str = Str2;
482	Result = -ULLVal;
483	return false;
484	}
485
486	/// GetAsUnsignedInteger - Workhorse method that converts a integer character
487	/// sequence of radix up to 36 to an unsigned long long value.
488	bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
489	unsigned long long &Result) {
490	if (consumeUnsignedInteger(Str, Radix, Result))
491	return true;
492
493	// For getAsUnsignedInteger, we require the whole string to be consumed or
494	// else we consider it a failure.
495	return !Str.empty();
496	}
497
498	bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
499	long long &Result) {
500	if (consumeSignedInteger(Str, Radix, Result))
501	return true;
502
503	// For getAsSignedInteger, we require the whole string to be consumed or else
504	// we consider it a failure.
505	return !Str.empty();
506	}
507
508	bool StringRef::consumeInteger(unsigned Radix, APInt &Result) {
509	StringRef Str = *this;
510
511	// Autosense radix if not specified.
512	if (Radix == `0`)
513	Radix = GetAutoSenseRadix(Str);
514
515	assert(Radix > `1` && Radix <= `36`);
516
517	// Empty strings (after the radix autosense) are invalid.
518	if (Str.empty()) return true;
519
520	// Skip leading zeroes. This can be a significant improvement if
521	// it means we don't need > 64 bits.
522	Str = Str.ltrim(Char: `'0'`);
523
524	// If it was nothing but zeroes....
525	if (Str.empty()) {
526	Result = APInt (`64`, `0`);
527	*this = Str;
528	return false;
529	}
530
531	// (Over-)estimate the required number of bits.
532	unsigned Log2Radix = `0`;
533	while ((`1U` << Log2Radix) < Radix) Log2Radix++;
534	bool IsPowerOf2Radix = ((`1U` << Log2Radix) == Radix);
535
536	unsigned BitWidth = Log2Radix * Str.size();
537	if (BitWidth < Result.getBitWidth())
538	BitWidth = Result.getBitWidth(); // don't shrink the result
539	else if (BitWidth > Result.getBitWidth())
540	Result = Result.zext(width: BitWidth);
541
542	APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
543	if (!IsPowerOf2Radix) {
544	// These must have the same bit-width as Result.
545	RadixAP = APInt (BitWidth, Radix);
546	CharAP = APInt (BitWidth, `0`);
547	}
548
549	// Parse all the bytes of the string given this radix.
550	Result = `0`;
551	while (!Str.empty()) {
552	unsigned CharVal;
553	if (Str [`0`] >= `'0'` && Str [`0`] <= `'9'`)
554	CharVal = Str [`0`]-`'0'`;
555	else if (Str [`0`] >= `'a'` && Str [`0`] <= `'z'`)
556	CharVal = Str [`0`]-`'a'`+`10`;
557	else if (Str [`0`] >= `'A'` && Str [`0`] <= `'Z'`)
558	CharVal = Str [`0`]-`'A'`+`10`;
559	else
560	break;
561
562	// If the parsed value is larger than the integer radix, the string is
563	// invalid.
564	if (CharVal >= Radix)
565	break;
566
567	// Add in this character.
568	if (IsPowerOf2Radix) {
569	Result <<= Log2Radix;
570	Result \|= CharVal;
571	} else {
572	Result *= RadixAP;
573	CharAP = CharVal;
574	Result += CharAP;
575	}
576
577	Str = Str.substr(Start: `1`);
578	}
579
580	// We consider the operation a failure if no characters were consumed
581	// successfully.
582	if (size() == Str.size())
583	return true;
584
585	*this = Str;
586	return false;
587	}
588
589	bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
590	StringRef Str = *this;
591	if (Str.consumeInteger(Radix, Result))
592	return true;
593
594	// For getAsInteger, we require the whole string to be consumed or else we
595	// consider it a failure.
596	return !Str.empty();
597	}
598
599	bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
600	APFloat F(`0.0`);
601	auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
602	if (errorToBool(Err: StatusOrErr.takeError()))
603	return true;
604
605	APFloat::opStatus Status = *StatusOrErr;
606	if (Status != APFloat::opOK) {
607	if (!AllowInexact \|\| !(Status & APFloat::opInexact))
608	return true;
609	}
610
611	Result = F.convertToDouble();
612	return false;
613	}
614
615	// Implementation of StringRef hashing.
616	hash_code llvm::hash_value(StringRef S) { return hash_combine_range(R&: S); }
617
618	unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) {
619	assert(Val.data() != getEmptyKey().data() &&
620	"Cannot hash the empty key!");
621	assert(Val.data() != getTombstoneKey().data() &&
622	"Cannot hash the tombstone key!");
623	return (unsigned)(hash_value(S: Val));
624	}
625

Browse the source code of llvm_projects/llvm/lib/Support/StringRef.cpp