1 | //===-- Support/DJB.cpp ---DJB Hash -----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains support for the DJ Bernstein hash function. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/Support/DJB.h" |
14 | #include "llvm/ADT/ArrayRef.h" |
15 | #include "llvm/Support/Compiler.h" |
16 | #include "llvm/Support/ConvertUTF.h" |
17 | #include "llvm/Support/Unicode.h" |
18 | |
19 | using namespace llvm; |
20 | |
21 | static UTF32 chopOneUTF32(StringRef &Buffer) { |
22 | UTF32 C; |
23 | const UTF8 *const Begin8Const = |
24 | reinterpret_cast<const UTF8 *>(Buffer.begin()); |
25 | const UTF8 *Begin8 = Begin8Const; |
26 | UTF32 *Begin32 = &C; |
27 | |
28 | // In lenient mode we will always end up with a "reasonable" value in C for |
29 | // non-empty input. |
30 | assert(!Buffer.empty()); |
31 | ConvertUTF8toUTF32(sourceStart: &Begin8, sourceEnd: reinterpret_cast<const UTF8 *>(Buffer.end()), |
32 | targetStart: &Begin32, targetEnd: &C + 1, flags: lenientConversion); |
33 | Buffer = Buffer.drop_front(N: Begin8 - Begin8Const); |
34 | return C; |
35 | } |
36 | |
37 | static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) { |
38 | const UTF32 *Begin32 = &C; |
39 | UTF8 *Begin8 = Storage.begin(); |
40 | |
41 | // The case-folded output should always be a valid unicode character, so use |
42 | // strict mode here. |
43 | ConversionResult CR = ConvertUTF32toUTF8(sourceStart: &Begin32, sourceEnd: &C + 1, targetStart: &Begin8, |
44 | targetEnd: Storage.end(), flags: strictConversion); |
45 | assert(CR == conversionOK && "Case folding produced invalid char?" ); |
46 | (void)CR; |
47 | return StringRef(reinterpret_cast<char *>(Storage.begin()), |
48 | Begin8 - Storage.begin()); |
49 | } |
50 | |
51 | static UTF32 foldCharDwarf(UTF32 C) { |
52 | // DWARF v5 addition to the unicode folding rules. |
53 | // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot |
54 | // Above" into "i". |
55 | if (C == 0x130 || C == 0x131) |
56 | return 'i'; |
57 | return sys::unicode::foldCharSimple(C); |
58 | } |
59 | |
60 | static std::optional<uint32_t> fastCaseFoldingDjbHash(StringRef Buffer, |
61 | uint32_t H) { |
62 | bool AllASCII = true; |
63 | for (unsigned char C : Buffer) { |
64 | H = H * 33 + ('A' <= C && C <= 'Z' ? C - 'A' + 'a' : C); |
65 | AllASCII &= C <= 0x7f; |
66 | } |
67 | if (AllASCII) |
68 | return H; |
69 | return std::nullopt; |
70 | } |
71 | |
72 | uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) { |
73 | if (std::optional<uint32_t> Result = fastCaseFoldingDjbHash(Buffer, H)) |
74 | return *Result; |
75 | |
76 | std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage; |
77 | while (!Buffer.empty()) { |
78 | UTF32 C = foldCharDwarf(C: chopOneUTF32(Buffer)); |
79 | StringRef Folded = toUTF8(C, Storage); |
80 | H = djbHash(Buffer: Folded, H); |
81 | } |
82 | return H; |
83 | } |
84 | |