CommentLexer.cpp source code [llvm_projects/clang/lib/AST/CommentLexer.cpp]

1	//===--- CommentLexer.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang/AST/CommentLexer.h"
10	#include "clang/AST/CommentCommandTraits.h"
11	#include "clang/Basic/CharInfo.h"
12	#include "clang/Basic/DiagnosticComment.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/ADT/StringSwitch.h"
15	#include "llvm/Support/ConvertUTF.h"
16	#include "llvm/Support/ErrorHandling.h"
17
18	namespace clang {
19	namespace comments {
20
21	void Token::dump(const Lexer &L, const SourceManager &SM) const {
22	llvm::errs() << "comments::Token Kind=" << Kind << " ";
23	Loc.print(OS&: llvm::errs(), SM);
24	llvm::errs() << " " << Length << " \"" << L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n";
25	}
26
27	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28	return isLetter(c: C);
29	}
30
31	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32	return isDigit(c: C);
33	}
34
35	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36	return isHexDigit(c: C);
37	}
38
39	static inline StringRef convertCodePointToUTF8(
40	llvm::BumpPtrAllocator &Allocator,
41	unsigned CodePoint) {
42	char Resolved = Allocator.Allocate<char*>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43	char *ResolvedPtr = Resolved;
44	if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr))
45	return StringRef(Resolved, ResolvedPtr - Resolved);
46	else
47	return StringRef();
48	}
49
50	namespace {
51
52	#include "clang/AST/CommentHTMLTags.inc"
53	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55	} // end anonymous namespace
56
57	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58	// Fast path, first check a few most widely used named character references.
59	return llvm::StringSwitch<StringRef>(Name)
60	.Case(S: "amp", Value: "&")
61	.Case(S: "lt", Value: "<")
62	.Case(S: "gt", Value: ">")
63	.Case(S: "quot", Value: "\"")
64	.Case(S: "apos", Value: "\'")
65	// Slow path.
66	.Default(Value: translateHTMLNamedCharacterReferenceToUTF8(Name));
67	}
68
69	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70	unsigned CodePoint = `0`;
71	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
72	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73	CodePoint *= `10`;
74	CodePoint += Name [i] - `'0'`;
75	}
76	return convertCodePointToUTF8(Allocator, CodePoint);
77	}
78
79	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80	unsigned CodePoint = `0`;
81	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
82	CodePoint *= `16`;
83	const char C = Name [i];
84	assert(isHTMLHexCharacterReferenceCharacter(C));
85	CodePoint += llvm::hexDigitValue(C);
86	}
87	return convertCodePointToUTF8(Allocator, CodePoint);
88	}
89
90	void Lexer::skipLineStartingDecorations() {
91	// This function should be called only for C comments
92	assert(CommentState == LCS_InsideCComment);
93
94	if (BufferPtr == CommentEnd)
95	return;
96
97	const char *NewBufferPtr = BufferPtr;
98	while (isHorizontalWhitespace(c: *NewBufferPtr))
99	if (++NewBufferPtr == CommentEnd)
100	return;
101	if (NewBufferPtr == `''`)
102	BufferPtr = NewBufferPtr + `1`;
103	}
104
105	namespace {
106	/// Returns pointer to the first newline character in the string.
107	const char findNewline(const* char BufferPtr, const* char *BufferEnd) {
108	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109	if (isVerticalWhitespace(c: *BufferPtr))
110	return BufferPtr;
111	}
112	return BufferEnd;
113	}
114
115	const char skipNewline(const* char BufferPtr, const* char *BufferEnd) {
116	if (BufferPtr == BufferEnd)
117	return BufferPtr;
118
119	if (*BufferPtr == `'\n'`)
120	BufferPtr++;
121	else {
122	assert(*BufferPtr == `'\r'`);
123	BufferPtr++;
124	if (BufferPtr != BufferEnd && *BufferPtr == `'\n'`)
125	BufferPtr++;
126	}
127	return BufferPtr;
128	}
129
130	const char skipNamedCharacterReference(const* char *BufferPtr,
131	const char *BufferEnd) {
132	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133	if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr))
134	return BufferPtr;
135	}
136	return BufferEnd;
137	}
138
139	const char skipDecimalCharacterReference(const* char *BufferPtr,
140	const char *BufferEnd) {
141	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142	if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr))
143	return BufferPtr;
144	}
145	return BufferEnd;
146	}
147
148	const char skipHexCharacterReference(const* char *BufferPtr,
149	const char *BufferEnd) {
150	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151	if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr))
152	return BufferPtr;
153	}
154	return BufferEnd;
155	}
156
157	bool isHTMLIdentifierStartingCharacter(char C) {
158	return isLetter(c: C);
159	}
160
161	bool isHTMLIdentifierCharacter(char C) {
162	return isAlphanumeric(c: C);
163	}
164
165	const char skipHTMLIdentifier(const* char BufferPtr, const* char *BufferEnd) {
166	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167	if (!isHTMLIdentifierCharacter(C: *BufferPtr))
168	return BufferPtr;
169	}
170	return BufferEnd;
171	}
172
173	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174	/// string allowed.
175	///
176	/// Returns pointer to closing quote.
177	const char skipHTMLQuotedString(const* char BufferPtr, const* char *BufferEnd)
178	{
179	const char Quote = *BufferPtr;
180	assert(Quote == `'\"'` \|\| Quote == `'\''`);
181
182	BufferPtr++;
183	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184	const char C = *BufferPtr;
185	if (C == Quote && BufferPtr[-`1`] != `'\\'`)
186	return BufferPtr;
187	}
188	return BufferEnd;
189	}
190
191	const char skipWhitespace(const* char BufferPtr, const* char *BufferEnd) {
192	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193	if (!isWhitespace(c: *BufferPtr))
194	return BufferPtr;
195	}
196	return BufferEnd;
197	}
198
199	const char skipHorizontalWhitespace(const* char *BufferPtr,
200	const char *BufferEnd) {
201	for (; BufferPtr != BufferEnd; ++BufferPtr) {
202	if (!isHorizontalWhitespace(c: *BufferPtr))
203	return BufferPtr;
204	}
205	return BufferEnd;
206	}
207
208	bool isWhitespace(const char BufferPtr, const* char *BufferEnd) {
209	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
210	}
211
212	bool isCommandNameStartCharacter(char C) {
213	return isLetter(c: C);
214	}
215
216	bool isCommandNameCharacter(char C) {
217	return isAlphanumeric(c: C);
218	}
219
220	const char skipCommandName(const* char BufferPtr, const* char *BufferEnd) {
221	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
222	if (!isCommandNameCharacter(C: *BufferPtr))
223	return BufferPtr;
224	}
225	return BufferEnd;
226	}
227
228	/// Return the one past end pointer for BCPL comments.
229	/// Handles newlines escaped with backslash or trigraph for backslahs.
230	const char findBCPLCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
231	const char *CurPtr = BufferPtr;
232	while (CurPtr != BufferEnd) {
233	while (!isVerticalWhitespace(c: *CurPtr)) {
234	CurPtr++;
235	if (CurPtr == BufferEnd)
236	return BufferEnd;
237	}
238	// We found a newline, check if it is escaped.
239	const char *EscapePtr = CurPtr - `1`;
240	while(isHorizontalWhitespace(c: *EscapePtr))
241	EscapePtr--;
242
243	if (*EscapePtr == `'\\'` \|\|
244	(EscapePtr - `2` >= BufferPtr && EscapePtr[`0`] == `'/'` &&
245	EscapePtr[-`1`] == `'?'` && EscapePtr[-`2`] == `'?'`)) {
246	// We found an escaped newline.
247	CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd);
248	} else
249	return CurPtr; // Not an escaped newline.
250	}
251	return BufferEnd;
252	}
253
254	/// Return the one past end pointer for C comments.
255	/// Very dumb, does not handle escaped newlines or trigraphs.
256	const char findCCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
257	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
258	if (BufferPtr == `''`) {
259	assert(BufferPtr + `1` != BufferEnd);
260	if (*(BufferPtr + `1`) == `'/'`)
261	return BufferPtr;
262	}
263	}
264	llvm_unreachable("buffer end hit before '*/' was seen");
265	}
266
267	} // end anonymous namespace
268
269	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
270	tok::TokenKind Kind) {
271	const unsigned TokLen = TokEnd - BufferPtr;
272	Result.setLocation(getSourceLocation(Loc: BufferPtr));
273	Result.setKind(Kind);
274	Result.setLength(TokLen);
275	#ifndef NDEBUG
276	Result.TextPtr = "<UNSET>";
277	Result.IntVal = `7`;
278	#endif
279	BufferPtr = TokEnd;
280	}
281
282	const char *Lexer::skipTextToken() {
283	const char *TokenPtr = BufferPtr;
284	assert(TokenPtr < CommentEnd);
285	StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
286
287	again:
288	size_t End =
289	StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols);
290	if (End == StringRef::npos)
291	return CommentEnd;
292
293	// Doxygen doesn't recognize any commands in a one-line double quotation.
294	// If we don't find an ending quotation mark, we pretend it never began.
295	if (*(TokenPtr + End) == `'\"'`) {
296	TokenPtr += End + `1`;
297	End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\"");
298	if (End != StringRef::npos && *(TokenPtr + End) == `'\"'`)
299	TokenPtr += End + `1`;
300	goto again;
301	}
302	return TokenPtr + End;
303	}
304
305	void Lexer::lexCommentText(Token &T) {
306	assert(CommentState == LCS_InsideBCPLComment \|\|
307	CommentState == LCS_InsideCComment);
308
309	// Handles lexing non-command text, i.e. text and newline.
310	auto HandleNonCommandToken = [&]() -> void {
311	assert(State == LS_Normal);
312
313	const char *TokenPtr = BufferPtr;
314	assert(TokenPtr < CommentEnd);
315	switch (*TokenPtr) {
316	case `'\n'`:
317	case `'\r'`:
318	TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
319	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline);
320
321	if (CommentState == LCS_InsideCComment)
322	skipLineStartingDecorations();
323	return;
324
325	default:
326	return formTextToken(Result&: T, TokEnd: skipTextToken());
327	}
328	};
329
330	if (!ParseCommands)
331	return HandleNonCommandToken ();
332
333	switch (State) {
334	case LS_Normal:
335	break;
336	case LS_VerbatimBlockFirstLine:
337	lexVerbatimBlockFirstLine(T);
338	return;
339	case LS_VerbatimBlockBody:
340	lexVerbatimBlockBody(T);
341	return;
342	case LS_VerbatimLineText:
343	lexVerbatimLineText(T);
344	return;
345	case LS_HTMLStartTag:
346	lexHTMLStartTag(T);
347	return;
348	case LS_HTMLEndTag:
349	lexHTMLEndTag(T);
350	return;
351	}
352
353	assert(State == LS_Normal);
354	const char *TokenPtr = BufferPtr;
355	assert(TokenPtr < CommentEnd);
356	switch(*TokenPtr) {
357	case `'\\'`:
358	case `'@'`: {
359	// Commands that start with a backslash and commands that start with
360	// 'at' have equivalent semantics. But we keep information about the
361	// exact syntax in AST for comments.
362	tok::TokenKind CommandKind =
363	(*TokenPtr == `'@'`) ? tok::at_command : tok::backslash_command;
364	TokenPtr++;
365	if (TokenPtr == CommentEnd) {
366	formTextToken(Result&: T, TokEnd: TokenPtr);
367	return;
368	}
369	char C = *TokenPtr;
370	switch (C) {
371	default:
372	break;
373
374	case `'\\'`: case `'@'`: case `'&'`: case `'$'`:
375	case `'#'`: case `'<'`: case `'>'`: case `'%'`:
376	case `'\"'`: case `'.'`: case `':'`:
377	// This is one of \\ \@ \& \$ etc escape sequences.
378	TokenPtr++;
379	if (C == `':'` && TokenPtr != CommentEnd && *TokenPtr == `':'`) {
380	// This is the \:: escape sequence.
381	TokenPtr++;
382	}
383	StringRef UnescapedText(BufferPtr + `1`, TokenPtr - (BufferPtr + `1`));
384	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
385	T.setText(UnescapedText);
386	return;
387	}
388
389	// Don't make zero-length commands.
390	if (!isCommandNameStartCharacter(C: *TokenPtr)) {
391	formTextToken(Result&: T, TokEnd: TokenPtr);
392	return;
393	}
394
395	TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
396	unsigned Length = TokenPtr - (BufferPtr + `1`);
397
398	// Hardcoded support for lexing LaTeX formula commands
399	// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
400	if (Length == `1` && TokenPtr[-`1`] == `'f'` && TokenPtr != CommentEnd) {
401	C = *TokenPtr;
402	if (C == `'$'` \|\| C == `'('` \|\| C == `')'` \|\| C == `'['` \|\| C == `']'` \|\|
403	C == `'{'` \|\| C == `'}'`) {
404	TokenPtr++;
405	Length++;
406	}
407	}
408
409	StringRef CommandName(BufferPtr + `1`, Length);
410
411	const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName);
412	if (!Info) {
413	if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) {
414	StringRef CorrectedName = Info->Name;
415	SourceLocation Loc = getSourceLocation(Loc: BufferPtr);
416	SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr);
417	SourceRange FullRange = SourceRange (Loc, EndLoc);
418	SourceRange CommandRange(Loc.getLocWithOffset(Offset: `1`), EndLoc);
419	Diag(Loc, DiagID: diag::warn_correct_comment_command_name)
420	<< FullRange << CommandName << CorrectedName
421	<< FixItHint::CreateReplacement(RemoveRange: CommandRange, Code: CorrectedName);
422	} else {
423	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::unknown_command);
424	T.setUnknownCommandName(CommandName);
425	Diag(Loc: T.getLocation(), DiagID: diag::warn_unknown_comment_command_name)
426	<< SourceRange (T.getLocation(), T.getEndLocation());
427	return;
428	}
429	}
430	if (Info->IsVerbatimBlockCommand) {
431	setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info);
432	return;
433	}
434	if (Info->IsVerbatimLineCommand) {
435	setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info);
436	return;
437	}
438	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind);
439	T.setCommandID(Info->getID());
440	return;
441	}
442
443	case `'&'`:
444	lexHTMLCharacterReference(T);
445	return;
446
447	case `'<'`: {
448	TokenPtr++;
449	if (TokenPtr == CommentEnd) {
450	formTextToken(Result&: T, TokEnd: TokenPtr);
451	return;
452	}
453	const char C = *TokenPtr;
454	if (isHTMLIdentifierStartingCharacter(C))
455	setupAndLexHTMLStartTag(T);
456	else if (C == `'/'`)
457	setupAndLexHTMLEndTag(T);
458	else
459	formTextToken(Result&: T, TokEnd: TokenPtr);
460	return;
461	}
462
463	default:
464	return HandleNonCommandToken ();
465	}
466	}
467
468	void Lexer::setupAndLexVerbatimBlock(Token &T,
469	const char *TextBegin,
470	char Marker, const CommandInfo *Info) {
471	assert(Info->IsVerbatimBlockCommand);
472
473	VerbatimBlockEndCommandName.clear();
474	VerbatimBlockEndCommandName.append(RHS: Marker == `'\\'` ? "\\" : "@");
475	VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName);
476
477	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin);
478	T.setVerbatimBlockID(Info->getID());
479
480	// If there is a newline following the verbatim opening command, skip the
481	// newline so that we don't create an tok::verbatim_block_line with empty
482	// text content.
483	if (BufferPtr != CommentEnd &&
484	isVerticalWhitespace(c: *BufferPtr)) {
485	BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
486	State = LS_VerbatimBlockBody;
487	return;
488	}
489
490	State = LS_VerbatimBlockFirstLine;
491	}
492
493	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
494	again:
495	assert(BufferPtr < CommentEnd);
496
497	// FIXME: It would be better to scan the text once, finding either the block
498	// end command or newline.
499	//
500	// Extract current line.
501	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
502	StringRef Line(BufferPtr, Newline - BufferPtr);
503
504	// Look for end command in current line.
505	size_t Pos = Line.find(Str: VerbatimBlockEndCommandName);
506	const char *TextEnd;
507	const char *NextLine;
508	if (Pos == StringRef::npos) {
509	// Current line is completely verbatim.
510	TextEnd = Newline;
511	NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd);
512	} else if (Pos == `0`) {
513	// Current line contains just an end command.
514	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
515	StringRef Name(BufferPtr + `1`, End - (BufferPtr + `1`));
516	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end);
517	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
518	State = LS_Normal;
519	return;
520	} else {
521	// There is some text, followed by end command. Extract text first.
522	TextEnd = BufferPtr + Pos;
523	NextLine = TextEnd;
524	// If there is only whitespace before end command, skip whitespace.
525	if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) {
526	BufferPtr = TextEnd;
527	goto again;
528	}
529	}
530
531	StringRef Text(BufferPtr, TextEnd - BufferPtr);
532	formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line);
533	T.setVerbatimBlockText(Text);
534
535	State = LS_VerbatimBlockBody;
536	}
537
538	void Lexer::lexVerbatimBlockBody(Token &T) {
539	assert(State == LS_VerbatimBlockBody);
540
541	if (CommentState == LCS_InsideCComment)
542	skipLineStartingDecorations();
543
544	if (BufferPtr == CommentEnd) {
545	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line);
546	T.setVerbatimBlockText("");
547	return;
548	}
549
550	lexVerbatimBlockFirstLine(T);
551	}
552
553	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
554	const CommandInfo *Info) {
555	assert(Info->IsVerbatimLineCommand);
556	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name);
557	T.setVerbatimLineID(Info->getID());
558
559	State = LS_VerbatimLineText;
560	}
561
562	void Lexer::lexVerbatimLineText(Token &T) {
563	assert(State == LS_VerbatimLineText);
564
565	// Extract current line.
566	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
567	StringRef Text(BufferPtr, Newline - BufferPtr);
568	formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text);
569	T.setVerbatimLineText(Text);
570
571	State = LS_Normal;
572	}
573
574	void Lexer::lexHTMLCharacterReference(Token &T) {
575	const char *TokenPtr = BufferPtr;
576	assert(*TokenPtr == `'&'`);
577	TokenPtr++;
578	if (TokenPtr == CommentEnd) {
579	formTextToken(Result&: T, TokEnd: TokenPtr);
580	return;
581	}
582	const char *NamePtr;
583	bool isNamed = false;
584	bool isDecimal = false;
585	char C = *TokenPtr;
586	if (isHTMLNamedCharacterReferenceCharacter(C)) {
587	NamePtr = TokenPtr;
588	TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
589	isNamed = true;
590	} else if (C == `'#'`) {
591	TokenPtr++;
592	if (TokenPtr == CommentEnd) {
593	formTextToken(Result&: T, TokEnd: TokenPtr);
594	return;
595	}
596	C = *TokenPtr;
597	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
598	NamePtr = TokenPtr;
599	TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
600	isDecimal = true;
601	} else if (C == `'x'` \|\| C == `'X'`) {
602	TokenPtr++;
603	NamePtr = TokenPtr;
604	TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
605	} else {
606	formTextToken(Result&: T, TokEnd: TokenPtr);
607	return;
608	}
609	} else {
610	formTextToken(Result&: T, TokEnd: TokenPtr);
611	return;
612	}
613	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
614	*TokenPtr != `';'`) {
615	formTextToken(Result&: T, TokEnd: TokenPtr);
616	return;
617	}
618	StringRef Name(NamePtr, TokenPtr - NamePtr);
619	TokenPtr++; // Skip semicolon.
620	StringRef Resolved;
621	if (isNamed)
622	Resolved = resolveHTMLNamedCharacterReference(Name);
623	else if (isDecimal)
624	Resolved = resolveHTMLDecimalCharacterReference(Name);
625	else
626	Resolved = resolveHTMLHexCharacterReference(Name);
627
628	if (Resolved.empty()) {
629	formTextToken(Result&: T, TokEnd: TokenPtr);
630	return;
631	}
632	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
633	T.setText(Resolved);
634	}
635
636	void Lexer::setupAndLexHTMLStartTag(Token &T) {
637	assert(BufferPtr[`0`] == `'<'` &&
638	isHTMLIdentifierStartingCharacter(BufferPtr[`1`]));
639	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
640	StringRef Name(BufferPtr + `1`, TagNameEnd - (BufferPtr + `1`));
641	if (!isHTMLTagName(Name)) {
642	formTextToken(Result&: T, TokEnd: TagNameEnd);
643	return;
644	}
645
646	formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag);
647	T.setHTMLTagStartName(Name);
648
649	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
650	if (BufferPtr == CommentEnd) { // in BCPL comments
651	State = LS_HTMLStartTag;
652	return;
653	}
654
655	const char C = *BufferPtr;
656	if (BufferPtr != CommentEnd &&
657	(C == `'>'` \|\| C == `'/'` \|\| isVerticalWhitespace(c: C) \|\|
658	isHTMLIdentifierStartingCharacter(C)))
659	State = LS_HTMLStartTag;
660	}
661
662	void Lexer::lexHTMLStartTag(Token &T) {
663	assert(State == LS_HTMLStartTag);
664
665	// Skip leading whitespace and comment decorations
666	while (isVerticalWhitespace(c: *BufferPtr)) {
667	BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
668
669	if (CommentState == LCS_InsideCComment)
670	skipLineStartingDecorations();
671
672	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
673	if (BufferPtr == CommentEnd) {
674	// HTML starting tags must be defined in a single comment block.
675	// It's likely a user-error where they forgot to terminate the comment.
676	State = LS_Normal;
677	// Since at least one newline was skipped and one token needs to be lexed,
678	// return a newline.
679	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
680	return;
681	}
682	}
683
684	const char *TokenPtr = BufferPtr;
685	char C = *TokenPtr;
686	if (isHTMLIdentifierCharacter(C)) {
687	TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
688	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
689	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident);
690	T.setHTMLIdent(Ident);
691	} else {
692	switch (C) {
693	case `'='`:
694	TokenPtr++;
695	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals);
696	break;
697	case `'\"'`:
698	case `'\''`: {
699	const char *OpenQuote = TokenPtr;
700	TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
701	const char *ClosingQuote = TokenPtr;
702	if (TokenPtr != CommentEnd) // Skip closing quote.
703	TokenPtr++;
704	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string);
705	T.setHTMLQuotedString(StringRef(OpenQuote + `1`,
706	ClosingQuote - (OpenQuote + `1`)));
707	break;
708	}
709	case `'>'`:
710	TokenPtr++;
711	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater);
712	State = LS_Normal;
713	return;
714	case `'/'`:
715	TokenPtr++;
716	if (TokenPtr != CommentEnd && *TokenPtr == `'>'`) {
717	TokenPtr++;
718	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater);
719	} else
720	formTextToken(Result&: T, TokEnd: TokenPtr);
721
722	State = LS_Normal;
723	return;
724	}
725	}
726
727	// Now look ahead and return to normal state if we don't see any HTML tokens
728	// ahead.
729	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
730	if (BufferPtr == CommentEnd) {
731	return;
732	}
733
734	C = *BufferPtr;
735	if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(c: C) &&
736	C != `'='` && C != `'\"'` && C != `'\''` && C != `'>'` && C != `'/'`) {
737	State = LS_Normal;
738	return;
739	}
740	}
741
742	void Lexer::setupAndLexHTMLEndTag(Token &T) {
743	assert(BufferPtr[`0`] == `'<'` && BufferPtr[`1`] == `'/'`);
744
745	const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
746	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd);
747	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
748	if (!isHTMLTagName(Name)) {
749	formTextToken(Result&: T, TokEnd: TagNameEnd);
750	return;
751	}
752
753	const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd);
754
755	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag);
756	T.setHTMLTagEndName(Name);
757
758	if (BufferPtr != CommentEnd && *BufferPtr == `'>'`)
759	State = LS_HTMLEndTag;
760	}
761
762	void Lexer::lexHTMLEndTag(Token &T) {
763	assert(BufferPtr != CommentEnd && *BufferPtr == `'>'`);
764
765	formTokenWithChars(Result&: T, TokEnd: BufferPtr + `1`, Kind: tok::html_greater);
766	State = LS_Normal;
767	}
768
769	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
770	const CommandTraits &Traits, SourceLocation FileLoc,
771	const char BufferStart, const* char BufferEnd, bool* ParseCommands)
772	: Allocator(Allocator), Diags(Diags), Traits(Traits),
773	BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
774	FileLoc (FileLoc), ParseCommands(ParseCommands),
775	CommentState(LCS_BeforeComment), State(LS_Normal) {}
776
777	void Lexer::lex(Token &T) {
778	again:
779	switch (CommentState) {
780	case LCS_BeforeComment:
781	if (BufferPtr == BufferEnd) {
782	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof);
783	return;
784	}
785
786	assert(*BufferPtr == `'/'`);
787	BufferPtr++; // Skip first slash.
788	switch(*BufferPtr) {
789	case `'/'`: { // BCPL comment.
790	BufferPtr++; // Skip second slash.
791
792	if (BufferPtr != BufferEnd) {
793	// Skip Doxygen magic marker, if it is present.
794	// It might be missing because of a typo //< or /<, or because we*
795	// merged this non-Doxygen comment into a bunch of Doxygen comments
796	// around it: /* ... / / ... / /* ... /
797	const char C = *BufferPtr;
798	if (C == `'/'` \|\| C == `'!'`)
799	BufferPtr++;
800	}
801
802	// Skip less-than symbol that marks trailing comments.
803	// Skip it even if the comment is not a Doxygen one, because //< and /<*
804	// are frequent typos.
805	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
806	BufferPtr++;
807
808	CommentState = LCS_InsideBCPLComment;
809	switch (State) {
810	case LS_VerbatimBlockFirstLine:
811	case LS_VerbatimBlockBody:
812	break;
813	case LS_HTMLStartTag:
814	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
815	break;
816	default:
817	State = LS_Normal;
818	break;
819	}
820	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
821	goto again;
822	}
823	case `''`: { // C comment.*
824	BufferPtr++; // Skip star.
825
826	// Skip Doxygen magic marker.
827	const char C = *BufferPtr;
828	if ((C == `''` && (BufferPtr + `1`) != `'/'`) \|\| C == `'!'`)
829	BufferPtr++;
830
831	// Skip less-than symbol that marks trailing comments.
832	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
833	BufferPtr++;
834
835	CommentState = LCS_InsideCComment;
836	State = LS_Normal;
837	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
838	goto again;
839	}
840	default:
841	llvm_unreachable("second character of comment should be '/' or '*'");
842	}
843
844	case LCS_BetweenComments: {
845	// Consecutive comments are extracted only if there is only whitespace
846	// between them. So we can search for the start of the next comment.
847	const char *EndWhitespace = BufferPtr;
848	while(EndWhitespace != BufferEnd && *EndWhitespace != `'/'`)
849	EndWhitespace++;
850
851	// When lexing the start of an HTML tag (i.e. going through the attributes)
852	// there won't be any newlines generated.
853	if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
854	CommentState = LCS_BeforeComment;
855	BufferPtr = EndWhitespace;
856	goto again;
857	}
858
859	// Turn any whitespace between comments (and there is only whitespace
860	// between them -- guaranteed by comment extraction) into a newline. We
861	// have two newlines between C comments in total (first one was synthesized
862	// after a comment).
863	formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline);
864
865	CommentState = LCS_BeforeComment;
866	break;
867	}
868
869	case LCS_InsideBCPLComment:
870	case LCS_InsideCComment:
871	if (BufferPtr != CommentEnd) {
872	lexCommentText(T);
873	break;
874	} else {
875	// Skip C comment closing sequence.
876	if (CommentState == LCS_InsideCComment) {
877	assert(BufferPtr[`0`] == `'*'` && BufferPtr[`1`] == `'/'`);
878	BufferPtr += `2`;
879	assert(BufferPtr <= BufferEnd);
880
881	// When lexing the start of an HTML tag (i.e. going through the
882	// attributes) there won't be any newlines generated - whitespace still
883	// needs to be skipped.
884	if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
885	CommentState = LCS_BetweenComments;
886	goto again;
887	}
888
889	// Synthenize newline just after the C comment, regardless if there is
890	// actually a newline.
891	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
892
893	CommentState = LCS_BetweenComments;
894	break;
895	} else {
896	// Don't synthesized a newline after BCPL comment.
897	CommentState = LCS_BetweenComments;
898	goto again;
899	}
900	}
901	}
902	}
903
904	StringRef Lexer::getSpelling(const Token &Tok,
905	const SourceManager &SourceMgr) const {
906	SourceLocation Loc = Tok.getLocation();
907	FileIDAndOffset LocInfo = SourceMgr.getDecomposedLoc(Loc);
908
909	bool InvalidTemp = false;
910	StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
911	if (InvalidTemp)
912	return StringRef();
913
914	const char *Begin = File.data() + LocInfo.second;
915	return StringRef(Begin, Tok.getLength());
916	}
917
918	} // end namespace comments
919	} // end namespace clang
920

Browse the source code of llvm_projects/clang/lib/AST/CommentLexer.cpp