CommentLexer.cpp source code [llvm_projects/clang/lib/AST/CommentLexer.cpp]

1	//===--- CommentLexer.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang/AST/CommentLexer.h"
10	#include "clang/AST/Comment.h"
11	#include "clang/AST/CommentCommandTraits.h"
12	#include "clang/Basic/CharInfo.h"
13	#include "clang/Basic/DiagnosticComment.h"
14	#include "llvm/ADT/StringExtras.h"
15	#include "llvm/ADT/StringSwitch.h"
16	#include "llvm/Support/ConvertUTF.h"
17	#include "llvm/Support/ErrorHandling.h"
18
19	namespace clang {
20	namespace comments {
21
22	void Token::dump(const Lexer &L, const SourceManager &SM) const {
23	llvm::errs() << "comments::Token Kind=" << Kind << " ";
24	Loc.print(OS&: llvm::errs(), SM);
25	llvm::errs() << " " << Length << " \"" << L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n";
26	}
27
28	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
29	return isLetter(c: C);
30	}
31
32	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
33	return isDigit(c: C);
34	}
35
36	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
37	return isHexDigit(c: C);
38	}
39
40	static inline StringRef convertCodePointToUTF8(
41	llvm::BumpPtrAllocator &Allocator,
42	unsigned CodePoint) {
43	char Resolved = Allocator.Allocate<char*>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44	char *ResolvedPtr = Resolved;
45	if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr))
46	return StringRef(Resolved, ResolvedPtr - Resolved);
47	else
48	return StringRef();
49	}
50
51	namespace {
52
53	#include "clang/AST/CommentHTMLTags.inc"
54	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55
56	} // end anonymous namespace
57
58	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
59	// Fast path, first check a few most widely used named character references.
60	return llvm::StringSwitch<StringRef>(Name)
61	.Case(S: "amp", Value: "&")
62	.Case(S: "lt", Value: "<")
63	.Case(S: "gt", Value: ">")
64	.Case(S: "quot", Value: "\"")
65	.Case(S: "apos", Value: "\'")
66	// Slow path.
67	.Default(Value: translateHTMLNamedCharacterReferenceToUTF8(Name));
68	}
69
70	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71	unsigned CodePoint = `0`;
72	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
73	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
74	CodePoint *= `10`;
75	CodePoint += Name [i] - `'0'`;
76	}
77	return convertCodePointToUTF8(Allocator, CodePoint);
78	}
79
80	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
81	unsigned CodePoint = `0`;
82	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
83	CodePoint *= `16`;
84	const char C = Name [i];
85	assert(isHTMLHexCharacterReferenceCharacter(C));
86	CodePoint += llvm::hexDigitValue(C);
87	}
88	return convertCodePointToUTF8(Allocator, CodePoint);
89	}
90
91	void Lexer::skipLineStartingDecorations() {
92	// This function should be called only for C comments
93	assert(CommentState == LCS_InsideCComment);
94
95	if (BufferPtr == CommentEnd)
96	return;
97
98	const char *NewBufferPtr = BufferPtr;
99	while (isHorizontalWhitespace(c: *NewBufferPtr))
100	if (++NewBufferPtr == CommentEnd)
101	return;
102	if (NewBufferPtr == `''`)
103	BufferPtr = NewBufferPtr + `1`;
104	}
105
106	namespace {
107	/// Returns pointer to the first newline character in the string.
108	const char findNewline(const* char BufferPtr, const* char *BufferEnd) {
109	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
110	if (isVerticalWhitespace(c: *BufferPtr))
111	return BufferPtr;
112	}
113	return BufferEnd;
114	}
115
116	const char skipNewline(const* char BufferPtr, const* char *BufferEnd) {
117	if (BufferPtr == BufferEnd)
118	return BufferPtr;
119
120	if (*BufferPtr == `'\n'`)
121	BufferPtr++;
122	else {
123	assert(*BufferPtr == `'\r'`);
124	BufferPtr++;
125	if (BufferPtr != BufferEnd && *BufferPtr == `'\n'`)
126	BufferPtr++;
127	}
128	return BufferPtr;
129	}
130
131	const char skipNamedCharacterReference(const* char *BufferPtr,
132	const char *BufferEnd) {
133	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
134	if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr))
135	return BufferPtr;
136	}
137	return BufferEnd;
138	}
139
140	const char skipDecimalCharacterReference(const* char *BufferPtr,
141	const char *BufferEnd) {
142	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
143	if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr))
144	return BufferPtr;
145	}
146	return BufferEnd;
147	}
148
149	const char skipHexCharacterReference(const* char *BufferPtr,
150	const char *BufferEnd) {
151	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
152	if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr))
153	return BufferPtr;
154	}
155	return BufferEnd;
156	}
157
158	bool isHTMLIdentifierStartingCharacter(char C) {
159	return isLetter(c: C);
160	}
161
162	bool isHTMLIdentifierCharacter(char C) {
163	return isAlphanumeric(c: C);
164	}
165
166	const char skipHTMLIdentifier(const* char BufferPtr, const* char *BufferEnd) {
167	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168	if (!isHTMLIdentifierCharacter(C: *BufferPtr))
169	return BufferPtr;
170	}
171	return BufferEnd;
172	}
173
174	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
175	/// string allowed.
176	///
177	/// Returns pointer to closing quote.
178	const char skipHTMLQuotedString(const* char BufferPtr, const* char *BufferEnd)
179	{
180	const char Quote = *BufferPtr;
181	assert(Quote == `'\"'` \|\| Quote == `'\''`);
182
183	BufferPtr++;
184	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
185	const char C = *BufferPtr;
186	if (C == Quote && BufferPtr[-`1`] != `'\\'`)
187	return BufferPtr;
188	}
189	return BufferEnd;
190	}
191
192	const char skipWhitespace(const* char BufferPtr, const* char *BufferEnd) {
193	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
194	if (!isWhitespace(c: *BufferPtr))
195	return BufferPtr;
196	}
197	return BufferEnd;
198	}
199
200	const char skipHorizontalWhitespace(const* char *BufferPtr,
201	const char *BufferEnd) {
202	for (; BufferPtr != BufferEnd; ++BufferPtr) {
203	if (!isHorizontalWhitespace(c: *BufferPtr))
204	return BufferPtr;
205	}
206	return BufferEnd;
207	}
208
209	bool isWhitespace(const char BufferPtr, const* char *BufferEnd) {
210	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
211	}
212
213	bool isCommandNameStartCharacter(char C) {
214	return isLetter(c: C);
215	}
216
217	bool isCommandNameCharacter(char C) {
218	return isAsciiIdentifierContinue(c: C, AllowDollar: false);
219	}
220
221	const char skipCommandName(const* char BufferPtr, const* char *BufferEnd) {
222	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
223	if (!isCommandNameCharacter(C: *BufferPtr))
224	return BufferPtr;
225	}
226	return BufferEnd;
227	}
228
229	/// Return the one past end pointer for BCPL comments.
230	/// Handles newlines escaped with backslash or trigraph for backslahs.
231	const char findBCPLCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
232	const char *CurPtr = BufferPtr;
233	while (CurPtr != BufferEnd) {
234	while (!isVerticalWhitespace(c: *CurPtr)) {
235	CurPtr++;
236	if (CurPtr == BufferEnd)
237	return BufferEnd;
238	}
239	// We found a newline, check if it is escaped.
240	const char *EscapePtr = CurPtr - `1`;
241	while(isHorizontalWhitespace(c: *EscapePtr))
242	EscapePtr--;
243
244	if (*EscapePtr == `'\\'` \|\|
245	(EscapePtr - `2` >= BufferPtr && EscapePtr[`0`] == `'/'` &&
246	EscapePtr[-`1`] == `'?'` && EscapePtr[-`2`] == `'?'`)) {
247	// We found an escaped newline.
248	CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd);
249	} else
250	return CurPtr; // Not an escaped newline.
251	}
252	return BufferEnd;
253	}
254
255	/// Return the one past end pointer for C comments.
256	/// Very dumb, does not handle escaped newlines or trigraphs.
257	const char findCCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
258	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
259	if (BufferPtr == `''`) {
260	assert(BufferPtr + `1` != BufferEnd);
261	if (*(BufferPtr + `1`) == `'/'`)
262	return BufferPtr;
263	}
264	}
265	llvm_unreachable("buffer end hit before '*/' was seen");
266	}
267
268	} // end anonymous namespace
269
270	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
271	tok::TokenKind Kind) {
272	const unsigned TokLen = TokEnd - BufferPtr;
273	Result.setLocation(getSourceLocation(Loc: BufferPtr));
274	Result.setKind(Kind);
275	Result.setLength(TokLen);
276	#ifndef NDEBUG
277	Result.TextPtr = "<UNSET>";
278	Result.IntVal = `7`;
279	#endif
280	BufferPtr = TokEnd;
281	}
282
283	const char *Lexer::skipTextToken() {
284	const char *TokenPtr = BufferPtr;
285	assert(TokenPtr < CommentEnd);
286	StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
287
288	again:
289	size_t End =
290	StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols);
291	if (End == StringRef::npos)
292	return CommentEnd;
293
294	// Doxygen doesn't recognize any commands in a one-line double quotation.
295	// If we don't find an ending quotation mark, we pretend it never began.
296	if (*(TokenPtr + End) == `'\"'`) {
297	TokenPtr += End + `1`;
298	End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\"");
299	if (End != StringRef::npos && *(TokenPtr + End) == `'\"'`)
300	TokenPtr += End + `1`;
301	goto again;
302	}
303	return TokenPtr + End;
304	}
305
306	void Lexer::lexCommentText(Token &T) {
307	assert(CommentState == LCS_InsideBCPLComment \|\|
308	CommentState == LCS_InsideCComment);
309
310	// Handles lexing non-command text, i.e. text and newline.
311	auto HandleNonCommandToken = [&]() -> void {
312	assert(State == LS_Normal);
313
314	const char *TokenPtr = BufferPtr;
315	assert(TokenPtr < CommentEnd);
316	switch (*TokenPtr) {
317	case `'\n'`:
318	case `'\r'`:
319	TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
320	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline);
321
322	if (CommentState == LCS_InsideCComment)
323	skipLineStartingDecorations();
324	return;
325
326	default:
327	return formTextToken(Result&: T, TokEnd: skipTextToken());
328	}
329	};
330
331	if (!ParseCommands)
332	return HandleNonCommandToken ();
333
334	switch (State) {
335	case LS_Normal:
336	break;
337	case LS_VerbatimBlockFirstLine:
338	lexVerbatimBlockFirstLine(T);
339	return;
340	case LS_VerbatimBlockBody:
341	lexVerbatimBlockBody(T);
342	return;
343	case LS_VerbatimLineText:
344	lexVerbatimLineText(T);
345	return;
346	case LS_HTMLStartTag:
347	lexHTMLStartTag(T);
348	return;
349	case LS_HTMLEndTag:
350	lexHTMLEndTag(T);
351	return;
352	}
353
354	assert(State == LS_Normal);
355	const char *TokenPtr = BufferPtr;
356	assert(TokenPtr < CommentEnd);
357	switch(*TokenPtr) {
358	case `'\\'`:
359	case `'@'`: {
360	// Commands that start with a backslash and commands that start with
361	// 'at' have equivalent semantics. But we keep information about the
362	// exact syntax in AST for comments.
363	tok::TokenKind CommandKind =
364	(*TokenPtr == `'@'`) ? tok::at_command : tok::backslash_command;
365	TokenPtr++;
366	if (TokenPtr == CommentEnd) {
367	formTextToken(Result&: T, TokEnd: TokenPtr);
368	return;
369	}
370	char C = *TokenPtr;
371	switch (C) {
372	default:
373	break;
374
375	case `'\\'`: case `'@'`: case `'&'`: case `'$'`:
376	case `'#'`: case `'<'`: case `'>'`: case `'%'`:
377	case `'\"'`: case `'.'`: case `':'`:
378	// This is one of \\ \@ \& \$ etc escape sequences.
379	TokenPtr++;
380	if (C == `':'` && TokenPtr != CommentEnd && *TokenPtr == `':'`) {
381	// This is the \:: escape sequence.
382	TokenPtr++;
383	}
384	StringRef UnescapedText(BufferPtr + `1`, TokenPtr - (BufferPtr + `1`));
385	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
386	T.setText(UnescapedText);
387	return;
388	}
389
390	// Don't make zero-length commands.
391	if (!isCommandNameStartCharacter(C: *TokenPtr)) {
392	formTextToken(Result&: T, TokEnd: TokenPtr);
393	return;
394	}
395
396	TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
397	unsigned Length = TokenPtr - (BufferPtr + `1`);
398
399	// Hardcoded support for lexing LaTeX formula commands
400	// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
401	if (Length == `1` && TokenPtr[-`1`] == `'f'` && TokenPtr != CommentEnd) {
402	C = *TokenPtr;
403	if (C == `'$'` \|\| C == `'('` \|\| C == `')'` \|\| C == `'['` \|\| C == `']'` \|\|
404	C == `'{'` \|\| C == `'}'`) {
405	TokenPtr++;
406	Length++;
407	}
408	}
409
410	StringRef CommandName(BufferPtr + `1`, Length);
411
412	const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName);
413	if (!Info) {
414	if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) {
415	StringRef CorrectedName = Info->Name;
416	SourceLocation Loc = getSourceLocation(Loc: BufferPtr);
417	SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr);
418	SourceRange FullRange = SourceRange (Loc, EndLoc);
419	SourceRange CommandRange(Loc.getLocWithOffset(Offset: `1`), EndLoc);
420	Diag(Loc, DiagID: diag::warn_correct_comment_command_name)
421	<< FullRange << CommandName << CorrectedName
422	<< FixItHint::CreateReplacement(RemoveRange: CommandRange, Code: CorrectedName);
423	} else {
424	formTokenWithChars(Result&: T, TokEnd: TokenPtr,
425	Kind: CommandKind == tok::backslash_command
426	? tok::unknown_backslash_command
427	: tok::unknown_at_command);
428	T.setUnknownCommandName(CommandName);
429	Diag(Loc: T.getLocation(), DiagID: diag::warn_unknown_comment_command_name)
430	<< SourceRange (T.getLocation(), T.getEndLocation());
431	return;
432	}
433	}
434	if (Info->IsVerbatimBlockCommand) {
435	setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info);
436	return;
437	}
438	if (Info->IsVerbatimLineCommand) {
439	setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info);
440	return;
441	}
442	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind);
443	T.setCommandID(Info->getID());
444	return;
445	}
446
447	case `'&'`:
448	lexHTMLCharacterReference(T);
449	return;
450
451	case `'<'`: {
452	TokenPtr++;
453	if (TokenPtr == CommentEnd) {
454	formTextToken(Result&: T, TokEnd: TokenPtr);
455	return;
456	}
457	const char C = *TokenPtr;
458	if (isHTMLIdentifierStartingCharacter(C))
459	setupAndLexHTMLStartTag(T);
460	else if (C == `'/'`)
461	setupAndLexHTMLEndTag(T);
462	else
463	formTextToken(Result&: T, TokEnd: TokenPtr);
464	return;
465	}
466
467	default:
468	return HandleNonCommandToken ();
469	}
470	}
471
472	void Lexer::setupAndLexVerbatimBlock(Token &T,
473	const char *TextBegin,
474	char Marker, const CommandInfo *Info) {
475	assert(Info->IsVerbatimBlockCommand);
476
477	VerbatimBlockEndCommandName.clear();
478	VerbatimBlockEndCommandName.append(RHS: Marker == `'\\'` ? "\\" : "@");
479	VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName);
480
481	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin);
482	T.setVerbatimBlockID(Info->getID());
483
484	// If there is a newline following the verbatim opening command, skip the
485	// newline so that we don't create an tok::verbatim_block_line with empty
486	// text content.
487	if (BufferPtr != CommentEnd &&
488	isVerticalWhitespace(c: *BufferPtr)) {
489	BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
490	State = LS_VerbatimBlockBody;
491	return;
492	}
493
494	State = LS_VerbatimBlockFirstLine;
495	}
496
497	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
498	again:
499	assert(BufferPtr < CommentEnd);
500
501	// FIXME: It would be better to scan the text once, finding either the block
502	// end command or newline.
503	//
504	// Extract current line.
505	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
506	StringRef Line(BufferPtr, Newline - BufferPtr);
507
508	// Look for end command in current line.
509	size_t Pos = Line.find(Str: VerbatimBlockEndCommandName);
510	const char *TextEnd;
511	const char *NextLine;
512	if (Pos == StringRef::npos) {
513	// Current line is completely verbatim.
514	TextEnd = Newline;
515	NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd);
516	} else if (Pos == `0`) {
517	// Current line contains just an end command.
518	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
519	StringRef Name(BufferPtr + `1`, End - (BufferPtr + `1`));
520	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end);
521	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
522	State = LS_Normal;
523	return;
524	} else {
525	// There is some text, followed by end command. Extract text first.
526	TextEnd = BufferPtr + Pos;
527	NextLine = TextEnd;
528	// If there is only whitespace before end command, skip whitespace.
529	if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) {
530	BufferPtr = TextEnd;
531	goto again;
532	}
533	}
534
535	StringRef Text(BufferPtr, TextEnd - BufferPtr);
536	formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line);
537	T.setVerbatimBlockText(Text);
538
539	State = LS_VerbatimBlockBody;
540	}
541
542	void Lexer::lexVerbatimBlockBody(Token &T) {
543	assert(State == LS_VerbatimBlockBody);
544
545	if (CommentState == LCS_InsideCComment)
546	skipLineStartingDecorations();
547
548	if (BufferPtr == CommentEnd) {
549	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line);
550	T.setVerbatimBlockText("");
551	return;
552	}
553
554	lexVerbatimBlockFirstLine(T);
555	}
556
557	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
558	const CommandInfo *Info) {
559	assert(Info->IsVerbatimLineCommand);
560	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name);
561	T.setVerbatimLineID(Info->getID());
562
563	State = LS_VerbatimLineText;
564	}
565
566	void Lexer::lexVerbatimLineText(Token &T) {
567	assert(State == LS_VerbatimLineText);
568
569	// Extract current line.
570	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
571	StringRef Text(BufferPtr, Newline - BufferPtr);
572	formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text);
573	T.setVerbatimLineText(Text);
574
575	State = LS_Normal;
576	}
577
578	void Lexer::lexHTMLCharacterReference(Token &T) {
579	const char *TokenPtr = BufferPtr;
580	assert(*TokenPtr == `'&'`);
581	TokenPtr++;
582	if (TokenPtr == CommentEnd) {
583	formTextToken(Result&: T, TokEnd: TokenPtr);
584	return;
585	}
586	const char *NamePtr;
587	bool isNamed = false;
588	bool isDecimal = false;
589	char C = *TokenPtr;
590	if (isHTMLNamedCharacterReferenceCharacter(C)) {
591	NamePtr = TokenPtr;
592	TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
593	isNamed = true;
594	} else if (C == `'#'`) {
595	TokenPtr++;
596	if (TokenPtr == CommentEnd) {
597	formTextToken(Result&: T, TokEnd: TokenPtr);
598	return;
599	}
600	C = *TokenPtr;
601	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
602	NamePtr = TokenPtr;
603	TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
604	isDecimal = true;
605	} else if (C == `'x'` \|\| C == `'X'`) {
606	TokenPtr++;
607	NamePtr = TokenPtr;
608	TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
609	} else {
610	formTextToken(Result&: T, TokEnd: TokenPtr);
611	return;
612	}
613	} else {
614	formTextToken(Result&: T, TokEnd: TokenPtr);
615	return;
616	}
617	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
618	*TokenPtr != `';'`) {
619	formTextToken(Result&: T, TokEnd: TokenPtr);
620	return;
621	}
622	StringRef Name(NamePtr, TokenPtr - NamePtr);
623	TokenPtr++; // Skip semicolon.
624	StringRef Resolved;
625	if (isNamed)
626	Resolved = resolveHTMLNamedCharacterReference(Name);
627	else if (isDecimal)
628	Resolved = resolveHTMLDecimalCharacterReference(Name);
629	else
630	Resolved = resolveHTMLHexCharacterReference(Name);
631
632	if (Resolved.empty()) {
633	formTextToken(Result&: T, TokEnd: TokenPtr);
634	return;
635	}
636	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
637	T.setText(Resolved);
638	}
639
640	void Lexer::setupAndLexHTMLStartTag(Token &T) {
641	assert(BufferPtr[`0`] == `'<'` &&
642	isHTMLIdentifierStartingCharacter(BufferPtr[`1`]));
643	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
644	StringRef Name(BufferPtr + `1`, TagNameEnd - (BufferPtr + `1`));
645	if (!isHTMLTagName(Name)) {
646	formTextToken(Result&: T, TokEnd: TagNameEnd);
647	return;
648	}
649
650	formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag);
651	T.setHTMLTagStartName(Name);
652
653	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
654	if (BufferPtr == CommentEnd) { // in BCPL comments
655	State = LS_HTMLStartTag;
656	return;
657	}
658
659	const char C = *BufferPtr;
660	if (BufferPtr != CommentEnd &&
661	(C == `'>'` \|\| C == `'/'` \|\| isVerticalWhitespace(c: C) \|\|
662	isHTMLIdentifierStartingCharacter(C)))
663	State = LS_HTMLStartTag;
664	}
665
666	void Lexer::lexHTMLStartTag(Token &T) {
667	assert(State == LS_HTMLStartTag);
668
669	// Skip leading whitespace and comment decorations
670	while (isVerticalWhitespace(c: *BufferPtr)) {
671	BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
672
673	if (CommentState == LCS_InsideCComment)
674	skipLineStartingDecorations();
675
676	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
677	if (BufferPtr == CommentEnd) {
678	// HTML starting tags must be defined in a single comment block.
679	// It's likely a user-error where they forgot to terminate the comment.
680	State = LS_Normal;
681	// Since at least one newline was skipped and one token needs to be lexed,
682	// return a newline.
683	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
684	return;
685	}
686	}
687
688	const char *TokenPtr = BufferPtr;
689	char C = *TokenPtr;
690	if (isHTMLIdentifierCharacter(C)) {
691	TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
692	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
693	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident);
694	T.setHTMLIdent(Ident);
695	} else {
696	switch (C) {
697	case `'='`:
698	TokenPtr++;
699	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals);
700	break;
701	case `'\"'`:
702	case `'\''`: {
703	const char *OpenQuote = TokenPtr;
704	TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
705	const char *ClosingQuote = TokenPtr;
706	if (TokenPtr != CommentEnd) // Skip closing quote.
707	TokenPtr++;
708	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string);
709	T.setHTMLQuotedString(StringRef(OpenQuote + `1`,
710	ClosingQuote - (OpenQuote + `1`)));
711	break;
712	}
713	case `'>'`:
714	TokenPtr++;
715	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater);
716	State = LS_Normal;
717	return;
718	case `'/'`:
719	TokenPtr++;
720	if (TokenPtr != CommentEnd && *TokenPtr == `'>'`) {
721	TokenPtr++;
722	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater);
723	} else
724	formTextToken(Result&: T, TokEnd: TokenPtr);
725
726	State = LS_Normal;
727	return;
728	}
729	}
730
731	// Now look ahead and return to normal state if we don't see any HTML tokens
732	// ahead.
733	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
734	if (BufferPtr == CommentEnd) {
735	return;
736	}
737
738	C = *BufferPtr;
739	if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(c: C) &&
740	C != `'='` && C != `'\"'` && C != `'\''` && C != `'>'` && C != `'/'`) {
741	State = LS_Normal;
742	return;
743	}
744	}
745
746	void Lexer::setupAndLexHTMLEndTag(Token &T) {
747	assert(BufferPtr[`0`] == `'<'` && BufferPtr[`1`] == `'/'`);
748
749	const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
750	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd);
751	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
752	if (!isHTMLTagName(Name)) {
753	formTextToken(Result&: T, TokEnd: TagNameEnd);
754	return;
755	}
756
757	const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd);
758
759	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag);
760	T.setHTMLTagEndName(Name);
761
762	if (BufferPtr != CommentEnd && *BufferPtr == `'>'`)
763	State = LS_HTMLEndTag;
764	}
765
766	void Lexer::lexHTMLEndTag(Token &T) {
767	assert(BufferPtr != CommentEnd && *BufferPtr == `'>'`);
768
769	formTokenWithChars(Result&: T, TokEnd: BufferPtr + `1`, Kind: tok::html_greater);
770	State = LS_Normal;
771	}
772
773	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
774	const CommandTraits &Traits, SourceLocation FileLoc,
775	const char BufferStart, const* char BufferEnd, bool* ParseCommands)
776	: Allocator(Allocator), Diags(Diags), Traits(Traits),
777	BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
778	FileLoc (FileLoc), ParseCommands(ParseCommands),
779	CommentState(LCS_BeforeComment), State(LS_Normal) {}
780
781	void Lexer::lex(Token &T) {
782	again:
783	switch (CommentState) {
784	case LCS_BeforeComment:
785	if (BufferPtr == BufferEnd) {
786	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof);
787	return;
788	}
789
790	assert(*BufferPtr == `'/'`);
791	BufferPtr++; // Skip first slash.
792	switch(*BufferPtr) {
793	case `'/'`: { // BCPL comment.
794	BufferPtr++; // Skip second slash.
795
796	if (BufferPtr != BufferEnd) {
797	// Skip Doxygen magic marker, if it is present.
798	// It might be missing because of a typo //< or /<, or because we*
799	// merged this non-Doxygen comment into a bunch of Doxygen comments
800	// around it: /* ... / / ... / /* ... /
801	const char C = *BufferPtr;
802	if (C == `'/'` \|\| C == `'!'`)
803	BufferPtr++;
804	}
805
806	// Skip less-than symbol that marks trailing comments.
807	// Skip it even if the comment is not a Doxygen one, because //< and /<*
808	// are frequent typos.
809	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
810	BufferPtr++;
811
812	CommentState = LCS_InsideBCPLComment;
813	switch (State) {
814	case LS_VerbatimBlockFirstLine:
815	case LS_VerbatimBlockBody:
816	break;
817	case LS_HTMLStartTag:
818	BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
819	break;
820	default:
821	State = LS_Normal;
822	break;
823	}
824	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
825	goto again;
826	}
827	case `''`: { // C comment.*
828	BufferPtr++; // Skip star.
829
830	// Skip Doxygen magic marker.
831	const char C = *BufferPtr;
832	if ((C == `''` && (BufferPtr + `1`) != `'/'`) \|\| C == `'!'`)
833	BufferPtr++;
834
835	// Skip less-than symbol that marks trailing comments.
836	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
837	BufferPtr++;
838
839	CommentState = LCS_InsideCComment;
840	State = LS_Normal;
841	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
842	goto again;
843	}
844	default:
845	llvm_unreachable("second character of comment should be '/' or '*'");
846	}
847
848	case LCS_BetweenComments: {
849	// Consecutive comments are extracted only if there is only whitespace
850	// between them. So we can search for the start of the next comment.
851	const char *EndWhitespace = BufferPtr;
852	while(EndWhitespace != BufferEnd && *EndWhitespace != `'/'`)
853	EndWhitespace++;
854
855	// When lexing the start of an HTML tag (i.e. going through the attributes)
856	// there won't be any newlines generated.
857	if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
858	CommentState = LCS_BeforeComment;
859	BufferPtr = EndWhitespace;
860	goto again;
861	}
862
863	// Turn any whitespace between comments (and there is only whitespace
864	// between them -- guaranteed by comment extraction) into a newline. We
865	// have two newlines between C comments in total (first one was synthesized
866	// after a comment).
867	formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline);
868
869	CommentState = LCS_BeforeComment;
870	break;
871	}
872
873	case LCS_InsideBCPLComment:
874	case LCS_InsideCComment:
875	if (BufferPtr != CommentEnd) {
876	lexCommentText(T);
877	break;
878	} else {
879	// Skip C comment closing sequence.
880	if (CommentState == LCS_InsideCComment) {
881	assert(BufferPtr[`0`] == `'*'` && BufferPtr[`1`] == `'/'`);
882	BufferPtr += `2`;
883	assert(BufferPtr <= BufferEnd);
884
885	// When lexing the start of an HTML tag (i.e. going through the
886	// attributes) there won't be any newlines generated - whitespace still
887	// needs to be skipped.
888	if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
889	CommentState = LCS_BetweenComments;
890	goto again;
891	}
892
893	// Synthenize newline just after the C comment, regardless if there is
894	// actually a newline.
895	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
896
897	CommentState = LCS_BetweenComments;
898	break;
899	} else {
900	// Don't synthesized a newline after BCPL comment.
901	CommentState = LCS_BetweenComments;
902	goto again;
903	}
904	}
905	}
906	}
907
908	StringRef Lexer::getSpelling(const Token &Tok,
909	const SourceManager &SourceMgr) const {
910	SourceLocation Loc = Tok.getLocation();
911	FileIDAndOffset LocInfo = SourceMgr.getDecomposedLoc(Loc);
912
913	bool InvalidTemp = false;
914	StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
915	if (InvalidTemp)
916	return StringRef();
917
918	const char *Begin = File.data() + LocInfo.second;
919	return StringRef(Begin, Tok.getLength());
920	}
921
922	} // end namespace comments
923	} // end namespace clang
924

Browse the source code of llvm_projects/clang/lib/AST/CommentLexer.cpp