 |
|
 |
|
| Files: |
1 |
|
Branches Taken: |
79.4% |
683 / 860 |
| Generated: |
2010-02-10 01:31 |
|
Branches Executed: |
95.3% |
820 / 860 |
| |
|
Line Coverage: |
90.4% |
786 / 869 |
| |
 |
|
 |
1 : //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file implements the Lexer and Token interfaces.
11 : //
12 : //===----------------------------------------------------------------------===//
13 : //
14 : // TODO: GCC Diagnostics emitted by the lexer:
15 : // PEDWARN: (form feed|vertical tab) in preprocessing directive
16 : //
17 : // Universal characters, unicode, char mapping:
18 : // WARNING: `%.*s' is not in NFKC
19 : // WARNING: `%.*s' is not in NFC
20 : //
21 : // Other:
22 : // TODO: Options to support:
23 : // -fexec-charset,-fwide-exec-charset
24 : //
25 : //===----------------------------------------------------------------------===//
26 :
27 : #include "clang/Lex/Lexer.h"
28 : #include "clang/Lex/Preprocessor.h"
29 : #include "clang/Lex/LexDiagnostic.h"
30 : #include "clang/Basic/SourceManager.h"
31 : #include "llvm/Support/Compiler.h"
32 : #include "llvm/Support/MemoryBuffer.h"
33 : #include <cctype>
34 : using namespace clang;
35 :
36 : static void InitCharacterInfo();
37 :
38 : //===----------------------------------------------------------------------===//
39 : // Token Class Implementation
40 : //===----------------------------------------------------------------------===//
41 :
42 : /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
43 6961: bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
6932: branch 1 taken
29: branch 2 taken
44 6961: if (IdentifierInfo *II = getIdentifierInfo())
45 6932: return II->getObjCKeywordID() == objcKey;
46 29: return false;
47 : }
48 :
49 : /// getObjCKeywordID - Return the ObjC keyword kind.
50 7480: tok::ObjCKeywordKind Token::getObjCKeywordID() const {
51 7480: IdentifierInfo *specId = getIdentifierInfo();
7477: branch 0 taken
3: branch 1 taken
52 7480: return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
53 : }
54 :
55 :
56 : //===----------------------------------------------------------------------===//
57 : // Lexer Class Implementation
58 : //===----------------------------------------------------------------------===//
59 :
60 : void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
61 144587: const char *BufEnd) {
62 144587: InitCharacterInfo();
63 :
64 144587: BufferStart = BufStart;
65 144587: BufferPtr = BufPtr;
66 144587: BufferEnd = BufEnd;
67 :
68 : assert(BufEnd[0] == 0 &&
69 : "We assume that the input buffer has a null character at the end"
0: branch 0 not taken
144587: branch 1 taken
70 144587: " to simplify lexing!");
71 :
72 144587: Is_PragmaLexer = false;
73 144587: IsInConflictMarker = false;
74 :
75 : // Start of the file is a start of line.
76 144587: IsAtStartOfLine = true;
77 :
78 : // We are not after parsing a #.
79 144587: ParsingPreprocessorDirective = false;
80 :
81 : // We are not after parsing #include.
82 144587: ParsingFilename = false;
83 :
84 : // We are not in raw mode. Raw mode disables diagnostics and interpretation
85 : // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
86 : // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
87 : // or otherwise skipping over tokens.
88 144587: LexingRawMode = false;
89 :
90 : // Default to not keeping comments.
91 144587: ExtendedTokenMode = 0;
92 144587: }
93 :
94 : /// Lexer constructor - Create a new lexer object for the specified buffer
95 : /// with the specified preprocessor managing the lexing process. This lexer
96 : /// assumes that the associated file buffer and Preprocessor objects will
97 : /// outlive it, so it doesn't take ownership of either of them.
98 5618: Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
99 : : PreprocessorLexer(&PP, FID),
100 : FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
101 5618: Features(PP.getLangOptions()) {
102 :
103 : InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
104 5618: InputFile->getBufferEnd());
105 :
106 : // Default to keeping comments if the preprocessor wants them.
107 5618: SetCommentRetentionState(PP.getCommentRetentionState());
108 5618: }
109 :
110 : /// Lexer constructor - Create a new raw lexer object. This object is only
111 : /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
112 : /// range will outlive it, so it doesn't take ownership of it.
113 : Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
114 137646: const char *BufStart, const char *BufPtr, const char *BufEnd)
115 137646: : FileLoc(fileloc), Features(features) {
116 :
117 137646: InitLexer(BufStart, BufPtr, BufEnd);
118 :
119 : // We *are* in raw mode.
120 137646: LexingRawMode = true;
121 137646: }
122 :
123 : /// Lexer constructor - Create a new raw lexer object. This object is only
124 : /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
125 : /// range will outlive it, so it doesn't take ownership of it.
126 : Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
127 1323: const SourceManager &SM, const LangOptions &features)
128 1323: : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
129 :
130 : InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
131 1323: FromFile->getBufferEnd());
132 :
133 : // We *are* in raw mode.
134 1323: LexingRawMode = true;
135 1323: }
136 :
137 : /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
138 : /// _Pragma expansion. This has a variety of magic semantics that this method
139 : /// sets up. It returns a new'd Lexer that must be delete'd when done.
140 : ///
141 : /// On entrance to this routine, TokStartLoc is a macro location which has a
142 : /// spelling loc that indicates the bytes to be lexed for the token and an
143 : /// instantiation location that indicates where all lexed tokens should be
144 : /// "expanded from".
145 : ///
146 : /// FIXME: It would really be nice to make _Pragma just be a wrapper around a
147 : /// normal lexer that remaps tokens as they fly by. This would require making
148 : /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
149 : /// interface that could handle this stuff. This would pull GetMappedTokenLoc
150 : /// out of the critical path of the lexer!
151 : ///
152 : Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
153 : SourceLocation InstantiationLocStart,
154 : SourceLocation InstantiationLocEnd,
155 16: unsigned TokLen, Preprocessor &PP) {
156 16: SourceManager &SM = PP.getSourceManager();
157 :
158 : // Create the lexer as if we were going to lex the file normally.
159 16: FileID SpellingFID = SM.getFileID(SpellingLoc);
160 16: const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
161 16: Lexer *L = new Lexer(SpellingFID, InputFile, PP);
162 :
163 : // Now that the lexer is created, change the start/end locations so that we
164 : // just lex the subsection of the file that we want. This is lexing from a
165 : // scratch buffer.
166 16: const char *StrData = SM.getCharacterData(SpellingLoc);
167 :
168 16: L->BufferPtr = StrData;
169 16: L->BufferEnd = StrData+TokLen;
0: branch 0 not taken
16: branch 1 taken
170 16: assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
171 :
172 : // Set the SourceLocation with the remapping information. This ensures that
173 : // GetMappedTokenLoc will remap the tokens as they are lexed.
174 : L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
175 : InstantiationLocStart,
176 16: InstantiationLocEnd, TokLen);
177 :
178 : // Ensure that the lexer thinks it is inside a directive, so that end \n will
179 : // return an EOM token.
180 16: L->ParsingPreprocessorDirective = true;
181 :
182 : // This lexer really is for _Pragma.
183 16: L->Is_PragmaLexer = true;
184 16: return L;
185 : }
186 :
187 :
188 : /// Stringify - Convert the specified string into a C string, with surrounding
189 : /// ""'s, and with escaped \ and " characters.
190 111: std::string Lexer::Stringify(const std::string &Str, bool Charify) {
191 111: std::string Result = Str;
0: branch 0 not taken
111: branch 1 taken
192 111: char Quote = Charify ? '\'' : '"';
6056: branch 1 taken
111: branch 2 taken
193 6167: for (unsigned i = 0, e = Result.size(); i != e; ++i) {
6054: branch 1 taken
2: branch 2 taken
20: branch 4 taken
6034: branch 5 taken
22: branch 6 taken
6034: branch 7 taken
194 6056: if (Result[i] == '\\' || Result[i] == Quote) {
195 22: Result.insert(Result.begin()+i, '\\');
196 22: ++i; ++e;
197 : }
198 : }
199 : return Result;
200 : }
201 :
202 : /// Stringify - Convert the specified string into a C string by escaping '\'
203 : /// and " characters. This does not add surrounding ""'s to the string.
204 1372: void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
44314: branch 1 taken
1372: branch 2 taken
205 45686: for (unsigned i = 0, e = Str.size(); i != e; ++i) {
44314: branch 1 taken
0: branch 2 not taken
0: branch 4 not taken
44314: branch 5 taken
0: branch 6 not taken
44314: branch 7 taken
206 44314: if (Str[i] == '\\' || Str[i] == '"') {
207 0: Str.insert(Str.begin()+i, '\\');
208 0: ++i; ++e;
209 : }
210 : }
211 1372: }
212 :
213 : static bool isWhitespace(unsigned char c);
214 :
215 : /// MeasureTokenLength - Relex the token at the specified location and return
216 : /// its length in bytes in the input file. If the token needs cleaning (e.g.
217 : /// includes a trigraph or an escaped newline) then this count includes bytes
218 : /// that are part of that.
219 : unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
220 : const SourceManager &SM,
221 137653: const LangOptions &LangOpts) {
222 : // TODO: this could be special cased for common tokens like identifiers, ')',
223 : // etc to make this faster, if it mattered. Just look at StrData[0] to handle
224 : // all obviously single-char tokens. This could use
225 : // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
226 : // something.
227 :
228 : // If this comes from a macro expansion, we really do want the macro name, not
229 : // the token this macro expanded to.
230 137653: Loc = SM.getInstantiationLoc(Loc);
231 137653: std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
232 137653: std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first);
233 137653: const char *StrData = Buffer.first+LocInfo.second;
234 :
1256: branch 1 taken
136397: branch 2 taken
235 137653: if (isWhitespace(StrData[0]))
236 1256: return 0;
237 :
238 : // Create a lexer starting at the beginning of this token.
239 136397: Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second);
240 136397: TheLexer.SetCommentRetentionState(true);
241 136397: Token TheTok;
242 136397: TheLexer.LexFromRawLexer(TheTok);
243 136397: return TheTok.getLength();
244 : }
245 :
246 : //===----------------------------------------------------------------------===//
247 : // Character information.
248 : //===----------------------------------------------------------------------===//
249 :
250 : enum {
251 : CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
252 : CHAR_VERT_WS = 0x02, // '\r', '\n'
253 : CHAR_LETTER = 0x04, // a-z,A-Z
254 : CHAR_NUMBER = 0x08, // 0-9
255 : CHAR_UNDER = 0x10, // _
256 : CHAR_PERIOD = 0x20 // .
257 : };
258 :
259 : // Statically initialize CharInfo table based on ASCII character set
260 : // Reference: FreeBSD 7.2 /usr/share/misc/ascii
261 : static const unsigned char CharInfo[256] =
262 : {
263 : // 0 NUL 1 SOH 2 STX 3 ETX
264 : // 4 EOT 5 ENQ 6 ACK 7 BEL
265 : 0 , 0 , 0 , 0 ,
266 : 0 , 0 , 0 , 0 ,
267 : // 8 BS 9 HT 10 NL 11 VT
268 : //12 NP 13 CR 14 SO 15 SI
269 : 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
270 : CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 ,
271 : //16 DLE 17 DC1 18 DC2 19 DC3
272 : //20 DC4 21 NAK 22 SYN 23 ETB
273 : 0 , 0 , 0 , 0 ,
274 : 0 , 0 , 0 , 0 ,
275 : //24 CAN 25 EM 26 SUB 27 ESC
276 : //28 FS 29 GS 30 RS 31 US
277 : 0 , 0 , 0 , 0 ,
278 : 0 , 0 , 0 , 0 ,
279 : //32 SP 33 ! 34 " 35 #
280 : //36 $ 37 % 38 & 39 '
281 : CHAR_HORZ_WS, 0 , 0 , 0 ,
282 : 0 , 0 , 0 , 0 ,
283 : //40 ( 41 ) 42 * 43 +
284 : //44 , 45 - 46 . 47 /
285 : 0 , 0 , 0 , 0 ,
286 : 0 , 0 , CHAR_PERIOD , 0 ,
287 : //48 0 49 1 50 2 51 3
288 : //52 4 53 5 54 6 55 7
289 : CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
290 : CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
291 : //56 8 57 9 58 : 59 ;
292 : //60 < 61 = 62 > 63 ?
293 : CHAR_NUMBER , CHAR_NUMBER , 0 , 0 ,
294 : 0 , 0 , 0 , 0 ,
295 : //64 @ 65 A 66 B 67 C
296 : //68 D 69 E 70 F 71 G
297 : 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
298 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
299 : //72 H 73 I 74 J 75 K
300 : //76 L 77 M 78 N 79 O
301 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
302 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
303 : //80 P 81 Q 82 R 83 S
304 : //84 T 85 U 86 V 87 W
305 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
306 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
307 : //88 X 89 Y 90 Z 91 [
308 : //92 \ 93 ] 94 ^ 95 _
309 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
310 : 0 , 0 , 0 , CHAR_UNDER ,
311 : //96 ` 97 a 98 b 99 c
312 : //100 d 101 e 102 f 103 g
313 : 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
314 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
315 : //104 h 105 i 106 j 107 k
316 : //108 l 109 m 110 n 111 o
317 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
318 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
319 : //112 p 113 q 114 r 115 s
320 : //116 t 117 u 118 v 119 w
321 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
322 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
323 : //120 x 121 y 122 z 123 {
324 : //124 | 125 } 126 ~ 127 DEL
325 : CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 ,
326 : 0 , 0 , 0 , 0
327 : };
328 :
329 144587: static void InitCharacterInfo() {
330 : static bool isInited = false;
2538: branch 0 taken
142049: branch 1 taken
331 144587: if (isInited) return;
332 : // check the statically-initialized CharInfo table
0: branch 0 not taken
2538: branch 1 taken
333 2538: assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
0: branch 0 not taken
2538: branch 1 taken
334 2538: assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
0: branch 0 not taken
2538: branch 1 taken
335 2538: assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
0: branch 0 not taken
2538: branch 1 taken
336 2538: assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
0: branch 0 not taken
2538: branch 1 taken
337 2538: assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
0: branch 0 not taken
2538: branch 1 taken
338 2538: assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
0: branch 0 not taken
2538: branch 1 taken
339 2538: assert(CHAR_UNDER == CharInfo[(int)'_']);
0: branch 0 not taken
2538: branch 1 taken
340 2538: assert(CHAR_PERIOD == CharInfo[(int)'.']);
65988: branch 0 taken
2538: branch 1 taken
341 68526: for (unsigned i = 'a'; i <= 'z'; ++i) {
0: branch 0 not taken
65988: branch 1 taken
342 65988: assert(CHAR_LETTER == CharInfo[i]);
0: branch 0 not taken
65988: branch 1 taken
343 65988: assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
344 : }
25380: branch 0 taken
2538: branch 1 taken
345 27918: for (unsigned i = '0'; i <= '9'; ++i)
0: branch 0 not taken
25380: branch 1 taken
346 25380: assert(CHAR_NUMBER == CharInfo[i]);
347 :
348 2538: isInited = true;
349 : }
350 :
351 :
352 : /// isIdentifierBody - Return true if this is the body character of an
353 : /// identifier, which is [a-zA-Z0-9_].
354 9347222: static inline bool isIdentifierBody(unsigned char c) {
355 9347222: return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
356 : }
357 :
358 : /// isHorizontalWhitespace - Return true if this character is horizontal
359 : /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'.
360 454532: static inline bool isHorizontalWhitespace(unsigned char c) {
361 454532: return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
362 : }
363 :
364 : /// isWhitespace - Return true if this character is horizontal or vertical
365 : /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false
366 : /// for '\0'.
367 145162: static inline bool isWhitespace(unsigned char c) {
368 145162: return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
369 : }
370 :
371 : /// isNumberBody - Return true if this is the body character of an
372 : /// preprocessing number, which is [a-zA-Z0-9_.].
373 1265555: static inline bool isNumberBody(unsigned char c) {
374 : return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
375 1265555: true : false;
376 : }
377 :
378 :
379 : //===----------------------------------------------------------------------===//
380 : // Diagnostics forwarding code.
381 : //===----------------------------------------------------------------------===//
382 :
383 : /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
384 : /// lexer buffer was all instantiated at a single point, perform the mapping.
385 : /// This is currently only used for _Pragma implementation, so it is the slow
386 : /// path of the hot getSourceLocation method. Do not allow it to be inlined.
387 : static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP,
388 : SourceLocation FileLoc,
389 : unsigned CharNo,
390 : unsigned TokLen);
391 : static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
392 : SourceLocation FileLoc,
393 61: unsigned CharNo, unsigned TokLen) {
61: branch 1 taken
0: branch 2 not taken
394 61: assert(FileLoc.isMacroID() && "Must be an instantiation");
395 :
396 : // Otherwise, we're lexing "mapped tokens". This is used for things like
397 : // _Pragma handling. Combine the instantiation location of FileLoc with the
398 : // spelling location.
399 61: SourceManager &SM = PP.getSourceManager();
400 :
401 : // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
402 : // characters come from spelling(FileLoc)+Offset.
403 61: SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
404 61: SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
405 :
406 : // Figure out the expansion loc range, which is the range covered by the
407 : // original _Pragma(...) sequence.
408 : std::pair<SourceLocation,SourceLocation> II =
409 61: SM.getImmediateInstantiationRange(FileLoc);
410 :
411 61: return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
412 : }
413 :
414 : /// getSourceLocation - Return a source location identifier for the specified
415 : /// offset in the current file.
416 : SourceLocation Lexer::getSourceLocation(const char *Loc,
417 2792135: unsigned TokLen) const {
418 : assert(Loc >= BufferStart && Loc <= BufferEnd &&
2792135: branch 0 taken
0: branch 1 not taken
0: branch 2 not taken
2792135: branch 3 taken
419 2792135: "Location out of range for this buffer!");
420 :
421 : // In the normal case, we're just lexing from a simple file buffer, return
422 : // the file id from FileLoc with the offset specified.
423 2792135: unsigned CharNo = Loc-BufferStart;
2792074: branch 1 taken
61: branch 2 taken
424 2792135: if (FileLoc.isFileID())
425 2792074: return FileLoc.getFileLocWithOffset(CharNo);
426 :
427 : // Otherwise, this is the _Pragma lexer case, which pretends that all of the
428 : // tokens are lexed from where the _Pragma was defined.
0: branch 0 not taken
61: branch 1 taken
429 61: assert(PP && "This doesn't work on raw lexers");
430 61: return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
431 : }
432 :
433 : /// Diag - Forwarding function for diagnostics. This translate a source
434 : /// position in the current buffer into a SourceLocation object for rendering.
435 215: DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
436 215: return PP->Diag(getSourceLocation(Loc), DiagID);
437 : }
438 :
439 : //===----------------------------------------------------------------------===//
440 : // Trigraph and Escaped Newline Handling Code.
441 : //===----------------------------------------------------------------------===//
442 :
443 : /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
444 : /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
445 313: static char GetTrigraphCharForLetter(char Letter) {
218: branch 0 taken
4: branch 1 taken
24: branch 2 taken
22: branch 3 taken
11: branch 4 taken
9: branch 5 taken
2: branch 6 taken
19: branch 7 taken
2: branch 8 taken
2: branch 9 taken
446 313: switch (Letter) {
447 218: default: return 0;
448 4: case '=': return '#';
449 24: case ')': return ']';
450 22: case '(': return '[';
451 11: case '!': return '|';
452 9: case '\'': return '^';
453 2: case '>': return '}';
454 19: case '/': return '\\';
455 2: case '<': return '{';
456 2: case '-': return '~';
457 : }
458 : }
459 :
460 : /// DecodeTrigraphChar - If the specified character is a legal trigraph when
461 : /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
462 : /// return the result character. Finally, emit a warning about trigraph use
463 : /// whether trigraphs are enabled or not.
464 301: static char DecodeTrigraphChar(const char *CP, Lexer *L) {
465 301: char Res = GetTrigraphCharForLetter(*CP);
85: branch 0 taken
216: branch 1 taken
15: branch 2 taken
70: branch 3 taken
466 301: if (!Res || !L) return Res;
467 :
8: branch 1 taken
62: branch 2 taken
468 70: if (!L->getFeatures().Trigraphs) {
2: branch 1 taken
6: branch 2 taken
469 8: if (!L->isLexingRawMode())
470 2: L->Diag(CP-2, diag::trigraph_ignored);
471 8: return 0;
472 : }
473 :
24: branch 1 taken
38: branch 2 taken
474 62: if (!L->isLexingRawMode())
475 24: L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
476 62: return Res;
477 : }
478 :
479 : /// getEscapedNewLineSize - Return the size of the specified escaped newline,
480 : /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
481 : /// trigraph equivalent on entry to this function.
482 2957: unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
483 2957: unsigned Size = 0;
3049: branch 1 taken
8: branch 2 taken
484 6014: while (isWhitespace(Ptr[Size])) {
485 3049: ++Size;
486 :
100: branch 0 taken
2949: branch 1 taken
0: branch 2 not taken
100: branch 3 taken
487 3049: if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
488 100: continue;
489 :
490 : // If this is a \r\n or \n\r, skip the other half.
2949: branch 0 taken
0: branch 1 not taken
1: branch 2 taken
2948: branch 3 taken
0: branch 4 not taken
1: branch 5 taken
491 2949: if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
492 : Ptr[Size-1] != Ptr[Size])
493 0: ++Size;
494 :
495 2949: return Size;
496 : }
497 :
498 : // Not an escaped newline, must be a \t or something else.
499 8: return 0;
500 : }
501 :
502 : /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
503 : /// them), skip over them and return the first non-escaped-newline found,
504 : /// otherwise return P.
505 15: const char *Lexer::SkipEscapedNewLines(const char *P) {
506 3: while (1) {
507 : const char *AfterEscape;
11: branch 0 taken
4: branch 1 taken
508 15: if (*P == '\\') {
509 11: AfterEscape = P+1;
1: branch 0 taken
3: branch 1 taken
510 4: } else if (*P == '?') {
511 : // If not a trigraph for escape, bail out.
0: branch 0 not taken
1: branch 1 taken
1: branch 2 taken
1: branch 3 taken
512 1: if (P[1] != '?' || P[2] != '/')
513 1: return P;
514 0: AfterEscape = P+3;
515 : } else {
516 3: return P;
517 : }
518 :
519 11: unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
8: branch 0 taken
3: branch 1 taken
520 11: if (NewLineSize == 0) return P;
521 3: P = AfterEscape+NewLineSize;
522 : }
523 : }
524 :
525 :
526 : /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
527 : /// get its size, and return it. This is tricky in several cases:
528 : /// 1. If currently at the start of a trigraph, we warn about the trigraph,
529 : /// then either return the trigraph (skipping 3 chars) or the '?',
530 : /// depending on whether trigraphs are enabled or not.
531 : /// 2. If this is an escaped newline (potentially with whitespace between
532 : /// the backslash and newline), implicitly skip the newline and return
533 : /// the char after it.
534 : /// 3. If this is a UCN, return it. FIXME: C++ UCN's?
535 : ///
536 : /// This handles the slow/uncommon case of the getCharAndSize method. Here we
537 : /// know that we can accumulate into Size, and that we have already incremented
538 : /// Ptr by Size bytes.
539 : ///
540 : /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
541 : /// be updated to match.
542 : ///
543 : char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
544 9346: Token *Tok) {
545 : // If we have a slash, look for an escaped newline.
4114: branch 0 taken
5232: branch 1 taken
546 9346: if (Ptr[0] == '\\') {
547 4114: ++Size;
548 4114: ++Ptr;
549 4129: Slash:
550 : // Common case, backslash-char where the char is not whitespace.
1482: branch 1 taken
2647: branch 2 taken
551 4129: if (!isWhitespace(Ptr[0])) return '\\';
552 :
553 : // See if we have optional whitespace characters between the slash and
554 : // newline.
2647: branch 1 taken
0: branch 2 not taken
555 2647: if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
556 : // Remember that this token needs to be cleaned.
2636: branch 0 taken
11: branch 1 taken
557 2647: if (Tok) Tok->setFlag(Token::NeedsCleaning);
558 :
559 : // Warn if there was whitespace between the backslash and newline.
20: branch 0 taken
2627: branch 1 taken
20: branch 2 taken
0: branch 3 not taken
17: branch 4 taken
3: branch 5 taken
4: branch 7 taken
13: branch 8 taken
4: branch 9 taken
2643: branch 10 taken
560 2647: if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
561 4: Diag(Ptr, diag::backslash_newline_space);
562 :
563 : // Found backslash<whitespace><newline>. Parse the char after it.
564 2647: Size += EscapedNewLineSize;
565 2647: Ptr += EscapedNewLineSize;
566 : // Use slow version to accumulate a correct size field.
567 2647: return getCharAndSizeSlow(Ptr, Size, Tok);
568 : }
569 :
570 : // Otherwise, this is not an escaped newline, just return the slash.
571 0: return '\\';
572 : }
573 :
574 : // If this is a trigraph, process it.
2585: branch 0 taken
2647: branch 1 taken
301: branch 2 taken
2284: branch 3 taken
575 5232: if (Ptr[0] == '?' && Ptr[1] == '?') {
576 : // If this is actually a legal trigraph (not something like "??x"), emit
577 : // a trigraph warning. If so, and if trigraphs are enabled, return it.
286: branch 0 taken
15: branch 1 taken
77: branch 3 taken
224: branch 4 taken
578 301: if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
579 : // Remember that this token needs to be cleaned.
62: branch 0 taken
15: branch 1 taken
580 77: if (Tok) Tok->setFlag(Token::NeedsCleaning);
581 :
582 77: Ptr += 3;
583 77: Size += 3;
15: branch 0 taken
62: branch 1 taken
584 77: if (C == '\\') goto Slash;
585 62: return C;
586 : }
587 : }
588 :
589 : // If this is neither, return a single character.
590 5155: ++Size;
591 5155: return *Ptr;
592 : }
593 :
594 :
595 : /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
596 : /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
597 : /// and that we have already incremented Ptr by Size bytes.
598 : ///
599 : /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
600 : /// be updated to match.
601 : char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
602 632: const LangOptions &Features) {
603 : // If we have a slash, look for an escaped newline.
319: branch 0 taken
313: branch 1 taken
604 632: if (Ptr[0] == '\\') {
605 319: ++Size;
606 319: ++Ptr;
607 323: Slash:
608 : // Common case, backslash-char where the char is not whitespace.
24: branch 1 taken
299: branch 2 taken
609 323: if (!isWhitespace(Ptr[0])) return '\\';
610 :
611 : // See if we have optional whitespace characters followed by a newline.
299: branch 1 taken
0: branch 2 not taken
612 299: if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
613 : // Found backslash<whitespace><newline>. Parse the char after it.
614 299: Size += EscapedNewLineSize;
615 299: Ptr += EscapedNewLineSize;
616 :
617 : // Use slow version to accumulate a correct size field.
618 299: return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
619 : }
620 :
621 : // Otherwise, this is not an escaped newline, just return the slash.
622 0: return '\\';
623 : }
624 :
625 : // If this is a trigraph, process it.
40: branch 0 taken
273: branch 1 taken
12: branch 2 taken
28: branch 3 taken
12: branch 4 taken
0: branch 5 not taken
626 313: if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
627 : // If this is actually a legal trigraph (not something like "??x"), return
628 : // it.
10: branch 1 taken
2: branch 2 taken
629 12: if (char C = GetTrigraphCharForLetter(Ptr[2])) {
630 10: Ptr += 3;
631 10: Size += 3;
4: branch 0 taken
6: branch 1 taken
632 10: if (C == '\\') goto Slash;
633 6: return C;
634 : }
635 : }
636 :
637 : // If this is neither, return a single character.
638 303: ++Size;
639 303: return *Ptr;
640 : }
641 :
642 : //===----------------------------------------------------------------------===//
643 : // Helper methods for lexing.
644 : //===----------------------------------------------------------------------===//
645 :
646 1170750: void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
647 : // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
648 : unsigned Size;
649 1170750: unsigned char C = *CurPtr++;
8176257: branch 1 taken
1170750: branch 2 taken
650 10517757: while (isIdentifierBody(C))
651 8176257: C = *CurPtr++;
652 :
653 1170750: --CurPtr; // Back up over the skipped character.
654 :
655 : // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
656 : // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
657 : // FIXME: UCNs.
658 : //
659 : // TODO: Could merge these checks into a CharInfo flag to make the comparison
660 : // cheaper
1170738: branch 0 taken
12: branch 1 taken
1170577: branch 2 taken
161: branch 3 taken
4: branch 4 taken
1170573: branch 5 taken
2: branch 6 taken
2: branch 7 taken
661 1170750: if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
662 1170750: FinishIdentifier:
663 1170750: const char *IdStart = BufferPtr;
664 1170750: FormTokenWithChars(Result, CurPtr, tok::identifier);
665 :
666 : // If we are in raw mode, return this identifier raw. There is no need to
667 : // look up identifier information or attempt to macro expand it.
863259: branch 0 taken
307491: branch 1 taken
668 1170750: if (LexingRawMode) return;
669 :
670 : // Fill in Result.IdentifierInfo, looking up the identifier in the
671 : // identifier table.
672 863259: IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
673 :
674 : // Change the kind of this identifier to the appropriate token kind, e.g.
675 : // turning "for" into a keyword.
676 863259: Result.setKind(II->getTokenID());
677 :
678 : // Finally, now that we know we have an identifier, pass this off to the
679 : // preprocessor, which may macro expand it or something.
13007: branch 1 taken
850252: branch 2 taken
680 863259: if (II->isHandleIdentifierCase())
681 13007: PP->HandleIdentifier(Result);
682 863259: return;
683 : }
684 :
685 : // Otherwise, $,\,? in identifier found. Enter slower path.
686 :
687 175: C = getCharAndSize(CurPtr, Size);
688 8: while (1) {
2: branch 0 taken
181: branch 1 taken
689 183: if (C == '$') {
690 : // If we hit a $ and they are not supported in identifiers, we are done.
0: branch 0 not taken
2: branch 1 taken
691 2: if (!Features.DollarIdents) goto FinishIdentifier;
692 :
693 : // Otherwise, emit a diagnostic and continue.
1: branch 1 taken
1: branch 2 taken
694 2: if (!isLexingRawMode())
695 1: Diag(CurPtr, diag::ext_dollar_in_identifier);
696 2: CurPtr = ConsumeChar(CurPtr, Size, Result);
697 2: C = getCharAndSize(CurPtr, Size);
698 2: continue;
175: branch 1 taken
6: branch 2 taken
699 181: } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
700 : // Found end of identifier.
701 175: goto FinishIdentifier;
702 : }
703 :
704 : // Otherwise, this character is good, consume it.
705 6: CurPtr = ConsumeChar(CurPtr, Size, Result);
706 :
707 6: C = getCharAndSize(CurPtr, Size);
28: branch 1 taken
6: branch 2 taken
708 40: while (isIdentifierBody(C)) { // FIXME: UCNs.
709 28: CurPtr = ConsumeChar(CurPtr, Size, Result);
710 28: C = getCharAndSize(CurPtr, Size);
711 : }
712 : }
713 : }
714 :
715 :
716 : /// LexNumericConstant - Lex the remainder of a integer or floating point
717 : /// constant. From[-1] is the first character lexed. Return the end of the
718 : /// constant.
719 308194: void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
720 : unsigned Size;
721 308194: char C = getCharAndSize(CurPtr, Size);
722 308194: char PrevCh = 0;
957361: branch 1 taken
308194: branch 2 taken
723 1573749: while (isNumberBody(C)) { // FIXME: UCNs?
724 957361: CurPtr = ConsumeChar(CurPtr, Size, Result);
725 957361: PrevCh = C;
726 957361: C = getCharAndSize(CurPtr, Size);
727 : }
728 :
729 : // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
285618: branch 0 taken
22576: branch 1 taken
7761: branch 2 taken
277857: branch 3 taken
30337: branch 4 taken
0: branch 5 not taken
29818: branch 6 taken
519: branch 7 taken
730 308194: if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
731 29818: return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
732 :
733 : // If we have a hex FP constant, continue.
278169: branch 0 taken
207: branch 1 taken
312: branch 2 taken
277857: branch 3 taken
519: branch 4 taken
0: branch 5 not taken
8: branch 6 taken
511: branch 7 taken
4: branch 8 taken
4: branch 9 taken
4: branch 11 taken
0: branch 12 not taken
8: branch 13 taken
278368: branch 14 taken
734 278376: if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
735 : (!PP || !PP->getLangOptions().CPlusPlus0x))
736 8: return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
737 :
738 : // Update the location of token as well as BufferPtr.
739 278368: const char *TokStart = BufferPtr;
740 278368: FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
741 278368: Result.setLiteralData(TokStart);
742 : }
743 :
744 : /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
745 : /// either " or L".
746 13497: void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
747 13497: const char *NulCharacter = 0; // Does this string contain the \0 character?
748 :
749 13497: char C = getAndAdvanceChar(CurPtr, Result);
199279: branch 0 taken
13494: branch 1 taken
750 226270: while (C != '"') {
751 : // Skip escaped characters.
737: branch 0 taken
198542: branch 1 taken
752 199279: if (C == '\\') {
753 : // Skip the escaped character.
754 737: C = getAndAdvanceChar(CurPtr, Result);
198539: branch 0 taken
3: branch 1 taken
198539: branch 2 taken
0: branch 3 not taken
0: branch 4 not taken
198539: branch 5 taken
198539: branch 6 taken
198539: branch 7 taken
755 198542: } else if (C == '\n' || C == '\r' || // Newline.
756 : (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2: branch 1 taken
1: branch 2 taken
0: branch 3 not taken
2: branch 4 taken
0: branch 5 not taken
3: branch 6 taken
757 3: if (!isLexingRawMode() && !Features.AsmPreprocessor)
758 0: Diag(BufferPtr, diag::err_unterminated_string);
759 3: FormTokenWithChars(Result, CurPtr-1, tok::unknown);
760 3: return;
0: branch 0 not taken
198539: branch 1 taken
761 198539: } else if (C == 0) {
762 0: NulCharacter = CurPtr-1;
763 : }
764 199276: C = getAndAdvanceChar(CurPtr, Result);
765 : }
766 :
767 : // If a nul character existed in the string, warn about it.
0: branch 0 not taken
13494: branch 1 taken
0: branch 3 not taken
0: branch 4 not taken
0: branch 5 not taken
13494: branch 6 taken
768 13494: if (NulCharacter && !isLexingRawMode())
769 0: Diag(NulCharacter, diag::null_in_string);
770 :
771 : // Update the location of the token as well as the BufferPtr instance var.
772 13494: const char *TokStart = BufferPtr;
773 : FormTokenWithChars(Result, CurPtr,
61: branch 0 taken
13433: branch 1 taken
774 13494: Wide ? tok::wide_string_literal : tok::string_literal);
775 13494: Result.setLiteralData(TokStart);
776 : }
777 :
778 : /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
779 : /// after having lexed the '<' character. This is used for #include filenames.
780 632: void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
781 632: const char *NulCharacter = 0; // Does this string contain the \0 character?
782 632: const char *AfterLessPos = CurPtr;
783 632: char C = getAndAdvanceChar(CurPtr, Result);
7111: branch 0 taken
631: branch 1 taken
784 8374: while (C != '>') {
785 : // Skip escaped characters.
0: branch 0 not taken
7111: branch 1 taken
786 7111: if (C == '\\') {
787 : // Skip the escaped character.
788 0: C = getAndAdvanceChar(CurPtr, Result);
7110: branch 0 taken
1: branch 1 taken
7110: branch 2 taken
0: branch 3 not taken
0: branch 4 not taken
7110: branch 5 taken
7110: branch 6 taken
7110: branch 7 taken
789 7111: } else if (C == '\n' || C == '\r' || // Newline.
790 : (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
791 : // If the filename is unterminated, then it must just be a lone <
792 : // character. Return this as such.
793 1: FormTokenWithChars(Result, AfterLessPos, tok::less);
794 1: return;
0: branch 0 not taken
7110: branch 1 taken
795 7110: } else if (C == 0) {
796 0: NulCharacter = CurPtr-1;
797 : }
798 7110: C = getAndAdvanceChar(CurPtr, Result);
799 : }
800 :
801 : // If a nul character existed in the string, warn about it.
0: branch 0 not taken
631: branch 1 taken
0: branch 3 not taken
0: branch 4 not taken
0: branch 5 not taken
631: branch 6 taken
802 631: if (NulCharacter && !isLexingRawMode())
803 0: Diag(NulCharacter, diag::null_in_string);
804 :
805 : // Update the location of token as well as BufferPtr.
806 631: const char *TokStart = BufferPtr;
807 631: FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
808 631: Result.setLiteralData(TokStart);
809 : }
810 :
811 :
812 : /// LexCharConstant - Lex the remainder of a character constant, after having
813 : /// lexed either ' or L'.
814 377: void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
815 377: const char *NulCharacter = 0; // Does this character contain the \0 character?
816 :
817 : // Handle the common case of 'x' and '\y' efficiently.
818 377: char C = getAndAdvanceChar(CurPtr, Result);
2: branch 0 taken
375: branch 1 taken
819 377: if (C == '\'') {
2: branch 1 taken
0: branch 2 not taken
0: branch 3 not taken
2: branch 4 taken
0: branch 5 not taken
2: branch 6 taken
820 2: if (!isLexingRawMode() && !Features.AsmPreprocessor)
821 0: Diag(BufferPtr, diag::err_empty_character);
822 2: FormTokenWithChars(Result, CurPtr, tok::unknown);
823 2: return;
87: branch 0 taken
288: branch 1 taken
824 375: } else if (C == '\\') {
825 : // Skip the escaped character.
826 : // FIXME: UCN's.
827 87: C = getAndAdvanceChar(CurPtr, Result);
828 : }
829 :
375: branch 0 taken
0: branch 1 not taken
372: branch 2 taken
3: branch 3 taken
372: branch 4 taken
0: branch 5 not taken
338: branch 6 taken
34: branch 7 taken
830 713: if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
831 338: ++CurPtr;
832 : } else {
833 : // Fall back on generic code for embedded nulls, newlines, wide chars.
70: branch 0 taken
33: branch 1 taken
834 103: do {
835 : // Skip escaped characters.
0: branch 0 not taken
107: branch 1 taken
836 107: if (C == '\\') {
837 : // Skip the escaped character.
838 0: C = getAndAdvanceChar(CurPtr, Result);
103: branch 0 taken
4: branch 1 taken
103: branch 2 taken
0: branch 3 not taken
0: branch 4 not taken
103: branch 5 taken
103: branch 6 taken
103: branch 7 taken
839 107: } else if (C == '\n' || C == '\r' || // Newline.
840 : (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2: branch 1 taken
2: branch 2 taken
0: branch 3 not taken
2: branch 4 taken
0: branch 5 not taken
4: branch 6 taken
841 4: if (!isLexingRawMode() && !Features.AsmPreprocessor)
842 0: Diag(BufferPtr, diag::err_unterminated_char);
843 4: FormTokenWithChars(Result, CurPtr-1, tok::unknown);
844 4: return;
0: branch 0 not taken
103: branch 1 taken
845 103: } else if (C == 0) {
846 0: NulCharacter = CurPtr-1;
847 : }
848 103: C = getAndAdvanceChar(CurPtr, Result);
849 : } while (C != '\'');
850 : }
851 :
0: branch 0 not taken
371: branch 1 taken
0: branch 3 not taken
0: branch 4 not taken
0: branch 5 not taken
371: branch 6 taken
852 371: if (NulCharacter && !isLexingRawMode())
853 0: Diag(NulCharacter, diag::null_in_char);
854 :
855 : // Update the location of token as well as BufferPtr.
856 371: const char *TokStart = BufferPtr;
857 371: FormTokenWithChars(Result, CurPtr, tok::char_constant);
858 371: Result.setLiteralData(TokStart);
859 : }
860 :
861 : /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
862 : /// Update BufferPtr to point to the next non-whitespace character and return.
863 : ///
864 : /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
865 : ///
866 178792: bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
867 : // Whitespace - Skip it, then return the token after the whitespace.
868 178792: unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently.
869 36464: while (1) {
870 : // Skip horizontal whitespace very aggressively.
206019: branch 1 taken
215256: branch 2 taken
871 636531: while (isHorizontalWhitespace(Char))
872 206019: Char = *++CurPtr;
873 :
874 : // Otherwise if we have something other than whitespace, we're done.
178796: branch 0 taken
36460: branch 1 taken
4: branch 2 taken
178792: branch 3 taken
875 215256: if (Char != '\n' && Char != '\r')
876 178792: break;
877 :
0: branch 0 not taken
36464: branch 1 taken
878 36464: if (ParsingPreprocessorDirective) {
879 : // End of preprocessor directive line, let LexTokenInternal handle this.
880 0: BufferPtr = CurPtr;
881 0: return false;
882 : }
883 :
884 : // ok, but handle newline.
885 : // The returned token is at the start of the line.
886 36464: Result.setFlag(Token::StartOfLine);
887 : // No leading whitespace seen so far.
888 36464: Result.clearFlag(Token::LeadingSpace);
889 36464: Char = *++CurPtr;
890 : }
891 :
892 : // If this isn't immediately after a newline, there is leading space.
893 178792: char PrevChar = CurPtr[-1];
65709: branch 0 taken
113083: branch 1 taken
65709: branch 2 taken
0: branch 3 not taken
894 178792: if (PrevChar != '\n' && PrevChar != '\r')
895 65709: Result.setFlag(Token::LeadingSpace);
896 :
897 : // If the client wants us to return whitespace, return it now.
0: branch 1 not taken
178792: branch 2 taken
898 178792: if (isKeepWhitespaceMode()) {
899 0: FormTokenWithChars(Result, CurPtr, tok::unknown);
900 0: return true;
901 : }
902 :
903 178792: BufferPtr = CurPtr;
904 178792: return false;
905 : }
906 :
907 : // SkipBCPLComment - We have just read the // characters from input. Skip until
908 : // we find the newline character thats terminate the comment. Then update
909 : /// BufferPtr and return.
910 : ///
911 : /// If we're in KeepCommentMode or any CommentHandler has inserted
912 : /// some tokens, this will store the first token and return true.
913 53903: bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
914 : // If BCPL comments aren't explicitly enabled for this language, emit an
915 : // extension warning.
26: branch 0 taken
53877: branch 1 taken
11: branch 3 taken
15: branch 4 taken
11: branch 5 taken
53892: branch 6 taken
916 53903: if (!Features.BCPLComment && !isLexingRawMode()) {
917 11: Diag(BufferPtr, diag::ext_bcpl_comment);
918 :
919 : // Mark them enabled so we only emit one warning for this translation
920 : // unit.
921 11: Features.BCPLComment = true;
922 : }
923 :
924 : // Scan over the body of the comment. The common case, when scanning, is that
925 : // the comment contains normal ascii characters with nothing interesting in
926 : // them. As such, optimize for this case with the inner loop.
927 : char C;
1571: branch 0 taken
0: branch 1 not taken
1571: branch 2 taken
0: branch 3 not taken
928 1571: do {
929 55474: C = *CurPtr;
930 : // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character.
931 : // If we find a \n character, scan backwards, checking to see if it's an
932 : // escaped newline, like we do for block comments.
933 :
934 : // Skip over characters in the fast loop.
2637593: branch 0 taken
3: branch 1 taken
2636376: branch 2 taken
1217: branch 3 taken
2636022: branch 4 taken
354: branch 5 taken
2582124: branch 6 taken
53898: branch 7 taken
2582122: branch 8 taken
2: branch 9 taken
935 2693070: while (C != 0 && // Potentially EOF.
936 : C != '\\' && // Potentially escaped newline.
937 : C != '?' && // Potentially trigraph.
938 : C != '\n' && C != '\r') // Newline or DOS-style newline.
939 2582122: C = *++CurPtr;
940 :
941 : // If this is a newline, we're done.
1576: branch 0 taken
53898: branch 1 taken
2: branch 2 taken
1574: branch 3 taken
942 55474: if (C == '\n' || C == '\r')
943 53900: break; // Found the newline? Break out!
944 :
945 : // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
946 : // properly decode the character. Read it in raw mode to avoid emitting
947 : // diagnostics about things like trigraphs. If we see an escaped newline,
948 : // we'll handle it below.
949 1574: const char *OldPtr = CurPtr;
950 1574: bool OldRawMode = isLexingRawMode();
951 1574: LexingRawMode = true;
952 1574: C = getAndAdvanceChar(CurPtr, Result);
953 1574: LexingRawMode = OldRawMode;
954 :
955 : // If the char that we finally got was a \n, then we must have had something
956 : // like \<newline><newline>. We don't want to have consumed the second
957 : // newline, we want CurPtr, to end up pointing to it down below.
1573: branch 0 taken
1: branch 1 taken
0: branch 2 not taken
1573: branch 3 taken
958 1574: if (C == '\n' || C == '\r') {
959 1: --CurPtr;
960 1: C = 'x'; // doesn't matter what this is.
961 : }
962 :
963 : // If we read multiple characters, and one of those characters was a \r or
964 : // \n, then we had an escaped newline within the comment. Emit diagnostic
965 : // unless the next line is also a // comment.
584: branch 0 taken
990: branch 1 taken
465: branch 2 taken
119: branch 3 taken
451: branch 4 taken
14: branch 5 taken
966 1574: if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
914: branch 0 taken
10: branch 1 taken
967 924: for (; OldPtr != CurPtr; ++OldPtr)
473: branch 0 taken
441: branch 1 taken
0: branch 2 not taken
473: branch 3 taken
968 914: if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
969 : // Okay, we found a // comment that ends in a newline, if the next
970 : // line is also a // comment, but has spaces, don't emit a diagnostic.
424: branch 1 taken
17: branch 2 taken
971 441: if (isspace(C)) {
972 424: const char *ForwardPtr = CurPtr;
6908: branch 1 taken
424: branch 2 taken
973 7756: while (isspace(*ForwardPtr)) // Skip whitespace.
974 6908: ++ForwardPtr;
332: branch 0 taken
92: branch 1 taken
332: branch 2 taken
0: branch 3 not taken
975 424: if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
976 332: break;
977 : }
978 :
55: branch 1 taken
54: branch 2 taken
979 109: if (!isLexingRawMode())
980 55: Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
981 109: break;
982 : }
983 : }
984 :
3: branch 0 taken
1571: branch 1 taken
985 1574: if (CurPtr == BufferEnd+1) { --CurPtr; break; }
986 : } while (C != '\n' && C != '\r');
987 :
988 : // Found but did not consume the newline. Notify comment handlers about the
989 : // comment unless we're in a #if 0 block.
38464: branch 0 taken
15439: branch 1 taken
38436: branch 3 taken
28: branch 4 taken
0: branch 9 not taken
38436: branch 10 taken
0: branch 11 not taken
53903: branch 12 taken
990 53903: if (PP && !isLexingRawMode() &&
991 : PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
992 : getSourceLocation(CurPtr)))) {
993 0: BufferPtr = CurPtr;
994 0: return true; // A token has to be returned.
995 : }
996 :
997 : // If we are returning comments as tokens, return this comment as a token.
15458: branch 1 taken
38445: branch 2 taken
998 53903: if (inKeepCommentMode())
999 15458: return SaveBCPLComment(Result, CurPtr);
1000 :
1001 : // If we are inside a preprocessor directive and we see the end of line,
1002 : // return immediately, so that the lexer can return this as an EOM token.
38261: branch 0 taken
184: branch 1 taken
3: branch 2 taken
38258: branch 3 taken
1003 38445: if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
1004 187: BufferPtr = CurPtr;
1005 187: return false;
1006 : }
1007 :
1008 : // Otherwise, eat the \n character. We don't care if this is a \n\r or
1009 : // \r\n sequence. This is an efficiency hack (because we know the \n can't
1010 : // contribute to another token), it isn't needed for correctness. Note that
1011 : // this is ok even in KeepWhitespaceMode, because we would have returned the
1012 : /// comment above in that mode.
1013 38258: ++CurPtr;
1014 :
1015 : // The next returned token is at the start of the line.
1016 38258: Result.setFlag(Token::StartOfLine);
1017 : // No leading whitespace seen so far.
1018 38258: Result.clearFlag(Token::LeadingSpace);
1019 38258: BufferPtr = CurPtr;
1020 38258: return false;
1021 : }
1022 :
1023 : /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
1024 : /// an appropriate way and return it.
1025 15458: bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
1026 : // If we're not in a preprocessor directive, just return the // comment
1027 : // directly.
1028 15458: FormTokenWithChars(Result, CurPtr, tok::comment);
1029 :
15456: branch 0 taken
2: branch 1 taken
1030 15458: if (!ParsingPreprocessorDirective)
1031 15456: return true;
1032 :
1033 : // If this BCPL-style comment is in a macro definition, transmogrify it into
1034 : // a C-style block comment.
1035 2: std::string Spelling = PP->getSpelling(Result);
2: branch 1 taken
0: branch 2 not taken
2: branch 4 taken
0: branch 5 not taken
1036 2: assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
1037 2: Spelling[1] = '*'; // Change prefix to "/*".
1038 2: Spelling += "*/"; // add suffix.
1039 :
1040 2: Result.setKind(tok::comment);
1041 : PP->CreateString(&Spelling[0], Spelling.size(), Result,
1042 2: Result.getLocation());
1043 2: return true;
1044 : }
1045 :
1046 : /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
1047 : /// character (either \n or \r) is part of an escaped newline sequence. Issue a
1048 : /// diagnostic if so. We know that the newline is inside of a block comment.
1049 : static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
1050 24: Lexer *L) {
0: branch 0 not taken
24: branch 1 taken
24: branch 2 taken
24: branch 3 taken
1051 24: assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
1052 :
1053 : // Back up off the newline.
1054 24: --CurPtr;
1055 :
1056 : // If this is a two-character newline sequence, skip the other character.
18: branch 0 taken
6: branch 1 taken
0: branch 2 not taken
18: branch 3 taken
1057 24: if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
1058 : // \n\n or \r\r -> not escaped newline.
6: branch 0 taken
0: branch 1 not taken
1059 6: if (CurPtr[0] == CurPtr[1])
1060 6: return false;
1061 : // \n\r or \r\n -> skip the newline.
1062 0: --CurPtr;
1063 : }
1064 :
1065 : // If we have horizontal whitespace, skip over it. We allow whitespace
1066 : // between the slash and newline.
1067 18: bool HasSpace = false;
18: branch 1 taken
36: branch 2 taken
0: branch 3 not taken
18: branch 4 taken
36: branch 5 taken
18: branch 6 taken
1068 72: while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
1069 36: --CurPtr;
1070 36: HasSpace = true;
1071 : }
1072 :
1073 : // If we have a slash, we know this is an escaped newline.
6: branch 0 taken
12: branch 1 taken
1074 18: if (*CurPtr == '\\') {
0: branch 0 not taken
6: branch 1 taken
1075 6: if (CurPtr[-1] != '*') return false;
1076 : } else {
1077 : // It isn't a slash, is it the ?? / trigraph?
6: branch 0 taken
6: branch 1 taken
6: branch 2 taken
0: branch 3 not taken
6: branch 4 taken
0: branch 5 not taken
0: branch 6 not taken
6: branch 7 taken
1078 12: if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
1079 : CurPtr[-3] != '*')
1080 6: return false;
1081 :
1082 : // This is the trigraph ending the comment. Emit a stern warning!
1083 6: CurPtr -= 2;
1084 :
1085 : // If no trigraphs are enabled, warn that we ignored this trigraph and
1086 : // ignore this * character.
0: branch 1 not taken
6: branch 2 taken
1087 6: if (!L->getFeatures().Trigraphs) {
0: branch 1 not taken
0: branch 2 not taken
1088 0: if (!L->isLexingRawMode())
1089 0: L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
1090 0: return false;
1091 : }
5: branch 1 taken
1: branch 2 taken
1092 6: if (!L->isLexingRawMode())
1093 5: L->Diag(CurPtr, diag::trigraph_ends_block_comment);
1094 : }
1095 :
1096 : // Warn about having an escaped newline between the */ characters.
10: branch 1 taken
2: branch 2 taken
1097 12: if (!L->isLexingRawMode())
1098 10: L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
1099 :
1100 : // If there was space between the backslash and newline, warn about it.
12: branch 0 taken
0: branch 1 not taken
10: branch 3 taken
2: branch 4 taken
10: branch 5 taken
2: branch 6 taken
1101 12: if (HasSpace && !L->isLexingRawMode())
1102 10: L->Diag(CurPtr, diag::backslash_newline_space);
1103 :
1104 12: return true;
1105 : }
1106 :
1107 : #ifdef __SSE2__
1108 : #include <emmintrin.h>
1109 : #elif __ALTIVEC__
1110 : #include <altivec.h>
1111 : #undef bool
1112 : #endif
1113 :
1114 : /// SkipBlockComment - We have just read the /* characters from input. Read
1115 : /// until we find the */ characters that terminate the comment. Note that we
1116 : /// don't bother decoding trigraphs or escaped newlines in block comments,
1117 : /// because they cannot cause the comment to end. The only thing that can
1118 : /// happen is the comment could end with an escaped newline between the */ end
1119 : /// of comment.
1120 : ///
1121 : /// If we're in KeepCommentMode or any CommentHandler has inserted
1122 : /// some tokens, this will store the first token and return true.
1123 10873: bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
1124 : // Scan one character past where we should, looking for a '/' character. Once
1125 : // we find it, check to see if it was preceeded by a *. This common
1126 : // optimization helps people who like to put a lot of * characters in their
1127 : // comments.
1128 :
1129 : // The first character we get with newlines and trigraphs skipped to handle
1130 : // the degenerate /*/ case below correctly if the * has an escaped newline
1131 : // after it.
1132 : unsigned CharSize;
1133 10873: unsigned char C = getCharAndSize(CurPtr, CharSize);
1134 10873: CurPtr += CharSize;
3: branch 0 taken
10870: branch 1 taken
3: branch 2 taken
0: branch 3 not taken
1135 10873: if (C == 0 && CurPtr == BufferEnd+1) {
0: branch 1 not taken
3: branch 2 taken
1136 3: if (!isLexingRawMode())
1137 0: Diag(BufferPtr, diag::err_unterminated_block_comment);
1138 3: --CurPtr;
1139 :
1140 : // KeepWhitespaceMode should return this broken comment as a token. Since
1141 : // it isn't a well formed comment, just return it as an 'unknown' token.
0: branch 1 not taken
3: branch 2 taken
1142 3: if (isKeepWhitespaceMode()) {
1143 0: FormTokenWithChars(Result, CurPtr, tok::unknown);
1144 0: return true;
1145 : }
1146 :
1147 3: BufferPtr = CurPtr;
1148 3: return false;
1149 : }
1150 :
1151 : // Check to see if the first character after the '/*' is another /. If so,
1152 : // then this slash does not end the block comment, it is part of it.
6: branch 0 taken
10864: branch 1 taken
1153 10870: if (C == '/')
1154 6: C = *CurPtr++;
1155 :
1156 1227: while (1) {
1157 : // Skip over all non-interesting characters until we find end of buffer or a
1158 : // (probably ending) '/' character.
11704: branch 0 taken
393: branch 1 taken
1159 12097: if (CurPtr + 24 < BufferEnd) {
1160 : // While not aligned to a 16-byte boundary.
97331: branch 0 taken
675: branch 1 taken
86302: branch 2 taken
11029: branch 3 taken
1161 109710: while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
1162 86302: C = *CurPtr++;
1163 :
11029: branch 0 taken
675: branch 1 taken
1164 11704: if (C == '/') goto FoundSlash;
1165 :
1166 : #ifdef __SSE2__
1167 : __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
1168 : '/', '/', '/', '/', '/', '/', '/', '/');
1169 : while (CurPtr+16 <= BufferEnd &&
1170 : _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
1171 : CurPtr += 16;
1172 : #elif __ALTIVEC__
1173 : __vector unsigned char Slashes = {
1174 : '/', '/', '/', '/', '/', '/', '/', '/',
1175 : '/', '/', '/', '/', '/', '/', '/', '/'
1176 : };
1177 : while (CurPtr+16 <= BufferEnd &&
1178 : !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
1179 : CurPtr += 16;
1180 : #else
1181 : // Scan for '/' quickly. Many block comments are very large.
274050: branch 0 taken
2801: branch 1 taken
271065: branch 2 taken
2985: branch 3 taken
268523: branch 4 taken
2542: branch 5 taken
265822: branch 6 taken
2701: branch 7 taken
265822: branch 8 taken
0: branch 9 not taken
1182 287880: while (CurPtr[0] != '/' &&
1183 : CurPtr[1] != '/' &&
1184 : CurPtr[2] != '/' &&
1185 : CurPtr[3] != '/' &&
1186 : CurPtr+4 < BufferEnd) {
1187 265822: CurPtr += 4;
1188 : }
1189 : #endif
1190 :
1191 : // It has to be one of the bytes scanned, increment to it and read one.
1192 11029: C = *CurPtr++;
1193 : }
1194 :
1195 : // Loop to scan the remainder.
20458: branch 0 taken
11422: branch 1 taken
20458: branch 2 taken
0: branch 3 not taken
1196 43302: while (C != '/' && C != '\0')
1197 20458: C = *CurPtr++;
1198 :
1199 12097: FoundSlash:
12097: branch 0 taken
0: branch 1 not taken
1200 12097: if (C == '/') {
1239: branch 0 taken
10858: branch 1 taken
1201 12097: if (CurPtr[-2] == '*') // We found the final */. We're done!
1202 10858: break;
1203 :
1215: branch 0 taken
24: branch 1 taken
0: branch 2 not taken
1215: branch 3 taken
1204 1239: if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
12: branch 1 taken
12: branch 2 taken
1205 24: if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
1206 : // We found the final */, though it had an escaped newline between the
1207 : // * and /. We're done!
1208 12: break;
1209 : }
1210 : }
2: branch 0 taken
1225: branch 1 taken
2: branch 2 taken
0: branch 3 not taken
1211 1227: if (CurPtr[0] == '*' && CurPtr[1] != '/') {
1212 : // If this is a /* inside of the comment, emit a warning. Don't do this
1213 : // if this is a /*/, which will end the comment. This misses cases with
1214 : // embedded escaped newlines, but oh well.
0: branch 1 not taken
2: branch 2 taken
1215 2: if (!isLexingRawMode())
1216 0: Diag(CurPtr-1, diag::warn_nested_block_comment);
1217 : }
0: branch 0 not taken
0: branch 1 not taken
0: branch 2 not taken
0: branch 3 not taken
1218 0: } else if (C == 0 && CurPtr == BufferEnd+1) {
0: branch 1 not taken
0: branch 2 not taken
1219 0: if (!isLexingRawMode())
1220 0: Diag(BufferPtr, diag::err_unterminated_block_comment);
1221 : // Note: the user probably forgot a */. We could continue immediately
1222 : // after the /*, but this would involve lexing a lot of what really is the
1223 : // comment, which surely would confuse the parser.
1224 0: --CurPtr;
1225 :
1226 : // KeepWhitespaceMode should return this broken comment as a token. Since
1227 : // it isn't a well formed comment, just return it as an 'unknown' token.
0: branch 1 not taken
0: branch 2 not taken
1228 0: if (isKeepWhitespaceMode()) {
1229 0: FormTokenWithChars(Result, CurPtr, tok::unknown);
1230 0: return true;
1231 : }
1232 :
1233 0: BufferPtr = CurPtr;
1234 0: return false;
1235 : }
1236 1227: C = *CurPtr++;
1237 : }
1238 :
1239 : // Notify comment handlers about the comment unless we're in a #if 0 block.
10525: branch 0 taken
345: branch 1 taken
7085: branch 3 taken
3440: branch 4 taken
0: branch 9 not taken
7085: branch 10 taken
0: branch 11 not taken
10870: branch 12 taken
1240 10870: if (PP && !isLexingRawMode() &&
1241 : PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
1242 : getSourceLocation(CurPtr)))) {
1243 0: BufferPtr = CurPtr;
1244 0: return true; // A token has to be returned.
1245 : }
1246 :
1247 : // If we are returning comments as tokens, return this comment as a token.
349: branch 1 taken
10521: branch 2 taken
1248 10870: if (inKeepCommentMode()) {
1249 349: FormTokenWithChars(Result, CurPtr, tok::comment);
1250 349: return true;
1251 : }
1252 :
1253 : // It is common for the tokens immediately after a /**/ comment to be
1254 : // whitespace. Instead of going through the big switch, handle it
1255 : // efficiently now. This is safe even in KeepWhitespaceMode because we would
1256 : // have already returned above with the comment as a token.
161: branch 1 taken
10360: branch 2 taken
1257 10521: if (isHorizontalWhitespace(*CurPtr)) {
1258 161: Result.setFlag(Token::LeadingSpace);
1259 161: SkipWhitespace(Result, CurPtr+1);
1260 161: return false;
1261 : }
1262 :
1263 : // Otherwise, just return so that the next character will be lexed as a token.
1264 10360: BufferPtr = CurPtr;
1265 10360: Result.setFlag(Token::LeadingSpace);
1266 10360: return false;
1267 : }
1268 :
1269 : //===----------------------------------------------------------------------===//
1270 : // Primary Lexing Entry Points
1271 : //===----------------------------------------------------------------------===//
1272 :
1273 : /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
1274 : /// uninterpreted string. This switches the lexer out of directive mode.
1275 24: std::string Lexer::ReadToEndOfLine() {
1276 : assert(ParsingPreprocessorDirective && ParsingFilename == false &&
24: branch 0 taken
0: branch 1 not taken
0: branch 2 not taken
24: branch 3 taken
1277 24: "Must be in a preprocessing directive!");
1278 24: std::string Result;
1279 24: Token Tmp;
1280 :
1281 : // CurPtr - Cache BufferPtr in an automatic variable.
1282 24: const char *CurPtr = BufferPtr;
1283 433: while (1) {
1284 457: char Char = getAndAdvanceChar(CurPtr, Tmp);
433: branch 0 taken
0: branch 1 not taken
24: branch 2 taken
1285 457: switch (Char) {
1286 : default:
1287 433: Result += Char;
1288 433: break;
1289 : case 0: // Null.
1290 : // Found end of file?
0: branch 0 not taken
0: branch 1 not taken
1291 0: if (CurPtr-1 != BufferEnd) {
1292 : // Nope, normal character, continue.
1293 0: Result += Char;
1294 0: break;
1295 : }
1296 : // FALL THROUGH.
1297 : case '\r':
1298 : case '\n':
1299 : // Okay, we found the end of the line. First, back up past the \0, \r, \n.
0: branch 0 not taken
24: branch 1 taken
1300 24: assert(CurPtr[-1] == Char && "Trigraphs for newline?");
1301 24: BufferPtr = CurPtr-1;
1302 :
1303 : // Next, lex the character, which should handle the EOM transition.
1304 24: Lex(Tmp);
24: branch 1 taken
0: branch 2 not taken
1305 24: assert(Tmp.is(tok::eom) && "Unexpected token!");
1306 :
1307 : // Finally, we're done, return the string we found.
1308 : return Result;
1309 : }
1310 : }
1311 : }
1312 :
1313 : /// LexEndOfFile - CurPtr points to the end of this file. Handle this
1314 : /// condition, reporting diagnostics and handling other edge cases as required.
1315 : /// This returns true if Result contains a token, false if PP.Lex should be
1316 : /// called again.
1317 7042: bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
1318 : // If we hit the end of the file while parsing a preprocessor directive,
1319 : // end the preprocessor directive first. The next token returned will
1320 : // then be the end of file.
0: branch 0 not taken
7042: branch 1 taken
1321 7042: if (ParsingPreprocessorDirective) {
1322 : // Done parsing the "line".
1323 0: ParsingPreprocessorDirective = false;
1324 : // Update the location of token as well as BufferPtr.
1325 0: FormTokenWithChars(Result, CurPtr, tok::eom);
1326 :
1327 : // Restore comment saving mode, in case it was disabled for directive.
1328 0: SetCommentRetentionState(PP->getCommentRetentionState());
1329 0: return true; // Have a token.
1330 : }
1331 :
1332 : // If we are in raw mode, return this event as an EOF token. Let the caller
1333 : // that put us in raw mode handle the event.
1341: branch 1 taken
5701: branch 2 taken
1334 7042: if (isLexingRawMode()) {
1335 1341: Result.startToken();
1336 1341: BufferPtr = BufferEnd;
1337 1341: FormTokenWithChars(Result, BufferEnd, tok::eof);
1338 1341: return true;
1339 : }
1340 :
1341 : // Otherwise, check if we are code-completing, then issue diagnostics for
1342 : // unterminated #if and missing newline.
1343 :
5701: branch 0 taken
0: branch 1 not taken
87: branch 3 taken
5614: branch 4 taken
87: branch 5 taken
5614: branch 6 taken
1344 5701: if (PP && PP->isCodeCompletionFile(FileLoc)) {
1345 : // We're at the end of the file, but we've been asked to consider the
1346 : // end of the file to be a code-completion token. Return the
1347 : // code-completion token.
1348 87: Result.startToken();
1349 87: FormTokenWithChars(Result, CurPtr, tok::code_completion);
1350 :
1351 : // Only do the eof -> code_completion translation once.
1352 87: PP->SetCodeCompletionPoint(0, 0, 0);
1353 87: return true;
1354 : }
1355 :
1356 : // If we are in a #if directive, emit an error.
0: branch 1 not taken
5614: branch 2 taken
1357 11228: while (!ConditionalStack.empty()) {
1358 : PP->Diag(ConditionalStack.back().IfLoc,
1359 0: diag::err_pp_unterminated_conditional);
1360 0: ConditionalStack.pop_back();
1361 : }
1362 :
1363 : // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
1364 : // a pedwarn.
5536: branch 0 taken
78: branch 1 taken
87: branch 2 taken
5449: branch 3 taken
87: branch 4 taken
0: branch 5 not taken
1365 5614: if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
1366 : Diag(BufferEnd, diag::ext_no_newline_eof)
1367 : << CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd),
1368 87: "\n");
1369 :
1370 5614: BufferPtr = CurPtr;
1371 :
1372 : // Finally, let the preprocessor handle this.
1373 5614: return PP->HandleEndOfFile(Result);
1374 : }
1375 :
1376 : /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
1377 : /// the specified lexer will return a tok::l_paren token, 0 if it is something
1378 : /// else and 2 if there are no more tokens in the buffer controlled by the
1379 : /// lexer.
1380 2126: unsigned Lexer::isNextPPTokenLParen() {
0: branch 0 not taken
2126: branch 1 taken
1381 2126: assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
1382 :
1383 : // Switch to 'skipping' mode. This will ensure that we can lex a token
1384 : // without emitting diagnostics, disables macro expansion, and will cause EOF
1385 : // to return an EOF token instead of popping the include stack.
1386 2126: LexingRawMode = true;
1387 :
1388 : // Save state that can be changed while lexing so that we can restore it.
1389 2126: const char *TmpBufferPtr = BufferPtr;
1390 2126: bool inPPDirectiveMode = ParsingPreprocessorDirective;
1391 :
1392 2126: Token Tok;
1393 2126: Tok.startToken();
1394 2126: LexTokenInternal(Tok);
1395 :
1396 : // Restore state that may have changed.
1397 2126: BufferPtr = TmpBufferPtr;
1398 2126: ParsingPreprocessorDirective = inPPDirectiveMode;
1399 :
1400 : // Restore the lexer back to non-skipping mode.
1401 2126: LexingRawMode = false;
1402 :
1: branch 1 taken
2125: branch 2 taken
1403 2126: if (Tok.is(tok::eof))
1404 1: return 2;
1405 2125: return Tok.is(tok::l_paren);
1406 : }
1407 :
1408 : /// FindConflictEnd - Find the end of a version control conflict marker.
1409 4: static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) {
1410 4: llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7);
1411 4: size_t Pos = RestOfBuffer.find(">>>>>>>");
4: branch 0 taken
0: branch 1 not taken
1412 8: while (Pos != llvm::StringRef::npos) {
1413 : // Must occur at start of line.
4: branch 1 taken
0: branch 2 not taken
0: branch 4 not taken
4: branch 5 taken
0: branch 6 not taken
4: branch 7 taken
1414 4: if (RestOfBuffer[Pos-1] != '\r' &&
1415 : RestOfBuffer[Pos-1] != '\n') {
1416 0: RestOfBuffer = RestOfBuffer.substr(Pos+7);
1417 0: continue;
1418 : }
1419 4: return RestOfBuffer.data()+Pos;
1420 : }
1421 0: return 0;
1422 : }
1423 :
1424 : /// IsStartOfConflictMarker - If the specified pointer is the start of a version
1425 : /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
1426 : /// and recover nicely. This returns true if it is a conflict marker and false
1427 : /// if not.
1428 8: bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
1429 : // Only a conflict marker if it starts at the beginning of a line.
8: branch 0 taken
0: branch 1 not taken
4: branch 2 taken
4: branch 3 taken
4: branch 4 taken
0: branch 5 not taken
1430 8: if (CurPtr != BufferStart &&
1431 : CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1432 4: return false;
1433 :
1434 : // Check to see if we have <<<<<<<.
4: branch 0 taken
0: branch 1 not taken
0: branch 5 not taken
4: branch 6 taken
0: branch 7 not taken
4: branch 8 taken
1435 4: if (BufferEnd-CurPtr < 8 ||
1436 : llvm::StringRef(CurPtr, 7) != "<<<<<<<")
1437 0: return false;
1438 :
1439 : // If we have a situation where we don't care about conflict markers, ignore
1440 : // it.
4: branch 0 taken
0: branch 1 not taken
2: branch 3 taken
2: branch 4 taken
2: branch 5 taken
2: branch 6 taken
1441 4: if (IsInConflictMarker || isLexingRawMode())
1442 2: return false;
1443 :
1444 : // Check to see if there is a >>>>>>> somewhere in the buffer at the start of
1445 : // a line to terminate this conflict marker.
2: branch 1 taken
0: branch 2 not taken
1446 2: if (FindConflictEnd(CurPtr+7, BufferEnd)) {
1447 : // We found a match. We are really in a conflict marker.
1448 : // Diagnose this, and ignore to the end of line.
1449 2: Diag(CurPtr, diag::err_conflict_marker);
1450 2: IsInConflictMarker = true;
1451 :
1452 : // Skip ahead to the end of line. We know this exists because the
1453 : // end-of-conflict marker starts with \r or \n.
176: branch 0 taken
0: branch 1 not taken
174: branch 2 taken
2: branch 3 taken
1454 178: while (*CurPtr != '\r' && *CurPtr != '\n') {
0: branch 0 not taken
174: branch 1 taken
1455 174: assert(CurPtr != BufferEnd && "Didn't find end of line");
1456 174: ++CurPtr;
1457 : }
1458 2: BufferPtr = CurPtr;
1459 2: return true;
1460 : }
1461 :
1462 : // No end of conflict marker found.
1463 0: return false;
1464 : }
1465 :
1466 :
1467 : /// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>'
1468 : /// marker, then it is the end of a conflict marker. Handle it by ignoring up
1469 : /// until the end of the line. This returns true if it is a conflict marker and
1470 : /// false if not.
1471 23: bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
1472 : // Only a conflict marker if it starts at the beginning of a line.
23: branch 0 taken
0: branch 1 not taken
16: branch 2 taken
7: branch 3 taken
16: branch 4 taken
0: branch 5 not taken
1473 23: if (CurPtr != BufferStart &&
1474 : CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1475 16: return false;
1476 :
1477 : // If we have a situation where we don't care about conflict markers, ignore
1478 : // it.
2: branch 0 taken
5: branch 1 taken
0: branch 3 not taken
2: branch 4 taken
5: branch 5 taken
2: branch 6 taken
1479 7: if (!IsInConflictMarker || isLexingRawMode())
1480 5: return false;
1481 :
1482 : // Check to see if we have the marker (7 characters in a row).
12: branch 0 taken
2: branch 1 taken
1483 14: for (unsigned i = 1; i != 7; ++i)
0: branch 0 not taken
12: branch 1 taken
1484 12: if (CurPtr[i] != CurPtr[0])
1485 0: return false;
1486 :
1487 : // If we do have it, search for the end of the conflict marker. This could
1488 : // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
1489 : // be the end of conflict marker.
2: branch 1 taken
0: branch 2 not taken
1490 2: if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) {
1491 2: CurPtr = End;
1492 :
1493 : // Skip ahead to the end of line.
32: branch 0 taken
0: branch 1 not taken
32: branch 2 taken
0: branch 3 not taken
30: branch 4 taken
2: branch 5 taken
1494 34: while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
1495 30: ++CurPtr;
1496 :
1497 2: BufferPtr = CurPtr;
1498 :
1499 : // No longer in the conflict marker.
1500 2: IsInConflictMarker = false;
1501 2: return true;
1502 : }
1503 :
1504 0: return false;
1505 : }
1506 :
1507 :
1508 : /// LexTokenInternal - This implements a simple C family lexer. It is an
1509 : /// extremely performance critical piece of code. This assumes that the buffer
1510 : /// has a null character at the end of the file. This returns a preprocessing
1511 : /// token, not a normal token, as such, it is an internal interface. It assumes
1512 : /// that the Flags of result have been cleared before calling this.
1513 2903142: void Lexer::LexTokenInternal(Token &Result) {
1514 2903142: LexNextToken:
1515 : // New token, can't need cleaning yet.
1516 2903142: Result.clearFlag(Token::NeedsCleaning);
1517 2903142: Result.setIdentifierInfo(0);
1518 :
1519 : // CurPtr - Cache BufferPtr in an automatic variable.
1520 2903142: const char *CurPtr = BufferPtr;
1521 :
1522 : // Small amounts of horizontal whitespace is very common between tokens.
1917746: branch 0 taken
985396: branch 1 taken
10748: branch 2 taken
1906998: branch 3 taken
1523 2903142: if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
1524 996144: ++CurPtr;
49282: branch 0 taken
1000639: branch 1 taken
4495: branch 2 taken
996144: branch 3 taken
1525 2046065: while ((*CurPtr == ' ') || (*CurPtr == '\t'))
1526 53777: ++CurPtr;
1527 :
1528 : // If we are keeping whitespace and other tokens, just return what we just
1529 : // skipped. The next lexer invocation will return the token after the
1530 : // whitespace.
0: branch 1 not taken
996144: branch 2 taken
1531 996144: if (isKeepWhitespaceMode()) {
1532 0: FormTokenWithChars(Result, CurPtr, tok::unknown);
1533 0: return;
1534 : }
1535 :
1536 996144: BufferPtr = CurPtr;
1537 996144: Result.setFlag(Token::LeadingSpace);
1538 : }
1539 :
1540 : unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
1541 :
1542 : // Read a character, advancing over it.
1543 2903142: char Char = getAndAdvanceChar(CurPtr, Result);
1544 : tok::TokenKind Kind;
1545 :
7040: branch 0 taken
2: branch 1 taken
466324: branch 2 taken
2016: branch 3 taken
278349: branch 4 taken
3266: branch 5 taken
1167573: branch 6 taken
6: branch 7 taken
346: branch 8 taken
13436: branch 9 taken
1570: branch 10 taken
8974: branch 11 taken
9500: branch 12 taken
114875: branch 13 taken
112925: branch 14 taken
30808: branch 15 taken
64523: branch 16 taken
4011: branch 17 taken
7574: branch 18 taken
28571: branch 19 taken
4910: branch 20 taken
23561: branch 21 taken
458: branch 22 taken
2172: branch 23 taken
42756: branch 24 taken
153: branch 25 taken
12950: branch 26 taken
11871: branch 27 taken
940: branch 28 taken
1863: branch 29 taken
15980: branch 30 taken
86303: branch 31 taken
21704: branch 32 taken
35587: branch 33 taken
306726: branch 34 taken
13508: branch 35 taken
11: branch 36 taken
1546 2903142: switch (Char) {
1547 : case 0: // Null.
1548 : // Found end of file?
7040: branch 0 taken
0: branch 1 not taken
1549 7040: if (CurPtr-1 == BufferEnd) {
1550 : // Read the PP instance variable into an automatic variable, because
1551 : // LexEndOfFile will often delete 'this'.
1552 7040: Preprocessor *PPCache = PP;
3940: branch 1 taken
3100: branch 2 taken
1553 7040: if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
1554 3940: return; // Got a token to return.
0: branch 0 not taken
3100: branch 1 taken
1555 3100: assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1556 3100: return PPCache->Lex(Result);
1557 : }
1558 :
0: branch 1 not taken
0: branch 2 not taken
1559 0: if (!isLexingRawMode())
1560 0: Diag(CurPtr-1, diag::null_in_file);
1561 0: Result.setFlag(Token::LeadingSpace);
0: branch 1 not taken
0: branch 2 not taken
1562 0: if (SkipWhitespace(Result, CurPtr))
1563 0: return; // KeepWhitespaceMode
1564 :
1565 0: goto LexNextToken; // GCC isn't tail call eliminating.
1566 :
1567 : case 26: // DOS & CP/M EOF: "^Z".
1568 : // If we're in Microsoft extensions mode, treat this as end of file.
2: branch 0 taken
0: branch 1 not taken
1569 2: if (Features.Microsoft) {
1570 : // Read the PP instance variable into an automatic variable, because
1571 : // LexEndOfFile will often delete 'this'.
1572 2: Preprocessor *PPCache = PP;
2: branch 1 taken
0: branch 2 not taken
1573 2: if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
1574 2: return; // Got a token to return.
0: branch 0 not taken
0: branch 1 not taken
1575 0: assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1576 0: return PPCache->Lex(Result);
1577 : }
1578 : // If Microsoft extensions are disabled, this is just random garbage.
1579 0: Kind = tok::unknown;
1580 0: break;
1581 :
1582 : case '\n':
1583 : case '\r':
1584 : // If we are inside a preprocessor directive and we see the end of line,
1585 : // we know we are done with the directive, so return an EOM token.
294714: branch 0 taken
171610: branch 1 taken
1586 466324: if (ParsingPreprocessorDirective) {
1587 : // Done parsing the "line".
1588 294714: ParsingPreprocessorDirective = false;
1589 :
1590 : // Restore comment saving mode, in case it was disabled for directive.
1591 294714: SetCommentRetentionState(PP->getCommentRetentionState());
1592 :
1593 : // Since we consumed a newline, we are back at the start of a line.
1594 294714: IsAtStartOfLine = true;
1595 :
1596 294714: Kind = tok::eom;
1597 294714: break;
1598 : }
1599 : // The returned token is at the start of the line.
1600 171610: Result.setFlag(Token::StartOfLine);
1601 : // No leading whitespace seen so far.
1602 171610: Result.clearFlag(Token::LeadingSpace);
1603 :
0: branch 1 not taken
171610: branch 2 taken
1604 171610: if (SkipWhitespace(Result, CurPtr))
1605 0: return; // KeepWhitespaceMode
1606 171610: goto LexNextToken; // GCC isn't tail call eliminating.
1607 : case ' ':
1608 : case '\t':
1609 : case '\f':
1610 : case '\v':
1611 7021: SkipHorizontalWhitespace:
1612 7021: Result.setFlag(Token::LeadingSpace);
0: branch 1 not taken
7021: branch 2 taken
1613 7021: if (SkipWhitespace(Result, CurPtr))
1614 0: return; // KeepWhitespaceMode
1615 :
1616 45478: SkipIgnoredUnits:
1617 45478: CurPtr = BufferPtr;
1618 :
1619 : // If the next token is obviously a // or /* */ comment, skip it efficiently
1620 : // too (without going through the big switch stmt).
22796: branch 0 taken
22682: branch 1 taken
22784: branch 2 taken
12: branch 3 taken
22784: branch 5 taken
0: branch 6 not taken
22784: branch 7 taken
0: branch 8 not taken
22784: branch 9 taken
22694: branch 10 taken
1621 45478: if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
1622 : Features.BCPLComment) {
0: branch 1 not taken
22784: branch 2 taken
1623 22784: if (SkipBCPLComment(Result, CurPtr+2))
1624 0: return; // There is a token to return.
1625 22784: goto SkipIgnoredUnits;
12: branch 0 taken
22682: branch 1 taken
12: branch 2 taken
0: branch 3 not taken
12: branch 5 taken
0: branch 6 not taken
12: branch 7 taken
22682: branch 8 taken
1626 22694: } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
0: branch 1 not taken
12: branch 2 taken
1627 12: if (SkipBlockComment(Result, CurPtr+2))
1628 0: return; // There is a token to return.
1629 12: goto SkipIgnoredUnits;
5005: branch 1 taken
17677: branch 2 taken
1630 22682: } else if (isHorizontalWhitespace(*CurPtr)) {
1631 5005: goto SkipHorizontalWhitespace;
1632 : }
1633 17677: goto LexNextToken; // GCC isn't tail call eliminating.
1634 :
1635 : // C99 6.4.4.1: Integer Constants.
1636 : // C99 6.4.4.2: Floating Constants.
1637 : case '0': case '1': case '2': case '3': case '4':
1638 : case '5': case '6': case '7': case '8': case '9':
1639 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1640 278349: MIOpt.ReadToken();
1641 278349: return LexNumericConstant(Result, CurPtr);
1642 :
1643 : case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
1644 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1645 3266: MIOpt.ReadToken();
1646 3266: Char = getCharAndSize(CurPtr, SizeTmp);
1647 :
1648 : // Wide string literal.
61: branch 0 taken
3205: branch 1 taken
1649 3266: if (Char == '"')
1650 : return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
1651 61: true);
1652 :
1653 : // Wide character constant.
31: branch 0 taken
3174: branch 1 taken
1654 3205: if (Char == '\'')
1655 31: return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1656 : // FALL THROUGH, treating L like the start of an identifier.
1657 :
1658 : // C99 6.4.2: Identifiers.
1659 : case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1660 : case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
1661 : case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1662 : case 'V': case 'W': case 'X': case 'Y': case 'Z':
1663 : case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1664 : case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1665 : case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1666 : case 'v': case 'w': case 'x': case 'y': case 'z':
1667 : case '_':
1668 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1669 1170747: MIOpt.ReadToken();
1670 1170747: return LexIdentifier(Result, CurPtr);
1671 :
1672 : case '$': // $ in identifiers.
3: branch 0 taken
3: branch 1 taken
1673 6: if (Features.DollarIdents) {
3: branch 1 taken
0: branch 2 not taken
1674 3: if (!isLexingRawMode())
1675 3: Diag(CurPtr-1, diag::ext_dollar_in_identifier);
1676 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1677 3: MIOpt.ReadToken();
1678 3: return LexIdentifier(Result, CurPtr);
1679 : }
1680 :
1681 3: Kind = tok::unknown;
1682 3: break;
1683 :
1684 : // C99 6.4.4: Character Constants.
1685 : case '\'':
1686 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1687 346: MIOpt.ReadToken();
1688 346: return LexCharConstant(Result, CurPtr);
1689 :
1690 : // C99 6.4.5: String Literals.
1691 : case '"':
1692 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1693 13436: MIOpt.ReadToken();
1694 13436: return LexStringLiteral(Result, CurPtr, false);
1695 :
1696 : // C99 6.4.6: Punctuators.
1697 : case '?':
1698 1570: Kind = tok::question;
1699 1570: break;
1700 : case '[':
1701 8974: Kind = tok::l_square;
1702 8974: break;
1703 : case ']':
1704 9500: Kind = tok::r_square;
1705 9500: break;
1706 : case '(':
1707 114875: Kind = tok::l_paren;
1708 114875: break;
1709 : case ')':
1710 112925: Kind = tok::r_paren;
1711 112925: break;
1712 : case '{':
1713 30808: Kind = tok::l_brace;
1714 30808: break;
1715 : case '}':
1716 64523: Kind = tok::r_brace;
1717 64523: break;
1718 : case '.':
1719 4011: Char = getCharAndSize(CurPtr, SizeTmp);
2986: branch 0 taken
1025: branch 1 taken
19: branch 2 taken
2967: branch 3 taken
1720 4011: if (Char >= '0' && Char <= '9') {
1721 : // Notify MIOpt that we read a non-whitespace/non-comment token.
1722 19: MIOpt.ReadToken();
1723 :
1724 19: return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1291: branch 0 taken
2701: branch 1 taken
55: branch 2 taken
1236: branch 3 taken
1725 4047: } else if (Features.CPlusPlus && Char == '*') {
1726 55: Kind = tok::periodstar;
1727 55: CurPtr += SizeTmp;
935: branch 0 taken
3002: branch 1 taken
932: branch 3 taken
3: branch 4 taken
932: branch 5 taken
3005: branch 6 taken
1728 3937: } else if (Char == '.' &&
1729 : getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
1730 932: Kind = tok::ellipsis;
1731 : CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1732 932: SizeTmp2, Result);
1733 : } else {
1734 3005: Kind = tok::period;
1735 : }
1736 3992: break;
1737 : case '&':
1738 7574: Char = getCharAndSize(CurPtr, SizeTmp);
1744: branch 0 taken
5830: branch 1 taken
1739 7574: if (Char == '&') {
1740 1744: Kind = tok::ampamp;
1741 1744: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
37: branch 0 taken
5793: branch 1 taken
1742 5830: } else if (Char == '=') {
1743 37: Kind = tok::ampequal;
1744 37: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1745 : } else {
1746 5793: Kind = tok::amp;
1747 : }
1748 7574: break;
1749 : case '*':
66: branch 1 taken
28505: branch 2 taken
1750 28571: if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1751 66: Kind = tok::starequal;
1752 66: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1753 : } else {
1754 28505: Kind = tok::star;
1755 : }
1756 28571: break;
1757 : case '+':
1758 4910: Char = getCharAndSize(CurPtr, SizeTmp);
1244: branch 0 taken
3666: branch 1 taken
1759 4910: if (Char == '+') {
1760 1244: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1761 1244: Kind = tok::plusplus;
240: branch 0 taken
3426: branch 1 taken
1762 3666: } else if (Char == '=') {
1763 240: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1764 240: Kind = tok::plusequal;
1765 : } else {
1766 3426: Kind = tok::plus;
1767 : }
1768 4910: break;
1769 : case '-':
1770 23561: Char = getCharAndSize(CurPtr, SizeTmp);
170: branch 0 taken
23391: branch 1 taken
1771 23561: if (Char == '-') { // --
1772 170: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1773 170: Kind = tok::minusminus;
1333: branch 0 taken
22058: branch 1 taken
594: branch 2 taken
739: branch 3 taken
80: branch 5 taken
514: branch 6 taken
80: branch 7 taken
23311: branch 8 taken
1774 23391: } else if (Char == '>' && Features.CPlusPlus &&
1775 : getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
1776 : CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1777 80: SizeTmp2, Result);
1778 80: Kind = tok::arrowstar;
1253: branch 0 taken
22058: branch 1 taken
1779 23311: } else if (Char == '>') { // ->
1780 1253: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1781 1253: Kind = tok::arrow;
33: branch 0 taken
22025: branch 1 taken
1782 22058: } else if (Char == '=') { // -=
1783 33: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1784 33: Kind = tok::minusequal;
1785 : } else {
1786 22025: Kind = tok::minus;
1787 : }
1788 23561: break;
1789 : case '~':
1790 458: Kind = tok::tilde;
1791 458: break;
1792 : case '!':
556: branch 1 taken
1616: branch 2 taken
1793 2172: if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1794 556: Kind = tok::exclaimequal;
1795 556: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1796 : } else {
1797 1616: Kind = tok::exclaim;
1798 : }
1799 2172: break;
1800 : case '/':
1801 : // 6.4.9: Comments
1802 42756: Char = getCharAndSize(CurPtr, SizeTmp);
31121: branch 0 taken
11635: branch 1 taken
1803 42756: if (Char == '/') { // BCPL comment.
1804 : // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
1805 : // want to lex this as a comment. There is one problem with this though,
1806 : // that in one particular corner case, this can change the behavior of the
1807 : // resultant program. For example, In "foo //**/ bar", C89 would lex
1808 : // this as "foo / bar" and langauges with BCPL comments would lex it as
1809 : // "foo". Check to see if the character after the second slash is a '*'.
1810 : // If so, we will lex that as a "/" instead of the start of a comment.
28: branch 0 taken
31093: branch 1 taken
26: branch 3 taken
2: branch 4 taken
31119: branch 5 taken
2: branch 6 taken
1811 31121: if (Features.BCPLComment ||
1812 : getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
15458: branch 2 taken
15661: branch 3 taken
1813 31119: if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1814 15458: return; // There is a token to return.
1815 :
1816 : // It is common for the tokens immediately after a // comment to be
1817 : // whitespace (indentation for the next line). Instead of going through
1818 : // the big switch, handle it efficiently now.
1819 15661: goto SkipIgnoredUnits;
1820 : }
1821 : }
1822 :
10861: branch 0 taken
776: branch 1 taken
1823 11637: if (Char == '*') { // /**/ comment.
349: branch 2 taken
10512: branch 3 taken
1824 10861: if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1825 349: return; // There is a token to return.
1826 10512: goto LexNextToken; // GCC isn't tail call eliminating.
1827 : }
1828 :
102: branch 0 taken
674: branch 1 taken
1829 776: if (Char == '=') {
1830 102: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1831 102: Kind = tok::slashequal;
1832 : } else {
1833 674: Kind = tok::slash;
1834 : }
1835 776: break;
1836 : case '%':
1837 153: Char = getCharAndSize(CurPtr, SizeTmp);
10: branch 0 taken
143: branch 1 taken
1838 153: if (Char == '=') {
1839 10: Kind = tok::percentequal;
1840 10: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
143: branch 0 taken
0: branch 1 not taken
4: branch 2 taken
139: branch 3 taken
1841 147: } else if (Features.Digraphs && Char == '>') {
1842 4: Kind = tok::r_brace; // '%>' -> '}'
1843 4: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
139: branch 0 taken
0: branch 1 not taken
8: branch 2 taken
131: branch 3 taken
1844 143: } else if (Features.Digraphs && Char == ':') {
1845 8: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1846 8: Char = getCharAndSize(CurPtr, SizeTmp);
0: branch 0 not taken
8: branch 1 taken
0: branch 3 not taken
0: branch 4 not taken
0: branch 5 not taken
8: branch 6 taken
1847 8: if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
1848 0: Kind = tok::hashhash; // '%:%:' -> '##'
1849 : CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1850 0: SizeTmp2, Result);
0: branch 0 not taken
8: branch 1 taken
8: branch 2 taken
8: branch 3 taken
1851 8: } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize
1852 0: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
0: branch 1 not taken
0: branch 2 not taken
1853 0: if (!isLexingRawMode())
1854 0: Diag(BufferPtr, diag::charize_microsoft_ext);
1855 0: Kind = tok::hashat;
1856 : } else { // '%:' -> '#'
1857 : // We parsed a # character. If this occurs at the start of the line,
1858 : // it's actually the start of a preprocessing directive. Callback to
1859 : // the preprocessor to handle it.
1860 : // FIXME: -fpreprocessed mode??
8: branch 1 taken
0: branch 2 not taken
4: branch 3 taken
4: branch 4 taken
4: branch 5 taken
0: branch 6 not taken
4: branch 7 taken
4: branch 8 taken
1861 8: if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
1862 4: FormTokenWithChars(Result, CurPtr, tok::hash);
1863 4: PP->HandleDirective(Result);
1864 :
1865 : // As an optimization, if the preprocessor didn't switch lexers, tail
1866 : // recurse.
3: branch 1 taken
1: branch 2 taken
1867 4: if (PP->isCurrentLexer(this)) {
1868 : // Start a new token. If this is a #include or something, the PP may
1869 : // want us starting at the beginning of the line again. If so, set
1870 : // the StartOfLine flag.
3: branch 0 taken
0: branch 1 not taken
1871 3: if (IsAtStartOfLine) {
1872 3: Result.setFlag(Token::StartOfLine);
1873 3: IsAtStartOfLine = false;
1874 : }
1875 3: goto LexNextToken; // GCC isn't tail call eliminating.
1876 : }
1877 :
1878 1: return PP->Lex(Result);
1879 : }
1880 :
1881 4: Kind = tok::hash;
1882 : }
1883 : } else {
1884 131: Kind = tok::percent;
1885 : }
1886 149: break;
1887 : case '<':
1888 12950: Char = getCharAndSize(CurPtr, SizeTmp);
632: branch 0 taken
12318: branch 1 taken
1889 12950: if (ParsingFilename) {
1890 632: return LexAngledStringLiteral(Result, CurPtr);
814: branch 0 taken
11504: branch 1 taken
1891 12318: } else if (Char == '<') {
1892 814: char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
31: branch 0 taken
783: branch 1 taken
1893 814: if (After == '=') {
1894 31: Kind = tok::lesslessequal;
1895 : CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1896 31: SizeTmp2, Result);
8: branch 0 taken
775: branch 1 taken
2: branch 3 taken
6: branch 4 taken
2: branch 5 taken
781: branch 6 taken
1897 783: } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
1898 : // If this is actually a '<<<<<<<' version control conflict marker,
1899 : // recognize it as such and recover nicely.
1900 2: goto LexNextToken;
1901 : } else {
1902 781: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1903 781: Kind = tok::lessless;
1904 : }
56: branch 0 taken
11448: branch 1 taken
1905 11504: } else if (Char == '=') {
1906 56: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1907 56: Kind = tok::lessequal;
11442: branch 0 taken
6: branch 1 taken
8: branch 2 taken
11434: branch 3 taken
1908 11456: } else if (Features.Digraphs && Char == ':') { // '<:' -> '['
1909 8: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1910 8: Kind = tok::l_square;
11434: branch 0 taken
6: branch 1 taken
4: branch 2 taken
11430: branch 3 taken
1911 11444: } else if (Features.Digraphs && Char == '%') { // '<%' -> '{'
1912 4: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1913 4: Kind = tok::l_brace;
1914 : } else {
1915 11436: Kind = tok::less;
1916 : }
1917 12316: break;
1918 : case '>':
1919 11871: Char = getCharAndSize(CurPtr, SizeTmp);
576: branch 0 taken
11295: branch 1 taken
1920 11871: if (Char == '=') {
1921 576: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1922 576: Kind = tok::greaterequal;
220: branch 0 taken
11075: branch 1 taken
1923 11295: } else if (Char == '>') {
1924 220: char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
12: branch 0 taken
208: branch 1 taken
1925 220: if (After == '=') {
1926 : CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1927 12: SizeTmp2, Result);
1928 12: Kind = tok::greatergreaterequal;
12: branch 0 taken
196: branch 1 taken
0: branch 3 not taken
12: branch 4 taken
0: branch 5 not taken
208: branch 6 taken
1929 208: } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
1930 : // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
1931 0: goto LexNextToken;
1932 : } else {
1933 208: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1934 208: Kind = tok::greatergreater;
1935 : }
1936 :
1937 : } else {
1938 11075: Kind = tok::greater;
1939 : }
1940 11871: break;
1941 : case '^':
1942 940: Char = getCharAndSize(CurPtr, SizeTmp);
0: branch 0 not taken
940: branch 1 taken
1943 940: if (Char == '=') {
1944 0: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1945 0: Kind = tok::caretequal;
1946 : } else {
1947 940: Kind = tok::caret;
1948 : }
1949 940: break;
1950 : case '|':
1951 1863: Char = getCharAndSize(CurPtr, SizeTmp);
33: branch 0 taken
1830: branch 1 taken
1952 1863: if (Char == '=') {
1953 33: Kind = tok::pipeequal;
1954 33: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1316: branch 0 taken
514: branch 1 taken
1955 1830: } else if (Char == '|') {
1956 : // If this is '|||||||' and we're in a conflict marker, ignore it.
4: branch 0 taken
1312: branch 1 taken
1: branch 3 taken
3: branch 4 taken
1: branch 5 taken
1315: branch 6 taken
1957 1316: if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
1958 1: goto LexNextToken;
1959 1315: Kind = tok::pipepipe;
1960 1315: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1961 : } else {
1962 514: Kind = tok::pipe;
1963 : }
1964 1862: break;
1965 : case ':':
1966 15980: Char = getCharAndSize(CurPtr, SizeTmp);
15962: branch 0 taken
18: branch 1 taken
8: branch 2 taken
15954: branch 3 taken
1967 15988: if (Features.Digraphs && Char == '>') {
1968 8: Kind = tok::r_square; // ':>' -> ']'
1969 8: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
7207: branch 0 taken
8765: branch 1 taken
4533: branch 2 taken
2674: branch 3 taken
1970 20505: } else if (Features.CPlusPlus && Char == ':') {
1971 4533: Kind = tok::coloncolon;
1972 4533: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1973 : } else {
1974 11439: Kind = tok::colon;
1975 : }
1976 15980: break;
1977 : case ';':
1978 86303: Kind = tok::semi;
1979 86303: break;
1980 : case '=':
1981 21704: Char = getCharAndSize(CurPtr, SizeTmp);
2688: branch 0 taken
19016: branch 1 taken
1982 21704: if (Char == '=') {
1983 : // If this is '=======' and we're in a conflict marker, ignore it.
7: branch 0 taken
2681: branch 1 taken
1: branch 3 taken
6: branch 4 taken
1: branch 5 taken
2687: branch 6 taken
1984 2688: if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
1985 1: goto LexNextToken;
1986 :
1987 2687: Kind = tok::equalequal;
1988 2687: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1989 : } else {
1990 19016: Kind = tok::equal;
1991 : }
1992 21703: break;
1993 : case ',':
1994 35587: Kind = tok::comma;
1995 35587: break;
1996 : case '#':
1997 306726: Char = getCharAndSize(CurPtr, SizeTmp);
810: branch 0 taken
305916: branch 1 taken
1998 306726: if (Char == '#') {
1999 810: Kind = tok::hashhash;
2000 810: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2: branch 0 taken
305914: branch 1 taken
2: branch 2 taken
0: branch 3 not taken
2001 305918: } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize
2002 2: Kind = tok::hashat;
1: branch 1 taken
1: branch 2 taken
2003 2: if (!isLexingRawMode())
2004 1: Diag(BufferPtr, diag::charize_microsoft_ext);
2005 2: CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2006 : } else {
2007 : // We parsed a # character. If this occurs at the start of the line,
2008 : // it's actually the start of a preprocessing directive. Callback to
2009 : // the preprocessor to handle it.
2010 : // FIXME: -fpreprocessed mode??
305613: branch 1 taken
301: branch 2 taken
288089: branch 3 taken
17524: branch 4 taken
288088: branch 5 taken
1: branch 6 taken
288088: branch 7 taken
17826: branch 8 taken
2011 305914: if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
2012 288088: FormTokenWithChars(Result, CurPtr, tok::hash);
2013 288088: PP->HandleDirective(Result);
2014 :
2015 : // As an optimization, if the preprocessor didn't switch lexers, tail
2016 : // recurse.
287506: branch 1 taken
582: branch 2 taken
2017 288088: if (PP->isCurrentLexer(this)) {
2018 : // Start a new token. If this is a #include or something, the PP may
2019 : // want us starting at the beginning of the line again. If so, set
2020 : // the StartOfLine flag.
287504: branch 0 taken
2: branch 1 taken
2021 287506: if (IsAtStartOfLine) {
2022 287504: Result.setFlag(Token::StartOfLine);
2023 287504: IsAtStartOfLine = false;
2024 : }
2025 287506: goto LexNextToken; // GCC isn't tail call eliminating.
2026 : }
2027 582: return PP->Lex(Result);
2028 : }
2029 :
2030 17826: Kind = tok::hash;
2031 : }
2032 18638: break;
2033 :
2034 : case '@':
2035 : // Objective C support.
13508: branch 0 taken
0: branch 1 not taken
13507: branch 2 taken
1: branch 3 taken
2036 27015: if (CurPtr[-1] == '@' && Features.ObjC1)
2037 13507: Kind = tok::at;
2038 : else
2039 1: Kind = tok::unknown;
2040 13508: break;
2041 :
2042 : case '\\':
2043 : // FIXME: UCN's.
2044 : // FALL THROUGH.
2045 : default:
2046 11: Kind = tok::unknown;
2047 : break;
2048 : }
2049 :
2050 : // Notify MIOpt that we read a non-whitespace/non-comment token.
2051 928774: MIOpt.ReadToken();
2052 :
2053 : // Update the location of token as well as BufferPtr.
2054 928774: FormTokenWithChars(Result, CurPtr, Kind);
2055 : }
Generated: 2010-02-10 01:31 by zcov