 |
|
 |
|
| Files: |
1 |
|
Branches Taken: |
35.7% |
5 / 14 |
| Generated: |
2010-02-10 01:31 |
|
Branches Executed: |
57.1% |
8 / 14 |
| |
|
Line Coverage: |
93.8% |
60 / 64 |
| |
 |
|
 |
1 : //===--- Lexer.h - C Language Family Lexer ----------------------*- C++ -*-===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file defines the Lexer interface.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #ifndef LLVM_CLANG_LEXER_H
15 : #define LLVM_CLANG_LEXER_H
16 :
17 : #include "clang/Lex/PreprocessorLexer.h"
18 : #include "clang/Basic/LangOptions.h"
19 : #include "llvm/ADT/SmallVector.h"
20 : #include <string>
21 : #include <vector>
22 : #include <cassert>
23 :
24 : namespace clang {
25 : class Diagnostic;
26 : class SourceManager;
27 : class Preprocessor;
28 : class DiagnosticBuilder;
29 :
30 : /// Lexer - This provides a simple interface that turns a text buffer into a
31 : /// stream of tokens. This provides no support for file reading or buffering,
32 : /// or buffering/seeking of tokens, only forward lexing is supported. It relies
33 : /// on the specified Preprocessor object to handle preprocessor directives, etc.
0: branch 2 not taken
138969: branch 3 taken
5618: branch 7 taken
0: branch 8 not taken
34 144587: class Lexer : public PreprocessorLexer {
35 : //===--------------------------------------------------------------------===//
36 : // Constant configuration values for this lexer.
37 : const char *BufferStart; // Start of the buffer.
38 : const char *BufferEnd; // End of the buffer.
39 : SourceLocation FileLoc; // Location for start of file.
40 : LangOptions Features; // Features enabled by this language (cache).
41 : bool Is_PragmaLexer : 1; // True if lexer for _Pragma handling.
42 : bool IsInConflictMarker : 1; // True if in a VCS conflict marker '<<<<<<<'
43 :
44 : //===--------------------------------------------------------------------===//
45 : // Context-specific lexing flags set by the preprocessor.
46 : //
47 :
48 : /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
49 : /// and return them as tokens. This is used for -C and -CC modes, and
50 : /// whitespace preservation can be useful for some clients that want to lex
51 : /// the file in raw mode and get every character from the file.
52 : ///
53 : /// When this is set to 2 it returns comments and whitespace. When set to 1
54 : /// it returns comments, when it is set to 0 it returns normal tokens only.
55 : unsigned char ExtendedTokenMode;
56 :
57 : //===--------------------------------------------------------------------===//
58 : // Context that changes as the file is lexed.
59 : // NOTE: any state that mutates when in raw mode must have save/restore code
60 : // in Lexer::isNextPPTokenLParen.
61 :
62 : // BufferPtr - Current pointer into the buffer. This is the next character
63 : // to be lexed.
64 : const char *BufferPtr;
65 :
66 : // IsAtStartOfLine - True if the next lexed token should get the "start of
67 : // line" flag set on it.
68 : bool IsAtStartOfLine;
69 :
70 : Lexer(const Lexer&); // DO NOT IMPLEMENT
71 : void operator=(const Lexer&); // DO NOT IMPLEMENT
72 : friend class Preprocessor;
73 :
74 : void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
75 : public:
76 :
77 : /// Lexer constructor - Create a new lexer object for the specified buffer
78 : /// with the specified preprocessor managing the lexing process. This lexer
79 : /// assumes that the associated file buffer and Preprocessor objects will
80 : /// outlive it, so it doesn't take ownership of either of them.
81 : Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP);
82 :
83 : /// Lexer constructor - Create a new raw lexer object. This object is only
84 : /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
85 : /// range will outlive it, so it doesn't take ownership of it.
86 : Lexer(SourceLocation FileLoc, const LangOptions &Features,
87 : const char *BufStart, const char *BufPtr, const char *BufEnd);
88 :
89 : /// Lexer constructor - Create a new raw lexer object. This object is only
90 : /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
91 : /// range will outlive it, so it doesn't take ownership of it.
92 : Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer,
93 : const SourceManager &SM, const LangOptions &Features);
94 :
95 : /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
96 : /// _Pragma expansion. This has a variety of magic semantics that this method
97 : /// sets up. It returns a new'd Lexer that must be delete'd when done.
98 : static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
99 : SourceLocation InstantiationLocStart,
100 : SourceLocation InstantiationLocEnd,
101 : unsigned TokLen, Preprocessor &PP);
102 :
103 :
104 : /// getFeatures - Return the language features currently enabled. NOTE: this
105 : /// lexer modifies features as a file is parsed!
106 76: const LangOptions &getFeatures() const { return Features; }
107 :
108 : /// getFileLoc - Return the File Location for the file we are lexing out of.
109 : /// The physical location encodes the location where the characters come from,
110 : /// the virtual location encodes where we should *claim* the characters came
111 : /// from. Currently this is only used by _Pragma handling.
112 944: SourceLocation getFileLoc() const { return FileLoc; }
113 :
114 : /// Lex - Return the next token in the file. If this is the end of file, it
115 : /// return the tok::eof token. Return true if an error occurred and
116 : /// compilation should terminate, false if normal. This implicitly involves
117 : /// the preprocessor.
118 2413704: void Lex(Token &Result) {
119 : // Start a new token.
120 2413704: Result.startToken();
121 :
122 : // NOTE, any changes here should also change code after calls to
123 : // Preprocessor::HandleDirective
151794: branch 0 taken
2261910: branch 1 taken
124 2413704: if (IsAtStartOfLine) {
125 151794: Result.setFlag(Token::StartOfLine);
126 151794: IsAtStartOfLine = false;
127 : }
128 :
129 : // Get a token. Note that this may delete the current lexer if the end of
130 : // file is reached.
131 2413704: LexTokenInternal(Result);
132 2413704: }
133 :
134 : /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
135 270688: bool isPragmaLexer() const { return Is_PragmaLexer; }
136 :
137 : /// IndirectLex - An indirect call to 'Lex' that can be invoked via
138 : /// the PreprocessorLexer interface.
139 729: void IndirectLex(Token &Result) { Lex(Result); }
140 :
141 : /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
142 : /// associated preprocessor object. Return true if the 'next character to
143 : /// read' pointer points at the end of the lexer buffer, false otherwise.
144 137981: bool LexFromRawLexer(Token &Result) {
0: branch 0 not taken
3188: branch 1 taken
145 137981: assert(LexingRawMode && "Not already in raw mode!");
146 137981: Lex(Result);
147 : // Note that lexing to the end of the buffer doesn't implicitly delete the
148 : // lexer when in raw mode.
149 137981: return BufferPtr == BufferEnd;
150 : }
151 :
152 : /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
153 : /// every character in the file, including whitespace and comments. This
154 : /// should only be used in raw mode, as the preprocessor is not prepared to
155 : /// deal with the excess tokens.
156 1913977: bool isKeepWhitespaceMode() const {
157 1913977: return ExtendedTokenMode > 1;
158 : }
159 :
160 : /// SetKeepWhitespaceMode - This method lets clients enable or disable
161 : /// whitespace retention mode.
162 0: void SetKeepWhitespaceMode(bool Val) {
163 : assert((!Val || LexingRawMode) &&
0: branch 0 not taken
0: branch 1 not taken
0: branch 2 not taken
0: branch 3 not taken
164 0: "Can only enable whitespace retention in raw mode");
0: branch 0 not taken
0: branch 1 not taken
165 0: ExtendedTokenMode = Val ? 2 : 0;
166 0: }
167 :
168 : /// inKeepCommentMode - Return true if the lexer should return comments as
169 : /// tokens.
170 87569: bool inKeepCommentMode() const {
171 87569: return ExtendedTokenMode > 0;
172 : }
173 :
174 : /// SetCommentRetentionMode - Change the comment retention mode of the lexer
175 : /// to the specified mode. This is really only useful when lexing in raw
176 : /// mode, because otherwise the lexer needs to manage this.
177 739038: void SetCommentRetentionState(bool Mode) {
178 : assert(!isKeepWhitespaceMode() &&
179 739038: "Can't play with comment retention state when retaining whitespace");
180 877839: ExtendedTokenMode = Mode ? 1 : 0;
181 739038: }
182 :
183 4: const char *getBufferStart() const { return BufferStart; }
184 :
185 : /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
186 : /// uninterpreted string. This switches the lexer out of directive mode.
187 : std::string ReadToEndOfLine();
188 :
189 :
190 : /// Diag - Forwarding function for diagnostics. This translate a source
191 : /// position in the current buffer into a SourceLocation object for rendering.
192 : DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
193 :
194 : /// getSourceLocation - Return a source location identifier for the specified
195 : /// offset in the current file.
196 : SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
197 :
198 : /// getSourceLocation - Return a source location for the next character in
199 : /// the current file.
200 552: SourceLocation getSourceLocation() { return getSourceLocation(BufferPtr); }
201 :
202 : /// \brief Return the current location in the buffer.
203 51: const char *getBufferLocation() const { return BufferPtr; }
204 :
205 : /// Stringify - Convert the specified string into a C string by escaping '\'
206 : /// and " characters. This does not add surrounding ""'s to the string.
207 : /// If Charify is true, this escapes the ' character instead of ".
208 : static std::string Stringify(const std::string &Str, bool Charify = false);
209 :
210 : /// Stringify - Convert the specified string into a C string by escaping '\'
211 : /// and " characters. This does not add surrounding ""'s to the string.
212 : static void Stringify(llvm::SmallVectorImpl<char> &Str);
213 :
214 : /// MeasureTokenLength - Relex the token at the specified location and return
215 : /// its length in bytes in the input file. If the token needs cleaning (e.g.
216 : /// includes a trigraph or an escaped newline) then this count includes bytes
217 : /// that are part of that.
218 : static unsigned MeasureTokenLength(SourceLocation Loc,
219 : const SourceManager &SM,
220 : const LangOptions &LangOpts);
221 :
222 : //===--------------------------------------------------------------------===//
223 : // Internal implementation interfaces.
224 : private:
225 :
226 : /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
227 : /// by Lex.
228 : ///
229 : void LexTokenInternal(Token &Result);
230 :
231 : /// FormTokenWithChars - When we lex a token, we have identified a span
232 : /// starting at BufferPtr, going to TokEnd that forms the token. This method
233 : /// takes that range and assigns it to the token as its location and size. In
234 : /// addition, since tokens cannot overlap, this also updates BufferPtr to be
235 : /// TokEnd.
236 : void FormTokenWithChars(Token &Result, const char *TokEnd,
237 2700239: tok::TokenKind Kind) {
238 2700239: unsigned TokLen = TokEnd-BufferPtr;
239 2700239: Result.setLength(TokLen);
240 2700239: Result.setLocation(getSourceLocation(BufferPtr, TokLen));
241 2700239: Result.setKind(Kind);
242 2700239: BufferPtr = TokEnd;
243 2700239: }
244 :
245 : /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
246 : /// tok::l_paren token, 0 if it is something else and 2 if there are no more
247 : /// tokens in the buffer controlled by this lexer.
248 : unsigned isNextPPTokenLParen();
249 :
250 : //===--------------------------------------------------------------------===//
251 : // Lexer character reading interfaces.
252 : public:
253 :
254 : // This lexer is built on two interfaces for reading characters, both of which
255 : // automatically provide phase 1/2 translation. getAndAdvanceChar is used
256 : // when we know that we will be reading a character from the input buffer and
257 : // that this character will be part of the result token. This occurs in (f.e.)
258 : // string processing, because we know we need to read until we find the
259 : // closing '"' character.
260 : //
261 : // The second interface is the combination of getCharAndSize with
262 : // ConsumeChar. getCharAndSize reads a phase 1/2 translated character,
263 : // returning it and its size. If the lexer decides that this character is
264 : // part of the current token, it calls ConsumeChar on it. This two stage
265 : // approach allows us to emit diagnostics for characters (e.g. warnings about
266 : // trigraphs), knowing that they only are emitted if the character is
267 : // consumed.
268 :
269 : /// isObviouslySimpleCharacter - Return true if the specified character is
270 : /// obviously the same in translation phase 1 and translation phase 3. This
271 : /// can return false for characters that end up being the same, but it will
272 : /// never return true for something that needs to be mapped.
273 5559562: static bool isObviouslySimpleCharacter(char C) {
274 5559562: return C != '?' && C != '\\';
275 : }
276 :
277 : /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
278 : /// advance over it, and return it. This is tricky in several cases. Here we
279 : /// just handle the trivial case and fall-back to the non-inlined
280 : /// getCharAndSizeSlow method to handle the hard case.
281 3126992: inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
282 : // If this is not a trigraph and not a UCN or escaped newline, return
283 : // quickly.
284 3126992: if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
285 :
286 6427: unsigned Size = 0;
287 6427: char C = getCharAndSizeSlow(Ptr, Size, &Tok);
288 6427: Ptr += Size;
289 6427: return C;
290 : }
291 :
292 : private:
293 : /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
294 : /// and added to a given token, check to see if there are diagnostics that
295 : /// need to be emitted or flags that need to be set on the token. If so, do
296 : /// it.
297 1047912: const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
298 : // Normal case, we consumed exactly one token. Just return it.
299 1047912: if (Size == 1)
300 1047901: return Ptr+Size;
301 :
302 : // Otherwise, re-lex the character with a current token, allowing
303 : // diagnostics to be emitted and flags to be set.
304 11: Size = 0;
305 11: getCharAndSizeSlow(Ptr, Size, &Tok);
306 11: return Ptr+Size;
307 : }
308 :
309 : /// getCharAndSize - Peek a single 'character' from the specified buffer,
310 : /// get its size, and return it. This is tricky in several cases. Here we
311 : /// just handle the trivial case and fall-back to the non-inlined
312 : /// getCharAndSizeSlow method to handle the hard case.
313 1768246: inline char getCharAndSize(const char *Ptr, unsigned &Size) {
314 : // If this is not a trigraph and not a UCN or escaped newline, return
315 : // quickly.
316 1768246: if (isObviouslySimpleCharacter(Ptr[0])) {
317 1767985: Size = 1;
318 1767985: return *Ptr;
319 : }
320 :
321 261: Size = 0;
322 261: return getCharAndSizeSlow(Ptr, Size);
323 : }
324 :
325 : /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
326 : /// method.
327 : char getCharAndSizeSlow(const char *Ptr, unsigned &Size, Token *Tok = 0);
328 : public:
329 :
330 : /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
331 : /// emit a warning.
332 : static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
333 35378: const LangOptions &Features) {
334 : // If this is not a trigraph and not a UCN or escaped newline, return
335 : // quickly.
336 35378: if (isObviouslySimpleCharacter(Ptr[0])) {
337 35045: Size = 1;
338 35045: return *Ptr;
339 : }
340 :
341 333: Size = 0;
342 333: return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
343 : }
344 :
345 : /// getEscapedNewLineSize - Return the size of the specified escaped newline,
346 : /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
347 : /// to this function.
348 : static unsigned getEscapedNewLineSize(const char *P);
349 :
350 : /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
351 : /// them), skip over them and return the first non-escaped-newline found,
352 : /// otherwise return P.
353 : static const char *SkipEscapedNewLines(const char *P);
354 : private:
355 :
356 : /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
357 : /// diagnostic.
358 : static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
359 : const LangOptions &Features);
360 :
361 : //===--------------------------------------------------------------------===//
362 : // Other lexer functions.
363 :
364 : // Helper functions to lex the remainder of a token of the specific type.
365 : void LexIdentifier (Token &Result, const char *CurPtr);
366 : void LexNumericConstant (Token &Result, const char *CurPtr);
367 : void LexStringLiteral (Token &Result, const char *CurPtr,bool Wide);
368 : void LexAngledStringLiteral(Token &Result, const char *CurPtr);
369 : void LexCharConstant (Token &Result, const char *CurPtr);
370 : bool LexEndOfFile (Token &Result, const char *CurPtr);
371 :
372 : bool SkipWhitespace (Token &Result, const char *CurPtr);
373 : bool SkipBCPLComment (Token &Result, const char *CurPtr);
374 : bool SkipBlockComment (Token &Result, const char *CurPtr);
375 : bool SaveBCPLComment (Token &Result, const char *CurPtr);
376 :
377 : bool IsStartOfConflictMarker(const char *CurPtr);
378 : bool HandleEndOfConflictMarker(const char *CurPtr);
379 : };
380 :
381 :
382 : } // end namespace clang
383 :
384 : #endif
Generated: 2010-02-10 01:31 by zcov