2 * Base interface for any ANTLR3 lexer.
\r
4 * An ANLTR3 lexer builds from two sets of components:
\r
6 * - The runtime components that provide common functionality such as
\r
7 * traversing character streams, building tokens for output and so on.
\r
8 * - The generated rules and struutre of the actual lexer, which call upon the
\r
9 * runtime components.
\r
11 * A lexer class contains a character input stream, a base recognizer interface
\r
12 * (which it will normally implement) and a token source interface (which it also
\r
13 * implements. The Tokensource interface is called by a token consumer (such as
\r
14 * a parser, but in theory it can be anything that wants a set of abstract
\r
15 * tokens in place of a raw character stream.
\r
17 * So then, we set up a lexer in a sequence akin to:
\r
19 * - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
\r
20 * and initialize it.
\r
21 * - Create a lexer interface and tell it where it its input stream is.
\r
22 * This will cause the creation of a base recognizer class, which it will
\r
23 * override with its own implementations of some methods. The lexer creator
\r
24 * can also then in turn override anything it likes.
\r
25 * - The lexer token source interface is then passed to some interface that
\r
26 * knows how to use it, byte calling for a next token.
\r
27 * - When a next token is called, let ze lexing begin.
\r
30 #ifndef _ANTLR3_LEXER
\r
31 #define _ANTLR3_LEXER
\r
33 // [The "BSD licence"]
\r
34 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
\r
35 // http://www.temporal-wave.com
\r
36 // http://www.linkedin.com/in/jimidle
\r
38 // All rights reserved.
\r
40 // Redistribution and use in source and binary forms, with or without
\r
41 // modification, are permitted provided that the following conditions
\r
43 // 1. Redistributions of source code must retain the above copyright
\r
44 // notice, this list of conditions and the following disclaimer.
\r
45 // 2. Redistributions in binary form must reproduce the above copyright
\r
46 // notice, this list of conditions and the following disclaimer in the
\r
47 // documentation and/or other materials provided with the distribution.
\r
48 // 3. The name of the author may not be used to endorse or promote products
\r
49 // derived from this software without specific prior written permission.
\r
51 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
\r
52 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
\r
53 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
\r
54 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
\r
55 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
\r
56 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
\r
57 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
\r
58 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
\r
59 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
\r
60 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\r
64 #define ANTLR3_STRING_TERMINATOR 0xFFFFFFFF
\r
66 #include <antlr3defs.h>
\r
67 #include <antlr3input.h>
\r
68 #include <antlr3commontoken.h>
\r
69 #include <antlr3tokenstream.h>
\r
70 #include <antlr3baserecognizer.h>
\r
76 typedef struct ANTLR3_LEXER_struct
\r
78 /** If there is a super structure that is implementing the
\r
79 * lexer, then a pointer to it can be stored here in case
\r
80 * implementing functions are overridden by this super structure.
\r
84 /** A generated lexer has an mTokens() function, which needs
\r
85 * the context pointer of the generated lexer, not the base lexer interface
\r
86 * this is stored here and initialized by the generated code (or manually
\r
87 * if this is a manually built lexer.
\r
91 /** A pointer to the character stream whence this lexer is receiving
\r
93 * TODO: I may come back to this and implement charstream outside
\r
94 * the input stream as per the java implementation.
\r
96 pANTLR3_INPUT_STREAM input;
\r
98 /** Pointer to the implementation of a base recognizer, which the lexer
\r
99 * creates and then overrides with its own lexer oriented functions (the
\r
100 * default implementation is parser oriented). This also contains a
\r
101 * token source interface, which the lexer instance will provide to anything
\r
102 * that needs it, which is anything else that implements a base recognizer,
\r
103 * such as a parser.
\r
105 pANTLR3_BASE_RECOGNIZER rec;
\r
107 /** Pointer to a function that sets the charstream source for the lexer and
\r
108 * causes it to be reset.
\r
110 void (*setCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
\r
112 /** Pointer to a function that switches the current character input stream to
\r
113 * a new one, saving the old one, which we will revert to at the end of this
\r
116 void (*pushCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
\r
118 /** Pointer to a function that abandons the current input stream, whether it
\r
119 * is empty or not and reverts to the previous stacked input stream.
\r
121 void (*popCharStream) (struct ANTLR3_LEXER_struct * lexer);
\r
123 /** Pointer to a function that emits the supplied token as the next token in
\r
126 void (*emitNew) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token);
\r
128 /** Pointer to a function that constructs a new token from the lexer stored information
\r
130 pANTLR3_COMMON_TOKEN (*emit) (struct ANTLR3_LEXER_struct * lexer);
\r
132 /** Pointer to the user provided (either manually or through code generation
\r
133 * function that causes the lexer rules to run the lexing rules and produce
\r
134 * the next token if there iss one. This is called from nextToken() in the
\r
135 * pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is
\r
136 * the generated lexer context (stored in ctx in this interface) it is a generated
\r
137 * function and expects the context to be the generated lexer.
\r
139 void (*mTokens) (void * ctx);
\r
141 /** Pointer to a function that attempts to match and consume the specified string from the input
\r
142 * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
\r
143 * with 0xFFFFFFFF, which is an invalid UTF32 character
\r
145 ANTLR3_BOOLEAN (*matchs) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string);
\r
147 /** Pointer to a function that matches and consumes the specified character from the input stream.
\r
148 * As the input stream is required to provide characters via LA() as UTF32 characters it does not
\r
149 * need to provide an implementation if it is not sourced from 8 bit ASCII. The default lexer
\r
150 * implementation is source encoding agnostic, unless for some reason it takes two 32 bit characters
\r
151 * to specify a single character, in which case the input stream and the lexer rules would have to match
\r
152 * in encoding and then it would work 'by accident' anyway.
\r
154 ANTLR3_BOOLEAN (*matchc) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c);
\r
156 /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
\r
157 * but this would only be useful if the tokens were in tsome guaranteed order which is
\r
158 * only going to happen with a hand crafted token set).
\r
160 ANTLR3_BOOLEAN (*matchRange) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high);
\r
162 /** Pointer to a function that matches the next token/char in the input stream
\r
163 * regardless of what it actaully is.
\r
165 void (*matchAny) (struct ANTLR3_LEXER_struct * lexer);
\r
167 /** Pointer to a function that recovers from an error found in the input stream.
\r
168 * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
\r
169 * be from a mismatched token that the (*match)() could not recover from.
\r
171 void (*recover) (struct ANTLR3_LEXER_struct * lexer);
\r
173 /** Pointer to function to return the current line number in the input stream
\r
175 ANTLR3_UINT32 (*getLine) (struct ANTLR3_LEXER_struct * lexer);
\r
176 ANTLR3_MARKER (*getCharIndex) (struct ANTLR3_LEXER_struct * lexer);
\r
177 ANTLR3_UINT32 (*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer);
\r
179 /** Pointer to function to return the text so far for the current token being generated
\r
181 pANTLR3_STRING (*getText) (struct ANTLR3_LEXER_struct * lexer);
\r
184 /** Pointer to a function that knows how to free the resources of a lexer
\r
186 void (*free) (struct ANTLR3_LEXER_struct * lexer);
\r