2 * Defines the basic structures used to manipulate character
\r
3 * streams from any input source. The first implementation of
\r
4 * this stream was ASCII 8 bit, but any character size and encoding
\r
5 * can in theory be used, so long as they can return a 32 bit Integer
\r
6 * representation of their characters amd efficiently mark and revert
\r
7 * to specific offsets into their input streams.
\r
9 #ifndef _ANTLR3_INPUT_H
\r
10 #define _ANTLR3_INPUT_H
\r
12 // [The "BSD licence"]
\r
13 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
\r
14 // http://www.temporal-wave.com
\r
15 // http://www.linkedin.com/in/jimidle
\r
17 // All rights reserved.
\r
19 // Redistribution and use in source and binary forms, with or without
\r
20 // modification, are permitted provided that the following conditions
\r
22 // 1. Redistributions of source code must retain the above copyright
\r
23 // notice, this list of conditions and the following disclaimer.
\r
24 // 2. Redistributions in binary form must reproduce the above copyright
\r
25 // notice, this list of conditions and the following disclaimer in the
\r
26 // documentation and/or other materials provided with the distribution.
\r
27 // 3. The name of the author may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
\r
31 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
\r
32 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
\r
33 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
\r
34 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
\r
35 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
\r
36 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
\r
37 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
\r
38 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
\r
39 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\r
41 #include <antlr3defs.h>
\r
42 #include <antlr3string.h>
\r
43 #include <antlr3commontoken.h>
\r
44 #include <antlr3intstream.h>
\r
50 /// Master context structure for an ANTLR3 C runtime based input stream.
\r
51 /// \ingroup apistructures
\r
53 typedef struct ANTLR3_INPUT_STREAM_struct
\r
55 /** Interfaces that provide streams must all provide
\r
56 * a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM
\r
59 pANTLR3_INT_STREAM istream;
\r
61 /** Whatever super structure is providing the INPUT stream needs a pointer to itself
\r
62 * so that this can be passed back to it whenever the api functions
\r
63 * are called back from this interface.
\r
67 /// Indicates the size, in 8 bit units, of a single character. Note that
\r
68 /// the C runtime does not deal with surrogates and UTF8 directly as this would be
\r
69 /// slow and complicated. Variable character width inputs are expected to be converted
\r
70 /// into fixed width formats, so that would be a UTF32 format for anything that cannot
\r
71 /// work with a UCS2 encoding, such as UTF-8. Generally you are best
\r
72 /// working internally with 32 bit characters.
\r
74 ANTLR3_UINT8 charByteSize;
\r
76 /** Pointer the start of the input string, characters may be
\r
77 * taken as offsets from here and in original input format encoding.
\r
81 /** Indicates if the data pointer was allocated by us, and so should be freed
\r
82 * when the stream dies.
\r
86 /** String factory for this input stream
\r
88 pANTLR3_STRING_FACTORY strFactory;
\r
91 /** Pointer to the next character to be consumed from the input data
\r
92 * This is cast to point at the encoding of the original file that
\r
93 * was read by the functions installed as pointer in this input stream
\r
94 * context instance at file/string/whatever load time.
\r
98 /** Number of characters that can be consumed at this point in time.
\r
99 * Mostly this is just what is left in the pre-read buffer, but if the
\r
100 * input source is a stream such as a socket or something then we may
\r
101 * call special read code to wait for more input.
\r
103 ANTLR3_UINT32 sizeBuf;
\r
105 /** The line number we are traversing in the input file. This gets incremented
\r
106 * by a newline() call in the lexer grammar actions.
\r
108 ANTLR3_UINT32 line;
\r
110 /** Pointer into the input buffer where the current line
\r
113 void * currentLine;
\r
115 /** The offset within the current line of the current character
\r
117 ANTLR3_INT32 charPositionInLine;
\r
119 /** Tracks how deep mark() calls are nested
\r
121 ANTLR3_UINT32 markDepth;
\r
123 /** List of mark() points in the input stream
\r
125 pANTLR3_VECTOR markers;
\r
127 /** File name string, set to pointer to memory if
\r
128 * you set it manually as it will be free()d
\r
130 pANTLR3_STRING fileName;
\r
132 /** File number, needs to be set manually to some file index of your devising.
\r
134 ANTLR3_UINT32 fileNo;
\r
136 /** Character that automatically causes an internal line count
\r
139 ANTLR3_UCHAR newlineChar;
\r
144 /** Pointer to function that closes the input stream
\r
146 void (*close) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
147 void (*free) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
149 /** Pointer to function that resets the input stream
\r
151 void (*reset) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
154 * Pinter to function that installs a version of LA that always
\r
155 * returns upper case. Only valid for character streams and creates a case
\r
156 * insensitive lexer if the lexer tokens are described in upper case. The
\r
157 * tokens will preserve case in the token text.
\r
159 void (*setUcaseLA) (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
\r
161 /** Pointer to function to return input stream element at 1 based
\r
162 * offset from nextChar. Same as _LA for char stream, but token
\r
163 * streams etc. have one of these that does other stuff of course.
\r
165 void * (*_LT) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt);
\r
167 /** Pointer to function to return the total size of the input buffer. For streams
\r
168 * this may be just the total we have available so far. This means of course that
\r
169 * the input stream must be careful to accumulate enough input so that any backtracking
\r
170 * can be satisfied.
\r
172 ANTLR3_UINT32 (*size) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
174 /** Pointer to function to return a substring of the input stream. String is returned in allocated
\r
175 * memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form.
\r
177 pANTLR3_STRING (*substr) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
\r
179 /** Pointer to function to return the current line number in the input stream
\r
181 ANTLR3_UINT32 (*getLine) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
183 /** Pointer to function to return the current line buffer in the input stream
\r
184 * The pointer returned is directly into the input stream so you must copy
\r
185 * it if you wish to manipulate it without damaging the input stream. Encoding
\r
186 * is obviously in the same form as the input stream.
\r
188 * - Note taht this function wil lbe inaccurate if setLine is called as there
\r
189 * is no way at the moment to position the input stream at a particular line
\r
192 void * (*getLineBuf) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
194 /** Pointer to function to return the current offset in the current input stream line
\r
196 ANTLR3_UINT32 (*getCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input);
\r
198 /** Pointer to function to set the current line number in the input stream
\r
200 void (*setLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line);
\r
202 /** Pointer to function to set the current position in the current line.
\r
204 void (*setCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position);
\r
206 /** Pointer to function to override the default newline character that the input stream
\r
207 * looks for to trigger the line and offset and line buffer recording information.
\r
209 * - By default the chracter '\n' will be instaleldas tehe newline trigger character. When this
\r
210 * character is seen by the consume() function then the current line number is incremented and the
\r
211 * current line offset is reset to 0. The Pointer for the line of input we are consuming
\r
212 * is updated to point to the next character after this one in the input stream (which means it
\r
213 * may become invlaid if the last newline character in the file is seen (so watch out).
\r
214 * - If for some reason you do not want teh counters and pointesr to be restee, yu can set the
\r
215 * chracter to some impossible charater such as '\0' or whatever.
\r
216 * - This is a single character only, so choose the last chracter in a sequence of two or more.
\r
217 * - This is only a simple aid to error reporting - if you have a complicated binary inptu structure
\r
218 * it may not be adequate, but you can always override every function in the input stream with your
\r
219 * own of course, and can even write your own complete input stream set if you like.
\r
220 * - It is your responsiblity to set a valid cahracter for the input stream type. Ther is no point
\r
221 * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII as this will just be truncated and never
\r
222 * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
\r
224 void (*SetNewLineChar) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar);
\r
228 ANTLR3_INPUT_STREAM;
\r
231 /** \brief Structure for track lex input states as part of mark()
\r
232 * and rewind() of lexer.
\r
234 typedef struct ANTLR3_LEX_STATE_struct
\r
236 /** Pointer to the next character to be consumed from the input data
\r
237 * This is cast to point at the encoding of the original file that
\r
238 * was read by the functions installed as pointer in this input stream
\r
239 * context instance at file/string/whatever load time.
\r
243 /** The line number we are traversing in the input file. This gets incremented
\r
244 * by a newline() call in the lexer grammer actions.
\r
246 ANTLR3_UINT32 line;
\r
248 /** Pointer into the input buffer where the current line
\r
251 void * currentLine;
\r
253 /** The offset within the current line of the current character
\r
255 ANTLR3_INT32 charPositionInLine;
\r
262 void antlr3AsciiSetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type);
\r
263 void antlr3UCS2SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type);
\r
264 void antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type);
\r
270 #endif /* _ANTLR3_INPUT_H */
\r