2 /// Base functions to initialize and manipulate any input stream
\r
5 // [The "BSD licence"]
\r
6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
\r
7 // http://www.temporal-wave.com
\r
8 // http://www.linkedin.com/in/jimidle
\r
10 // All rights reserved.
\r
12 // Redistribution and use in source and binary forms, with or without
\r
13 // modification, are permitted provided that the following conditions
\r
15 // 1. Redistributions of source code must retain the above copyright
\r
16 // notice, this list of conditions and the following disclaimer.
\r
17 // 2. Redistributions in binary form must reproduce the above copyright
\r
18 // notice, this list of conditions and the following disclaimer in the
\r
19 // documentation and/or other materials provided with the distribution.
\r
20 // 3. The name of the author may not be used to endorse or promote products
\r
21 // derived from this software without specific prior written permission.
\r
23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
\r
24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
\r
25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
\r
26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
\r
27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
\r
28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
\r
29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
\r
30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
\r
31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
\r
32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\r
34 #include <antlr3input.h>
\r
39 static void antlr3AsciiConsume (pANTLR3_INT_STREAM is);
\r
40 static ANTLR3_UCHAR antlr3AsciiLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
\r
41 static ANTLR3_UCHAR antlr3AsciiLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
\r
42 static ANTLR3_MARKER antlr3AsciiIndex (pANTLR3_INT_STREAM is);
\r
43 static ANTLR3_MARKER antlr3AsciiMark (pANTLR3_INT_STREAM is);
\r
44 static void antlr3AsciiRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
\r
45 static void antlr3AsciiRewindLast (pANTLR3_INT_STREAM is);
\r
46 static void antlr3AsciiRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
\r
47 static void antlr3AsciiSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
\r
48 static pANTLR3_STRING antlr3AsciiGetSourceName (pANTLR3_INT_STREAM is);
\r
50 // ASCII Charstream API functions
\r
52 static void antlr3InputClose (pANTLR3_INPUT_STREAM input);
\r
53 static void antlr3InputReset (pANTLR3_INPUT_STREAM input);
\r
54 static void * antlr3AsciiLT (pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
\r
55 static ANTLR3_UINT32 antlr3AsciiSize (pANTLR3_INPUT_STREAM input);
\r
56 static pANTLR3_STRING antlr3AsciiSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
\r
57 static ANTLR3_UINT32 antlr3AsciiGetLine (pANTLR3_INPUT_STREAM input);
\r
58 static void * antlr3AsciiGetLineBuf (pANTLR3_INPUT_STREAM input);
\r
59 static ANTLR3_UINT32 antlr3AsciiGetCharPosition (pANTLR3_INPUT_STREAM input);
\r
60 static void antlr3AsciiSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
\r
61 static void antlr3AsciiSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
\r
62 static void antlr3AsciiSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
\r
63 static void antlr3AsciiSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
\r
65 /// \brief Common function to setup function interface for an 8 bit ASCII input stream.
\r
67 /// \param input Input stream context pointer
\r
70 /// - Many of the 8 bit ASCII oriented file stream handling functions will be usable
\r
71 /// by any or at least some other input streams. Therefore it is perfectly acceptable
\r
72 /// to call this function to install the ASCII handler then override just those functions
\r
73 /// that would not work for the particular input encoding, such as consume for instance.
\r
76 antlr3AsciiSetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type)
\r
78 // Build a string factory for this stream
\r
80 input->strFactory = antlr3StringFactoryNew();
\r
82 // Default stream set up is for ASCII, therefore there is nothing else
\r
83 // to do but set it up as such
\r
85 antlr3GenericSetupStream(input, type);
\r
90 antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type)
\r
93 /* Install function pointers for an 8 bit ASCII input
\r
96 /* Allocate stream interface
\r
98 input->istream = antlr3IntStreamNew();
\r
99 input->istream->type = ANTLR3_CHARSTREAM;
\r
100 input->istream->super = input;
\r
102 input->istream->type = type;
\r
106 input->istream->consume = antlr3AsciiConsume; /* Consume the next 8 bit character in the buffer */
\r
107 input->istream->_LA = antlr3AsciiLA; /* Return the UTF32 character at offset n (1 based) */
\r
108 input->istream->index = antlr3AsciiIndex; /* Current index (offset from first character */
\r
109 input->istream->mark = antlr3AsciiMark; /* Record the current lex state for later restore */
\r
110 input->istream->rewind = antlr3AsciiRewind; /* How to rewind the input */
\r
111 input->istream->rewindLast = antlr3AsciiRewindLast; /* How to rewind the input */
\r
112 input->istream->seek = antlr3AsciiSeek; /* How to seek to a specific point in the stream */
\r
113 input->istream->release = antlr3AsciiRelease; /* Reset marks after mark n */
\r
114 input->istream->getSourceName = antlr3AsciiGetSourceName; // Return a string that names the input source
\r
118 input->close = antlr3InputClose; /* Close down the stream completely */
\r
119 input->free = antlr3InputClose; /* Synonym for free */
\r
120 input->reset = antlr3InputReset; /* Reset input to start */
\r
121 input->_LT = antlr3AsciiLT; /* Same as _LA for 8 bit Ascii file */
\r
122 input->size = antlr3AsciiSize; /* Return the size of the input buffer */
\r
123 input->substr = antlr3AsciiSubstr; /* Return a string from the input stream */
\r
124 input->getLine = antlr3AsciiGetLine; /* Return the current line number in the input stream */
\r
125 input->getLineBuf = antlr3AsciiGetLineBuf; /* Return a pointer to the start of the current line being consumed */
\r
126 input->getCharPositionInLine = antlr3AsciiGetCharPosition; /* Return the offset into the current line of input */
\r
127 input->setLine = antlr3AsciiSetLine; /* Set the input stream line number (does not set buffer pointers) */
\r
128 input->setCharPositionInLine = antlr3AsciiSetCharPosition; /* Set the offset in to the current line (does not set any pointers ) */
\r
129 input->SetNewLineChar = antlr3AsciiSetNewLineChar; /* Set the value of the newline trigger character */
\r
130 input->setUcaseLA = antlr3AsciiSetUcaseLA;
\r
132 input->charByteSize = 1; // Size in bytes of characters in this stream.
\r
134 /* Initialize entries for tables etc
\r
136 input->markers = NULL;
\r
138 /* Set up the input stream brand new
\r
140 input->reset(input);
\r
142 /* Install default line separator character (it can be replaced
\r
143 * by the grammar programmer later)
\r
145 input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
\r
148 static pANTLR3_STRING
\r
149 antlr3AsciiGetSourceName(pANTLR3_INT_STREAM is)
\r
151 return is->streamName;
\r
154 /** \brief Close down an input stream and free any memory allocated by it.
\r
156 * \param input Input stream context pointer
\r
159 antlr3InputClose(pANTLR3_INPUT_STREAM input)
\r
161 // Close any markers in the input stream
\r
163 if (input->markers != NULL)
\r
165 input->markers->free(input->markers);
\r
166 input->markers = NULL;
\r
169 // Close the string factory
\r
171 if (input->strFactory != NULL)
\r
173 input->strFactory->close(input->strFactory);
\r
176 // Free the input stream buffer if we allocated it
\r
178 if (input->isAllocated && input->data != NULL)
\r
180 ANTLR3_FREE(input->data);
\r
181 input->data = NULL;
\r
184 input->istream->free(input->istream);
\r
186 // Finally, free the space for the structure itself
\r
188 ANTLR3_FREE(input);
\r
195 antlr3AsciiSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
\r
199 // Return the upper case version of the characters
\r
201 input->istream->_LA = antlr3AsciiLA_ucase;
\r
205 // Return the raw characters as they are in the buffer
\r
207 input->istream->_LA = antlr3AsciiLA;
\r
212 /** \brief Reset a re-startable input stream to the start
\r
214 * \param input Input stream context pointer
\r
217 antlr3InputReset(pANTLR3_INPUT_STREAM input)
\r
220 input->nextChar = input->data; /* Input at first character */
\r
221 input->line = 1; /* starts at line 1 */
\r
222 input->charPositionInLine = -1;
\r
223 input->currentLine = input->data;
\r
224 input->markDepth = 0; /* Reset markers */
\r
226 /* Free up the markers table if it is there
\r
228 if (input->markers != NULL)
\r
230 input->markers->free(input->markers);
\r
233 /* Install a new markers table
\r
235 input->markers = antlr3VectorNew(0);
\r
238 /** \brief Consume the next character in an 8 bit ASCII input stream
\r
240 * \param input Input stream context pointer
\r
243 antlr3AsciiConsume(pANTLR3_INT_STREAM is)
\r
245 pANTLR3_INPUT_STREAM input;
\r
247 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
249 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
\r
251 /* Indicate one more character in this line
\r
253 input->charPositionInLine++;
\r
255 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
\r
257 /* Reset for start of a new line of input
\r
260 input->charPositionInLine = 0;
\r
261 input->currentLine = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
\r
264 /* Increment to next character position
\r
266 input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
\r
270 /** \brief Return the input element assuming an 8 bit ascii input
\r
272 * \param[in] input Input stream context pointer
\r
273 * \param[in] la 1 based offset of next input stream element
\r
275 * \return Next input character in internal ANTLR3 encoding (UTF32)
\r
277 static ANTLR3_UCHAR
\r
278 antlr3AsciiLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
\r
280 pANTLR3_INPUT_STREAM input;
\r
282 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
284 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
\r
286 return ANTLR3_CHARSTREAM_EOF;
\r
290 return (ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
\r
294 /** \brief Return the input element assuming an 8 bit ASCII input and
\r
295 * always return the UPPER CASE character.
\r
296 * Note that this is 8 bit and so we assume that the toupper
\r
297 * function will use the correct locale for 8 bits.
\r
299 * \param[in] input Input stream context pointer
\r
300 * \param[in] la 1 based offset of next input stream element
\r
302 * \return Next input character in internal ANTLR3 encoding (UTF32)
\r
304 static ANTLR3_UCHAR
\r
305 antlr3AsciiLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
\r
307 pANTLR3_INPUT_STREAM input;
\r
309 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
311 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
\r
313 return ANTLR3_CHARSTREAM_EOF;
\r
317 return (ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
\r
322 /** \brief Return the input element assuming an 8 bit ascii input
\r
324 * \param[in] input Input stream context pointer
\r
325 * \param[in] lt 1 based offset of next input stream element
\r
327 * \return Next input character in internal ANTLR3 encoding (UTF32)
\r
330 antlr3AsciiLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
\r
332 /* Casting is horrible but it means no warnings and LT should never be called
\r
333 * on a character stream anyway I think. If it is then, the void * will need to be
\r
334 * cast back in a similar manner. Yuck! But this means that LT for Token streams and
\r
335 * tree streams is correct.
\r
337 return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
\r
340 /** \brief Calculate the current index in the output stream.
\r
341 * \param[in] input Input stream context pointer
\r
343 static ANTLR3_MARKER
\r
344 antlr3AsciiIndex(pANTLR3_INT_STREAM is)
\r
346 pANTLR3_INPUT_STREAM input;
\r
348 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
350 return (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
\r
353 /** \brief Return the size of the current input stream, as an Ascii file
\r
354 * which in this case is the total input. Other implementations may provide
\r
355 * more sophisticated implementations to deal with non-recoverable streams
\r
358 * \param[in] input Input stream context pointer
\r
360 static ANTLR3_UINT32
\r
361 antlr3AsciiSize(pANTLR3_INPUT_STREAM input)
\r
363 return input->sizeBuf;
\r
366 /** \brief Mark the current input point in an Ascii 8 bit stream
\r
367 * such as a file stream, where all the input is available in the
\r
370 * \param[in] is Input stream context pointer
\r
372 static ANTLR3_MARKER
\r
373 antlr3AsciiMark (pANTLR3_INT_STREAM is)
\r
375 pANTLR3_LEX_STATE state;
\r
376 pANTLR3_INPUT_STREAM input;
\r
378 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
382 input->markDepth++;
\r
384 /* See if we are revisiting a mark as we can just reuse the vector
\r
385 * entry if we are, otherwise, we need a new one
\r
387 if (input->markDepth > input->markers->count)
\r
389 state = ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
\r
391 /* Add it to the table
\r
393 input->markers->add(input->markers, state, ANTLR3_FREE_FUNC); /* No special structure, just free() on delete */
\r
397 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
\r
399 /* Assume no errors for speed, it will just blow up if the table failed
\r
400 * for some reasons, hence lots of unit tests on the tables ;-)
\r
404 /* We have created or retrieved the state, so update it with the current
\r
405 * elements of the lexer state.
\r
407 state->charPositionInLine = input->charPositionInLine;
\r
408 state->currentLine = input->currentLine;
\r
409 state->line = input->line;
\r
410 state->nextChar = input->nextChar;
\r
412 is->lastMarker = input->markDepth;
\r
416 return input->markDepth;
\r
418 /** \brief Rewind the lexer input to the state specified by the last produced mark.
\r
420 * \param[in] input Input stream context pointer
\r
423 * Assumes ASCII (or at least, 8 Bit) input stream.
\r
426 antlr3AsciiRewindLast (pANTLR3_INT_STREAM is)
\r
428 is->rewind(is, is->lastMarker);
\r
431 /** \brief Rewind the lexer input to the state specified by the supplied mark.
\r
433 * \param[in] input Input stream context pointer
\r
436 * Assumes ASCII (or at least, 8 Bit) input stream.
\r
439 antlr3AsciiRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
\r
441 pANTLR3_LEX_STATE state;
\r
442 pANTLR3_INPUT_STREAM input;
\r
444 input = ((pANTLR3_INPUT_STREAM) is->super);
\r
446 /* Perform any clean up of the marks
\r
448 input->istream->release(input->istream, mark);
\r
450 /* Find the supplied mark state
\r
452 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
\r
454 /* Seek input pointer to the requested point (note we supply the void *pointer
\r
455 * to whatever is implementing the int stream to seek).
\r
457 antlr3AsciiSeek(is, (ANTLR3_MARKER)(state->nextChar));
\r
459 /* Reset to the reset of the information in the mark
\r
461 input->charPositionInLine = state->charPositionInLine;
\r
462 input->currentLine = state->currentLine;
\r
463 input->line = state->line;
\r
464 input->nextChar = state->nextChar;
\r
470 /** \brief Rewind the lexer input to the state specified by the supplied mark.
\r
472 * \param[in] input Input stream context pointer
\r
475 * Assumes ASCII (or at least, 8 Bit) input stream.
\r
478 antlr3AsciiRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
\r
480 pANTLR3_INPUT_STREAM input;
\r
482 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
484 /* We don't do much here in fact as we never free any higher marks in
\r
485 * the hashtable as we just resuse any memory allocated for them.
\r
487 input->markDepth = (ANTLR3_UINT32)(mark - 1);
\r
490 /** \brief Rewind the lexer input to the state specified by the supplied mark.
\r
492 * \param[in] input Input stream context pointer
\r
495 * Assumes ASCII (or at least, 8 Bit) input stream.
\r
498 antlr3AsciiSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
\r
500 ANTLR3_INT32 count;
\r
501 pANTLR3_INPUT_STREAM input;
\r
503 input = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
\r
505 /* If the requested seek point is less than the current
\r
506 * input point, then we assume that we are resetting from a mark
\r
507 * and do not need to scan, but can just set to there.
\r
509 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
\r
511 input->nextChar = ((pANTLR3_UINT8) seekPoint);
\r
515 count = (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
\r
523 /** Return a substring of the ASCII (8 bit) input stream in
\r
524 * newly allocated memory.
\r
526 * \param input Input stream context pointer
\r
527 * \param start Offset in input stream where the string starts
\r
528 * \param stop Offset in the input stream where the string ends.
\r
530 static pANTLR3_STRING
\r
531 antlr3AsciiSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
\r
533 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
\r
536 /** \brief Return the line number as understood by the 8 bit/ASCII input stream.
\r
538 * \param input Input stream context pointer
\r
539 * \return Line number in input stream that we believe we are working on.
\r
541 static ANTLR3_UINT32
\r
542 antlr3AsciiGetLine (pANTLR3_INPUT_STREAM input)
\r
544 return input->line;
\r
547 /** Return a pointer into the input stream that points at the start
\r
548 * of the current input line as triggered by the end of line character installed
\r
549 * for the stream ('\n' unless told differently).
\r
551 * \param[in] input
\r
554 antlr3AsciiGetLineBuf (pANTLR3_INPUT_STREAM input)
\r
556 return input->currentLine;
\r
559 /** Return the current offset in to the current line in the input stream.
\r
561 * \param input Input stream context pointer
\r
562 * \return Current line offset
\r
564 static ANTLR3_UINT32
\r
565 antlr3AsciiGetCharPosition (pANTLR3_INPUT_STREAM input)
\r
567 return input->charPositionInLine;
\r
570 /** Set the current line number as understood by the input stream.
\r
572 * \param input Input stream context pointer
\r
573 * \param line Line number to tell the input stream we are on
\r
576 * This function does not change any pointers, it just allows the programmer to set the
\r
577 * line number according to some external criterion, such as finding a lexed directive
\r
578 * like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
\r
579 * with some original source format.
\r
582 antlr3AsciiSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
\r
584 input->line = line;
\r
587 /** Set the current offset in the current line to be a particular setting.
\r
589 * \param[in] input Input stream context pointer
\r
590 * \param[in] position New setting for current offset.
\r
593 * This does not set the actual pointers in the input stream, it is purely for reporting
\r
594 * purposes and so on as per antlr3AsciiSetLine();
\r
597 antlr3AsciiSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
\r
599 input->charPositionInLine = position;
\r
602 /** Set the newline trigger character in the input stream to the supplied parameter.
\r
604 * \param[in] input Input stream context pointer
\r
605 * \param[in] newlineChar Character to set to be the newline trigger.
\r
608 * - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
\r
609 * are the same encodings), but the input stream catered to by this function is 8 bit
\r
610 * only, so it is up to the programmer to ensure that the character supplied is valid.
\r
613 antlr3AsciiSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
\r
615 input->newlineChar = newlineChar;
\r