2 /// Base functions to initialize and manipulate a UCS2 input stream
\r
4 #include <antlr3input.h>
\r
6 // [The "BSD licence"]
\r
7 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
\r
8 // http://www.temporal-wave.com
\r
9 // http://www.linkedin.com/in/jimidle
\r
11 // All rights reserved.
\r
13 // Redistribution and use in source and binary forms, with or without
\r
14 // modification, are permitted provided that the following conditions
\r
16 // 1. Redistributions of source code must retain the above copyright
\r
17 // notice, this list of conditions and the following disclaimer.
\r
18 // 2. Redistributions in binary form must reproduce the above copyright
\r
19 // notice, this list of conditions and the following disclaimer in the
\r
20 // documentation and/or other materials provided with the distribution.
\r
21 // 3. The name of the author may not be used to endorse or promote products
\r
22 // derived from this software without specific prior written permission.
\r
24 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
\r
25 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
\r
26 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
\r
27 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
\r
28 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
\r
29 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
\r
30 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
\r
31 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
\r
32 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
\r
33 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\r
37 static void antlr3UCS2Consume (pANTLR3_INT_STREAM is);
\r
38 static ANTLR3_UCHAR antlr3UCS2LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
\r
39 static ANTLR3_MARKER antlr3UCS2Index (pANTLR3_INT_STREAM is);
\r
40 static void antlr3UCS2Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
\r
42 // ucs2 Charstream API functions
\r
44 static pANTLR3_STRING antlr3UCS2Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
\r
46 /// \brief Common function to setup function interface for a 16 bit "UCS2" input stream.
\r
48 /// \param input Input stream context pointer
\r
51 /// - Strictly speaking, there is no such thing as a UCS2 input stream as the term
\r
52 /// tends to confuse the notions of character encoding, unicode and so on. However
\r
53 /// because there will possibly be a need for a UTF-16 stream, I needed to identify 16 bit
\r
54 /// streams that do not support surrogate encodings and UCS2 is how it is mostly referred to.
\r
55 /// For instance Java, Oracle and others use a 16 bit encoding of characters and so this type
\r
56 /// of stream is very common.
\r
57 /// Take it to mean, therefore, a straight 16 bit uncomplicated encoding of Unicode code points.
\r
60 antlr3UCS2SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 type)
\r
62 // Build a string factory for this stream. This is a 16 bit string "UCS2" factory which is a standard
\r
63 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
\r
66 input->strFactory = antlr3UCS2StringFactoryNew();
\r
68 // Install function pointers for an 8 bit ASCII input, which are good for almost
\r
69 // all input stream functions. We will then override those that won't work for 16 bit characters.
\r
71 antlr3GenericSetupStream (input, type);
\r
73 // Intstream API overrides for UCS2
\r
75 input->istream->consume = antlr3UCS2Consume; // Consume the next 16 bit character in the buffer
\r
76 input->istream->_LA = antlr3UCS2LA; // Return the UTF32 character at offset n (1 based)
\r
77 input->istream->index = antlr3UCS2Index; // Calculate current index in input stream, 16 bit based
\r
78 input->istream->seek = antlr3UCS2Seek; // How to seek to a specific point in the stream
\r
80 // Charstream API overrides for UCS2
\r
82 input->substr = antlr3UCS2Substr; // Return a string from the input stream
\r
84 input->charByteSize = 2; // Size in bytes of characters in this stream.
\r
88 /// \brief Consume the next character in an 8 bit ASCII input stream
\r
90 /// \param input Input stream context pointer
\r
93 antlr3UCS2Consume(pANTLR3_INT_STREAM is)
\r
95 pANTLR3_INPUT_STREAM input;
\r
97 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
99 if ((pANTLR3_UINT16)(input->nextChar) < (((pANTLR3_UINT16)input->data) + input->sizeBuf))
\r
101 // Indicate one more character in this line
\r
103 input->charPositionInLine++;
\r
105 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
\r
107 // Reset for start of a new line of input
\r
110 input->charPositionInLine = 0;
\r
111 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
\r
114 // Increment to next character position
\r
116 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
\r
120 /// \brief Return the input element assuming an 8 bit ascii input
\r
122 /// \param[in] input Input stream context pointer
\r
123 /// \param[in] la 1 based offset of next input stream element
\r
125 /// \return Next input character in internal ANTLR3 encoding (UTF32)
\r
127 static ANTLR3_UCHAR
\r
128 antlr3UCS2LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
\r
130 pANTLR3_INPUT_STREAM input;
\r
132 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
134 if (( ((pANTLR3_UINT16)input->nextChar) + la - 1) >= (((pANTLR3_UINT16)input->data) + input->sizeBuf))
\r
136 return ANTLR3_CHARSTREAM_EOF;
\r
140 return (ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar + la - 1));
\r
145 /// \brief Calculate the current index in the output stream.
\r
146 /// \param[in] input Input stream context pointer
\r
148 static ANTLR3_MARKER
\r
149 antlr3UCS2Index(pANTLR3_INT_STREAM is)
\r
151 pANTLR3_INPUT_STREAM input;
\r
153 input = ((pANTLR3_INPUT_STREAM) (is->super));
\r
155 return (ANTLR3_MARKER)(input->nextChar);
\r
158 /// \brief Rewind the lexer input to the state specified by the supplied mark.
\r
160 /// \param[in] input Input stream context pointer
\r
163 /// Assumes ASCII (or at least, 8 Bit) input stream.
\r
166 antlr3UCS2Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
\r
168 ANTLR3_INT32 count;
\r
169 pANTLR3_INPUT_STREAM input;
\r
171 input = ((pANTLR3_INPUT_STREAM) is->super);
\r
173 // If the requested seek point is less than the current
\r
174 // input point, then we assume that we are resetting from a mark
\r
175 // and do not need to scan, but can just set to there.
\r
177 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
\r
179 input->nextChar = (void *)seekPoint;
\r
183 count = (ANTLR3_UINT32)((seekPoint - (ANTLR3_MARKER)(input->nextChar)) / 2); // 16 bits per character in UCS2
\r
191 /// \brief Return a substring of the ucs2 (16 bit) input stream in
\r
192 /// newly allocated memory.
\r
194 /// \param input Input stream context pointer
\r
195 /// \param start Offset in input stream where the string starts
\r
196 /// \param stop Offset in the input stream where the string ends.
\r
198 static pANTLR3_STRING
\r
199 antlr3UCS2Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
\r
201 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
\r