/** \file * Contains the base functions that all recognizers require. * Any function can be overridden by a lexer/parser/tree parser or by the * ANTLR3 programmer. * * \addtogroup pANTLR3_BASE_RECOGNIZER * @{ */ #include // [The "BSD licence"] // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC // http://www.temporal-wave.com // http://www.linkedin.com/in/jimidle // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. The name of the author may not be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef ANTLR3_WINDOWS #pragma warning( disable : 4100 ) #endif /* Interface functions -standard implementations cover parser and treeparser * almost completely but are overridden by the parser or tree parser as needed. Lexer overrides * most of these functions. */ static void beginResync (pANTLR3_BASE_RECOGNIZER recognizer); static pANTLR3_BITSET computeErrorRecoverySet (pANTLR3_BASE_RECOGNIZER recognizer); static void endResync (pANTLR3_BASE_RECOGNIZER recognizer); static void beginBacktrack (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 level); static void endBacktrack (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 level, ANTLR3_BOOLEAN successful); static void * match (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow); static void matchAny (pANTLR3_BASE_RECOGNIZER recognizer); static void mismatch (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow); static ANTLR3_BOOLEAN mismatchIsUnwantedToken (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM is, ANTLR3_UINT32 ttype); static ANTLR3_BOOLEAN mismatchIsMissingToken (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM is, pANTLR3_BITSET_LIST follow); static void reportError (pANTLR3_BASE_RECOGNIZER recognizer); static pANTLR3_BITSET computeCSRuleFollow (pANTLR3_BASE_RECOGNIZER recognizer); static pANTLR3_BITSET combineFollows (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_BOOLEAN exact); static void displayRecognitionError (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 * tokenNames); static void recover (pANTLR3_BASE_RECOGNIZER recognizer); static void * recoverFromMismatchedToken (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow); static void * recoverFromMismatchedSet (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET_LIST follow); static ANTLR3_BOOLEAN recoverFromMismatchedElement(pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET_LIST follow); static void consumeUntil (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 tokenType); static void consumeUntilSet (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET set); static pANTLR3_STACK getRuleInvocationStack (pANTLR3_BASE_RECOGNIZER recognizer); static pANTLR3_STACK getRuleInvocationStackNamed (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 name); static pANTLR3_HASH_TABLE toStrings (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_HASH_TABLE); static ANTLR3_MARKER getRuleMemoization (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_INTKEY ruleIndex, ANTLR3_MARKER ruleParseStart); static ANTLR3_BOOLEAN alreadyParsedRule (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_MARKER ruleIndex); static void memoize (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_MARKER ruleIndex, ANTLR3_MARKER ruleParseStart); static ANTLR3_BOOLEAN synpred (pANTLR3_BASE_RECOGNIZER recognizer, void * ctx, void (*predicate)(void * ctx)); static void reset (pANTLR3_BASE_RECOGNIZER recognizer); static void freeBR (pANTLR3_BASE_RECOGNIZER recognizer); static void * getCurrentInputSymbol (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM istream); static void * getMissingSymbol (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM istream, pANTLR3_EXCEPTION e, ANTLR3_UINT32 expectedTokenType, pANTLR3_BITSET_LIST follow); static ANTLR3_UINT32 getNumberOfSyntaxErrors (pANTLR3_BASE_RECOGNIZER recognizer); ANTLR3_API pANTLR3_BASE_RECOGNIZER antlr3BaseRecognizerNew(ANTLR3_UINT32 type, ANTLR3_UINT32 sizeHint, pANTLR3_RECOGNIZER_SHARED_STATE state) { pANTLR3_BASE_RECOGNIZER recognizer; // Allocate memory for the structure // recognizer = (pANTLR3_BASE_RECOGNIZER) ANTLR3_MALLOC((size_t)sizeof(ANTLR3_BASE_RECOGNIZER)); if (recognizer == NULL) { // Allocation failed // return NULL; } // If we have been supplied with a pre-existing recognizer state // then we just install it, otherwise we must create one from scratch // if (state == NULL) { recognizer->state = (pANTLR3_RECOGNIZER_SHARED_STATE) ANTLR3_CALLOC(1, (size_t)sizeof(ANTLR3_RECOGNIZER_SHARED_STATE)); if (recognizer->state == NULL) { ANTLR3_FREE(recognizer); return NULL; } // Initialize any new recognizer state // recognizer->state->errorRecovery = ANTLR3_FALSE; recognizer->state->lastErrorIndex = -1; recognizer->state->failed = ANTLR3_FALSE; recognizer->state->errorCount = 0; recognizer->state->backtracking = 0; recognizer->state->following = NULL; recognizer->state->ruleMemo = NULL; recognizer->state->tokenNames = NULL; recognizer->state->sizeHint = sizeHint; recognizer->state->tokSource = NULL; recognizer->state->tokFactory = NULL; // Rather than check to see if we must initialize // the stack every time we are asked for an new rewrite stream // we just always create an empty stack and then just // free it when the base recognizer is freed. // recognizer->state->rStreams = antlr3VectorNew(0); // We don't know the size. if (recognizer->state->rStreams == NULL) { // Out of memory // ANTLR3_FREE(recognizer->state); ANTLR3_FREE(recognizer); return NULL; } } else { // Install the one we were given, and do not reset it here // as it will either already have been initialized or will // be in a state that needs to be preserved. // recognizer->state = state; } // Install the BR API // recognizer->alreadyParsedRule = alreadyParsedRule; recognizer->beginResync = beginResync; recognizer->combineFollows = combineFollows; recognizer->beginBacktrack = beginBacktrack; recognizer->endBacktrack = endBacktrack; recognizer->computeCSRuleFollow = computeCSRuleFollow; recognizer->computeErrorRecoverySet = computeErrorRecoverySet; recognizer->consumeUntil = consumeUntil; recognizer->consumeUntilSet = consumeUntilSet; recognizer->displayRecognitionError = displayRecognitionError; recognizer->endResync = endResync; recognizer->exConstruct = antlr3MTExceptionNew; recognizer->getRuleInvocationStack = getRuleInvocationStack; recognizer->getRuleInvocationStackNamed = getRuleInvocationStackNamed; recognizer->getRuleMemoization = getRuleMemoization; recognizer->match = match; recognizer->matchAny = matchAny; recognizer->memoize = memoize; recognizer->mismatch = mismatch; recognizer->mismatchIsUnwantedToken = mismatchIsUnwantedToken; recognizer->mismatchIsMissingToken = mismatchIsMissingToken; recognizer->recover = recover; recognizer->recoverFromMismatchedElement= recoverFromMismatchedElement; recognizer->recoverFromMismatchedSet = recoverFromMismatchedSet; recognizer->recoverFromMismatchedToken = recoverFromMismatchedToken; recognizer->getNumberOfSyntaxErrors = getNumberOfSyntaxErrors; recognizer->reportError = reportError; recognizer->reset = reset; recognizer->synpred = synpred; recognizer->toStrings = toStrings; recognizer->getCurrentInputSymbol = getCurrentInputSymbol; recognizer->getMissingSymbol = getMissingSymbol; recognizer->debugger = NULL; recognizer->free = freeBR; /* Initialize variables */ recognizer->type = type; return recognizer; } static void freeBR (pANTLR3_BASE_RECOGNIZER recognizer) { pANTLR3_EXCEPTION thisE; // Did we have a state allocated? // if (recognizer->state != NULL) { // Free any rule memoization we set up // if (recognizer->state->ruleMemo != NULL) { recognizer->state->ruleMemo->free(recognizer->state->ruleMemo); recognizer->state->ruleMemo = NULL; } // Free any exception space we have left around // thisE = recognizer->state->exception; if (thisE != NULL) { thisE->freeEx(thisE); } // Free any rewrite streams we have allocated // if (recognizer->state->rStreams != NULL) { recognizer->state->rStreams->free(recognizer->state->rStreams); } // Free up any token factory we created (error recovery for instance) // if (recognizer->state->tokFactory != NULL) { recognizer->state->tokFactory->close(recognizer->state->tokFactory); } // Free the shared state memory // ANTLR3_FREE(recognizer->state); } // Free the actual recognizer space // ANTLR3_FREE(recognizer); } /** * Creates a new Mismatched Token Exception and inserts in the recognizer * exception stack. * * \param recognizer * Context pointer for this recognizer * */ ANTLR3_API void antlr3MTExceptionNew(pANTLR3_BASE_RECOGNIZER recognizer) { /* Create a basic recognition exception structure */ antlr3RecognitionExceptionNew(recognizer); /* Now update it to indicate this is a Mismatched token exception */ recognizer->state->exception->name = ANTLR3_MISMATCHED_EX_NAME; recognizer->state->exception->type = ANTLR3_MISMATCHED_TOKEN_EXCEPTION; return; } ANTLR3_API void antlr3RecognitionExceptionNew(pANTLR3_BASE_RECOGNIZER recognizer) { pANTLR3_EXCEPTION ex; pANTLR3_LEXER lexer; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INPUT_STREAM ins; pANTLR3_INT_STREAM is; pANTLR3_COMMON_TOKEN_STREAM cts; pANTLR3_TREE_NODE_STREAM tns; ins = NULL; cts = NULL; tns = NULL; is = NULL; lexer = NULL; parser = NULL; tparser = NULL; switch (recognizer->type) { case ANTLR3_TYPE_LEXER: lexer = (pANTLR3_LEXER) (recognizer->super); ins = lexer->input; is = ins->istream; break; case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); cts = (pANTLR3_COMMON_TOKEN_STREAM)(parser->tstream->super); is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); tns = tparser->ctnstream->tnstream; is = tns->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function antlr3RecognitionExceptionNew called by unknown parser type - provide override for this function\n"); return; break; } /* Create a basic exception structure */ ex = antlr3ExceptionNew(ANTLR3_RECOGNITION_EXCEPTION, (void *)ANTLR3_RECOGNITION_EX_NAME, NULL, ANTLR3_FALSE); /* Rest of information depends on the base type of the * input stream. */ switch (is->type & ANTLR3_INPUT_MASK) { case ANTLR3_CHARSTREAM: ex->c = is->_LA (is, 1); /* Current input character */ ex->line = ins->getLine (ins); /* Line number comes from stream */ ex->charPositionInLine = ins->getCharPositionInLine (ins); /* Line offset also comes from the stream */ ex->index = is->index (is); ex->streamName = ins->fileName; ex->message = "Unexpected character"; break; case ANTLR3_TOKENSTREAM: ex->token = cts->tstream->_LT (cts->tstream, 1); /* Current input token */ ex->line = ((pANTLR3_COMMON_TOKEN)(ex->token))->getLine (ex->token); ex->charPositionInLine = ((pANTLR3_COMMON_TOKEN)(ex->token))->getCharPositionInLine (ex->token); ex->index = cts->tstream->istream->index (cts->tstream->istream); if (((pANTLR3_COMMON_TOKEN)(ex->token))->type == ANTLR3_TOKEN_EOF) { ex->streamName = NULL; } else { ex->streamName = ((pANTLR3_COMMON_TOKEN)(ex->token))->input->fileName; } ex->message = "Unexpected token"; break; case ANTLR3_COMMONTREENODE: ex->token = tns->_LT (tns, 1); /* Current input tree node */ ex->line = ((pANTLR3_BASE_TREE)(ex->token))->getLine (ex->token); ex->charPositionInLine = ((pANTLR3_BASE_TREE)(ex->token))->getCharPositionInLine (ex->token); ex->index = tns->istream->index (tns->istream); // Are you ready for this? Deep breath now... // { pANTLR3_COMMON_TREE tnode; tnode = ((pANTLR3_COMMON_TREE)(((pANTLR3_BASE_TREE)(ex->token))->super)); if (tnode->token == NULL) { ex->streamName = ((pANTLR3_BASE_TREE)(ex->token))->strFactory->newStr(((pANTLR3_BASE_TREE)(ex->token))->strFactory, (pANTLR3_UINT8)"-unknown source-"); } else { if (tnode->token->input == NULL) { ex->streamName = NULL; } else { ex->streamName = tnode->token->input->fileName; } } ex->message = "Unexpected node"; } break; } ex->input = is; ex->nextException = recognizer->state->exception; /* So we don't leak the memory */ recognizer->state->exception = ex; recognizer->state->error = ANTLR3_TRUE; /* Exception is outstanding */ return; } /// Match current input symbol against ttype. Upon error, do one token /// insertion or deletion if possible. /// To turn off single token insertion or deletion error /// recovery, override mismatchRecover() and have it call /// plain mismatch(), which does not recover. Then any error /// in a rule will cause an exception and immediate exit from /// rule. Rule would recover by resynchronizing to the set of /// symbols that can follow rule ref. /// static void * match( pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; void * matchedSymbol; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'match' called by unknown parser type - provide override for this function\n"); return ANTLR3_FALSE; break; } // Pick up the current input token/node for assignment to labels // matchedSymbol = recognizer->getCurrentInputSymbol(recognizer, is); if (is->_LA(is, 1) == ttype) { // The token was the one we were told to expect // is->consume(is); // Consume that token from the stream recognizer->state->errorRecovery = ANTLR3_FALSE; // Not in error recovery now (if we were) recognizer->state->failed = ANTLR3_FALSE; // The match was a success return matchedSymbol; // We are done } // We did not find the expected token type, if we are backtracking then // we just set the failed flag and return. // if (recognizer->state->backtracking > 0) { // Backtracking is going on // recognizer->state->failed = ANTLR3_TRUE; return matchedSymbol; } // We did not find the expected token and there is no backtracking // going on, so we mismatch, which creates an exception in the recognizer exception // stack. // matchedSymbol = recognizer->recoverFromMismatchedToken(recognizer, ttype, follow); return matchedSymbol; } /// Consumes the next token, whatever it is, and resets the recognizer state /// so that it is not in error. /// /// \param recognizer /// Recognizer context pointer /// static void matchAny(pANTLR3_BASE_RECOGNIZER recognizer) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'matchAny' called by unknown parser type - provide override for this function\n"); return; break; } recognizer->state->errorRecovery = ANTLR3_FALSE; recognizer->state->failed = ANTLR3_FALSE; is->consume(is); return; } /// /// static ANTLR3_BOOLEAN mismatchIsUnwantedToken(pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM is, ANTLR3_UINT32 ttype) { ANTLR3_UINT32 nextt; nextt = is->_LA(is, 2); if (nextt == ttype) { if (recognizer->state->exception != NULL) { recognizer->state->exception->expecting = nextt; } return ANTLR3_TRUE; // This token is unknown, but the next one is the one we wanted } else { return ANTLR3_FALSE; // Neither this token, nor the one following is the one we wanted } } /// /// static ANTLR3_BOOLEAN mismatchIsMissingToken(pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM is, pANTLR3_BITSET_LIST follow) { ANTLR3_BOOLEAN retcode; pANTLR3_BITSET followClone; pANTLR3_BITSET viableTokensFollowingThisRule; if (follow == NULL) { // There is no information about the tokens that can follow the last one // hence we must say that the current one we found is not a member of the // follow set and does not indicate a missing token. We will just consume this // single token and see if the parser works it out from there. // return ANTLR3_FALSE; } followClone = NULL; viableTokensFollowingThisRule = NULL; // The C bitset maps are laid down at compile time by the // C code generation. Hence we cannot remove things from them // and so on. So, in order to remove EOR (if we need to) then // we clone the static bitset. // followClone = antlr3BitsetLoad(follow); if (followClone == NULL) { return ANTLR3_FALSE; } // Compute what can follow this grammar reference // if (followClone->isMember(followClone, ANTLR3_EOR_TOKEN_TYPE)) { // EOR can follow, but if we are not the start symbol, we // need to remove it. // if (recognizer->state->following->vector->count >= 0) { followClone->remove(followClone, ANTLR3_EOR_TOKEN_TYPE); } // Now compute the visiable tokens that can follow this rule, according to context // and make them part of the follow set. // viableTokensFollowingThisRule = recognizer->computeCSRuleFollow(recognizer); followClone->borInPlace(followClone, viableTokensFollowingThisRule); } /// if current token is consistent with what could come after set /// then we know we're missing a token; error recovery is free to /// "insert" the missing token /// /// BitSet cannot handle negative numbers like -1 (EOF) so I leave EOR /// in follow set to indicate that the fall of the start symbol is /// in the set (EOF can follow). /// if ( followClone->isMember(followClone, is->_LA(is, 1)) || followClone->isMember(followClone, ANTLR3_EOR_TOKEN_TYPE) ) { retcode = ANTLR3_TRUE; } else { retcode = ANTLR3_FALSE; } if (viableTokensFollowingThisRule != NULL) { viableTokensFollowingThisRule->free(viableTokensFollowingThisRule); } if (followClone != NULL) { followClone->free(followClone); } return retcode; } /// Factor out what to do upon token mismatch so tree parsers can behave /// differently. Override and call mismatchRecover(input, ttype, follow) /// to get single token insertion and deletion. Use this to turn off /// single token insertion and deletion. Override mismatchRecover /// to call this instead. /// /// \remark mismatch only works for parsers and must be overridden for anything else. /// static void mismatch(pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; // Install a mismatched token exception in the exception stack // antlr3MTExceptionNew(recognizer); recognizer->state->exception->expecting = ttype; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'mismatch' called by unknown parser type - provide override for this function\n"); return; break; } if (mismatchIsUnwantedToken(recognizer, is, ttype)) { // Create a basic recognition exception structure // antlr3RecognitionExceptionNew(recognizer); // Now update it to indicate this is an unwanted token exception // recognizer->state->exception->name = ANTLR3_UNWANTED_TOKEN_EXCEPTION_NAME; recognizer->state->exception->type = ANTLR3_UNWANTED_TOKEN_EXCEPTION; return; } if (mismatchIsMissingToken(recognizer, is, follow)) { // Create a basic recognition exception structure // antlr3RecognitionExceptionNew(recognizer); // Now update it to indicate this is an unwanted token exception // recognizer->state->exception->name = ANTLR3_MISSING_TOKEN_EXCEPTION_NAME; recognizer->state->exception->type = ANTLR3_MISSING_TOKEN_EXCEPTION; return; } // Just a mismatched token is all we can dtermine // antlr3MTExceptionNew(recognizer); return; } /// Report a recognition problem. /// /// This method sets errorRecovery to indicate the parser is recovering /// not parsing. Once in recovery mode, no errors are generated. /// To get out of recovery mode, the parser must successfully match /// a token (after a resync). So it will go: /// /// 1. error occurs /// 2. enter recovery mode, report error /// 3. consume until token found in resynch set /// 4. try to resume parsing /// 5. next match() will reset errorRecovery mode /// /// If you override, make sure to update errorCount if you care about that. /// static void reportError (pANTLR3_BASE_RECOGNIZER recognizer) { if (recognizer->state->errorRecovery == ANTLR3_TRUE) { // Already in error recovery so don't display another error while doing so // return; } // Signal we are in error recovery now // recognizer->state->errorRecovery = ANTLR3_TRUE; // Indicate this recognizer had an error while processing. // recognizer->state->errorCount++; // Call the error display routine // recognizer->displayRecognitionError(recognizer, recognizer->state->tokenNames); } static void beginBacktrack (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 level) { if (recognizer->debugger != NULL) { recognizer->debugger->beginBacktrack(recognizer->debugger, level); } } static void endBacktrack (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 level, ANTLR3_BOOLEAN successful) { if (recognizer->debugger != NULL) { recognizer->debugger->endBacktrack(recognizer->debugger, level, successful); } } static void beginResync (pANTLR3_BASE_RECOGNIZER recognizer) { if (recognizer->debugger != NULL) { recognizer->debugger->beginResync(recognizer->debugger); } } static void endResync (pANTLR3_BASE_RECOGNIZER recognizer) { if (recognizer->debugger != NULL) { recognizer->debugger->endResync(recognizer->debugger); } } /// Compute the error recovery set for the current rule. /// Documentation below is from the Java implementation. /// /// During rule invocation, the parser pushes the set of tokens that can /// follow that rule reference on the stack; this amounts to /// computing FIRST of what follows the rule reference in the /// enclosing rule. This local follow set only includes tokens /// from within the rule; i.e., the FIRST computation done by /// ANTLR stops at the end of a rule. // /// EXAMPLE // /// When you find a "no viable alt exception", the input is not /// consistent with any of the alternatives for rule r. The best /// thing to do is to consume tokens until you see something that /// can legally follow a call to r *or* any rule that called r. /// You don't want the exact set of viable next tokens because the /// input might just be missing a token--you might consume the /// rest of the input looking for one of the missing tokens. /// /// Consider grammar: /// /// a : '[' b ']' /// | '(' b ')' /// ; /// b : c '^' INT ; /// c : ID /// | INT /// ; /// /// At each rule invocation, the set of tokens that could follow /// that rule is pushed on a stack. Here are the various "local" /// follow sets: /// /// FOLLOW(b1_in_a) = FIRST(']') = ']' /// FOLLOW(b2_in_a) = FIRST(')') = ')' /// FOLLOW(c_in_b) = FIRST('^') = '^' /// /// Upon erroneous input "[]", the call chain is /// /// a -> b -> c /// /// and, hence, the follow context stack is: /// /// depth local follow set after call to rule /// 0 a (from main()) /// 1 ']' b /// 3 '^' c /// /// Notice that ')' is not included, because b would have to have /// been called from a different context in rule a for ')' to be /// included. /// /// For error recovery, we cannot consider FOLLOW(c) /// (context-sensitive or otherwise). We need the combined set of /// all context-sensitive FOLLOW sets--the set of all tokens that /// could follow any reference in the call chain. We need to /// resync to one of those tokens. Note that FOLLOW(c)='^' and if /// we resync'd to that token, we'd consume until EOF. We need to /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. /// In this case, for input "[]", LA(1) is in this set so we would /// not consume anything and after printing an error rule c would /// return normally. It would not find the required '^' though. /// At this point, it gets a mismatched token error and throws an /// exception (since LA(1) is not in the viable following token /// set). The rule exception handler tries to recover, but finds /// the same recovery set and doesn't consume anything. Rule b /// exits normally returning to rule a. Now it finds the ']' (and /// with the successful match exits errorRecovery mode). /// /// So, you can see that the parser walks up call chain looking /// for the token that was a member of the recovery set. /// /// Errors are not generated in errorRecovery mode. /// /// ANTLR's error recovery mechanism is based upon original ideas: /// /// "Algorithms + Data Structures = Programs" by Niklaus Wirth /// /// and /// /// "A note on error recovery in recursive descent parsers": /// http://portal.acm.org/citation.cfm?id=947902.947905 /// /// Later, Josef Grosch had some good ideas: /// /// "Efficient and Comfortable Error Recovery in Recursive Descent /// Parsers": /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip /// /// Like Grosch I implemented local FOLLOW sets that are combined /// at run-time upon error to avoid overhead during parsing. /// static pANTLR3_BITSET computeErrorRecoverySet (pANTLR3_BASE_RECOGNIZER recognizer) { return recognizer->combineFollows(recognizer, ANTLR3_FALSE); } /// Compute the context-sensitive FOLLOW set for current rule. /// Documentation below is from the Java runtime. /// /// This is the set of token types that can follow a specific rule /// reference given a specific call chain. You get the set of /// viable tokens that can possibly come next (look ahead depth 1) /// given the current call chain. Contrast this with the /// definition of plain FOLLOW for rule r: /// /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} /// /// where x in T* and alpha, beta in V*; T is set of terminals and /// V is the set of terminals and non terminals. In other words, /// FOLLOW(r) is the set of all tokens that can possibly follow /// references to r in///any* sentential form (context). At /// runtime, however, we know precisely which context applies as /// we have the call chain. We may compute the exact (rather /// than covering superset) set of following tokens. /// /// For example, consider grammar: /// /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} /// | "return" expr '.' /// ; /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} /// atom : INT // FOLLOW(atom)=={'+',')',';','.'} /// | '(' expr ')' /// ; /// /// The FOLLOW sets are all inclusive whereas context-sensitive /// FOLLOW sets are precisely what could follow a rule reference. /// For input input "i=(3);", here is the derivation: /// /// stat => ID '=' expr ';' /// => ID '=' atom ('+' atom)* ';' /// => ID '=' '(' expr ')' ('+' atom)* ';' /// => ID '=' '(' atom ')' ('+' atom)* ';' /// => ID '=' '(' INT ')' ('+' atom)* ';' /// => ID '=' '(' INT ')' ';' /// /// At the "3" token, you'd have a call chain of /// /// stat -> expr -> atom -> expr -> atom /// /// What can follow that specific nested ref to atom? Exactly ')' /// as you can see by looking at the derivation of this specific /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. /// /// You want the exact viable token set when recovering from a /// token mismatch. Upon token mismatch, if LA(1) is member of /// the viable next token set, then you know there is most likely /// a missing token in the input stream. "Insert" one by just not /// throwing an exception. /// static pANTLR3_BITSET computeCSRuleFollow (pANTLR3_BASE_RECOGNIZER recognizer) { return recognizer->combineFollows(recognizer, ANTLR3_FALSE); } /// Compute the current followset for the input stream. /// static pANTLR3_BITSET combineFollows (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_BOOLEAN exact) { pANTLR3_BITSET followSet; pANTLR3_BITSET localFollowSet; ANTLR3_UINT32 top; ANTLR3_UINT32 i; top = recognizer->state->following->size(recognizer->state->following); followSet = antlr3BitsetNew(0); localFollowSet = NULL; for (i = top; i>0; i--) { localFollowSet = antlr3BitsetLoad((pANTLR3_BITSET_LIST) recognizer->state->following->get(recognizer->state->following, i-1)); if (localFollowSet != NULL) { followSet->borInPlace(followSet, localFollowSet); if (exact == ANTLR3_TRUE) { if (localFollowSet->isMember(localFollowSet, ANTLR3_EOR_TOKEN_TYPE) == ANTLR3_FALSE) { // Only leave EOR in the set if at top (start rule); this lets us know // if we have to include the follow(start rule); I.E., EOF // if (i>1) { followSet->remove(followSet, ANTLR3_EOR_TOKEN_TYPE); } } else { break; // Cannot see End Of Rule from here, just drop out } } localFollowSet->free(localFollowSet); localFollowSet = NULL; } } if (localFollowSet != NULL) { localFollowSet->free(localFollowSet); } return followSet; } /// Standard/Example error display method. /// No generic error message display funciton coudl possibly do everything correctly /// for all possible parsers. Hence you are provided with this example routine, which /// you should override in your parser/tree parser to do as you will. /// /// Here we depart somewhat from the Java runtime as that has now split up a lot /// of the error display routines into spearate units. However, ther is little advantage /// to this in the C version as you will probably implement all such routines as a /// separate translation unit, rather than install them all as pointers to functions /// in the base recognizer. /// static void displayRecognitionError (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 * tokenNames) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; pANTLR3_STRING ttext; pANTLR3_STRING ftext; pANTLR3_EXCEPTION ex; pANTLR3_COMMON_TOKEN theToken; pANTLR3_BASE_TREE theBaseTree; pANTLR3_COMMON_TREE theCommonTree; // Retrieve some info for easy reading. // ex = recognizer->state->exception; ttext = NULL; // See if there is a 'filename' we can use // if (ex->streamName == NULL) { if (((pANTLR3_COMMON_TOKEN)(ex->token))->type == ANTLR3_TOKEN_EOF) { ANTLR3_FPRINTF(stderr, "-end of input-("); } else { ANTLR3_FPRINTF(stderr, "-unknown source-("); } } else { ftext = ex->streamName->to8(ex->streamName); ANTLR3_FPRINTF(stderr, "%s(", ftext->chars); } // Next comes the line number // ANTLR3_FPRINTF(stderr, "%d) ", recognizer->state->exception->line); ANTLR3_FPRINTF(stderr, " : error %d : %s", recognizer->state->exception->type, (pANTLR3_UINT8) (recognizer->state->exception->message)); // How we determine the next piece is dependent on which thing raised the // error. // switch (recognizer->type) { case ANTLR3_TYPE_PARSER: // Prepare the knowledge we know we have // parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; theToken = (pANTLR3_COMMON_TOKEN)(recognizer->state->exception->token); ttext = theToken->toString(theToken); ANTLR3_FPRINTF(stderr, ", at offset %d", recognizer->state->exception->charPositionInLine); if (theToken != NULL) { if (theToken->type == ANTLR3_TOKEN_EOF) { ANTLR3_FPRINTF(stderr, ", at "); } else { // Guard against null text in a token // ANTLR3_FPRINTF(stderr, "\n near %s\n ", ttext == NULL ? (pANTLR3_UINT8)"" : ttext->chars); } } break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; theBaseTree = (pANTLR3_BASE_TREE)(recognizer->state->exception->token); ttext = theBaseTree->toStringTree(theBaseTree); if (theBaseTree != NULL) { theCommonTree = (pANTLR3_COMMON_TREE) theBaseTree->super; if (theCommonTree != NULL) { theToken = (pANTLR3_COMMON_TOKEN) theBaseTree->getToken(theBaseTree); } ANTLR3_FPRINTF(stderr, ", at offset %d", theBaseTree->getCharPositionInLine(theBaseTree)); ANTLR3_FPRINTF(stderr, ", near %s", ttext->chars); } break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function displayRecognitionError called by unknown parser type - provide override for this function\n"); return; break; } // Although this function should generally be provided by the implementation, this one // should be as helpful as possible for grammar developers and serve as an example // of what you can do with each exception type. In general, when you make up your // 'real' handler, you should debug the routine with all possible errors you expect // which will then let you be as specific as possible about all circumstances. // // Note that in the general case, errors thrown by tree parsers indicate a problem // with the output of the parser or with the tree grammar itself. The job of the parser // is to produce a perfect (in traversal terms) syntactically correct tree, so errors // at that stage should really be semantic errors that your own code determines and handles // in whatever way is appropriate. // switch (ex->type) { case ANTLR3_UNWANTED_TOKEN_EXCEPTION: // Indicates that the recognizer was fed a token which seesm to be // spurious input. We can detect this when the token that follows // this unwanted token would normally be part of the syntactically // correct stream. Then we can see that the token we are looking at // is just something that should not be there and throw this exception. // if (tokenNames == NULL) { ANTLR3_FPRINTF(stderr, " : Extraneous input..."); } else { if (ex->expecting == ANTLR3_TOKEN_EOF) { ANTLR3_FPRINTF(stderr, " : Extraneous input - expected \n"); } else { ANTLR3_FPRINTF(stderr, " : Extraneous input - expected %s ...\n", tokenNames[ex->expecting]); } } break; case ANTLR3_MISSING_TOKEN_EXCEPTION: // Indicates that the recognizer detected that the token we just // hit would be valid syntactically if preceeded by a particular // token. Perhaps a missing ';' at line end or a missing ',' in an // expression list, and such like. // if (tokenNames == NULL) { ANTLR3_FPRINTF(stderr, " : Missing token (%d)...\n", ex->expecting); } else { if (ex->expecting == ANTLR3_TOKEN_EOF) { ANTLR3_FPRINTF(stderr, " : Missing \n"); } else { ANTLR3_FPRINTF(stderr, " : Missing %s \n", tokenNames[ex->expecting]); } } break; case ANTLR3_RECOGNITION_EXCEPTION: // Indicates that the recognizer received a token // in the input that was not predicted. This is the basic exception type // from which all others are derived. So we assume it was a syntax error. // You may get this if there are not more tokens and more are needed // to complete a parse for instance. // ANTLR3_FPRINTF(stderr, " : syntax error...\n"); break; case ANTLR3_MISMATCHED_TOKEN_EXCEPTION: // We were expecting to see one thing and got another. This is the // most common error if we coudl not detect a missing or unwanted token. // Here you can spend your efforts to // derive more useful error messages based on the expected // token set and the last token and so on. The error following // bitmaps do a good job of reducing the set that we were looking // for down to something small. Knowing what you are parsing may be // able to allow you to be even more specific about an error. // if (tokenNames == NULL) { ANTLR3_FPRINTF(stderr, " : syntax error...\n"); } else { if (ex->expecting == ANTLR3_TOKEN_EOF) { ANTLR3_FPRINTF(stderr, " : expected \n"); } else { ANTLR3_FPRINTF(stderr, " : expected %s ...\n", tokenNames[ex->expecting]); } } break; case ANTLR3_NO_VIABLE_ALT_EXCEPTION: // We could not pick any alt decision from the input given // so god knows what happened - however when you examine your grammar, // you should. It means that at the point where the current token occurred // that the DFA indicates nowhere to go from here. // ANTLR3_FPRINTF(stderr, " : cannot match to any predicted input...\n"); break; case ANTLR3_MISMATCHED_SET_EXCEPTION: { ANTLR3_UINT32 count; ANTLR3_UINT32 bit; ANTLR3_UINT32 size; ANTLR3_UINT32 numbits; pANTLR3_BITSET errBits; // This means we were able to deal with one of a set of // possible tokens at this point, but we did not see any // member of that set. // ANTLR3_FPRINTF(stderr, " : unexpected input...\n expected one of : "); // What tokens could we have accepted at this point in the // parse? // count = 0; errBits = antlr3BitsetLoad (ex->expectingSet); numbits = errBits->numBits (errBits); size = errBits->size (errBits); if (size > 0) { // However many tokens we could have dealt with here, it is usually // not useful to print ALL of the set here. I arbitrarily chose 8 // here, but you should do whatever makes sense for you of course. // No token number 0, so look for bit 1 and on. // for (bit = 1; bit < numbits && count < 8 && count < size; bit++) { // TODO: This doesn;t look right - should be asking if the bit is set!! // if (tokenNames[bit]) { ANTLR3_FPRINTF(stderr, "%s%s", count > 0 ? ", " : "", tokenNames[bit]); count++; } } ANTLR3_FPRINTF(stderr, "\n"); } else { ANTLR3_FPRINTF(stderr, "Actually dude, we didn't seem to be expecting anything here, or at least\n"); ANTLR3_FPRINTF(stderr, "I could not work out what I was expecting, like so many of us these days!\n"); } } break; case ANTLR3_EARLY_EXIT_EXCEPTION: // We entered a loop requiring a number of token sequences // but found a token that ended that sequence earlier than // we should have done. // ANTLR3_FPRINTF(stderr, " : missing elements...\n"); break; default: // We don't handle any other exceptions here, but you can // if you wish. If we get an exception that hits this point // then we are just going to report what we know about the // token. // ANTLR3_FPRINTF(stderr, " : syntax not recognized...\n"); break; } // Here you have the token that was in error which if this is // the standard implementation will tell you the line and offset // and also record the address of the start of the line in the // input stream. You could therefore print the source line and so on. // Generally though, I would expect that your lexer/parser will keep // its own map of lines and source pointers or whatever as there // are a lot of specific things you need to know about the input // to do something like that. // Here is where you do it though :-). // } /// Return how many syntax errors were detected by this recognizer /// static ANTLR3_UINT32 getNumberOfSyntaxErrors(pANTLR3_BASE_RECOGNIZER recognizer) { return recognizer->state->errorCount; } /// Recover from an error found on the input stream. Mostly this is /// NoViableAlt exceptions, but could be a mismatched token that /// the match() routine could not recover from. /// static void recover (pANTLR3_BASE_RECOGNIZER recognizer) { // Used to compute the follow set of tokens // pANTLR3_BITSET followSet; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function recover called by unknown parser type - provide override for this function\n"); return; break; } // Are we about to repeat the same error? // if (recognizer->state->lastErrorIndex == is->index(is)) { // The last error was at the same token index point. This must be a case // where LT(1) is in the recovery token set so nothing is // consumed. Consume a single token so at least to prevent // an infinite loop; this is a failsafe. // is->consume(is); } // Record error index position // recognizer->state->lastErrorIndex = is->index(is); // Work out the follows set for error recovery // followSet = recognizer->computeErrorRecoverySet(recognizer); // Call resync hook (for debuggers and so on) // recognizer->beginResync(recognizer); // Consume tokens until we have resynced to something in the follows set // recognizer->consumeUntilSet(recognizer, followSet); // End resync hook // recognizer->endResync(recognizer); // Destroy the temporary bitset we produced. // followSet->free(followSet); // Reset the inError flag so we don't re-report the exception // recognizer->state->error = ANTLR3_FALSE; recognizer->state->failed = ANTLR3_FALSE; } /// Attempt to recover from a single missing or extra token. /// /// EXTRA TOKEN /// /// LA(1) is not what we are looking for. If LA(2) has the right token, /// however, then assume LA(1) is some extra spurious token. Delete it /// and LA(2) as if we were doing a normal match(), which advances the /// input. /// /// MISSING TOKEN /// /// If current token is consistent with what could come after /// ttype then it is ok to "insert" the missing token, else throw /// exception For example, Input "i=(3;" is clearly missing the /// ')'. When the parser returns from the nested call to expr, it /// will have call chain: /// /// stat -> expr -> atom /// /// and it will be trying to match the ')' at this point in the /// derivation: /// /// => ID '=' '(' INT ')' ('+' atom)* ';' /// ^ /// match() will see that ';' doesn't match ')' and report a /// mismatched token error. To recover, it sees that LA(1)==';' /// is in the set of tokens that can follow the ')' token /// reference in rule atom. It can assume that you forgot the ')'. /// /// The exception that was passed in, in the java implementation is /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the /// error flag and rules cascade back when this is set. /// static void * recoverFromMismatchedToken (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype, pANTLR3_BITSET_LIST follow) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; void * matchedSymbol; // Invoke the debugger event if there is a debugger listening to us // if (recognizer->debugger != NULL) { recognizer->debugger->recognitionException(recognizer->debugger, recognizer->state->exception); } switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function recoverFromMismatchedToken called by unknown parser type - provide override for this function\n"); return NULL; break; } // Create an exception if we need one // if (recognizer->state->exception == NULL) { antlr3RecognitionExceptionNew(recognizer); } // If the next token after the one we are looking at in the input stream // is what we are looking for then we remove the one we have discovered // from the stream by consuming it, then consume this next one along too as // if nothing had happened. // if ( recognizer->mismatchIsUnwantedToken(recognizer, is, ttype) == ANTLR3_TRUE) { recognizer->state->exception->type = ANTLR3_UNWANTED_TOKEN_EXCEPTION; recognizer->state->exception->message = ANTLR3_UNWANTED_TOKEN_EXCEPTION_NAME; // Call resync hook (for debuggers and so on) // if (recognizer->debugger != NULL) { recognizer->debugger->beginResync(recognizer->debugger); } recognizer->beginResync(recognizer); // "delete" the extra token // recognizer->beginResync(recognizer); is->consume(is); recognizer->endResync(recognizer); // End resync hook // if (recognizer->debugger != NULL) { recognizer->debugger->endResync(recognizer->debugger); } // Print out the error after we consume so that ANTLRWorks sees the // token in the exception. // recognizer->reportError(recognizer); // Return the token we are actually matching // matchedSymbol = recognizer->getCurrentInputSymbol(recognizer, is); // Consume the token that the rule actually expected to get as if everything // was hunky dory. // is->consume(is); recognizer->state->error = ANTLR3_FALSE; // Exception is not outstanding any more return matchedSymbol; } // Single token deletion (Unwanted above) did not work // so we see if we can insert a token instead by calculating which // token would be missing // if (mismatchIsMissingToken(recognizer, is, follow)) { // We can fake the missing token and proceed // matchedSymbol = recognizer->getMissingSymbol(recognizer, is, recognizer->state->exception, ttype, follow); recognizer->state->exception->type = ANTLR3_MISSING_TOKEN_EXCEPTION; recognizer->state->exception->message = ANTLR3_MISSING_TOKEN_EXCEPTION_NAME; recognizer->state->exception->token = matchedSymbol; recognizer->state->exception->expecting = ttype; // Print out the error after we insert so that ANTLRWorks sees the // token in the exception. // recognizer->reportError(recognizer); recognizer->state->error = ANTLR3_FALSE; // Exception is not outstanding any more return matchedSymbol; } // Neither deleting nor inserting tokens allows recovery // must just report the exception. // recognizer->state->error = ANTLR3_TRUE; return NULL; } static void * recoverFromMismatchedSet (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET_LIST follow) { pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; pANTLR3_COMMON_TOKEN matchedSymbol; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function recoverFromMismatchedSet called by unknown parser type - provide override for this function\n"); return NULL; break; } if (recognizer->mismatchIsMissingToken(recognizer, is, follow) == ANTLR3_TRUE) { // We can fake the missing token and proceed // matchedSymbol = recognizer->getMissingSymbol(recognizer, is, recognizer->state->exception, ANTLR3_TOKEN_INVALID, follow); recognizer->state->exception->type = ANTLR3_MISSING_TOKEN_EXCEPTION; recognizer->state->exception->token = matchedSymbol; // Print out the error after we insert so that ANTLRWorks sees the // token in the exception. // recognizer->reportError(recognizer); recognizer->state->error = ANTLR3_FALSE; // Exception is not outstanding any more return matchedSymbol; } // TODO - Single token deletion like in recoverFromMismatchedToken() // recognizer->state->error = ANTLR3_TRUE; recognizer->state->failed = ANTLR3_TRUE; return NULL; } /// This code is factored out from mismatched token and mismatched set /// recovery. It handles "single token insertion" error recovery for /// both. No tokens are consumed to recover from insertions. Return /// true if recovery was possible else return false. /// static ANTLR3_BOOLEAN recoverFromMismatchedElement (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET_LIST followBits) { pANTLR3_BITSET viableToksFollowingRule; pANTLR3_BITSET follow; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function recover called by unknown parser type - provide override for this function\n"); return ANTLR3_FALSE; break; } follow = antlr3BitsetLoad(followBits); if (follow == NULL) { /* The follow set is NULL, which means we don't know what can come * next, so we "hit and hope" by just signifying that we cannot * recover, which will just cause the next token to be consumed, * which might dig us out. */ return ANTLR3_FALSE; } /* We have a bitmap for the follow set, hence we can compute * what can follow this grammar element reference. */ if (follow->isMember(follow, ANTLR3_EOR_TOKEN_TYPE) == ANTLR3_TRUE) { /* First we need to know which of the available tokens are viable * to follow this reference. */ viableToksFollowingRule = recognizer->computeCSRuleFollow(recognizer); /* Remove the EOR token, which we do not wish to compute with */ follow->remove(follow, ANTLR3_EOR_TOKEN_TYPE); viableToksFollowingRule->free(viableToksFollowingRule); /* We now have the computed set of what can follow the current token */ } /* We can now see if the current token works with the set of tokens * that could follow the current grammar reference. If it looks like it * is consistent, then we can "insert" that token by not throwing * an exception and assuming that we saw it. */ if ( follow->isMember(follow, is->_LA(is, 1)) == ANTLR3_TRUE) { /* report the error, but don't cause any rules to abort and stuff */ recognizer->reportError(recognizer); if (follow != NULL) { follow->free(follow); } recognizer->state->error = ANTLR3_FALSE; recognizer->state->failed = ANTLR3_FALSE; return ANTLR3_TRUE; /* Success in recovery */ } if (follow != NULL) { follow->free(follow); } /* We could not find anything viable to do, so this is going to * cause an exception. */ return ANTLR3_FALSE; } /// Eat tokens from the input stream until we get one of JUST the right type /// static void consumeUntil (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 tokenType) { ANTLR3_UINT32 ttype; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'consumeUntil' called by unknown parser type - provide override for this function\n"); return; break; } // What do have at the moment? // ttype = is->_LA(is, 1); // Start eating tokens until we get to the one we want. // while (ttype != ANTLR3_TOKEN_EOF && ttype != tokenType) { is->consume(is); ttype = is->_LA(is, 1); } } /// Eat tokens from the input stream until we find one that /// belongs to the supplied set. /// static void consumeUntilSet (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_BITSET set) { ANTLR3_UINT32 ttype; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'consumeUntilSet' called by unknown parser type - provide override for this function\n"); return; break; } // What do have at the moment? // ttype = is->_LA(is, 1); // Start eating tokens until we get to one we want. // while (ttype != ANTLR3_TOKEN_EOF && set->isMember(set, ttype) == ANTLR3_FALSE) { is->consume(is); ttype = is->_LA(is, 1); } } /** Return the rule invocation stack (how we got here in the parse. * In the java version Ter just asks the JVM for all the information * but in C we don't get this information, so I am going to do nothing * right now. */ static pANTLR3_STACK getRuleInvocationStack (pANTLR3_BASE_RECOGNIZER recognizer) { return NULL; } static pANTLR3_STACK getRuleInvocationStackNamed (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 name) { return NULL; } /** Convenience method for template rewrites - NYI. */ static pANTLR3_HASH_TABLE toStrings (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_HASH_TABLE tokens) { return NULL; } static void ANTLR3_CDECL freeIntTrie (void * trie) { ((pANTLR3_INT_TRIE)trie)->free((pANTLR3_INT_TRIE)trie); } /** Pointer to a function to return whether the rule has parsed input starting at the supplied * start index before. If the rule has not parsed input starting from the supplied start index, * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point * then it will return the point where it last stopped parsing after that start point. * * \remark * The rule memos are an ANTLR3_LIST of ANTLR3_LISTS, however if this becomes any kind of performance * issue (it probably won't, the hash tables are pretty quick) then we could make a special int only * version of the table. */ static ANTLR3_MARKER getRuleMemoization (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_INTKEY ruleIndex, ANTLR3_MARKER ruleParseStart) { /* The rule memos are an ANTLR3_LIST of ANTLR3_LIST. */ pANTLR3_INT_TRIE ruleList; ANTLR3_MARKER stopIndex; pANTLR3_TRIE_ENTRY entry; /* See if we have a list in the ruleMemos for this rule, and if not, then create one * as we will need it eventually if we are being asked for the memo here. */ entry = recognizer->state->ruleMemo->get(recognizer->state->ruleMemo, (ANTLR3_INTKEY)ruleIndex); if (entry == NULL) { /* Did not find it, so create a new one for it, with a bit depth based on the * size of the input stream. We need the bit depth to incorporate the number if * bits required to represent the largest possible stop index in the input, which is the * last character. An int stream is free to return the largest 64 bit offset if it has * no idea of the size, but you should remember that this will cause the leftmost * bit match algorithm to run to 63 bits, which will be the whole time spent in the trie ;-) */ ruleList = antlr3IntTrieNew(63); /* Depth is theoretically 64 bits, but probably not ;-) */ if (ruleList != NULL) { recognizer->state->ruleMemo->add(recognizer->state->ruleMemo, (ANTLR3_INTKEY)ruleIndex, ANTLR3_HASH_TYPE_STR, 0, ANTLR3_FUNC_PTR(ruleList), freeIntTrie); } /* We cannot have a stopIndex in a trie we have just created of course */ return MEMO_RULE_UNKNOWN; } ruleList = (pANTLR3_INT_TRIE) (entry->data.ptr); /* See if there is a stop index associated with the supplied start index. */ stopIndex = 0; entry = ruleList->get(ruleList, ruleParseStart); if (entry != NULL) { stopIndex = (ANTLR3_MARKER)(entry->data.intVal); } if (stopIndex == 0) { return MEMO_RULE_UNKNOWN; } return stopIndex; } /** Has this rule already parsed input at the current index in the * input stream? Return ANTLR3_TRUE if we have and ANTLR3_FALSE * if we have not. * * This method has a side-effect: if we have seen this input for * this rule and successfully parsed before, then seek ahead to * 1 past the stop token matched for this rule last time. */ static ANTLR3_BOOLEAN alreadyParsedRule (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_MARKER ruleIndex) { ANTLR3_MARKER stopIndex; pANTLR3_LEXER lexer; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; lexer = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; lexer = NULL; is = tparser->ctnstream->tnstream->istream; break; case ANTLR3_TYPE_LEXER: lexer = (pANTLR3_LEXER) (recognizer->super); parser = NULL; tparser = NULL; is = lexer->input->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'alreadyParsedRule' called by unknown parser type - provide override for this function\n"); return ANTLR3_FALSE; break; } /* See if we have a memo marker for this. */ stopIndex = recognizer->getRuleMemoization(recognizer, ruleIndex, is->index(is)); if (stopIndex == MEMO_RULE_UNKNOWN) { return ANTLR3_FALSE; } if (stopIndex == MEMO_RULE_FAILED) { recognizer->state->failed = ANTLR3_TRUE; } else { is->seek(is, stopIndex+1); } /* If here then the rule was executed for this input already */ return ANTLR3_TRUE; } /** Record whether or not this rule parsed the input at this position * successfully. */ static void memoize (pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_MARKER ruleIndex, ANTLR3_MARKER ruleParseStart) { /* The rule memos are an ANTLR3_LIST of ANTLR3_LIST. */ pANTLR3_INT_TRIE ruleList; pANTLR3_TRIE_ENTRY entry; ANTLR3_MARKER stopIndex; pANTLR3_LEXER lexer; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; case ANTLR3_TYPE_LEXER: lexer = (pANTLR3_LEXER) (recognizer->super); parser = NULL; tparser = NULL; is = lexer->input->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function consumeUntilSet called by unknown parser type - provide override for this function\n"); return; break; } stopIndex = recognizer->state->failed == ANTLR3_TRUE ? MEMO_RULE_FAILED : is->index(is) - 1; entry = recognizer->state->ruleMemo->get(recognizer->state->ruleMemo, (ANTLR3_INTKEY)ruleIndex); if (entry != NULL) { ruleList = (pANTLR3_INT_TRIE)(entry->data.ptr); /* If we don't already have this entry, append it. The memoize trie does not * accept duplicates so it won't add it if already there and we just ignore the * return code as we don't care if it is there already. */ ruleList->add(ruleList, ruleParseStart, ANTLR3_HASH_TYPE_INT, stopIndex, NULL, NULL); } } /** A syntactic predicate. Returns true/false depending on whether * the specified grammar fragment matches the current input stream. * This resets the failed instance var afterwards. */ static ANTLR3_BOOLEAN synpred (pANTLR3_BASE_RECOGNIZER recognizer, void * ctx, void (*predicate)(void * ctx)) { ANTLR3_MARKER start; pANTLR3_PARSER parser; pANTLR3_TREE_PARSER tparser; pANTLR3_INT_STREAM is; switch (recognizer->type) { case ANTLR3_TYPE_PARSER: parser = (pANTLR3_PARSER) (recognizer->super); tparser = NULL; is = parser->tstream->istream; break; case ANTLR3_TYPE_TREE_PARSER: tparser = (pANTLR3_TREE_PARSER) (recognizer->super); parser = NULL; is = tparser->ctnstream->tnstream->istream; break; default: ANTLR3_FPRINTF(stderr, "Base recognizer function 'synPred' called by unknown parser type - provide override for this function\n"); return ANTLR3_FALSE; break; } /* Begin backtracking so we can get back to where we started after trying out * the syntactic predicate. */ start = is->mark(is); recognizer->state->backtracking++; /* Try the syntactical predicate */ predicate(ctx); /* Reset */ is->rewind(is, start); recognizer->state->backtracking--; if (recognizer->state->failed == ANTLR3_TRUE) { /* Predicate failed */ recognizer->state->failed = ANTLR3_FALSE; return ANTLR3_FALSE; } else { /* Predicate was successful */ recognizer->state->failed = ANTLR3_FALSE; return ANTLR3_TRUE; } } static void reset(pANTLR3_BASE_RECOGNIZER recognizer) { if (recognizer->state->following != NULL) { recognizer->state->following->free(recognizer->state->following); } // Reset the state flags // recognizer->state->errorRecovery = ANTLR3_FALSE; recognizer->state->lastErrorIndex = -1; recognizer->state->failed = ANTLR3_FALSE; recognizer->state->errorCount = 0; recognizer->state->backtracking = 0; recognizer->state->following = NULL; if (recognizer->state != NULL) { if (recognizer->state->ruleMemo != NULL) { recognizer->state->ruleMemo->free(recognizer->state->ruleMemo); recognizer->state->ruleMemo = antlr3IntTrieNew(15); /* 16 bit depth is enough for 32768 rules! */ } } // Install a new following set // recognizer->state->following = antlr3StackNew(8); } // Default implementation is for parser and assumes a token stream as supplied by the runtime. // You MAY need override this function if the standard TOKEN_STREAM is not what you are using. // static void * getCurrentInputSymbol (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM istream) { return ((pANTLR3_TOKEN_STREAM)istream->super)->_LT((pANTLR3_TOKEN_STREAM)istream->super, 1); } // Default implementation is for parser and assumes a token stream as supplied by the runtime. // You MAY need override this function if the standard COMMON_TOKEN_STREAM is not what you are using. // static void * getMissingSymbol (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_INT_STREAM istream, pANTLR3_EXCEPTION e, ANTLR3_UINT32 expectedTokenType, pANTLR3_BITSET_LIST follow) { pANTLR3_TOKEN_STREAM ts; pANTLR3_COMMON_TOKEN_STREAM cts; pANTLR3_COMMON_TOKEN token; pANTLR3_COMMON_TOKEN current; pANTLR3_STRING text; // Dereference the standard pointers // ts = (pANTLR3_TOKEN_STREAM)istream->super; cts = (pANTLR3_COMMON_TOKEN_STREAM)ts->super; // Work out what to use as the current symbol to make a line and offset etc // If we are at EOF, we use the token before EOF // current = ts->_LT(ts, 1); if (current->getType(current) == ANTLR3_TOKEN_EOF) { current = ts->_LT(ts, -1); } // Create a new empty token // if (recognizer->state->tokFactory == NULL) { // We don't yet have a token factory for making tokens // we just need a fake one using the input stream of the current // token. // recognizer->state->tokFactory = antlr3TokenFactoryNew(current->input); } token = recognizer->state->tokFactory->newToken(recognizer->state->tokFactory); // Set some of the token properties based on the current token // token->setLine (token, current->getLine(current)); token->setCharPositionInLine (token, current->getCharPositionInLine(current)); token->setChannel (token, ANTLR3_TOKEN_DEFAULT_CHANNEL); token->setType (token, expectedTokenType); token->user1 = current->user1; token->user2 = current->user2; token->user3 = current->user3; token->custom = current->custom; token->lineStart = current->lineStart; // Create the token text that shows it has been inserted // token->setText8(token, (pANTLR3_UINT8)"getText(token); if (text != NULL) { text->append8(text, (const char *)recognizer->state->tokenNames[expectedTokenType]); text->append8(text, (const char *)">"); } // Finally return the pointer to our new token // return token; } #ifdef ANTLR3_WINDOWS #pragma warning( default : 4100 ) #endif /// @} ///