package org.simantics.scl.compiler.markdown.internal; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.simantics.scl.compiler.markdown.inlines.Subject; import org.simantics.scl.compiler.markdown.nodes.BlockQuoteNode; import org.simantics.scl.compiler.markdown.nodes.CodeBlockNode; import org.simantics.scl.compiler.markdown.nodes.DocumentNode; import org.simantics.scl.compiler.markdown.nodes.ExtensionBlockNode; import org.simantics.scl.compiler.markdown.nodes.HeaderNode; import org.simantics.scl.compiler.markdown.nodes.HorizontalRuleNode; import org.simantics.scl.compiler.markdown.nodes.HtmlNode; import org.simantics.scl.compiler.markdown.nodes.ItemNode; import org.simantics.scl.compiler.markdown.nodes.ListNode; import org.simantics.scl.compiler.markdown.nodes.Node; import org.simantics.scl.compiler.markdown.nodes.ParagraphNode; import org.simantics.scl.compiler.markdown.nodes.Reference; import gnu.trove.map.hash.THashMap; public class MarkdownParser { public static final boolean DEBUG = false; public static final int CODE_INDENT = 4; private DocumentNode root = new DocumentNode(); private Node current = root; private StringBuilder detabBuffer = new StringBuilder(); private Scanner scanner = new Scanner(); private int lineNumber = 0; private THashMap referenceMap = new THashMap(); public DocumentNode parseDocument(Reader reader) throws IOException { StringBuilder lineBuffer = new StringBuilder(); char secondNL = 0; while(true) { int c = reader.read(); if(c == -1) { processLine(lineBuffer); break; } else if(c == '\n' || c == '\r') { if(lineBuffer.length() == 0 && c == secondNL) secondNL = 0; else { processLine(lineBuffer); lineBuffer.delete(0, lineBuffer.length()); secondNL = c == '\n' ? '\r' : '\n'; } } else lineBuffer.append((char)c); } while(current != null) current = finalize(current); processInlines(root); return root; } public DocumentNode parseDocument(String text) { try { return parseDocument(new StringReader(text)); } catch (IOException e) { // Should not be possible throw new RuntimeException(e); } } private void processInlines(Node node) { for(Node child = node.firstChild; child != null; child = child.next) processInlines(child); if(node instanceof ParagraphNode || node instanceof HeaderNode) Subject.parseInlines(referenceMap, node); } private void processLine(StringBuilder line) { ++lineNumber; line = detab(line); if(DEBUG) System.out.println("processLine(" + line + ")"); line.append('\n'); // Easier to detect eol Node container = root; int offset = 0; boolean blank = false; boolean allMatched = true; while(container.lastChild != null && container.lastChild.open) { container = container.lastChild; int firstNonspace = offset; char c; while((c=line.charAt(firstNonspace)) == ' ') ++firstNonspace; int indent = firstNonspace - offset; blank = c == '\n'; if(container instanceof BlockQuoteNode) { if(indent <= 3 && c == '>') { offset = firstNonspace + 1; if(line.charAt(offset) == ' ') ++offset; } else allMatched = false; } else if(container instanceof ItemNode) { ItemNode item = (ItemNode)container; if(indent >= item.indentation) { offset += item.indentation; } else if(blank) offset = firstNonspace; else allMatched = false; } else if(container instanceof CodeBlockNode) { CodeBlockNode codeBlock = (CodeBlockNode)container; if(!codeBlock.fenced) { if(indent >= CODE_INDENT) offset += CODE_INDENT; else if(blank) offset = firstNonspace; else allMatched = false; } else { if(indent <= 3 && Scanner.isCloseCodeFence(line, firstNonspace, codeBlock.fenceChar, codeBlock.fenceLength)) { current = finalize(container); return; } else { int i = codeBlock.fenceOffset; while(i > 0 && line.charAt(offset) == ' ') { ++offset; --i; } } } } else if(container instanceof HeaderNode) { allMatched = false; } else if(container instanceof HtmlNode) { if(blank) allMatched = false; } else if(container instanceof ParagraphNode) { if(blank) allMatched = false; } if(!allMatched) { container = container.parent; break; } } Node lastMatchedContainer = container; if(DEBUG) System.out.println(" lastMatchedContainer = " + lastMatchedContainer.getClass().getSimpleName() + "@" + lastMatchedContainer.hashCode()); if(blank && container.lastLineBlank) { //System.out.println(" DOUBLE BREAK " + container.getClass().getSimpleName() + "@" + container.hashCode()); Node b = root; while(b != null && !(b instanceof ListNode)) b = b.lastChild; if(b != null) { while(container != null && container != b) container = finalize(container); finalize(b); container = b.parent; } } boolean maybeLazy = current instanceof ParagraphNode; while(!(container instanceof CodeBlockNode) && !(container instanceof HtmlNode)) { int firstNonspace = offset; char c; while((c=line.charAt(firstNonspace)) == ' ') ++firstNonspace; int indent = firstNonspace - offset; blank = c == '\n'; if(indent >= CODE_INDENT) { if(!maybeLazy && !blank) { offset += 4; container = addChild(container, new CodeBlockNode()); } else break; } else if(c == '>') { offset = firstNonspace + 1; if(line.charAt(offset) == ' ') ++offset; container = addChild(container, new BlockQuoteNode()); } else if(c == '#' && scanner.isAtxHeaderStart(line, firstNonspace)) { offset = firstNonspace + scanner.matched; container = addChild(container, new HeaderNode(scanner.level, false)); } else if((c == '`' || c == '~') && scanner.isOpenCodeFence(line, firstNonspace, c)) { container = addChild(container, new CodeBlockNode(c, scanner.level, firstNonspace - offset)); offset = firstNonspace + scanner.matched; } else if(Scanner.isHtmlBlockTag(line, firstNonspace)) { container = addChild(container, new HtmlNode()); } else if((c == '=' || c == '-') && container instanceof ParagraphNode && Scanner.isSetextHeaderLine(line, firstNonspace, c) /*&& container.stringContent.indexOf("\n") == -1*/ ) { HeaderNode header = new HeaderNode(c == '=' ? 1 : 2, true); header.lineNumber = container.lineNumber; if(DEBUG) System.out.println(" Replace ParagraphNode@" + System.identityHashCode(container) + " with HeaderNode@" + System.identityHashCode(header)); header.stringContent = container.stringContent; header.parent = container.parent; header.prev = container.prev; if(header.prev != null) header.prev.next = header; if(header.parent.lastChild != null) header.parent.lastChild = header; if(header.parent.firstChild == container) header.parent.firstChild = header; container = header; if(current == container) current = header; offset = line.length()-1; } else if(!(container instanceof ParagraphNode && !allMatched) && (c == '*' || c == '_' || c == '-') && Scanner.isHRule(line, firstNonspace, c)) { container = addChild(container, new HorizontalRuleNode()); container = finalize(container); offset = line.length()-1; } else if((c == '*' || c == '+' || c == '-') && (line.charAt(firstNonspace+1) == ' ' || line.charAt(firstNonspace+1) == '\n')) { int originalOffset = offset; offset = firstNonspace + 1; int i = 0; char c2 = 0; while(i <= 5 && (c2 = line.charAt(offset+i)) == ' ') ++i; if(i >= 5|| i < 1 || c2 == '\n') { if(i > 0) ++offset; } else { offset += i; } if(!(container instanceof ListNode) || !((ListNode)container).isCompatible(c)) { container = addChild(container, new ListNode(c)); } if(DEBUG) { System.out.println(" indentation = " + (offset - originalOffset + (i == 0 ? 1 : 0))); } container = addChild(container, new ItemNode(offset - originalOffset + (i == 0 ? 1 : 0))); } else if(Character.isDigit(c) && scanner.isListMarker(line, firstNonspace)) { int originalOffset = offset; offset = firstNonspace + scanner.matched; int i = 0; char c2 = 0; while(i <= 5 && (c2 = line.charAt(offset+i)) == ' ') ++i; if(i >= 5|| i < 1 || c2 == '\n') { if(i > 0) ++offset; } else { offset += i; } if(!(container instanceof ListNode) || !((ListNode)container).isCompatible(scanner.bulletChar)) { container = addChild(container, new ListNode(scanner.bulletChar, scanner.level)); } if(DEBUG) { System.out.println(" indentation = " + (offset - originalOffset + (i == 0 ? 1 : 0))); } container = addChild(container, new ItemNode(offset - originalOffset + (i == 0 ? 1 : 0))); } else if(c == ':' && line.charAt(firstNonspace+1) == ':') { int p=firstNonspace+2; while(Character.isAlphabetic(c=line.charAt(p)) || Character.isDigit(c) || c == ' ' || c=='_') ++p; if(c != '[') break; int bracketBegin = p; ++p; while(true) { c = line.charAt(p++); if(c == ']') { break; } else if(c == '\\' && ((c=line.charAt(p+1)) == '\\' || c == ']')) { ++p; } else if(c == '\n') break; } if(c == ']') { offset = p; container = addChild(container, new ExtensionBlockNode( line.substring(firstNonspace+2, bracketBegin).trim(), line.substring(bracketBegin+1, p-1).trim())); } else break; } else break; if(container.acceptLines()) break; maybeLazy = false; } int firstNonspace = offset; char c; while((c=line.charAt(firstNonspace)) == ' ') ++firstNonspace; blank = c == '\n'; if(blank) { if(container.lastChild != null) container.lastChild.setLastLineBlank(true); container.setLastLineBlank( !(container instanceof BlockQuoteNode) && !(container instanceof HeaderNode) && !(container instanceof CodeBlockNode && ((CodeBlockNode)container).fenced) && !(container instanceof ItemNode && container.firstChild == null && container.lineNumber == lineNumber)); } else container.setLastLineBlank(false); for(Node cont = container;cont.parent != null; cont = cont.parent, cont.setLastLineBlank(false)); if(DEBUG) { System.out.println(" current = " + current.getClass().getSimpleName() + "@" + current.hashCode()); System.out.println(" container = " + container.getClass().getSimpleName() + "@" + container.hashCode()); } if(current != lastMatchedContainer && container == lastMatchedContainer && !blank && current instanceof ParagraphNode && current.stringContent != null) { addLine(current, line, offset); } else { while(current != lastMatchedContainer) current = finalize(current); if(container instanceof CodeBlockNode || container instanceof HtmlNode) addLine(container, line, offset); else if(blank) ; // do nothing else if(container.acceptLines()) { if(container instanceof HeaderNode && !((HeaderNode)container).setext) chopTrailingHashtags(line, firstNonspace); addLine(container, line, firstNonspace); } else { container = addChild(container, new ParagraphNode()); addLine(container, line, firstNonspace); } current = container; } } private void chopTrailingHashtags(StringBuilder line, int firstNonspace) { //System.out.println("chopTrailingHashtags("+line.substring(firstNonspace)+")"); int pos = line.length()-1; char c=0; while(pos >= 0 && ((c=line.charAt(pos)) == ' ' || c == '\n')) --pos; line.delete(pos+1, line.length()); if(c == '#') { --pos; while(pos >= 0 && (c=line.charAt(pos)) == '#') --pos; if(c != ' ') return; --pos; while(pos >= 0 && line.charAt(pos) == ' ') --pos; ++pos; if(pos < firstNonspace) pos = firstNonspace; line.delete(pos, line.length()); } } private void addLine(Node container, StringBuilder line, int offset) { if(container.stringContent == null) container.stringContent = new StringBuilder(); else container.stringContent.append('\n'); int length = line.length(); if(length > 0 && line.charAt(length-1) == '\n') --length; if(DEBUG) System.out.println(" addLine(" + container.getClass().getSimpleName() + "@" + container.hashCode() + ", \"" + line.substring(offset, length) + "\")"); container.stringContent.append(line, offset, length); } private StringBuilder detab(StringBuilder str) { int length = str.length(); for(int i=0;i 0) detabBuffer.append(' '); } else detabBuffer.append(c); } return detabBuffer; } } return str; } private Node addChild(Node parent, Node child) { child.lineNumber = lineNumber; if(DEBUG) System.out.println(" addChild(" + parent.getClass().getSimpleName() + "@" + parent.hashCode() + ", " + child.getClass().getSimpleName() + "@" + child.hashCode() + ")"); while(!parent.canContain(child)) parent = finalize(parent); parent.addChild(child); return child; } private Node finalize(Node node) { node.open = false; if(node instanceof ParagraphNode) { parseReferenceInline(node); } else if(node instanceof HeaderNode) { if(node.stringContent == null) node.stringContent = new StringBuilder(0); } else if(node instanceof CodeBlockNode) { CodeBlockNode codeBlock = (CodeBlockNode)node; if(codeBlock.fenced) { int firstLineLength = codeBlock.stringContent.indexOf("\n"); String infoString; if(firstLineLength == -1) { infoString = codeBlock.stringContent.toString().trim(); codeBlock.stringContent = new StringBuilder(0); } else { infoString = codeBlock.stringContent.substring(0, firstLineLength).trim(); codeBlock.stringContent.delete(0, firstLineLength+1); } codeBlock.infoString = Reference.cleanUrl(infoString); } else { removeTrailingBlankLines(codeBlock.stringContent); } } else if(node instanceof ListNode) { ListNode list = (ListNode)node; list.tight = true; itemLoop: for(Node item=list.firstChild;item != null;item = item.next) { if(item.lastLineBlank && item.next != null) { list.tight = false; break; } for(Node child=item.firstChild;child != null;child = child.next) if(endsWithBlankLine(child) && (child.next != null || item.next != null)) { list.tight = false; break itemLoop; } } } return node.parent; } private static boolean endsWithBlankLine(Node node) { while(true) { if(node.lastLineBlank) return true; node = node.lastChild; if(!(node instanceof ListNode) && !(node instanceof ItemNode)) return false; } } private static void removeTrailingBlankLines(StringBuilder str) { int endPos = str.length(); int pos = endPos-1; while(pos >= 0) { char c = str.charAt(pos); if(c == '\n') endPos = pos; else if(c != ' ') break; --pos; } if(endPos < str.length()) str.delete(endPos, str.length()); } private void parseReferenceInline(Node node) { StringBuilder input = node.stringContent; while(true) { int offset = 0; if(offset == input.length() || input.charAt(offset) != '[') return; // Label offset = Scanner.scanLinkLabel(input, offset); if(offset == -1 || offset == input.length() || input.charAt(offset) != ':') return; String label = input.substring(1, offset-1); ++offset; // Url offset = spnl(input, offset); int linkStart = offset; offset = Scanner.scanLinkUrl(input, offset); if(offset == -1 || offset == linkStart) return; String url; if(linkStart < input.length() && input.charAt(linkStart) == '<') url = input.substring(linkStart+1, offset-1); else url = input.substring(linkStart, offset); url = Reference.cleanUrl(url); // Title int linkUrlEnd = offset; offset = spnl(input, offset); int titleStart = offset; offset = Scanner.scanLinkTitle(input, offset); String title; if(offset == -1) { offset = linkUrlEnd; char c = 0; while(offset < input.length() && (c = input.charAt(offset)) == ' ') ++offset; if(c == '\n') ++offset; else if(offset != input.length()) return; title = ""; } else { title = input.substring(titleStart+1, offset-1); title = Reference.cleanTitle(title); char c = 0; while(offset < input.length() && (c = input.charAt(offset)) == ' ') ++offset; if(c == '\n') ++offset; else if(offset != input.length()) return; } /*System.out.println("Reference:"); System.out.println(" label = '" + label + "'"); System.out.println(" url = '" + url + "'"); System.out.println(" title = '" + title + "'");*/ Reference reference = new Reference(Reference.normalizeLabel(label), url, title); if(!referenceMap.contains(reference.label)) referenceMap.put(reference.label, reference); if(offset == input.length()) { node.remove(); return; } else input.delete(0, offset); } } private static int spnl(StringBuilder input, int offset) { boolean seenWhitespace = false; while(offset < input.length()) { char c = input.charAt(offset); if(c == ' ') ++offset; else if(c == '\n') { if(seenWhitespace) return offset; else { seenWhitespace = true; ++offset; } } else return offset; } return offset; } }