]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/org.simantics.scl.compiler/src/org/simantics/scl/compiler/markdown/internal/MarkdownParser.java
migrated to svn revision 33108
[simantics/platform.git] / bundles / org.simantics.scl.compiler / src / org / simantics / scl / compiler / markdown / internal / MarkdownParser.java
1 package org.simantics.scl.compiler.markdown.internal;
2
3 import java.io.IOException;
4 import java.io.Reader;
5 import java.io.StringReader;
6
7 import org.simantics.scl.compiler.markdown.inlines.Subject;
8 import org.simantics.scl.compiler.markdown.nodes.BlockQuoteNode;
9 import org.simantics.scl.compiler.markdown.nodes.CodeBlockNode;
10 import org.simantics.scl.compiler.markdown.nodes.DocumentNode;
11 import org.simantics.scl.compiler.markdown.nodes.ExtensionBlockNode;
12 import org.simantics.scl.compiler.markdown.nodes.HeaderNode;
13 import org.simantics.scl.compiler.markdown.nodes.HorizontalRuleNode;
14 import org.simantics.scl.compiler.markdown.nodes.HtmlNode;
15 import org.simantics.scl.compiler.markdown.nodes.ItemNode;
16 import org.simantics.scl.compiler.markdown.nodes.ListNode;
17 import org.simantics.scl.compiler.markdown.nodes.Node;
18 import org.simantics.scl.compiler.markdown.nodes.ParagraphNode;
19 import org.simantics.scl.compiler.markdown.nodes.Reference;
20
21 import gnu.trove.map.hash.THashMap;
22
23 public class MarkdownParser {
24     public static final boolean DEBUG = false;
25     public static final int CODE_INDENT = 4;
26     
27     private DocumentNode root = new DocumentNode();
28     private Node current = root;
29     
30     private StringBuilder detabBuffer = new StringBuilder();
31     private Scanner scanner = new Scanner();
32     private int lineNumber = 0;
33     
34     private THashMap<String, Reference> referenceMap = new THashMap<String, Reference>();
35     
36     public DocumentNode parseDocument(Reader reader) throws IOException {
37         StringBuilder lineBuffer = new StringBuilder();
38         char secondNL = 0;
39         while(true) {
40             int c = reader.read();
41             if(c == -1) {
42                 processLine(lineBuffer);
43                 break;
44             }
45             else if(c == '\n' || c == '\r') {
46                 if(lineBuffer.length() == 0 && c == secondNL)
47                     secondNL = 0;
48                 else {
49                     processLine(lineBuffer);
50                     lineBuffer.delete(0, lineBuffer.length());
51                     secondNL = c == '\n' ? '\r' : '\n';
52                 }
53             }
54             else
55                 lineBuffer.append((char)c);
56         }
57         while(current != null)
58             current = finalize(current);
59         processInlines(root);
60         return root;
61     }
62     
63     public DocumentNode parseDocument(String text) {
64         try {
65             return parseDocument(new StringReader(text));
66         } catch (IOException e) {
67             // Should not be possible
68             throw new RuntimeException(e);
69         }
70     }
71
72     private void processInlines(Node node) {
73         for(Node child = node.firstChild; child != null; child = child.next)
74             processInlines(child);
75         if(node instanceof ParagraphNode || node instanceof HeaderNode)
76             Subject.parseInlines(referenceMap, node);
77     }
78
79     private void processLine(StringBuilder line) {
80         ++lineNumber;
81         line = detab(line);
82         if(DEBUG)
83             System.out.println("processLine(" + line + ")");
84         line.append('\n'); // Easier to detect eol
85         
86         Node container = root;
87         
88         int offset = 0; 
89         boolean blank = false;        
90         boolean allMatched = true;
91         while(container.lastChild != null && container.lastChild.open) {
92             container = container.lastChild;
93             
94             int firstNonspace = offset;
95             char c;
96             while((c=line.charAt(firstNonspace)) == ' ')
97                 ++firstNonspace;
98             
99             int indent = firstNonspace - offset;
100             blank = c == '\n';
101             
102             if(container instanceof BlockQuoteNode) {
103                 if(indent <= 3 && c == '>') {
104                     offset = firstNonspace + 1;
105                     if(line.charAt(offset) == ' ')
106                         ++offset;
107                 }
108                 else
109                     allMatched = false;
110             }
111             else if(container instanceof ItemNode) {
112                 ItemNode item = (ItemNode)container;
113                 if(indent >= item.indentation) {
114                     offset += item.indentation;
115                 }
116                 else if(blank)
117                     offset = firstNonspace;
118                 else
119                     allMatched = false;
120             }
121             else if(container instanceof CodeBlockNode) {
122                 CodeBlockNode codeBlock = (CodeBlockNode)container;
123                 if(!codeBlock.fenced) {
124                     if(indent >= CODE_INDENT)
125                         offset += CODE_INDENT;
126                     else if(blank)
127                         offset = firstNonspace;
128                     else
129                         allMatched = false;
130                 }
131                 else {
132                     if(indent <= 3 &&
133                             Scanner.isCloseCodeFence(line, firstNonspace,
134                                 codeBlock.fenceChar, codeBlock.fenceLength)) {
135                         current = finalize(container);
136                         return;
137                     }
138                     else {
139                         int i = codeBlock.fenceOffset;
140                         while(i > 0 && line.charAt(offset) == ' ') {
141                             ++offset;
142                             --i;
143                         }
144                     }
145                 }
146             }
147             else if(container instanceof HeaderNode) {
148                 allMatched = false;
149             }
150             else if(container instanceof HtmlNode) {
151                 if(blank)
152                     allMatched = false;
153             }
154             else if(container instanceof ParagraphNode) {
155                 if(blank)
156                     allMatched = false;
157             }
158             
159             if(!allMatched) {
160                 container = container.parent;
161                 break;
162             }
163         }
164         
165         Node lastMatchedContainer = container;
166         if(DEBUG)
167             System.out.println("    lastMatchedContainer = " + lastMatchedContainer.getClass().getSimpleName() + "@" + lastMatchedContainer.hashCode());
168         
169         if(blank && container.lastLineBlank) {
170             //System.out.println("    DOUBLE BREAK " + container.getClass().getSimpleName() + "@" + container.hashCode());
171             Node b = root;
172             while(b != null && !(b instanceof ListNode))
173                 b = b.lastChild;
174             
175             if(b != null) {
176                 while(container != null && container != b)
177                     container = finalize(container);
178                 finalize(b);
179                 container = b.parent;
180             }
181         }
182         
183         boolean maybeLazy = current instanceof ParagraphNode;
184         while(!(container instanceof CodeBlockNode) && !(container instanceof HtmlNode)) {
185             int firstNonspace = offset;
186             char c;
187             while((c=line.charAt(firstNonspace)) == ' ')
188                 ++firstNonspace;
189             
190             int indent = firstNonspace - offset;
191             blank = c == '\n';
192             
193             if(indent >= CODE_INDENT) {
194                 if(!maybeLazy && !blank) {
195                     offset += 4;
196                     container = addChild(container, new CodeBlockNode());
197                 }
198                 else
199                     break;
200             }
201             else if(c == '>') {
202                 offset = firstNonspace + 1;
203                 if(line.charAt(offset) == ' ')
204                     ++offset;
205                 container = addChild(container, new BlockQuoteNode());
206             }
207             else if(c == '#' && scanner.isAtxHeaderStart(line, firstNonspace)) {
208                 offset = firstNonspace + scanner.matched;
209                 container = addChild(container, new HeaderNode(scanner.level, false));
210             }
211             else if((c == '`' || c == '~') && scanner.isOpenCodeFence(line, firstNonspace, c)) {
212                 container = addChild(container, new CodeBlockNode(c, scanner.level, firstNonspace - offset));
213                 offset = firstNonspace + scanner.matched;
214             }
215             else if(Scanner.isHtmlBlockTag(line, firstNonspace)) {
216                 container = addChild(container, new HtmlNode());
217             }
218             else if((c == '=' || c == '-') 
219                     && container instanceof ParagraphNode
220                     && Scanner.isSetextHeaderLine(line, firstNonspace, c)
221                     && container.stringContent.indexOf("\n") == -1
222                     ) {
223                 HeaderNode header = new HeaderNode(c == '=' ? 1 : 2, true);
224                 header.lineNumber = container.lineNumber;
225                 if(DEBUG)
226                     System.out.println("    Replace ParagraphNode@" + System.identityHashCode(container) + " with HeaderNode@" + System.identityHashCode(header));
227                 header.stringContent = container.stringContent;
228                 header.parent = container.parent;
229                 header.prev = container.prev;
230                 if(header.prev != null)
231                     header.prev.next = header;
232                 if(header.parent.lastChild != null)
233                     header.parent.lastChild = header;
234                 if(header.parent.firstChild == container)
235                     header.parent.firstChild = header;
236                 container = header;
237                 if(current == container)
238                     current = header;
239                 offset = line.length()-1;
240             }
241             else if(!(container instanceof ParagraphNode && !allMatched)
242                     && (c == '*' || c == '_' || c == '-')
243                     && Scanner.isHRule(line, firstNonspace, c)) {
244                 container = addChild(container, new HorizontalRuleNode());
245                 container = finalize(container);
246                 offset = line.length()-1;
247             }
248             else if((c == '*' || c == '+' || c == '-') && 
249                     (line.charAt(firstNonspace+1) == ' ' || line.charAt(firstNonspace+1) == '\n')) {
250                 int originalOffset = offset;
251                 offset = firstNonspace + 1;
252                 int i = 0;
253                 char c2 = 0;
254                 while(i <= 5 && (c2 = line.charAt(offset+i)) == ' ')
255                     ++i;
256                 if(i >= 5|| i < 1 || c2 == '\n') {
257                     if(i > 0)
258                         ++offset;
259                 }
260                 else {
261                     offset += i;
262                 }
263                 
264                 if(!(container instanceof ListNode) || 
265                         !((ListNode)container).isCompatible(c)) {
266                     container = addChild(container, new ListNode(c));
267                 }
268                 
269                 if(DEBUG) {
270                     System.out.println("    indentation = " + (offset - originalOffset + (i == 0 ? 1 : 0)));
271                 }
272                 container = addChild(container, new ItemNode(offset - originalOffset + (i == 0 ? 1 : 0)));
273             }
274             else if(Character.isDigit(c) && scanner.isListMarker(line, firstNonspace)) {
275                 int originalOffset = offset;
276                 offset = firstNonspace + scanner.matched;
277                 int i = 0;
278                 char c2 = 0;
279                 while(i <= 5 && (c2 = line.charAt(offset+i)) == ' ')
280                     ++i;
281                 if(i >= 5|| i < 1 || c2 == '\n') {
282                     if(i > 0)
283                         ++offset;
284                 }
285                 else {
286                     offset += i;
287                 }
288                 
289                 if(!(container instanceof ListNode) || 
290                         !((ListNode)container).isCompatible(scanner.bulletChar)) {
291                     container = addChild(container, new ListNode(scanner.bulletChar, scanner.level));
292                 }
293                 
294                 if(DEBUG) {
295                     System.out.println("    indentation = " + (offset - originalOffset + (i == 0 ? 1 : 0)));
296                 }
297                 container = addChild(container, new ItemNode(offset - originalOffset + (i == 0 ? 1 : 0)));
298             }
299             else if(c == ':' && line.charAt(firstNonspace+1) == ':') {
300                 int p=firstNonspace+2;
301                 while(Character.isAlphabetic(c=line.charAt(p)) || Character.isDigit(c) || c == ' ' || c=='_')
302                     ++p;
303                 if(c != '[')
304                     break;
305                 int bracketBegin = p;
306                 ++p;
307                 while(true) {
308                     c = line.charAt(p++);
309                     if(c == ']') {
310                         break;
311                     }
312                     else if(c == '\\' && ((c=line.charAt(p+1)) == '\\' || c == ']')) {
313                         ++p;
314                     }
315                     else if(c == '\n')
316                         break;
317                 }
318                 if(c == ']') {
319                     offset = p;
320                     container = addChild(container, new ExtensionBlockNode(
321                             line.substring(firstNonspace+2, bracketBegin).trim(),
322                             line.substring(bracketBegin+1, p-1).trim()));
323                 }
324                 else
325                     break;
326             }
327             else
328                 break;
329             
330             if(container.acceptLines())
331                 break;
332             maybeLazy = false;
333         }
334         
335         int firstNonspace = offset;
336         char c;
337         while((c=line.charAt(firstNonspace)) == ' ')
338             ++firstNonspace;
339         
340         blank = c == '\n';
341         
342         if(blank) {
343             if(container.lastChild != null)
344                 container.lastChild.setLastLineBlank(true);
345             container.setLastLineBlank(
346                     !(container instanceof BlockQuoteNode) &&
347                     !(container instanceof HeaderNode) &&
348                     !(container instanceof CodeBlockNode && ((CodeBlockNode)container).fenced) &&
349                     !(container instanceof ItemNode &&
350                             container.firstChild == null &&
351                             container.lineNumber == lineNumber));
352         }
353         else
354             container.setLastLineBlank(false);
355         for(Node cont = container;cont.parent != null;
356                 cont = cont.parent, cont.setLastLineBlank(false));
357         
358         if(DEBUG) {
359             System.out.println("    current = " + current.getClass().getSimpleName() + "@" + current.hashCode());
360             System.out.println("    container = " + container.getClass().getSimpleName() + "@" + container.hashCode());
361         }
362         if(current != lastMatchedContainer &&
363                 container == lastMatchedContainer &&
364                 !blank &&
365                 current instanceof ParagraphNode &&
366                 current.stringContent != null) {
367             addLine(current, line, offset);
368         }
369         else {
370             while(current != lastMatchedContainer)
371                 current = finalize(current);
372             
373             if(container instanceof CodeBlockNode ||
374                     container instanceof HtmlNode)
375                 addLine(container, line, offset);
376             else if(blank)
377                 ; // do nothing
378             else if(container.acceptLines()) {
379                 if(container instanceof HeaderNode &&
380                         !((HeaderNode)container).setext)
381                     chopTrailingHashtags(line, firstNonspace);
382                 addLine(container, line, firstNonspace);
383             }
384             else {
385                 container = addChild(container, new ParagraphNode());
386                 addLine(container, line, firstNonspace);
387             }
388             
389             current = container;
390         }
391     }
392     
393     private void chopTrailingHashtags(StringBuilder line, int firstNonspace) {
394         //System.out.println("chopTrailingHashtags("+line.substring(firstNonspace)+")");
395         int pos = line.length()-1;
396         char c=0;
397         while(pos >= 0 && ((c=line.charAt(pos)) == ' ' || c == '\n'))
398             --pos;
399         line.delete(pos+1, line.length());
400         if(c == '#') {
401             --pos;
402             while(pos >= 0 && (c=line.charAt(pos)) == '#')
403                 --pos;
404             if(c != ' ')
405                 return;
406             --pos;
407             while(pos >= 0 && line.charAt(pos) == ' ')
408                 --pos;
409             ++pos;
410             if(pos < firstNonspace)
411                 pos = firstNonspace;
412             line.delete(pos, line.length());
413         }
414     }
415
416     private void addLine(Node container, StringBuilder line, int offset) {
417         if(container.stringContent == null)
418             container.stringContent = new StringBuilder();
419         else
420             container.stringContent.append('\n');
421         int length = line.length();
422         if(length > 0 && line.charAt(length-1) == '\n')
423             --length;
424         if(DEBUG)
425             System.out.println("    addLine(" + container.getClass().getSimpleName() + "@" + container.hashCode() + ", \"" + line.substring(offset, length) + "\")");
426         container.stringContent.append(line, offset, length);
427     }
428
429     private StringBuilder detab(StringBuilder str) {
430         int length = str.length();
431         for(int i=0;i<length;++i) {
432             if(str.charAt(i) == '\t') {
433                 detabBuffer.delete(0, detabBuffer.length());
434                 detabBuffer.append(str, 0, i);
435                 for(;i<length;++i) {
436                     char c = str.charAt(i);
437                     if(c == '\t') {
438                         int spaces = 4 - detabBuffer.length()%4;
439                         while(spaces-- > 0)
440                             detabBuffer.append(' ');
441                     }
442                     else
443                         detabBuffer.append(c);
444                 }
445                 return detabBuffer;
446             }
447         }
448         return str;
449     }
450     
451     private Node addChild(Node parent, Node child) {
452         child.lineNumber = lineNumber;
453         if(DEBUG)
454             System.out.println("    addChild(" + parent.getClass().getSimpleName() + "@" + parent.hashCode() + ", " + 
455                     child.getClass().getSimpleName() + "@" + child.hashCode() + ")");
456         while(!parent.canContain(child))
457             parent = finalize(parent);
458         parent.addChild(child);
459         return child;
460     }
461
462     private Node finalize(Node node) {
463         node.open = false;
464         if(node instanceof ParagraphNode) {
465             parseReferenceInline(node);
466         }
467         else if(node instanceof HeaderNode) {
468             if(node.stringContent == null)
469                 node.stringContent = new StringBuilder(0);
470         }
471         else if(node instanceof CodeBlockNode) {
472             CodeBlockNode codeBlock = (CodeBlockNode)node;
473             if(codeBlock.fenced) {
474                 int firstLineLength = codeBlock.stringContent.indexOf("\n");
475                 String infoString;
476                 if(firstLineLength == -1) {
477                     infoString = codeBlock.stringContent.toString().trim();
478                     codeBlock.stringContent = new StringBuilder(0);
479                 }
480                 else {
481                     infoString = codeBlock.stringContent.substring(0, firstLineLength).trim();
482                     codeBlock.stringContent.delete(0, firstLineLength+1);
483                 }
484                 codeBlock.infoString = Reference.cleanUrl(infoString);
485             }
486             else {
487                 removeTrailingBlankLines(codeBlock.stringContent);
488             }
489         }
490         else if(node instanceof ListNode) {
491             ListNode list = (ListNode)node;
492             list.tight = true;
493             itemLoop: for(Node item=list.firstChild;item != null;item = item.next) {
494                 if(item.lastLineBlank && item.next != null) {
495                     list.tight = false;
496                     break;
497                 }
498                 for(Node child=item.firstChild;child != null;child = child.next)
499                     if(endsWithBlankLine(child) && (child.next != null || item.next != null)) {
500                         list.tight = false;
501                         break itemLoop;
502                     }
503             }
504         }
505         return node.parent;
506     }
507
508     private static boolean endsWithBlankLine(Node node) {
509         while(true) {
510             if(node.lastLineBlank)
511                 return true;
512             node = node.lastChild;
513             if(!(node instanceof ListNode) && !(node instanceof ItemNode))
514                 return false;
515         }
516     }
517
518     private static void removeTrailingBlankLines(StringBuilder str) {
519         int endPos = str.length();
520         int pos = endPos-1;
521         while(pos >= 0) {
522             char c = str.charAt(pos);
523             if(c == '\n')
524                 endPos = pos;
525             else if(c != ' ')
526                 break;
527             --pos;
528         }
529         if(endPos < str.length())
530             str.delete(endPos, str.length());
531     }
532     
533     private void parseReferenceInline(Node node) {
534         StringBuilder input = node.stringContent;
535         
536         while(true) {
537             int offset = 0;
538             if(offset == input.length() || input.charAt(offset) != '[')
539                 return;
540             
541             // Label
542             offset = Scanner.scanLinkLabel(input, offset);
543             if(offset == -1 || offset == input.length() 
544                     || input.charAt(offset) != ':')
545                 return;
546             String label = input.substring(1, offset-1);
547             ++offset;
548             
549             // Url
550             offset = spnl(input, offset);
551             int linkStart = offset;
552             offset = Scanner.scanLinkUrl(input, offset);
553             if(offset == -1 || offset == linkStart)
554                 return;
555             String url;
556             if(linkStart < input.length() && input.charAt(linkStart) == '<')
557                 url = input.substring(linkStart+1, offset-1);
558             else
559                 url = input.substring(linkStart, offset);
560             url = Reference.cleanUrl(url);
561             
562             // Title
563             int linkUrlEnd = offset;
564             offset = spnl(input, offset);
565             int titleStart = offset;
566             offset = Scanner.scanLinkTitle(input, offset);
567             String title;
568             if(offset == -1) {
569                 offset = linkUrlEnd;
570                 char c = 0;
571                 while(offset < input.length() && (c = input.charAt(offset)) == ' ')
572                     ++offset;
573                 if(c == '\n')
574                     ++offset;
575                 else if(offset != input.length())
576                     return;
577                 title = "";
578             }
579             else {
580                 title = input.substring(titleStart+1, offset-1);
581                 title = Reference.cleanTitle(title);
582                 char c = 0;
583                 while(offset < input.length() && (c = input.charAt(offset)) == ' ')
584                     ++offset;
585                 if(c == '\n')
586                     ++offset;
587                 else if(offset != input.length())
588                     return;
589             }
590             /*System.out.println("Reference:");
591             System.out.println("    label = '" + label + "'");
592             System.out.println("    url = '" + url + "'");
593             System.out.println("    title = '" + title + "'");*/
594             Reference reference = new Reference(Reference.normalizeLabel(label), url, title);
595             if(!referenceMap.contains(reference.label))
596                 referenceMap.put(reference.label, reference);
597             
598             if(offset == input.length()) {
599                 node.remove();
600                 return;
601             }
602             else
603                 input.delete(0, offset);
604         }        
605     }
606     
607     private static int spnl(StringBuilder input, int offset) {
608         boolean seenWhitespace = false;
609         while(offset < input.length()) {
610             char c = input.charAt(offset);
611             if(c == ' ')
612                 ++offset;
613             else if(c == '\n') {
614                 if(seenWhitespace)
615                     return offset;
616                 else {
617                     seenWhitespace = true;
618                     ++offset;
619                 }
620             }
621             else
622                 return offset;
623         }
624         return offset;
625     }
626
627 }