]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/org.simantics.scl.compiler/src/org/simantics/scl/compiler/markdown/internal/Scanner.java
migrated to svn revision 33108
[simantics/platform.git] / bundles / org.simantics.scl.compiler / src / org / simantics / scl / compiler / markdown / internal / Scanner.java
1 package org.simantics.scl.compiler.markdown.internal;
2
3 import org.simantics.scl.compiler.markdown.inlines.Subject;
4
5 import gnu.trove.set.hash.THashSet;
6
7 public class Scanner {
8
9     public int level;
10     public int matched;
11     public char bulletChar;
12     
13     public static boolean isCloseCodeFence(StringBuilder line, int offset, char fenceChar, int fenceLength) {
14         int matched = 0;
15         while(line.charAt(offset) == fenceChar) {
16             ++offset;
17             ++matched;
18         }
19         if(matched < fenceLength)
20             return false;
21         while(true) {
22             char c = line.charAt(offset++);
23             if(c == '\n')
24                 return true;
25             else if(c != ' ')
26                 return false;
27         }
28     }
29     
30     public static boolean isSetextHeaderLine(StringBuilder line, int offset, char headerLineChar) {
31         char c;
32         while((c = line.charAt(offset)) == headerLineChar)
33             ++offset;
34         while(c == ' ') {
35             ++offset;
36             c = line.charAt(offset);
37         }
38         return c == '\n';
39     }
40
41     public static boolean isHRule(StringBuilder line, int offset, char hrChar) {
42         char c;
43         int count = 0;
44         while((c = line.charAt(offset)) != '\n') {
45             if(c == hrChar)
46                 ++count;
47             else if(c != ' ')
48                 return false;
49             ++offset;
50         }
51         return count >= 3;
52     }
53
54     public static boolean isHtmlBlockTag(StringBuilder line, int offset) {
55         if(line.charAt(offset) != '<')
56             return false;
57         ++offset;
58         char c = line.charAt(offset);
59         
60         // HTML comment, processing instruction, CDATA or entity definition
61         if(c == '!' || c == '?')
62             return true;
63
64         // Ending tag
65         if(c == '/') {
66             ++offset;
67             offset = scanTag(line, offset);
68             if(offset == -1)
69                 return false;
70             c = line.charAt(offset);
71             return c == ' ' || c == '>';
72         }
73         
74         // Beginning tag
75         offset = scanTag(line, offset);
76         if(offset == -1)
77             return false;
78         c = line.charAt(offset);
79         return c == ' ' || c == '/' || c == '>';
80     }
81     
82     public static int scanTag(StringBuilder line, int offset) {
83         StringBuilder b = new StringBuilder();
84         while(offset < line.length()) {
85             char c = line.charAt(offset);
86             if( (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') )
87                 b.append(c);
88             else if( c >= 'A' && c <= 'Z' )
89                 b.append(Character.toLowerCase(c));
90             else
91                 break;
92             ++offset;
93         }
94         if(HTML_BLOCK_TAG_SET.contains(b.toString()))
95             return offset;
96         else
97             return -1;
98     }
99     
100     private static final String[] HTML_BLOCK_TAGS = new String[] {
101         "article", "header", "aside", "hgroup", "blockquote", "hr", "iframe", "body", "li", "map", "button", "object", "canvas", "ol",
102         "caption", "output", "col", "p", "colgroup", "pre", "dd", "progress", "div", "section", "dl", "table", "td", "dt", "tbody",
103         "embed", "textarea", "fieldset", "tfoot", "figcaption", "th", "figure", "thead", "footer", "tr", "form", "ul", "h1", "h2", "h3",
104         "h4", "h5", "h6", "video", "script", "style"
105     };
106     private static final THashSet<String> HTML_BLOCK_TAG_SET = new THashSet<String>();
107     static {
108         for(String tag : HTML_BLOCK_TAGS)
109             HTML_BLOCK_TAG_SET.add(tag);
110     }
111     
112     public boolean isAtxHeaderStart(StringBuilder line, int offset) {
113         int matched = 0;
114         char c;
115         while((c = line.charAt(offset)) == '#') {
116             ++offset;
117             ++matched;
118         }
119         if(matched == 0 || matched > 6)
120             return false;
121         this.level = matched;
122         if(c != '\n') {
123             if(c != ' ') 
124                 return false;
125             while(c == ' ') {
126                 ++offset;
127                 ++matched;
128                 c = line.charAt(offset);
129             }
130         }
131         this.matched = matched;
132         return true;
133     }
134
135     public boolean isOpenCodeFence(StringBuilder line, int offset, char fenceChar) {
136         int matched = 0;
137         char c;
138         while((c = line.charAt(offset)) == fenceChar) {
139             ++offset;
140             ++matched;
141         }
142         if(matched < 3)
143             return false;
144         this.level = matched;
145         while(line.charAt(offset) == ' ') {
146             ++offset;
147             ++matched;
148         }
149         this.matched = matched;
150         while((c = line.charAt(offset)) != '\n') {
151             if(c == fenceChar)
152                 return false;
153             ++offset;
154         }
155         return true;
156     }
157
158     public boolean isListMarker(StringBuilder line, int offset) {
159         int pos = offset;
160         char c;
161         while(Character.isDigit(c = line.charAt(pos)))
162             ++pos;
163         if(c != '.' && c != ')')
164             return false;
165         ++pos;
166         char c2;
167         if((c2=line.charAt(pos)) != ' ' && c2 != '\n')
168             return false;
169         this.matched = pos-offset;
170         this.level = Integer.parseInt(line.substring(offset, pos-1));
171         this.bulletChar = c;
172         return true;
173     }
174     
175     private static final String CDATA = "CDATA[";
176     
177     public static int scanHtmlTag(StringBuilder input, int offset) {
178         char c;
179         c = input.charAt(offset++);        
180         
181         // Comment, declaration or cdata 
182         if(c == '!') {
183             if(offset == input.length())
184                 return -1;
185             c = input.charAt(offset++);
186             
187             // Comment
188             if(c == '-') {
189                 if(offset+4 > input.length())
190                     return -1;
191                 if(input.charAt(offset++) != '-')
192                     return -1;
193                 c = input.charAt(offset++);
194                 if(c == '-') {
195                     c = input.charAt(offset++);
196                     if(c == '-')
197                         return -1;
198                 }
199                 if(c == '>')
200                     return -1;
201                 
202                 while(offset+3 <= input.length()) {
203                     c = input.charAt(offset++);
204                     if(c == '-') {
205                         c = input.charAt(offset++);
206                         if(c == '-') {
207                             c = input.charAt(offset++);
208                             if(c == '>')
209                                 return offset;
210                             else
211                                 return -1;
212                         }
213                     }
214                 }
215                 return -1;
216             }
217             
218             // Cdata
219             else if(c == '[') {
220                 for(int i=0;i<CDATA.length();++i) {
221                     c = input.charAt(offset++);
222                     if(CDATA.charAt(i) != c)
223                         return -1;
224                 }
225                 while(offset+3 <= input.length()) {
226                     c = input.charAt(offset++);
227                     if(c == ']') {
228                         c = input.charAt(offset++);
229                         while(c == ']') {
230                             c = input.charAt(offset++);
231                             if(c == '>')
232                                 return offset;
233                         }
234                     }
235                 }
236                 return -1;
237             }
238             
239             // Declaration
240             else if(c >= 'A' && c <= 'Z') {
241                 while( offset < input.length() && (c=input.charAt(offset++)) >= 'A' && c <= 'Z' );
242                 if(c != ' ' && c != '\n')
243                     return -1;
244                 while( offset < input.length() && (c=input.charAt(offset++)) != '>' );
245                 if(c != '>')
246                     return -1;
247                 else
248                     return offset;
249             }
250             else
251                 return -1;
252         }
253         
254         // Processing instruction
255         else if(c == '?') {
256             while(offset < input.length()) {
257                 c = input.charAt(offset++);
258                 if(c == '?') {
259                     c = input.charAt(offset++);
260                     if(c == '>')
261                         return offset;
262                 }
263             }
264             return -1;
265         }
266         
267         // Close tag
268         else if(c == '/') {
269             offset = scanTagName(input, offset);
270             if(offset == -1)
271                 return -1;
272             offset = scanWhitespace(input, offset);
273             if(offset == -1)
274                 return -1;
275             if(input.charAt(offset) == '>')
276                 return offset+1;
277             else
278                 return -1;
279         }
280         
281         // Open tag
282         else {
283             --offset;
284             offset = scanTagName(input, offset);
285             if(offset == -1)
286                 return -1;
287             while(true) {
288                 if((c=input.charAt(offset)) != ' ' && c != '\n') {
289                     if(c == '>')
290                         return offset+1;
291                     if(c == '/' && input.charAt(offset+1)=='>')
292                         return offset+2;
293                     return -1;
294                 }
295                 offset = scanWhitespace(input, offset);
296                 if(offset == -1)
297                     return -1;
298                 c = input.charAt(offset);
299                 if(c == '>')
300                     return offset+1;
301                 if(c == '/' && input.charAt(offset+1)=='>')
302                     return offset+2;
303                 offset = scanAttributeName(input, offset);
304                 if(offset == -1)
305                     return -1;
306                 offset = scanWhitespace(input, offset);
307                 if(offset == -1)
308                     return -1;
309                 if((c=input.charAt(offset)) == '=') {
310                     ++offset;
311                     offset = scanWhitespace(input, offset);
312                     if(offset == -1)
313                         return -1;
314                     
315                     c = input.charAt(offset);
316                     if(c == '"') {
317                         ++offset;
318                         while(true) {
319                             if(offset == input.length())
320                                 return -1;
321                             c=input.charAt(offset++);
322                             if(c == '"')
323                                 break;
324                         }
325                     }
326                     else if(c == '\'') {
327                         ++offset;
328                         while(true) {
329                             if(offset == input.length())
330                                 return -1;
331                             c=input.charAt(offset++);
332                             if(c == '\'')
333                                 break;
334                         }
335                     }
336                     else {
337                         while(true) {
338                             if(offset == input.length())
339                                 return -1;
340                             c=input.charAt(offset++);
341                             if(c==' ' || c=='\n' || c=='"' || c=='\'' || c=='=' || c=='<' || c=='>' || c=='`') {
342                                 --offset;
343                                 break;
344                             }
345                         }
346                     }
347                 }
348                 else {
349                     if(c == '>')
350                         return offset+1;
351                     --offset;
352                     c = input.charAt(offset);
353                     if(c != ' ' && c != '\n' && c != '>')
354                         return -1;
355                 }
356             }
357         }
358     }
359     
360     private static int scanTagName(StringBuilder input, int offset) {
361         if(offset >= input.length())
362             return -1;
363         char c = input.charAt(offset++);
364         if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) )
365             return -1;
366         while(offset < input.length()) {
367             c = input.charAt(offset++);
368             if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) )
369                 return offset-1;
370         }
371         return -1;
372     }
373     
374     private static int scanAttributeName(StringBuilder input, int offset) {
375         if(offset >= input.length())
376             return -1;
377         char c = input.charAt(offset++);
378         if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':') )
379             return -1;
380         while(offset < input.length()) {
381             c = input.charAt(offset++);
382             if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
383                     || c == '_' || c == ':' || c == '.' || c == '-') )
384                 return offset-1;
385         }
386         return -1;
387     }
388     
389     public static int scanWhitespace(StringBuilder input, int offset) {
390         while(offset < input.length()) {
391             char c = input.charAt(offset);
392             if(c != ' ' && c != '\n')
393                 return offset;
394             ++offset;
395         }
396         return -1;
397     }
398
399     public static int scanUri(StringBuilder input, int pos) {
400         int startPos = pos;
401         char c;
402         while(true) {
403             if(pos == input.length())
404                 return -1;
405             c = input.charAt(pos++);
406             if(c < 0 || c >= 128 || !IS_SCHEME_CHAR[(int)c])
407                 break;
408         }
409         if(c != ':' || !SCHEME_SET.contains(input.substring(startPos, pos-1).toLowerCase()))
410             return -1;
411         while(pos < input.length() && (c = input.charAt(pos)) != '>' && c != '<' && !(c <= 0x20 && c >= 0))
412             ++pos;
413         if(c == '>')
414             return pos+1;
415         else
416             return -1;
417     }
418     
419     private static final String[] SCHEMES = new String[] {
420         "coap", "doi", "javascript", "aaa", "aaas", "about", "acap", "cap", "cid", "crid", "data", "dav", "dict", "dns", "file", "ftp", "geo", "go",
421         "gopher", "h323", "http", "https", "iax", "icap", "im", "imap", "info", "ipp", "iris", "iris.beep", "iris.xpc", "iris.xpcs", "iris.lwz",
422         "ldap", "mailto", "mid", "msrp", "msrps", "mtqp", "mupdate", "news", "nfs", "ni", "nih", "nntp", "opaquelocktoken", "pop", "pres", "rtsp",
423         "service", "session", "shttp", "sieve", "sip", "sips", "sms", "snmp", "soap.beep", "soap.beeps", "tag", "tel", "telnet", "tftp", "thismessage",
424         "tn3270", "tip", "tv", "urn", "vemmi", "ws", "wss", "xcon", "xcon-userid", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "z39.50r", "z39.50s", "adiumxtra",
425         "afp", "afs", "aim", "apt", "attachment", "aw", "beshare", "bitcoin", "bolo", "callto", "chrome", "chrome-extension", "com-eventbrite-attendee",
426         "content", "cvs", "dlna-playsingle", "dlna-playcontainer", "dtn", "dvb", "ed2k", "facetime", "feed", "finger", "fish", "gg", "git", "gizmoproject",
427         "gtalk", "hcp", "icon", "ipn", "irc", "irc6", "ircs", "itms", "jar", "jms", "keyparc", "lastfm", "ldaps", "magnet", "maps", "market", "message", "mms",
428         "ms-help", "msnim", "mumble", "mvn", "notes", "oid", "palm", "paparazzi", "platform", "proxy", "psyc", "query", "res", "resource", "rmi", "rsync",
429         "rtmp", "secondlife", "sftp", "sgn", "skype", "smb", "soldat", "spotify", "ssh", "steam", "svn", "teamspeak", "things", "udp", "unreal", "ut2004",
430         "ventrilo", "view-source", "webcal", "wtai", "wyciwyg", "xfire", "xri", "ymsgr"  
431     };
432     private static final THashSet<String> SCHEME_SET = new THashSet<String>();
433     private static final boolean[] IS_SCHEME_CHAR = new boolean[128];
434     static {
435         for(String scheme : SCHEMES) {
436             SCHEME_SET.add(scheme);
437             for(int i=0;i<scheme.length();++i) {
438                 char c = scheme.charAt(i);
439                 IS_SCHEME_CHAR[(int)c] = true;
440                 IS_SCHEME_CHAR[(int)Character.toUpperCase(c)] = true;
441             }
442         }
443     }
444     
445     public static int scanLinkLabel(StringBuilder input, int offset) {
446         if(offset == input.length() || input.charAt(offset++) != '[')
447             return -1;
448         int maxPos = Math.min(input.length(), offset+1000);
449         while(offset < maxPos) {
450             char c = input.charAt(offset++);
451             if(c == ']')
452                 return offset;
453             if(c == '[')
454                 return -1;
455             if(c == '\\' && offset < maxPos) {
456                 c = input.charAt(offset);
457                 if(Subject.getCharType(c) == 2)
458                     ++offset;
459             }
460         }
461         return -1;
462     }
463     
464     public static int scanLinkUrl(StringBuilder input, int offset) {        
465         if(offset == input.length())
466             return offset;
467         if(input.charAt(offset) == '<') {
468             ++offset;
469             while(offset < input.length()) {
470                 char c = input.charAt(offset++);
471                 if(c == '>') {
472                     return offset;
473                 }
474                 else if(c == '\\') {
475                     if(Subject.getCharType(c) == 2)
476                         ++offset;
477                 }
478                 else if(c == '<' || c == '\n')
479                     return -1;
480             }
481             return -1;
482         }
483         else {
484             while(offset < input.length()) {
485                 char c = input.charAt(offset++);
486                 if(c == '\\') {
487                     if(Subject.getCharType(input.charAt(offset)) == 2)
488                         ++offset;
489                     else
490                         return offset - 1;
491                 }
492                 else if( c == '(' ) {
493                     int orgPos = offset - 1;
494                     while(true) {
495                         if(offset >= input.length())
496                             return orgPos;
497                         c = input.charAt(offset++);
498                         if(c == '\\') {
499                             if(Subject.getCharType(input.charAt(offset)) == 2)
500                                 ++offset;
501                             else
502                                 return orgPos;
503                         }
504                         else if(c == ')')
505                             break;
506                         else if( (c <= 0x20 && c >= 0) || c == '(' )
507                             return orgPos;
508                     }
509                 }
510                 else if( (c <= 0x20 && c >= 0) || c == ')' )
511                     return offset-1;
512             }
513             return offset;
514         }
515     }
516
517     public static int scanLinkTitle(StringBuilder input, int offset) {
518         if(offset == input.length())
519             return -1;
520         char c = input.charAt(offset++);
521         if(c == '(') {
522             while(offset < input.length()) {
523                 c = input.charAt(offset++);
524                 if(c == ')')
525                     return offset;      
526                 if(c == '\\') {
527                     c = input.charAt(offset);
528                     if(c == ')' || c == '\\')
529                         ++offset;
530                 }
531             }
532             return -1;
533         }
534         else if(c == '"') {
535             while(offset < input.length()) {
536                 c = input.charAt(offset++);
537                 if(c == '"')
538                     return offset;      
539                 if(c == '\\') {
540                     c = input.charAt(offset);
541                     if(c == '"' || c == '\\')
542                         ++offset;
543                 }
544             }
545             return -1;
546         }
547         if(c == '\'') {
548             while(offset < input.length()) {
549                 c = input.charAt(offset++);
550                 if(c == '\'')
551                     return offset;      
552                 if(c == '\\') {
553                     c = input.charAt(offset);
554                     if(c == '\'' || c == '\\')
555                         ++offset;
556                 }
557             }
558             return -1;
559         }
560         else
561             return -1;
562     }
563     
564     private static final CharacterSet EMAIL_START = new CharacterSet("a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-");
565     private static final CharacterSet EMAIL_END_A = new CharacterSet("a-zA-Z0-9");
566     private static final CharacterSet EMAIL_END_B = new CharacterSet("a-zA-Z0-9-");
567
568     public static int scanEmail(StringBuilder input, int offset) {
569         int initialPos = offset;
570         char c = 0;
571         while(offset < input.length() && EMAIL_START.contains(c=input.charAt(offset++)) );
572         if( c != '@' || offset == initialPos )
573             return -1;
574         ++offset;
575         while(true) {
576             if(offset == input.length() || !EMAIL_END_A.contains(c=input.charAt(offset++)))
577                 return -1;
578             int count = 1;
579             int oldC = c;
580             while(offset < input.length() && EMAIL_END_B.contains(c=input.charAt(offset++))) {
581                 ++count;
582                 if(count > 62)
583                     return -1;
584                 oldC = c;
585             }
586             if(oldC=='-')
587                 return -1;
588             if(c == '>')
589                 return offset;
590             if(c != '.')
591                 return -1;
592         }
593     }
594 }