--- /dev/null
+package org.simantics.scl.compiler.markdown.internal;
+
+import gnu.trove.set.hash.THashSet;
+
+import org.simantics.scl.compiler.markdown.inlines.Subject;
+
+public class Scanner {
+
+ public int level;
+ public int matched;
+ public char bulletChar;
+
+ public static boolean isCloseCodeFence(StringBuilder line, int offset, char fenceChar, int fenceLength) {
+ int matched = 0;
+ while(line.charAt(offset) == fenceChar) {
+ ++offset;
+ ++matched;
+ }
+ if(matched < fenceLength)
+ return false;
+ while(true) {
+ char c = line.charAt(offset++);
+ if(c == '\n')
+ return true;
+ else if(c != ' ')
+ return false;
+ }
+ }
+
+ public static boolean isSetextHeaderLine(StringBuilder line, int offset, char headerLineChar) {
+ char c;
+ while((c = line.charAt(offset)) == headerLineChar)
+ ++offset;
+ while(c == ' ') {
+ ++offset;
+ c = line.charAt(offset);
+ }
+ return c == '\n';
+ }
+
+ public static boolean isHRule(StringBuilder line, int offset, char hrChar) {
+ char c;
+ int count = 0;
+ while((c = line.charAt(offset)) != '\n') {
+ if(c == hrChar)
+ ++count;
+ else if(c != ' ')
+ return false;
+ ++offset;
+ }
+ return count >= 3;
+ }
+
+ public static boolean isHtmlBlockTag(StringBuilder line, int offset) {
+ if(line.charAt(offset) != '<')
+ return false;
+ ++offset;
+ char c = line.charAt(offset);
+
+ // HTML comment, processing instruction, CDATA or entity definition
+ if(c == '!' || c == '?')
+ return true;
+
+ // Ending tag
+ if(c == '/') {
+ ++offset;
+ offset = scanTag(line, offset);
+ if(offset == -1)
+ return false;
+ c = line.charAt(offset);
+ return c == ' ' || c == '>';
+ }
+
+ // Beginning tag
+ offset = scanTag(line, offset);
+ if(offset == -1)
+ return false;
+ c = line.charAt(offset);
+ return c == ' ' || c == '/' || c == '>';
+ }
+
+ public static int scanTag(StringBuilder line, int offset) {
+ StringBuilder b = new StringBuilder();
+ while(offset < line.length()) {
+ char c = line.charAt(offset);
+ if( (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') )
+ b.append(c);
+ else if( c >= 'A' && c <= 'Z' )
+ b.append(Character.toLowerCase(c));
+ else
+ break;
+ ++offset;
+ }
+ if(HTML_BLOCK_TAG_SET.contains(b.toString()))
+ return offset;
+ else
+ return -1;
+ }
+
+ private static final String[] HTML_BLOCK_TAGS = new String[] {
+ "article", "header", "aside", "hgroup", "blockquote", "hr", "iframe", "body", "li", "map", "button", "object", "canvas", "ol",
+ "caption", "output", "col", "p", "colgroup", "pre", "dd", "progress", "div", "section", "dl", "table", "td", "dt", "tbody",
+ "embed", "textarea", "fieldset", "tfoot", "figcaption", "th", "figure", "thead", "footer", "tr", "form", "ul", "h1", "h2", "h3",
+ "h4", "h5", "h6", "video", "script", "style"
+ };
+ private static final THashSet<String> HTML_BLOCK_TAG_SET = new THashSet<String>();
+ static {
+ for(String tag : HTML_BLOCK_TAGS)
+ HTML_BLOCK_TAG_SET.add(tag);
+ }
+
+ public boolean isAtxHeaderStart(StringBuilder line, int offset) {
+ int matched = 0;
+ char c;
+ while((c = line.charAt(offset)) == '#') {
+ ++offset;
+ ++matched;
+ }
+ if(matched == 0 || matched > 6)
+ return false;
+ this.level = matched;
+ if(c != '\n') {
+ if(c != ' ')
+ return false;
+ while(c == ' ') {
+ ++offset;
+ ++matched;
+ c = line.charAt(offset);
+ }
+ }
+ this.matched = matched;
+ return true;
+ }
+
+ public boolean isOpenCodeFence(StringBuilder line, int offset, char fenceChar) {
+ int matched = 0;
+ char c;
+ while((c = line.charAt(offset)) == fenceChar) {
+ ++offset;
+ ++matched;
+ }
+ if(matched < 3)
+ return false;
+ this.level = matched;
+ while(line.charAt(offset) == ' ') {
+ ++offset;
+ ++matched;
+ }
+ this.matched = matched;
+ while((c = line.charAt(offset)) != '\n') {
+ if(c == fenceChar)
+ return false;
+ ++offset;
+ }
+ return true;
+ }
+
+ public boolean isListMarker(StringBuilder line, int offset) {
+ int pos = offset;
+ char c;
+ while(Character.isDigit(c = line.charAt(pos)))
+ ++pos;
+ if(c != '.' && c != ')')
+ return false;
+ ++pos;
+ char c2;
+ if((c2=line.charAt(pos)) != ' ' && c2 != '\n')
+ return false;
+ this.matched = pos-offset;
+ this.level = Integer.parseInt(line.substring(offset, pos-1));
+ this.bulletChar = c;
+ return true;
+ }
+
+ private static final String CDATA = "CDATA[";
+
+ public static int scanHtmlTag(StringBuilder input, int offset) {
+ char c;
+ c = input.charAt(offset++);
+
+ // Comment, declaration or cdata
+ if(c == '!') {
+ if(offset == input.length())
+ return -1;
+ c = input.charAt(offset++);
+
+ // Comment
+ if(c == '-') {
+ if(offset+4 > input.length())
+ return -1;
+ if(input.charAt(offset++) != '-')
+ return -1;
+ c = input.charAt(offset++);
+ if(c == '-') {
+ c = input.charAt(offset++);
+ if(c == '-')
+ return -1;
+ }
+ if(c == '>')
+ return -1;
+
+ while(offset+3 <= input.length()) {
+ c = input.charAt(offset++);
+ if(c == '-') {
+ c = input.charAt(offset++);
+ if(c == '-') {
+ c = input.charAt(offset++);
+ if(c == '>')
+ return offset;
+ else
+ return -1;
+ }
+ }
+ }
+ return -1;
+ }
+
+ // Cdata
+ else if(c == '[') {
+ for(int i=0;i<CDATA.length();++i) {
+ c = input.charAt(offset++);
+ if(CDATA.charAt(i) != c)
+ return -1;
+ }
+ while(offset+3 <= input.length()) {
+ c = input.charAt(offset++);
+ if(c == ']') {
+ c = input.charAt(offset++);
+ while(c == ']') {
+ c = input.charAt(offset++);
+ if(c == '>')
+ return offset;
+ }
+ }
+ }
+ return -1;
+ }
+
+ // Declaration
+ else if(c >= 'A' && c <= 'Z') {
+ while( offset < input.length() && (c=input.charAt(offset++)) >= 'A' && c <= 'Z' );
+ if(c != ' ' && c != '\n')
+ return -1;
+ while( offset < input.length() && (c=input.charAt(offset++)) != '>' );
+ if(c != '>')
+ return -1;
+ else
+ return offset;
+ }
+ else
+ return -1;
+ }
+
+ // Processing instruction
+ else if(c == '?') {
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if(c == '?') {
+ c = input.charAt(offset++);
+ if(c == '>')
+ return offset;
+ }
+ }
+ return -1;
+ }
+
+ // Close tag
+ else if(c == '/') {
+ offset = scanTagName(input, offset);
+ if(offset == -1)
+ return -1;
+ offset = scanWhitespace(input, offset);
+ if(offset == -1)
+ return -1;
+ if(input.charAt(offset) == '>')
+ return offset+1;
+ else
+ return -1;
+ }
+
+ // Open tag
+ else {
+ --offset;
+ offset = scanTagName(input, offset);
+ if(offset == -1)
+ return -1;
+ while(true) {
+ if((c=input.charAt(offset)) != ' ' && c != '\n') {
+ if(c == '>')
+ return offset+1;
+ if(c == '/' && input.charAt(offset+1)=='>')
+ return offset+2;
+ return -1;
+ }
+ offset = scanWhitespace(input, offset);
+ if(offset == -1)
+ return -1;
+ c = input.charAt(offset);
+ if(c == '>')
+ return offset+1;
+ if(c == '/' && input.charAt(offset+1)=='>')
+ return offset+2;
+ offset = scanAttributeName(input, offset);
+ if(offset == -1)
+ return -1;
+ offset = scanWhitespace(input, offset);
+ if(offset == -1)
+ return -1;
+ if((c=input.charAt(offset)) == '=') {
+ ++offset;
+ offset = scanWhitespace(input, offset);
+ if(offset == -1)
+ return -1;
+
+ c = input.charAt(offset);
+ if(c == '"') {
+ ++offset;
+ while(true) {
+ if(offset == input.length())
+ return -1;
+ c=input.charAt(offset++);
+ if(c == '"')
+ break;
+ }
+ }
+ else if(c == '\'') {
+ ++offset;
+ while(true) {
+ if(offset == input.length())
+ return -1;
+ c=input.charAt(offset++);
+ if(c == '\'')
+ break;
+ }
+ }
+ else {
+ while(true) {
+ if(offset == input.length())
+ return -1;
+ c=input.charAt(offset++);
+ if(c==' ' || c=='\n' || c=='"' || c=='\'' || c=='=' || c=='<' || c=='>' || c=='`') {
+ --offset;
+ break;
+ }
+ }
+ }
+ }
+ else {
+ if(c == '>')
+ return offset+1;
+ --offset;
+ c = input.charAt(offset);
+ if(c != ' ' && c != '\n' && c != '>')
+ return -1;
+ }
+ }
+ }
+ }
+
+ private static int scanTagName(StringBuilder input, int offset) {
+ if(offset >= input.length())
+ return -1;
+ char c = input.charAt(offset++);
+ if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) )
+ return -1;
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) )
+ return offset-1;
+ }
+ return -1;
+ }
+
+ private static int scanAttributeName(StringBuilder input, int offset) {
+ if(offset >= input.length())
+ return -1;
+ char c = input.charAt(offset++);
+ if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':') )
+ return -1;
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if( !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
+ || c == '_' || c == ':' || c == '.' || c == '-') )
+ return offset-1;
+ }
+ return -1;
+ }
+
+ public static int scanWhitespace(StringBuilder input, int offset) {
+ while(offset < input.length()) {
+ char c = input.charAt(offset);
+ if(c != ' ' && c != '\n')
+ return offset;
+ ++offset;
+ }
+ return -1;
+ }
+
+ public static int scanUri(StringBuilder input, int pos) {
+ int startPos = pos;
+ char c;
+ while(true) {
+ if(pos == input.length())
+ return -1;
+ c = input.charAt(pos++);
+ if(c < 0 || c >= 128 || !IS_SCHEME_CHAR[(int)c])
+ break;
+ }
+ if(c != ':' || !SCHEME_SET.contains(input.substring(startPos, pos-1).toLowerCase()))
+ return -1;
+ while(pos < input.length() && (c = input.charAt(pos)) != '>' && c != '<' && !(c <= 0x20 && c >= 0))
+ ++pos;
+ if(c == '>')
+ return pos+1;
+ else
+ return -1;
+ }
+
+ private static final String[] SCHEMES = new String[] {
+ "coap", "doi", "javascript", "aaa", "aaas", "about", "acap", "cap", "cid", "crid", "data", "dav", "dict", "dns", "file", "ftp", "geo", "go",
+ "gopher", "h323", "http", "https", "iax", "icap", "im", "imap", "info", "ipp", "iris", "iris.beep", "iris.xpc", "iris.xpcs", "iris.lwz",
+ "ldap", "mailto", "mid", "msrp", "msrps", "mtqp", "mupdate", "news", "nfs", "ni", "nih", "nntp", "opaquelocktoken", "pop", "pres", "rtsp",
+ "service", "session", "shttp", "sieve", "sip", "sips", "sms", "snmp", "soap.beep", "soap.beeps", "tag", "tel", "telnet", "tftp", "thismessage",
+ "tn3270", "tip", "tv", "urn", "vemmi", "ws", "wss", "xcon", "xcon-userid", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "z39.50r", "z39.50s", "adiumxtra",
+ "afp", "afs", "aim", "apt", "attachment", "aw", "beshare", "bitcoin", "bolo", "callto", "chrome", "chrome-extension", "com-eventbrite-attendee",
+ "content", "cvs", "dlna-playsingle", "dlna-playcontainer", "dtn", "dvb", "ed2k", "facetime", "feed", "finger", "fish", "gg", "git", "gizmoproject",
+ "gtalk", "hcp", "icon", "ipn", "irc", "irc6", "ircs", "itms", "jar", "jms", "keyparc", "lastfm", "ldaps", "magnet", "maps", "market", "message", "mms",
+ "ms-help", "msnim", "mumble", "mvn", "notes", "oid", "palm", "paparazzi", "platform", "proxy", "psyc", "query", "res", "resource", "rmi", "rsync",
+ "rtmp", "secondlife", "sftp", "sgn", "skype", "smb", "soldat", "spotify", "ssh", "steam", "svn", "teamspeak", "things", "udp", "unreal", "ut2004",
+ "ventrilo", "view-source", "webcal", "wtai", "wyciwyg", "xfire", "xri", "ymsgr"
+ };
+ private static final THashSet<String> SCHEME_SET = new THashSet<String>();
+ private static final boolean[] IS_SCHEME_CHAR = new boolean[128];
+ static {
+ for(String scheme : SCHEMES) {
+ SCHEME_SET.add(scheme);
+ for(int i=0;i<scheme.length();++i) {
+ char c = scheme.charAt(i);
+ IS_SCHEME_CHAR[(int)c] = true;
+ IS_SCHEME_CHAR[(int)Character.toUpperCase(c)] = true;
+ }
+ }
+ }
+
+ public static int scanLinkLabel(StringBuilder input, int offset) {
+ if(offset == input.length() || input.charAt(offset++) != '[')
+ return -1;
+ int maxPos = Math.min(input.length(), offset+1000);
+ while(offset < maxPos) {
+ char c = input.charAt(offset++);
+ if(c == ']')
+ return offset;
+ if(c == '[')
+ return -1;
+ if(c == '\\' && offset < maxPos) {
+ c = input.charAt(offset);
+ if(Subject.getCharType(c) == 2)
+ ++offset;
+ }
+ }
+ return -1;
+ }
+
+ public static int scanLinkUrl(StringBuilder input, int offset) {
+ if(offset == input.length())
+ return offset;
+ if(input.charAt(offset) == '<') {
+ ++offset;
+ while(offset < input.length()) {
+ char c = input.charAt(offset++);
+ if(c == '>') {
+ return offset;
+ }
+ else if(c == '\\') {
+ if(Subject.getCharType(c) == 2)
+ ++offset;
+ }
+ else if(c == '<' || c == '\n')
+ return -1;
+ }
+ return -1;
+ }
+ else {
+ while(offset < input.length()) {
+ char c = input.charAt(offset++);
+ if(c == '\\') {
+ if(Subject.getCharType(input.charAt(offset)) == 2)
+ ++offset;
+ else
+ return offset - 1;
+ }
+ else if( c == '(' ) {
+ int orgPos = offset - 1;
+ while(true) {
+ if(offset >= input.length())
+ return orgPos;
+ c = input.charAt(offset++);
+ if(c == '\\') {
+ if(Subject.getCharType(input.charAt(offset)) == 2)
+ ++offset;
+ else
+ return orgPos;
+ }
+ else if(c == ')')
+ break;
+ else if( (c <= 0x20 && c >= 0) || c == '(' )
+ return orgPos;
+ }
+ }
+ else if( (c <= 0x20 && c >= 0) || c == ')' )
+ return offset-1;
+ }
+ return offset;
+ }
+ }
+
+ public static int scanLinkTitle(StringBuilder input, int offset) {
+ if(offset == input.length())
+ return -1;
+ char c = input.charAt(offset++);
+ if(c == '(') {
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if(c == ')')
+ return offset;
+ if(c == '\\') {
+ c = input.charAt(offset);
+ if(c == ')' || c == '\\')
+ ++offset;
+ }
+ }
+ return -1;
+ }
+ else if(c == '"') {
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if(c == '"')
+ return offset;
+ if(c == '\\') {
+ c = input.charAt(offset);
+ if(c == '"' || c == '\\')
+ ++offset;
+ }
+ }
+ return -1;
+ }
+ if(c == '\'') {
+ while(offset < input.length()) {
+ c = input.charAt(offset++);
+ if(c == '\'')
+ return offset;
+ if(c == '\\') {
+ c = input.charAt(offset);
+ if(c == '\'' || c == '\\')
+ ++offset;
+ }
+ }
+ return -1;
+ }
+ else
+ return -1;
+ }
+
+ private static final CharacterSet EMAIL_START = new CharacterSet("a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-");
+ private static final CharacterSet EMAIL_END_A = new CharacterSet("a-zA-Z0-9");
+ private static final CharacterSet EMAIL_END_B = new CharacterSet("a-zA-Z0-9-");
+
+ public static int scanEmail(StringBuilder input, int offset) {
+ int initialPos = offset;
+ char c = 0;
+ while(offset < input.length() && EMAIL_START.contains(c=input.charAt(offset++)) );
+ if( c != '@' || offset == initialPos )
+ return -1;
+ ++offset;
+ while(true) {
+ if(offset == input.length() || !EMAIL_END_A.contains(c=input.charAt(offset++)))
+ return -1;
+ int count = 1;
+ int oldC = c;
+ while(offset < input.length() && EMAIL_END_B.contains(c=input.charAt(offset++))) {
+ ++count;
+ if(count > 62)
+ return -1;
+ oldC = c;
+ }
+ if(oldC=='-')
+ return -1;
+ if(c == '>')
+ return offset;
+ if(c != '.')
+ return -1;
+ }
+ }
+}