--- /dev/null
+package org.simantics.scl.compiler.parser.regexp;
+
+import gnu.trove.set.hash.THashSet;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.simantics.scl.compiler.parser.regexp.automata.NFA;
+
+public abstract class Regexp implements Comparable<Regexp> {
+ static final int ATOM = 0;
+ static final int OP = 1;
+ static final int OR = 2;
+ static final int SEQ = 3;
+
+ protected abstract void buildAutomaton(NFA aut, int inState, int outState);
+
+ public static final Regexp ONE = new RSeq(new Regexp[0]);
+ public static final Regexp ZERO = new ROr(new Regexp[0]);
+
+ Regexp() {
+ }
+
+ public NFA toAutomaton() {
+ NFA aut = new NFA();
+ int inState = aut.newState();
+ int outState = aut.newState();
+ buildAutomaton(aut, inState, outState);
+ aut.setInitialState(inState);
+ aut.setAccepts(outState, true);
+ return aut;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ toString(b, 0);
+ return b.toString();
+ }
+
+ protected abstract void toString(StringBuilder b, int prec);
+
+ protected abstract int getTypeId();
+
+ public abstract boolean isNullable();
+
+ public Regexp simplify() {
+ return this;
+ }
+
+ private static int parseRegexp(int pos, String regexp, AtomicReference<Regexp> result) {
+ ArrayList<Regexp> dc = new ArrayList<Regexp>();
+ ArrayList<Regexp> cc = new ArrayList<Regexp>();
+ loop: while(true) {
+ if(pos == regexp.length())
+ break loop;
+ char c = regexp.charAt(pos++);
+ switch(c) {
+ case '(': {
+ AtomicReference<Regexp> child = new AtomicReference<Regexp>();
+ pos = parseRegexp(pos, regexp, child);
+ cc.add(child.get());
+ } break;
+ case ')':
+ break loop;
+ case '|': {
+ dc.add(seq(cc));
+ cc.clear();
+ } break;
+ case '*': {
+ if(cc.isEmpty())
+ throw new IllegalArgumentException("Encountered * that is not front of anything.");
+ int p = cc.size()-1;
+ cc.set(p, Regexp.star(cc.get(p)));
+ } break;
+ case '?': {
+ if(cc.isEmpty())
+ throw new IllegalArgumentException("Encountered ? that is not front of anything.");
+ int p = cc.size()-1;
+ cc.set(p, Regexp.optional(cc.get(p)));
+ } break;
+ case '+': {
+ if(cc.isEmpty())
+ throw new IllegalArgumentException("Encountered + that is not front of anything.");
+ int p = cc.size()-1;
+ cc.set(p, Regexp.plus(cc.get(p)));
+ } break;
+ default:
+ cc.add(new RAtom(c));
+ }
+ }
+ dc.add(seq(cc));
+ result.set(or(dc));
+ return pos;
+ }
+
+ public static Regexp of(String regexp) {
+ AtomicReference<Regexp> result = new AtomicReference<Regexp>();
+ int finalPos = parseRegexp(0, regexp, result);
+ if(finalPos < regexp.length())
+ throw new IllegalArgumentException("Extra closing parenteses");
+ return result.get();
+ }
+
+ static void seq(ArrayList<Regexp> l, Regexp exp) {
+ if(l.size() > 0 && l.get(0) == ZERO)
+ return;
+ if(exp instanceof RSeq)
+ for(Regexp e : ((RSeq)exp).exps)
+ seq(l, e);
+ else if(exp == ZERO) {
+ l.clear();
+ l.add(exp);
+ }
+ else
+ l.add(exp);
+ }
+
+ static Regexp seq_(List<Regexp> es) {
+ if(es.size() == 0)
+ return ONE;
+ if(es.size() == 1)
+ return es.get(0);
+ Regexp[] eArray = es.toArray(new Regexp[es.size()]);
+ return new RSeq(eArray);
+ }
+
+ public static Regexp seq(Regexp ... exps) {
+ if(exps.length == 0)
+ return ONE;
+ if(exps.length == 1)
+ return exps[0];
+ ArrayList<Regexp> es = new ArrayList<Regexp>();
+ for(Regexp exp : exps)
+ seq(es, exp);
+ return seq_(es);
+ }
+
+ public static Regexp seq(Collection<Regexp> exps) {
+ return seq(exps.toArray(new Regexp[exps.size()]));
+ }
+
+ static void or(THashSet<Regexp> s, Regexp exp) {
+ if(exp instanceof ROr)
+ for(Regexp e : ((ROr)exp).exps)
+ or(s, e);
+ else
+ s.add(exp);
+ }
+
+ static Regexp or_(THashSet<Regexp> es) {
+ if(es.size() == 0)
+ return ZERO;
+ if(es.size() == 1)
+ return es.iterator().next();
+ Regexp[] eArray = es.toArray(new Regexp[es.size()]);
+ Arrays.sort(eArray);
+ return new ROr(eArray);
+ }
+
+ public static Regexp or(Regexp ... exps) {
+ if(exps.length == 0)
+ return ZERO;
+ if(exps.length == 1)
+ return exps[0];
+ THashSet<Regexp> es = new THashSet<Regexp>();
+ for(Regexp exp : exps)
+ or(es, exp);
+ return or_(es);
+ }
+
+ public static Regexp star(Regexp exp) {
+ if(exp == ONE || exp == ZERO)
+ return ONE;
+ return new ROp(exp, '*');
+ }
+
+ public static Regexp plus(Regexp exp) {
+ return new ROp(exp, '+');
+ }
+
+ public static Regexp optional(Regexp exp) {
+ return new ROp(exp, '?');
+ }
+
+ public static Regexp or(Collection<Regexp> exps) {
+ return or(exps.toArray(new Regexp[exps.size()]));
+ }
+
+ @Override
+ public int compareTo(Regexp o) {
+ int tA = getTypeId();
+ int tB = o.getTypeId();
+ if(tA < tB)
+ return -1;
+ if(tA > tB)
+ return 1;
+ switch(tA) {
+ case ATOM: {
+ int sA = ((RAtom)this).symbolId;
+ int sB = ((RAtom)o).symbolId;
+ if(sA < sB)
+ return -1;
+ if(sA > sB)
+ return 1;
+ return 0;
+ }
+ case OP: {
+ ROp a = (ROp)this;
+ ROp b = (ROp)o;
+ if(a.op < b.op)
+ return -1;
+ if(a.op > b.op)
+ return 1;
+ return a.exp.compareTo(b.exp);
+ }
+ case OR:
+ return compare(((ROr)this).exps, ((ROr)o).exps);
+ case SEQ: {
+ return compare(((RSeq)this).exps, ((RSeq)o).exps);
+ }
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ private static int compare(Regexp[] a, Regexp[] b) {
+ if(a.length < b.length)
+ return -1;
+ if(a.length > b.length)
+ return 1;
+ for(int i=0;i<a.length;++i) {
+ int temp = a[i].compareTo(b[i]);
+ if(temp != 0)
+ return temp;
+ }
+ return 0;
+ }
+
+ public abstract void toString(StringBuilder b, Namer grammar, int prec);
+}