]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/org.simantics.db.layer0/src/org/simantics/db/layer0/genericrelation/IndexQueries.java
Improvements to Lucene indexing
[simantics/platform.git] / bundles / org.simantics.db.layer0 / src / org / simantics / db / layer0 / genericrelation / IndexQueries.java
1 package org.simantics.db.layer0.genericrelation;
2
3
4 /**
5  * This class contains utilities related to queries made into Lucene indexes,
6  * such as escaping search terms.
7  * 
8  * @author Tuukka Lehtonen
9  */
10 public class IndexQueries {
11
12         /**
13          * Same as calling {@link #escape(String, boolean, boolean)} with
14          * escapeKeywords set to <code>true</code>.
15          * 
16          * @param s
17          * @param escapeWildcards
18          * @return escaped string
19          */
20         public static String escape(String s, boolean escapeWildcards) {
21                 return escape(s, escapeWildcards, true);
22         }
23
24         /**
25          * Returns a String where those characters that QueryParser expects to be
26          * escaped are escaped by a preceding <code>\</code>.
27          * 
28          * Copied from
29          * {@link org.apache.lucene.queryParser.QueryParser#escape(String)} but
30          * disabled escaping of wildcard characters '*' and '?'. Clients must escape
31          * wildcards themselves to allow use of wildcards in queries.
32          * 
33          * @param s
34          *            lucene query to escape
35          * @param escapeWildcards
36          *            <code>true</code> to escape also wildcard characters
37          * @param escapeKeywords
38          *            <code>true</code> to escape keywords like AND, OR, etc.
39          * @return escaped string
40          */
41         public static String escape(String s, boolean escapeWildcards, boolean escapeKeywords) {
42                 if (!needsEscaping(s, escapeWildcards, escapeKeywords))
43                         return s;
44
45                 StringBuilder sb = new StringBuilder(s.length() + 8);
46                 int len = s.length();
47                 // The beginning of the line is the same as the last character being
48                 // whitespace.
49                 boolean lastWhitespace = true;
50                 for (int i = 0; i < len;) {
51                         char c = s.charAt(i);
52                         // These characters are part of the query syntax and must be escaped
53                         if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
54                                         || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
55                                         || c == '|' || c == '&' || c == '/' || c == ' ' || (escapeWildcards && (c == '*' || c == '?'))) {
56                                 sb.append('\\');
57                                 sb.append(c);
58                                 lastWhitespace = false;
59                         } else if (Character.isWhitespace(c)) {
60                                 sb.append(c);
61                                 lastWhitespace = true;
62                         } else {
63                                 if (escapeKeywords && lastWhitespace) {
64                                         int reslen = processReservedWords(s, i, sb);
65                                         if (reslen > 0) {
66                                                 i += reslen;
67                                                 lastWhitespace = false;
68                                                 continue;
69                                         }
70                                 }
71                                 sb.append(c);
72                                 lastWhitespace = false;
73                         }
74                         ++i;
75                 }
76                 return sb.toString();
77         }
78
79         /**
80          * Same logic as in {@link #escape(String, boolean, boolean)} but this one
81          * simply checks whether the input string needs escaping at all or not.
82          * 
83          * @param s
84          * @param escapeWildcards
85          * @param escapeKeywords
86          * @return
87          */
88         private static boolean needsEscaping(String s, boolean escapeWildcards, boolean escapeKeywords) {
89                 int len = s.length();
90                 // The beginning of the line is the same as the last character being
91                 // whitespace.
92                 boolean lastWhitespace = true;
93                 for (int i = 0; i < len;) {
94                         char c = s.charAt(i);
95                         // These characters are part of the query syntax and must be escaped
96                         if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
97                                         || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
98                                         || c == '|' || c == '&' || c == '/' || c == ' ' || (escapeWildcards && (c == '*' || c == '?'))) {
99                                 return true;
100                         } else if (Character.isWhitespace(c)) {
101                                 lastWhitespace = true;
102                         } else {
103                                 if (escapeKeywords && lastWhitespace) {
104                                         int reslen = processReservedWords(s, i, null);
105                                         if (reslen > 0)
106                                                 return true;
107                                 }
108                                 lastWhitespace = false;
109                         }
110                         ++i;
111                 }
112                 return false;
113         }
114
115         private static final String[] RESERVED_WORDS = {
116                 "AND", "\\AND",
117                 "OR", "\\OR",
118                 "NOT", "\\NOT",
119         };
120
121         /**
122          * Lucene reserved words are case-sensitive for its query parser. Therefore
123          * only case-sensitive hits need to be looked for.
124          * 
125          * @param s
126          * @param fromIndex
127          * @return length of the reserved word in the input or 0 if no reserved word
128          *         in the input
129          */
130         private static int processReservedWords(String s, int fromIndex, StringBuilder sb) {
131                 final int total = RESERVED_WORDS.length;
132                 for (int w = 0; w < total; w += 2) {
133                         String word = RESERVED_WORDS[w];
134                         int len = word.length();
135                         if (s.regionMatches(false, fromIndex, word, 0, len)) {
136                                 if (sb != null) {
137                                         String replacement = RESERVED_WORDS[w+1];
138                                         sb.append(replacement);
139                                 }
140                                 return len;
141                         }
142                 }
143                 return 0;
144         }
145
146         /**
147          * Returns a String where those characters that QueryParser expects to be
148          * escaped are escaped by a preceding <code>\</code>.
149          */
150         public static String escape(String s) {
151                 return escape(s, false);
152         }
153
154         public static StringBuilder escapeTerm(String field, String term, boolean escapeWildcards, StringBuilder result) {
155                 if (field != null)
156                         result.append(field).append(':');
157                 result.append( escape(term, escapeWildcards) );
158                 return result;
159         }
160
161         public static String escapeTerm(String field, String term, boolean escapeWildcards) {
162                 StringBuilder sb = new StringBuilder();
163                 return escapeTerm(field, term, escapeWildcards, sb).toString();
164         }
165         
166         public static String quoteTerm(String term) {
167                 StringBuilder sb = new StringBuilder();
168                 sb.append("\"");
169                 sb.append(term.replaceAll("(\"|\\\\)", "\\\\$0"));
170                 sb.append("\"");
171                 return sb.toString();
172         }
173
174 //      public static void main(String[] args) {
175 //              System.out.println("esc: " + escape("AND01", true, true));
176 //              System.out.println("esc: " + escape("AND 01", true, true));
177 //              System.out.println("esc: " + escape(" AND 01", true, true));
178 //      }
179
180 }