]> gerrit.simantics Code Review - simantics/platform.git/blob - bundles/winterwell.markdown/src/winterwell/markdown/StringMethods.java
Fixed all line endings of the repository
[simantics/platform.git] / bundles / winterwell.markdown / src / winterwell / markdown / StringMethods.java
1 /**
2  * Basic String manipulation utilities.
3  * (c) Winterwell 2010 and ThinkTank Mathematics 2007
4  */
5 package winterwell.markdown;
6
7 import java.math.BigInteger;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.regex.Pattern;
13
14 import winterwell.utils.Mutable;
15 import winterwell.utils.containers.Pair;
16
17 /**
18  * A collection of general-purpose String handling methods.
19  * 
20  * @author daniel.winterstein
21  */
22 public final class StringMethods {
23
24         /**
25          * Removes xml tags, comment blocks and script blocks.
26          * 
27          * @param page
28          * @return the page with all xml tags removed.
29          */
30         public static String stripTags(String page) {
31                 // This code is rather ugly, but it does the job
32                 StringBuilder stripped = new StringBuilder(page.length());
33                 boolean inTag = false;
34                 // Comment blocks and script blocks are given special treatment
35                 boolean inComment = false;
36                 boolean inScript = false;
37                 // Go through the text
38                 for (int i = 0; i < page.length(); i++) {
39                         char c = page.charAt(i);
40                         // First check whether we are ignoring text
41                         if (inTag) {
42                                 if (c == '>')
43                                         inTag = false;
44                         } else if (inComment) {
45                                 if (c == '>' && page.charAt(i - 1) == '-'
46                                                 && page.charAt(i - 1) == '-') {
47                                         inComment = false;
48                                 }
49                         } else if (inScript) {
50                                 if (c == '>' && page.substring(i - 7, i).equals("/script")) {
51                                         inScript = false;
52                                 }
53                         } else {
54                                 // Check for the start of a tag - looks for '<' followed by any
55                                 // non-whitespace character
56                                 if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) {
57                                         // Comment, script-block or tag?
58                                         if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-'
59                                                         && page.charAt(i + 3) == '-') {
60                                                 inComment = true;
61                                         } else if (i + 8 < page.length()
62                                                         && page.substring(i + 1, i + 7).equals("script")) {
63                                                 inScript = true;
64                                                 i += 7;
65                                         } else
66                                                 inTag = true; // Normal tag by default
67                                 } else {
68                                         // Append all non-tag chars
69                                         stripped.append(c);
70                                 }
71                         } // end if...
72                 }
73                 return stripped.toString();
74         }
75         
76         /**
77          * The local line-end string. \n on unix, \r\n on windows, \r on mac.
78          */
79         public static final String LINEEND = System.getProperty("line.separator");
80
81         /**
82          * @param s
83          * @return A version of s where the first letter is uppercase and all others
84          *         are lowercase
85          */
86         public static final String capitalise(final String s) {
87                 return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
88         }
89
90         /**
91          * Convert all line breaks into the system line break.
92          */
93         public static final String convertLineBreaks(String text) {
94                 return convertLineBreaks(text, LINEEND);
95         }
96
97         /**
98          * Convert all line breaks into the specified line break.
99          */
100         public static final String convertLineBreaks(String text, String br) {
101                 text = text.replaceAll("\r\n", br);
102                 text = text.replaceAll("\r", br);
103                 text = text.replaceAll("\n", br);
104                 return text;
105         }
106
107         /**
108          * @param string
109          * @param character
110          * @return the number of times character appears in the string
111          * @author Sam Halliday
112          */
113         static public int countCharsInString(String string, char character) {
114                 int count = 0;
115                 for (char c : string.toCharArray()) {
116                         if (c == character) {
117                                 count++;
118                         }
119                 }
120                 return count;
121         }
122
123         /**
124          * 
125          * E.g.
126          * <code>findEnclosingRegion("text with a [region] inside", 15, '[', ']')</code>
127          * is (??,??)
128          * 
129          * @param text
130          * @param offset
131          * @param start
132          * @param end
133          * @return the smallest enclosed region (including start and end chars, the
134          *         1st number is inclusive, the 2nd exclusive), or null if none. So
135          *         text.subString(start,end) is the specified region
136          */
137         public static Pair<Integer> findEnclosingRegion(String text, int offset,
138                         char startMarker, char endMarker) {
139                 // Forward
140                 int end = findEnclosingRegion2(text, offset, endMarker, 1);
141                 if (end == -1)
142                         return null;
143                 end++; // end is exclusive
144                 // Backward
145                 int start = findEnclosingRegion2(text, offset, startMarker, -1);
146                 if (start == -1)
147                         return null;
148                 // Sanity
149                 assert text.substring(start, end).charAt(0) == startMarker;
150                 assert text.substring(start, end).endsWith("" + endMarker);
151                 // Done
152                 return new Pair<Integer>(start, end);
153         }
154
155         private static int findEnclosingRegion2(String text, int offset,
156                         char endMarker, int direction) {
157                 while (offset > -1 && offset < text.length()) {
158                         char c = text.charAt(offset);
159                         if (c == endMarker)
160                                 return offset;
161                         offset += direction;
162                 }
163                 return -1;
164         }
165
166         /**
167          * A convenience wrapper for
168          * {@link #findEnclosingRegion(String, int, char, char)} E.g. <code>
169          findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]");
170          </code>
171          * 
172          * @param text
173          * @param offset
174          * @param start
175          * @param end
176          * @return the smallest enclosed region (including start and end chars), or
177          *         null if none.
178          */
179         public static String findEnclosingText(String text, int offset,
180                         char startMarker, char endMarker) {
181                 Pair<Integer> region = findEnclosingRegion(text, offset, startMarker,
182                                 endMarker);
183                 if (region == null)
184                         return null;
185                 String s = text.substring(region.first, region.second);
186                 return s;
187         }
188
189         /**
190          * Format a block of text to use the given line-width. I.e. adjust the line
191          * breaks. Also known as <i>hard</i> line-wrapping. Paragraphs are
192          * recognised by a line of blank space between them (e.g. two returns).
193          * <p>
194          * Note: a side-effect of this method is that it converts all line-breaks
195          * into the local system's line-breaks. E.g. on Windows, \n will become \r\n
196          * 
197          * @param text
198          *            The text to format
199          * @param lineWidth
200          *            The number of columns in a line. Typically 78 or 80.
201          * @param respectLeadingCharacters
202          *            Can be null. If set, the specified leading characters will be
203          *            copied if the line is split. Use with " \t" to keep indented
204          *            paragraphs properly indented. Use with "> \t" to also handle
205          *            email-style quoting. Note that respected leading characters
206          *            receive no special treatment when they are used inside a
207          *            paragraph.
208          * @return A copy of text, formatted to the given line-width.
209          *         <p>
210          *         TODO: recognise paragraphs by changes in the respected leading
211          *         characters
212          */
213         public static String format(String text, int lineWidth, int tabWidth,
214                         String respectLeadingCharacters) {
215                 // Switch to Linux line breaks for easier internal workings
216                 text = convertLineBreaks(text, "\n");
217                 // Find paragraphs
218                 List<String> paras = format2_splitParagraphs(text,
219                                 respectLeadingCharacters);
220                 // Rebuild text
221                 StringBuilder sb = new StringBuilder(text.length() + 10);
222                 for (String p : paras) {
223                         String fp = format3_oneParagraph(p, lineWidth, tabWidth,
224                                         respectLeadingCharacters);
225                         sb.append(fp);
226                         // Paragraphs end with a double line break
227                         sb.append("\n\n");
228                 }
229                 // Pop the last line breaks
230                 sb.delete(sb.length() - 2, sb.length());
231                 // Convert line breaks to system ones
232                 text = convertLineBreaks(sb.toString());
233                 // Done
234                 return text;
235         }
236
237         private static List<String> format2_splitParagraphs(String text,
238                         String respectLeadingCharacters) {
239                 List<String> paras = new ArrayList<String>();
240                 Mutable.Int index = new Mutable.Int(0);
241                 // TODO The characters prefacing this paragraph
242                 String leadingChars = "";
243                 while (index.value < text.length()) {
244                         // One paragraph
245                         boolean inSpace = false;
246                         int start = index.value;
247                         while (index.value < text.length()) {
248                                 char c = text.charAt(index.value);
249                                 index.value++;
250                                 if (!Character.isWhitespace(c)) {
251                                         inSpace = false;
252                                         continue;
253                                 }
254                                 // Line end?
255                                 if (c == '\r' || c == '\n') {
256                                         // // Handle MS Windows 2 character \r\n line breaks
257                                         // if (index.value < text.length()) {
258                                         // char c2 = text.charAt(index.value);
259                                         // if (c=='\r' && c2=='\n') index.value++; // Push on past
260                                         // the 2nd line break char
261                                         // }
262                                         // Double line end - indicating a paragraph break
263                                         if (inSpace)
264                                                 break;
265                                         inSpace = true;
266                                 }
267                                 // TODO Other paragraph markers, spotted by a change in
268                                 // leadingChars
269                         }
270                         String p = text.substring(start, index.value);
271                         paras.add(p);
272                 }
273                 // Done
274                 return paras;
275         }
276
277         /**
278          * Format a block of text to fit the given line width
279          * 
280          * @param p
281          * @param lineWidth
282          * @param tabWidth
283          * @param respectLeadingCharacters
284          * @return
285          */
286         private static String format3_oneParagraph(String p, int lineWidth,
287                         int tabWidth, String respectLeadingCharacters) {
288                 // Collect the reformatted paragraph
289                 StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for
290                                                                                                                                 // some extra
291                                                                                                                                 // line-breaks
292                 // Get respected leading chars
293                 String leadingChars = format4_getLeadingChars(p,
294                                 respectLeadingCharacters);
295                 // First Line
296                 sb.append(leadingChars);
297                 int lineLength = leadingChars.length();
298                 int index = leadingChars.length();
299                 // Loop
300                 while (index < p.length()) {
301                         // Get the next word
302                         StringBuilder word = new StringBuilder();
303                         char c = p.charAt(index);
304                         index++;
305                         while (!Character.isWhitespace(c)) {
306                                 word.append(c);
307                                 if (index == p.length())
308                                         break;
309                                 c = p.charAt(index);
310                                 index++;
311                         }
312                         // Break the line if the word will not fit
313                         if (lineLength + word.length() > lineWidth && lineLength != 0) {
314                                 trimEnd(sb);
315                                 sb.append('\n'); // lineEnd(sb);
316                                 // New line
317                                 sb.append(leadingChars);
318                                 lineLength = leadingChars.length();
319                         }
320                         // Add word
321                         sb.append(word);
322                         lineLength += word.length();
323                         // Add the whitespace
324                         if (index != p.length() && lineLength < lineWidth) {
325                                 if (c == '\n') {
326                                         c = ' ';
327                                 }
328                                 sb.append(c);
329                                 lineLength += (c == '\t') ? tabWidth : 1;
330                         }
331                 }
332                 // A final trim
333                 trimEnd(sb);
334                 // Done
335                 return sb.toString();
336         }
337
338         /**
339          * 
340          * @param text
341          * @param respectLeadingCharacters
342          *            Can be null
343          * @return The characters at the beginning of text which are respected. E.g.
344          *         ("> Hello", " \t>") --> "> "
345          */
346         private static String format4_getLeadingChars(String text,
347                         String respectLeadingCharacters) {
348                 if (respectLeadingCharacters == null)
349                         return "";
350                 // Line-breaks cannot be respected
351                 assert respectLeadingCharacters.indexOf('\n') == -1;
352                 // Look for the first non-respected char
353                 for (int i = 0; i < text.length(); i++) {
354                         char c = text.charAt(i);
355                         if (respectLeadingCharacters.indexOf(c) == -1) {
356                                 // Return the previous chars
357                                 return text.substring(0, i);
358                         }
359                 }
360                 // All chars are respected
361                 return text;
362         }
363
364         /**
365          * Ensure that line ends with the right line-end character(s)
366          */
367         public static final String lineEnd(String line) {
368                 // strip possibly inappropriate line-endings
369                 if (line.endsWith("\n")) {
370                         line = line.substring(0, line.length() - 1);
371                 }
372                 if (line.endsWith("\r\n")) {
373                         line = line.substring(0, line.length() - 2);
374                 }
375                 if (line.endsWith("\r")) {
376                         line = line.substring(0, line.length() - 1);
377                 }
378                 // add in proper line end
379                 if (!line.endsWith(LINEEND)) {
380                         line += LINEEND;
381                 }
382                 return line;
383         }
384
385         /**
386          * Ensure that line ends with the right line-end character(s). This is more
387          * efficient than the version for Strings.
388          * 
389          * @param line
390          */
391         public static final void lineEnd(final StringBuilder line) {
392                 if (line.length() == 0) {
393                         line.append(LINEEND);
394                         return;
395                 }
396                 // strip possibly inappropriate line-endings
397                 final char last = line.charAt(line.length() - 1);
398                 if (last == '\n') {
399                         if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) {
400                                 // \r\n
401                                 line.replace(line.length() - 2, line.length(), LINEEND);
402                                 return;
403                         }
404                         line.replace(line.length() - 1, line.length(), LINEEND);
405                         return;
406                 }
407                 if (last == '\r') {
408                         line.replace(line.length() - 1, line.length(), LINEEND);
409                         return;
410                 }
411                 line.append(LINEEND);
412                 return;
413         }
414
415
416         
417         /**
418          * @param string
419          * @return the MD5 sum of the string using the default charset. Null if
420          *         there was an error in calculating the hash.
421          * @author Sam Halliday
422          */
423         public static String md5Hash(String string) {
424                 MessageDigest md5 = null;
425                 try {
426                         md5 = MessageDigest.getInstance("MD5");
427                 } catch (NoSuchAlgorithmException e) {
428                         // ignore this exception, we know MD5 exists
429                 }
430                 md5.update(string.getBytes());
431                 BigInteger hash = new BigInteger(1, md5.digest());
432                 return hash.toString(16);
433         }
434
435         /**
436          * Removes HTML-style tags from a string.
437          * 
438          * @param s
439          *            a String from which to remove tags
440          * @return a string with all instances of <.*> removed.
441          */
442         public static String removeTags(String s) {
443                 StringBuffer sb = new StringBuffer();
444                 boolean inTag = false;
445                 for (int i = 0; i < s.length(); i++) {
446                         char c = s.charAt(i);
447                         if (c == '<')
448                                 inTag = true;
449                         if (!inTag)
450                                 sb.append(c);
451                         if (c == '>')
452                                 inTag = false;
453                 }
454                 return sb.toString();
455         }
456
457         /**
458          * Repeat a character.
459          * 
460          * @param c
461          * @param i
462          * @return A String consisting of i x c.
463          * @example assert repeat('-', 5).equals("-----");
464          */
465         public static String repeat(Character c, int i) {
466                 StringBuilder dashes = new StringBuilder(i);
467                 for (int j = 0; j < i; j++)
468                         dashes.append(c);
469                 return dashes.toString();
470         }
471
472         /**
473          * Split a piece of text into separate lines. The line breaks are left at
474          * the end of each line.
475          * 
476          * @param text
477          * @return The individual lines in the text.
478          */
479         public static List<String> splitLines(String text) {
480                 List<String> lines = new ArrayList<String>();
481                 // Search for lines
482                 int start = 0;
483                 for (int i = 0; i < text.length(); i++) {
484                         char c = text.charAt(i);
485                         if (c == '\r' || c == '\n') {
486                                 // Handle MS Windows 2 character \r\n line breaks
487                                 if (i + 1 < text.length()) {
488                                         char c2 = text.charAt(i + 1);
489                                         if (c == '\r' && c2 == '\n')
490                                                 i++;
491                                 }
492                                 // Get the line, with the line break
493                                 String line = text.substring(start, i + 1);
494                                 lines.add(line);
495                                 start = i + 1;
496                         }
497                 }
498                 // Last one
499                 if (start != text.length()) {
500                         String line = text.substring(start);
501                         lines.add(line);
502                 }
503                 return lines;
504         }
505
506         /**
507          * Remove <i>trailing</i> whitespace. c.f. String#trim() which removes
508          * leading and trailing whitespace.
509          * 
510          * @param sb
511          */
512         private static void trimEnd(StringBuilder sb) {
513                 while (true) {
514                         // Get the last character
515                         int i = sb.length() - 1;
516                         if (i == -1)
517                                 return; // Quit if sb is empty
518                         char c = sb.charAt(i);
519                         if (!Character.isWhitespace(c))
520                                 return; // Finish?
521                         sb.deleteCharAt(i); // Remove and continue
522                 }
523         }
524
525         /**
526          * Returns true if the string is just whitespace, or empty, or null.
527          * 
528          * @param s
529          */
530         public static final boolean whitespace(final String s) {
531                 if (s == null) {
532                         return true;
533                 }
534                 for (int i = 0; i < s.length(); i++) {
535                         final char c = s.charAt(i);
536                         if (!Character.isWhitespace(c)) {
537                                 return false;
538                         }
539                 }
540                 return true;
541         }
542
543         /**
544          * @param text
545          * @return the number of words in text. Uses a crude whitespace
546          * measure.
547          */
548         public static int wordCount(String text) {
549                 String[] bits = text.split("\\W+");
550                 int wc = 0;
551                 for (String string : bits) {
552                         if (!whitespace(string)) wc++;
553                 }
554                 return wc;
555         }
556
557 }