2 * Basic String manipulation utilities.
\r
3 * (c) Winterwell 2010 and ThinkTank Mathematics 2007
\r
5 package winterwell.markdown;
\r
7 import java.math.BigInteger;
\r
8 import java.security.MessageDigest;
\r
9 import java.security.NoSuchAlgorithmException;
\r
10 import java.util.ArrayList;
\r
11 import java.util.List;
\r
12 import java.util.regex.Pattern;
\r
14 import winterwell.utils.Mutable;
\r
15 import winterwell.utils.containers.Pair;
\r
18 * A collection of general-purpose String handling methods.
\r
20 * @author daniel.winterstein
\r
22 public final class StringMethods {
\r
25 * Removes xml tags, comment blocks and script blocks.
\r
28 * @return the page with all xml tags removed.
\r
30 public static String stripTags(String page) {
\r
31 // This code is rather ugly, but it does the job
\r
32 StringBuilder stripped = new StringBuilder(page.length());
\r
33 boolean inTag = false;
\r
34 // Comment blocks and script blocks are given special treatment
\r
35 boolean inComment = false;
\r
36 boolean inScript = false;
\r
37 // Go through the text
\r
38 for (int i = 0; i < page.length(); i++) {
\r
39 char c = page.charAt(i);
\r
40 // First check whether we are ignoring text
\r
44 } else if (inComment) {
\r
45 if (c == '>' && page.charAt(i - 1) == '-'
\r
46 && page.charAt(i - 1) == '-') {
\r
49 } else if (inScript) {
\r
50 if (c == '>' && page.substring(i - 7, i).equals("/script")) {
\r
54 // Check for the start of a tag - looks for '<' followed by any
\r
55 // non-whitespace character
\r
56 if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) {
\r
57 // Comment, script-block or tag?
\r
58 if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-'
\r
59 && page.charAt(i + 3) == '-') {
\r
61 } else if (i + 8 < page.length()
\r
62 && page.substring(i + 1, i + 7).equals("script")) {
\r
66 inTag = true; // Normal tag by default
\r
68 // Append all non-tag chars
\r
73 return stripped.toString();
\r
77 * The local line-end string. \n on unix, \r\n on windows, \r on mac.
\r
79 public static final String LINEEND = System.getProperty("line.separator");
\r
83 * @return A version of s where the first letter is uppercase and all others
\r
86 public static final String capitalise(final String s) {
\r
87 return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();
\r
91 * Convert all line breaks into the system line break.
\r
93 public static final String convertLineBreaks(String text) {
\r
94 return convertLineBreaks(text, LINEEND);
\r
98 * Convert all line breaks into the specified line break.
\r
100 public static final String convertLineBreaks(String text, String br) {
\r
101 text = text.replaceAll("\r\n", br);
\r
102 text = text.replaceAll("\r", br);
\r
103 text = text.replaceAll("\n", br);
\r
110 * @return the number of times character appears in the string
\r
111 * @author Sam Halliday
\r
113 static public int countCharsInString(String string, char character) {
\r
115 for (char c : string.toCharArray()) {
\r
116 if (c == character) {
\r
126 * <code>findEnclosingRegion("text with a [region] inside", 15, '[', ']')</code>
\r
133 * @return the smallest enclosed region (including start and end chars, the
\r
134 * 1st number is inclusive, the 2nd exclusive), or null if none. So
\r
135 * text.subString(start,end) is the specified region
\r
137 public static Pair<Integer> findEnclosingRegion(String text, int offset,
\r
138 char startMarker, char endMarker) {
\r
140 int end = findEnclosingRegion2(text, offset, endMarker, 1);
\r
143 end++; // end is exclusive
\r
145 int start = findEnclosingRegion2(text, offset, startMarker, -1);
\r
149 assert text.substring(start, end).charAt(0) == startMarker;
\r
150 assert text.substring(start, end).endsWith("" + endMarker);
\r
152 return new Pair<Integer>(start, end);
\r
155 private static int findEnclosingRegion2(String text, int offset,
\r
156 char endMarker, int direction) {
\r
157 while (offset > -1 && offset < text.length()) {
\r
158 char c = text.charAt(offset);
\r
159 if (c == endMarker)
\r
161 offset += direction;
\r
167 * A convenience wrapper for
\r
168 * {@link #findEnclosingRegion(String, int, char, char)} E.g. <code>
\r
169 findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]");
\r
176 * @return the smallest enclosed region (including start and end chars), or
\r
179 public static String findEnclosingText(String text, int offset,
\r
180 char startMarker, char endMarker) {
\r
181 Pair<Integer> region = findEnclosingRegion(text, offset, startMarker,
\r
183 if (region == null)
\r
185 String s = text.substring(region.first, region.second);
\r
190 * Format a block of text to use the given line-width. I.e. adjust the line
\r
191 * breaks. Also known as <i>hard</i> line-wrapping. Paragraphs are
\r
192 * recognised by a line of blank space between them (e.g. two returns).
\r
194 * Note: a side-effect of this method is that it converts all line-breaks
\r
195 * into the local system's line-breaks. E.g. on Windows, \n will become \r\n
\r
198 * The text to format
\r
200 * The number of columns in a line. Typically 78 or 80.
\r
201 * @param respectLeadingCharacters
\r
202 * Can be null. If set, the specified leading characters will be
\r
203 * copied if the line is split. Use with " \t" to keep indented
\r
204 * paragraphs properly indented. Use with "> \t" to also handle
\r
205 * email-style quoting. Note that respected leading characters
\r
206 * receive no special treatment when they are used inside a
\r
208 * @return A copy of text, formatted to the given line-width.
\r
210 * TODO: recognise paragraphs by changes in the respected leading
\r
213 public static String format(String text, int lineWidth, int tabWidth,
\r
214 String respectLeadingCharacters) {
\r
215 // Switch to Linux line breaks for easier internal workings
\r
216 text = convertLineBreaks(text, "\n");
\r
218 List<String> paras = format2_splitParagraphs(text,
\r
219 respectLeadingCharacters);
\r
221 StringBuilder sb = new StringBuilder(text.length() + 10);
\r
222 for (String p : paras) {
\r
223 String fp = format3_oneParagraph(p, lineWidth, tabWidth,
\r
224 respectLeadingCharacters);
\r
226 // Paragraphs end with a double line break
\r
229 // Pop the last line breaks
\r
230 sb.delete(sb.length() - 2, sb.length());
\r
231 // Convert line breaks to system ones
\r
232 text = convertLineBreaks(sb.toString());
\r
237 private static List<String> format2_splitParagraphs(String text,
\r
238 String respectLeadingCharacters) {
\r
239 List<String> paras = new ArrayList<String>();
\r
240 Mutable.Int index = new Mutable.Int(0);
\r
241 // TODO The characters prefacing this paragraph
\r
242 String leadingChars = "";
\r
243 while (index.value < text.length()) {
\r
245 boolean inSpace = false;
\r
246 int start = index.value;
\r
247 while (index.value < text.length()) {
\r
248 char c = text.charAt(index.value);
\r
250 if (!Character.isWhitespace(c)) {
\r
255 if (c == '\r' || c == '\n') {
\r
256 // // Handle MS Windows 2 character \r\n line breaks
\r
257 // if (index.value < text.length()) {
\r
258 // char c2 = text.charAt(index.value);
\r
259 // if (c=='\r' && c2=='\n') index.value++; // Push on past
\r
260 // the 2nd line break char
\r
262 // Double line end - indicating a paragraph break
\r
267 // TODO Other paragraph markers, spotted by a change in
\r
270 String p = text.substring(start, index.value);
\r
278 * Format a block of text to fit the given line width
\r
283 * @param respectLeadingCharacters
\r
286 private static String format3_oneParagraph(String p, int lineWidth,
\r
287 int tabWidth, String respectLeadingCharacters) {
\r
288 // Collect the reformatted paragraph
\r
289 StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for
\r
292 // Get respected leading chars
\r
293 String leadingChars = format4_getLeadingChars(p,
\r
294 respectLeadingCharacters);
\r
296 sb.append(leadingChars);
\r
297 int lineLength = leadingChars.length();
\r
298 int index = leadingChars.length();
\r
300 while (index < p.length()) {
\r
301 // Get the next word
\r
302 StringBuilder word = new StringBuilder();
\r
303 char c = p.charAt(index);
\r
305 while (!Character.isWhitespace(c)) {
\r
307 if (index == p.length())
\r
309 c = p.charAt(index);
\r
312 // Break the line if the word will not fit
\r
313 if (lineLength + word.length() > lineWidth && lineLength != 0) {
\r
315 sb.append('\n'); // lineEnd(sb);
\r
317 sb.append(leadingChars);
\r
318 lineLength = leadingChars.length();
\r
322 lineLength += word.length();
\r
323 // Add the whitespace
\r
324 if (index != p.length() && lineLength < lineWidth) {
\r
329 lineLength += (c == '\t') ? tabWidth : 1;
\r
335 return sb.toString();
\r
341 * @param respectLeadingCharacters
\r
343 * @return The characters at the beginning of text which are respected. E.g.
\r
344 * ("> Hello", " \t>") --> "> "
\r
346 private static String format4_getLeadingChars(String text,
\r
347 String respectLeadingCharacters) {
\r
348 if (respectLeadingCharacters == null)
\r
350 // Line-breaks cannot be respected
\r
351 assert respectLeadingCharacters.indexOf('\n') == -1;
\r
352 // Look for the first non-respected char
\r
353 for (int i = 0; i < text.length(); i++) {
\r
354 char c = text.charAt(i);
\r
355 if (respectLeadingCharacters.indexOf(c) == -1) {
\r
356 // Return the previous chars
\r
357 return text.substring(0, i);
\r
360 // All chars are respected
\r
365 * Ensure that line ends with the right line-end character(s)
\r
367 public static final String lineEnd(String line) {
\r
368 // strip possibly inappropriate line-endings
\r
369 if (line.endsWith("\n")) {
\r
370 line = line.substring(0, line.length() - 1);
\r
372 if (line.endsWith("\r\n")) {
\r
373 line = line.substring(0, line.length() - 2);
\r
375 if (line.endsWith("\r")) {
\r
376 line = line.substring(0, line.length() - 1);
\r
378 // add in proper line end
\r
379 if (!line.endsWith(LINEEND)) {
\r
386 * Ensure that line ends with the right line-end character(s). This is more
\r
387 * efficient than the version for Strings.
\r
391 public static final void lineEnd(final StringBuilder line) {
\r
392 if (line.length() == 0) {
\r
393 line.append(LINEEND);
\r
396 // strip possibly inappropriate line-endings
\r
397 final char last = line.charAt(line.length() - 1);
\r
398 if (last == '\n') {
\r
399 if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) {
\r
401 line.replace(line.length() - 2, line.length(), LINEEND);
\r
404 line.replace(line.length() - 1, line.length(), LINEEND);
\r
407 if (last == '\r') {
\r
408 line.replace(line.length() - 1, line.length(), LINEEND);
\r
411 line.append(LINEEND);
\r
419 * @return the MD5 sum of the string using the default charset. Null if
\r
420 * there was an error in calculating the hash.
\r
421 * @author Sam Halliday
\r
423 public static String md5Hash(String string) {
\r
424 MessageDigest md5 = null;
\r
426 md5 = MessageDigest.getInstance("MD5");
\r
427 } catch (NoSuchAlgorithmException e) {
\r
428 // ignore this exception, we know MD5 exists
\r
430 md5.update(string.getBytes());
\r
431 BigInteger hash = new BigInteger(1, md5.digest());
\r
432 return hash.toString(16);
\r
436 * Removes HTML-style tags from a string.
\r
439 * a String from which to remove tags
\r
440 * @return a string with all instances of <.*> removed.
\r
442 public static String removeTags(String s) {
\r
443 StringBuffer sb = new StringBuffer();
\r
444 boolean inTag = false;
\r
445 for (int i = 0; i < s.length(); i++) {
\r
446 char c = s.charAt(i);
\r
454 return sb.toString();
\r
458 * Repeat a character.
\r
462 * @return A String consisting of i x c.
\r
463 * @example assert repeat('-', 5).equals("-----");
\r
465 public static String repeat(Character c, int i) {
\r
466 StringBuilder dashes = new StringBuilder(i);
\r
467 for (int j = 0; j < i; j++)
\r
469 return dashes.toString();
\r
473 * Split a piece of text into separate lines. The line breaks are left at
\r
474 * the end of each line.
\r
477 * @return The individual lines in the text.
\r
479 public static List<String> splitLines(String text) {
\r
480 List<String> lines = new ArrayList<String>();
\r
481 // Search for lines
\r
483 for (int i = 0; i < text.length(); i++) {
\r
484 char c = text.charAt(i);
\r
485 if (c == '\r' || c == '\n') {
\r
486 // Handle MS Windows 2 character \r\n line breaks
\r
487 if (i + 1 < text.length()) {
\r
488 char c2 = text.charAt(i + 1);
\r
489 if (c == '\r' && c2 == '\n')
\r
492 // Get the line, with the line break
\r
493 String line = text.substring(start, i + 1);
\r
499 if (start != text.length()) {
\r
500 String line = text.substring(start);
\r
507 * Remove <i>trailing</i> whitespace. c.f. String#trim() which removes
\r
508 * leading and trailing whitespace.
\r
512 private static void trimEnd(StringBuilder sb) {
\r
514 // Get the last character
\r
515 int i = sb.length() - 1;
\r
517 return; // Quit if sb is empty
\r
518 char c = sb.charAt(i);
\r
519 if (!Character.isWhitespace(c))
\r
521 sb.deleteCharAt(i); // Remove and continue
\r
526 * Returns true if the string is just whitespace, or empty, or null.
\r
530 public static final boolean whitespace(final String s) {
\r
534 for (int i = 0; i < s.length(); i++) {
\r
535 final char c = s.charAt(i);
\r
536 if (!Character.isWhitespace(c)) {
\r
545 * @return the number of words in text. Uses a crude whitespace
\r
548 public static int wordCount(String text) {
\r
549 String[] bits = text.split("\\W+");
\r
551 for (String string : bits) {
\r
552 if (!whitespace(string)) wc++;
\r