bundles/winterwell.markdown/src/winterwell/markdown/StringMethods.java

   1 /**\r
   2  * Basic String manipulation utilities.\r
   3  * (c) Winterwell 2010 and ThinkTank Mathematics 2007\r
   4  */\r
   5 package winterwell.markdown;\r
   6 \r
   7 import java.math.BigInteger;\r
   8 import java.security.MessageDigest;\r
   9 import java.security.NoSuchAlgorithmException;\r
  10 import java.util.ArrayList;\r
  11 import java.util.List;\r
  12 import java.util.regex.Pattern;\r
  13 \r
  14 import winterwell.utils.Mutable;\r
  15 import winterwell.utils.containers.Pair;\r
  16 \r
  17 /**\r
  18  * A collection of general-purpose String handling methods.\r
  19  * \r
  20  * @author daniel.winterstein\r
  21  */\r
  22 public final class StringMethods {\r
  23 \r
  24         /**\r
  25          * Removes xml tags, comment blocks and script blocks.\r
  26          * \r
  27          * @param page\r
  28          * @return the page with all xml tags removed.\r
  29          */\r
  30         public static String stripTags(String page) {\r
  31                 // This code is rather ugly, but it does the job\r
  32                 StringBuilder stripped = new StringBuilder(page.length());\r
  33                 boolean inTag = false;\r
  34                 // Comment blocks and script blocks are given special treatment\r
  35                 boolean inComment = false;\r
  36                 boolean inScript = false;\r
  37                 // Go through the text\r
  38                 for (int i = 0; i < page.length(); i++) {\r
  39                         char c = page.charAt(i);\r
  40                         // First check whether we are ignoring text\r
  41                         if (inTag) {\r
  42                                 if (c == '>')\r
  43                                         inTag = false;\r
  44                         } else if (inComment) {\r
  45                                 if (c == '>' && page.charAt(i - 1) == '-'\r
  46                                                 && page.charAt(i - 1) == '-') {\r
  47                                         inComment = false;\r
  48                                 }\r
  49                         } else if (inScript) {\r
  50                                 if (c == '>' && page.substring(i - 7, i).equals("/script")) {\r
  51                                         inScript = false;\r
  52                                 }\r
  53                         } else {\r
  54                                 // Check for the start of a tag - looks for '<' followed by any\r
  55                                 // non-whitespace character\r
  56                                 if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) {\r
  57                                         // Comment, script-block or tag?\r
  58                                         if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-'\r
  59                                                         && page.charAt(i + 3) == '-') {\r
  60                                                 inComment = true;\r
  61                                         } else if (i + 8 < page.length()\r
  62                                                         && page.substring(i + 1, i + 7).equals("script")) {\r
  63                                                 inScript = true;\r
  64                                                 i += 7;\r
  65                                         } else\r
  66                                                 inTag = true; // Normal tag by default\r
  67                                 } else {\r
  68                                         // Append all non-tag chars\r
  69                                         stripped.append(c);\r
  70                                 }\r
  71                         } // end if...\r
  72                 }\r
  73                 return stripped.toString();\r
  74         }\r
  75         \r
  76         /**\r
  77          * The local line-end string. \n on unix, \r\n on windows, \r on mac.\r
  78          */\r
  79         public static final String LINEEND = System.getProperty("line.separator");\r
  80 \r
  81         /**\r
  82          * @param s\r
  83          * @return A version of s where the first letter is uppercase and all others\r
  84          *         are lowercase\r
  85          */\r
  86         public static final String capitalise(final String s) {\r
  87                 return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();\r
  88         }\r
  89 \r
  90         /**\r
  91          * Convert all line breaks into the system line break.\r
  92          */\r
  93         public static final String convertLineBreaks(String text) {\r
  94                 return convertLineBreaks(text, LINEEND);\r
  95         }\r
  96 \r
  97         /**\r
  98          * Convert all line breaks into the specified line break.\r
  99          */\r
 100         public static final String convertLineBreaks(String text, String br) {\r
 101                 text = text.replaceAll("\r\n", br);\r
 102                 text = text.replaceAll("\r", br);\r
 103                 text = text.replaceAll("\n", br);\r
 104                 return text;\r
 105         }\r
 106 \r
 107         /**\r
 108          * @param string\r
 109          * @param character\r
 110          * @return the number of times character appears in the string\r
 111          * @author Sam Halliday\r
 112          */\r
 113         static public int countCharsInString(String string, char character) {\r
 114                 int count = 0;\r
 115                 for (char c : string.toCharArray()) {\r
 116                         if (c == character) {\r
 117                                 count++;\r
 118                         }\r
 119                 }\r
 120                 return count;\r
 121         }\r
 122 \r
 123         /**\r
 124          * \r
 125          * E.g.\r
 126          * <code>findEnclosingRegion("text with a [region] inside", 15, '[', ']')</code>\r
 127          * is (??,??)\r
 128          * \r
 129          * @param text\r
 130          * @param offset\r
 131          * @param start\r
 132          * @param end\r
 133          * @return the smallest enclosed region (including start and end chars, the\r
 134          *         1st number is inclusive, the 2nd exclusive), or null if none. So\r
 135          *         text.subString(start,end) is the specified region\r
 136          */\r
 137         public static Pair<Integer> findEnclosingRegion(String text, int offset,\r
 138                         char startMarker, char endMarker) {\r
 139                 // Forward\r
 140                 int end = findEnclosingRegion2(text, offset, endMarker, 1);\r
 141                 if (end == -1)\r
 142                         return null;\r
 143                 end++; // end is exclusive\r
 144                 // Backward\r
 145                 int start = findEnclosingRegion2(text, offset, startMarker, -1);\r
 146                 if (start == -1)\r
 147                         return null;\r
 148                 // Sanity\r
 149                 assert text.substring(start, end).charAt(0) == startMarker;\r
 150                 assert text.substring(start, end).endsWith("" + endMarker);\r
 151                 // Done\r
 152                 return new Pair<Integer>(start, end);\r
 153         }\r
 154 \r
 155         private static int findEnclosingRegion2(String text, int offset,\r
 156                         char endMarker, int direction) {\r
 157                 while (offset > -1 && offset < text.length()) {\r
 158                         char c = text.charAt(offset);\r
 159                         if (c == endMarker)\r
 160                                 return offset;\r
 161                         offset += direction;\r
 162                 }\r
 163                 return -1;\r
 164         }\r
 165 \r
 166         /**\r
 167          * A convenience wrapper for\r
 168          * {@link #findEnclosingRegion(String, int, char, char)} E.g. <code>\r
 169          findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]");\r
 170          </code>\r
 171          * \r
 172          * @param text\r
 173          * @param offset\r
 174          * @param start\r
 175          * @param end\r
 176          * @return the smallest enclosed region (including start and end chars), or\r
 177          *         null if none.\r
 178          */\r
 179         public static String findEnclosingText(String text, int offset,\r
 180                         char startMarker, char endMarker) {\r
 181                 Pair<Integer> region = findEnclosingRegion(text, offset, startMarker,\r
 182                                 endMarker);\r
 183                 if (region == null)\r
 184                         return null;\r
 185                 String s = text.substring(region.first, region.second);\r
 186                 return s;\r
 187         }\r
 188 \r
 189         /**\r
 190          * Format a block of text to use the given line-width. I.e. adjust the line\r
 191          * breaks. Also known as <i>hard</i> line-wrapping. Paragraphs are\r
 192          * recognised by a line of blank space between them (e.g. two returns).\r
 193          * <p>\r
 194          * Note: a side-effect of this method is that it converts all line-breaks\r
 195          * into the local system's line-breaks. E.g. on Windows, \n will become \r\n\r
 196          * \r
 197          * @param text\r
 198          *            The text to format\r
 199          * @param lineWidth\r
 200          *            The number of columns in a line. Typically 78 or 80.\r
 201          * @param respectLeadingCharacters\r
 202          *            Can be null. If set, the specified leading characters will be\r
 203          *            copied if the line is split. Use with " \t" to keep indented\r
 204          *            paragraphs properly indented. Use with "> \t" to also handle\r
 205          *            email-style quoting. Note that respected leading characters\r
 206          *            receive no special treatment when they are used inside a\r
 207          *            paragraph.\r
 208          * @return A copy of text, formatted to the given line-width.\r
 209          *         <p>\r
 210          *         TODO: recognise paragraphs by changes in the respected leading\r
 211          *         characters\r
 212          */\r
 213         public static String format(String text, int lineWidth, int tabWidth,\r
 214                         String respectLeadingCharacters) {\r
 215                 // Switch to Linux line breaks for easier internal workings\r
 216                 text = convertLineBreaks(text, "\n");\r
 217                 // Find paragraphs\r
 218                 List<String> paras = format2_splitParagraphs(text,\r
 219                                 respectLeadingCharacters);\r
 220                 // Rebuild text\r
 221                 StringBuilder sb = new StringBuilder(text.length() + 10);\r
 222                 for (String p : paras) {\r
 223                         String fp = format3_oneParagraph(p, lineWidth, tabWidth,\r
 224                                         respectLeadingCharacters);\r
 225                         sb.append(fp);\r
 226                         // Paragraphs end with a double line break\r
 227                         sb.append("\n\n");\r
 228                 }\r
 229                 // Pop the last line breaks\r
 230                 sb.delete(sb.length() - 2, sb.length());\r
 231                 // Convert line breaks to system ones\r
 232                 text = convertLineBreaks(sb.toString());\r
 233                 // Done\r
 234                 return text;\r
 235         }\r
 236 \r
 237         private static List<String> format2_splitParagraphs(String text,\r
 238                         String respectLeadingCharacters) {\r
 239                 List<String> paras = new ArrayList<String>();\r
 240                 Mutable.Int index = new Mutable.Int(0);\r
 241                 // TODO The characters prefacing this paragraph\r
 242                 String leadingChars = "";\r
 243                 while (index.value < text.length()) {\r
 244                         // One paragraph\r
 245                         boolean inSpace = false;\r
 246                         int start = index.value;\r
 247                         while (index.value < text.length()) {\r
 248                                 char c = text.charAt(index.value);\r
 249                                 index.value++;\r
 250                                 if (!Character.isWhitespace(c)) {\r
 251                                         inSpace = false;\r
 252                                         continue;\r
 253                                 }\r
 254                                 // Line end?\r
 255                                 if (c == '\r' || c == '\n') {\r
 256                                         // // Handle MS Windows 2 character \r\n line breaks\r
 257                                         // if (index.value < text.length()) {\r
 258                                         // char c2 = text.charAt(index.value);\r
 259                                         // if (c=='\r' && c2=='\n') index.value++; // Push on past\r
 260                                         // the 2nd line break char\r
 261                                         // }\r
 262                                         // Double line end - indicating a paragraph break\r
 263                                         if (inSpace)\r
 264                                                 break;\r
 265                                         inSpace = true;\r
 266                                 }\r
 267                                 // TODO Other paragraph markers, spotted by a change in\r
 268                                 // leadingChars\r
 269                         }\r
 270                         String p = text.substring(start, index.value);\r
 271                         paras.add(p);\r
 272                 }\r
 273                 // Done\r
 274                 return paras;\r
 275         }\r
 276 \r
 277         /**\r
 278          * Format a block of text to fit the given line width\r
 279          * \r
 280          * @param p\r
 281          * @param lineWidth\r
 282          * @param tabWidth\r
 283          * @param respectLeadingCharacters\r
 284          * @return\r
 285          */\r
 286         private static String format3_oneParagraph(String p, int lineWidth,\r
 287                         int tabWidth, String respectLeadingCharacters) {\r
 288                 // Collect the reformatted paragraph\r
 289                 StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for\r
 290                                                                                                                                 // some extra\r
 291                                                                                                                                 // line-breaks\r
 292                 // Get respected leading chars\r
 293                 String leadingChars = format4_getLeadingChars(p,\r
 294                                 respectLeadingCharacters);\r
 295                 // First Line\r
 296                 sb.append(leadingChars);\r
 297                 int lineLength = leadingChars.length();\r
 298                 int index = leadingChars.length();\r
 299                 // Loop\r
 300                 while (index < p.length()) {\r
 301                         // Get the next word\r
 302                         StringBuilder word = new StringBuilder();\r
 303                         char c = p.charAt(index);\r
 304                         index++;\r
 305                         while (!Character.isWhitespace(c)) {\r
 306                                 word.append(c);\r
 307                                 if (index == p.length())\r
 308                                         break;\r
 309                                 c = p.charAt(index);\r
 310                                 index++;\r
 311                         }\r
 312                         // Break the line if the word will not fit\r
 313                         if (lineLength + word.length() > lineWidth && lineLength != 0) {\r
 314                                 trimEnd(sb);\r
 315                                 sb.append('\n'); // lineEnd(sb);\r
 316                                 // New line\r
 317                                 sb.append(leadingChars);\r
 318                                 lineLength = leadingChars.length();\r
 319                         }\r
 320                         // Add word\r
 321                         sb.append(word);\r
 322                         lineLength += word.length();\r
 323                         // Add the whitespace\r
 324                         if (index != p.length() && lineLength < lineWidth) {\r
 325                                 if (c == '\n') {\r
 326                                         c = ' ';\r
 327                                 }\r
 328                                 sb.append(c);\r
 329                                 lineLength += (c == '\t') ? tabWidth : 1;\r
 330                         }\r
 331                 }\r
 332                 // A final trim\r
 333                 trimEnd(sb);\r
 334                 // Done\r
 335                 return sb.toString();\r
 336         }\r
 337 \r
 338         /**\r
 339          * \r
 340          * @param text\r
 341          * @param respectLeadingCharacters\r
 342          *            Can be null\r
 343          * @return The characters at the beginning of text which are respected. E.g.\r
 344          *         ("> Hello", " \t>") --> "> "\r
 345          */\r
 346         private static String format4_getLeadingChars(String text,\r
 347                         String respectLeadingCharacters) {\r
 348                 if (respectLeadingCharacters == null)\r
 349                         return "";\r
 350                 // Line-breaks cannot be respected\r
 351                 assert respectLeadingCharacters.indexOf('\n') == -1;\r
 352                 // Look for the first non-respected char\r
 353                 for (int i = 0; i < text.length(); i++) {\r
 354                         char c = text.charAt(i);\r
 355                         if (respectLeadingCharacters.indexOf(c) == -1) {\r
 356                                 // Return the previous chars\r
 357                                 return text.substring(0, i);\r
 358                         }\r
 359                 }\r
 360                 // All chars are respected\r
 361                 return text;\r
 362         }\r
 363 \r
 364         /**\r
 365          * Ensure that line ends with the right line-end character(s)\r
 366          */\r
 367         public static final String lineEnd(String line) {\r
 368                 // strip possibly inappropriate line-endings\r
 369                 if (line.endsWith("\n")) {\r
 370                         line = line.substring(0, line.length() - 1);\r
 371                 }\r
 372                 if (line.endsWith("\r\n")) {\r
 373                         line = line.substring(0, line.length() - 2);\r
 374                 }\r
 375                 if (line.endsWith("\r")) {\r
 376                         line = line.substring(0, line.length() - 1);\r
 377                 }\r
 378                 // add in proper line end\r
 379                 if (!line.endsWith(LINEEND)) {\r
 380                         line += LINEEND;\r
 381                 }\r
 382                 return line;\r
 383         }\r
 384 \r
 385         /**\r
 386          * Ensure that line ends with the right line-end character(s). This is more\r
 387          * efficient than the version for Strings.\r
 388          * \r
 389          * @param line\r
 390          */\r
 391         public static final void lineEnd(final StringBuilder line) {\r
 392                 if (line.length() == 0) {\r
 393                         line.append(LINEEND);\r
 394                         return;\r
 395                 }\r
 396                 // strip possibly inappropriate line-endings\r
 397                 final char last = line.charAt(line.length() - 1);\r
 398                 if (last == '\n') {\r
 399                         if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) {\r
 400                                 // \r\n\r
 401                                 line.replace(line.length() - 2, line.length(), LINEEND);\r
 402                                 return;\r
 403                         }\r
 404                         line.replace(line.length() - 1, line.length(), LINEEND);\r
 405                         return;\r
 406                 }\r
 407                 if (last == '\r') {\r
 408                         line.replace(line.length() - 1, line.length(), LINEEND);\r
 409                         return;\r
 410                 }\r
 411                 line.append(LINEEND);\r
 412                 return;\r
 413         }\r
 414 \r
 415 \r
 416         \r
 417         /**\r
 418          * @param string\r
 419          * @return the MD5 sum of the string using the default charset. Null if\r
 420          *         there was an error in calculating the hash.\r
 421          * @author Sam Halliday\r
 422          */\r
 423         public static String md5Hash(String string) {\r
 424                 MessageDigest md5 = null;\r
 425                 try {\r
 426                         md5 = MessageDigest.getInstance("MD5");\r
 427                 } catch (NoSuchAlgorithmException e) {\r
 428                         // ignore this exception, we know MD5 exists\r
 429                 }\r
 430                 md5.update(string.getBytes());\r
 431                 BigInteger hash = new BigInteger(1, md5.digest());\r
 432                 return hash.toString(16);\r
 433         }\r
 434 \r
 435         /**\r
 436          * Removes HTML-style tags from a string.\r
 437          * \r
 438          * @param s\r
 439          *            a String from which to remove tags\r
 440          * @return a string with all instances of <.*> removed.\r
 441          */\r
 442         public static String removeTags(String s) {\r
 443                 StringBuffer sb = new StringBuffer();\r
 444                 boolean inTag = false;\r
 445                 for (int i = 0; i < s.length(); i++) {\r
 446                         char c = s.charAt(i);\r
 447                         if (c == '<')\r
 448                                 inTag = true;\r
 449                         if (!inTag)\r
 450                                 sb.append(c);\r
 451                         if (c == '>')\r
 452                                 inTag = false;\r
 453                 }\r
 454                 return sb.toString();\r
 455         }\r
 456 \r
 457         /**\r
 458          * Repeat a character.\r
 459          * \r
 460          * @param c\r
 461          * @param i\r
 462          * @return A String consisting of i x c.\r
 463          * @example assert repeat('-', 5).equals("-----");\r
 464          */\r
 465         public static String repeat(Character c, int i) {\r
 466                 StringBuilder dashes = new StringBuilder(i);\r
 467                 for (int j = 0; j < i; j++)\r
 468                         dashes.append(c);\r
 469                 return dashes.toString();\r
 470         }\r
 471 \r
 472         /**\r
 473          * Split a piece of text into separate lines. The line breaks are left at\r
 474          * the end of each line.\r
 475          * \r
 476          * @param text\r
 477          * @return The individual lines in the text.\r
 478          */\r
 479         public static List<String> splitLines(String text) {\r
 480                 List<String> lines = new ArrayList<String>();\r
 481                 // Search for lines\r
 482                 int start = 0;\r
 483                 for (int i = 0; i < text.length(); i++) {\r
 484                         char c = text.charAt(i);\r
 485                         if (c == '\r' || c == '\n') {\r
 486                                 // Handle MS Windows 2 character \r\n line breaks\r
 487                                 if (i + 1 < text.length()) {\r
 488                                         char c2 = text.charAt(i + 1);\r
 489                                         if (c == '\r' && c2 == '\n')\r
 490                                                 i++;\r
 491                                 }\r
 492                                 // Get the line, with the line break\r
 493                                 String line = text.substring(start, i + 1);\r
 494                                 lines.add(line);\r
 495                                 start = i + 1;\r
 496                         }\r
 497                 }\r
 498                 // Last one\r
 499                 if (start != text.length()) {\r
 500                         String line = text.substring(start);\r
 501                         lines.add(line);\r
 502                 }\r
 503                 return lines;\r
 504         }\r
 505 \r
 506         /**\r
 507          * Remove <i>trailing</i> whitespace. c.f. String#trim() which removes\r
 508          * leading and trailing whitespace.\r
 509          * \r
 510          * @param sb\r
 511          */\r
 512         private static void trimEnd(StringBuilder sb) {\r
 513                 while (true) {\r
 514                         // Get the last character\r
 515                         int i = sb.length() - 1;\r
 516                         if (i == -1)\r
 517                                 return; // Quit if sb is empty\r
 518                         char c = sb.charAt(i);\r
 519                         if (!Character.isWhitespace(c))\r
 520                                 return; // Finish?\r
 521                         sb.deleteCharAt(i); // Remove and continue\r
 522                 }\r
 523         }\r
 524 \r
 525         /**\r
 526          * Returns true if the string is just whitespace, or empty, or null.\r
 527          * \r
 528          * @param s\r
 529          */\r
 530         public static final boolean whitespace(final String s) {\r
 531                 if (s == null) {\r
 532                         return true;\r
 533                 }\r
 534                 for (int i = 0; i < s.length(); i++) {\r
 535                         final char c = s.charAt(i);\r
 536                         if (!Character.isWhitespace(c)) {\r
 537                                 return false;\r
 538                         }\r
 539                 }\r
 540                 return true;\r
 541         }\r
 542 \r
 543         /**\r
 544          * @param text\r
 545          * @return the number of words in text. Uses a crude whitespace\r
 546          * measure.\r
 547          */\r
 548         public static int wordCount(String text) {\r
 549                 String[] bits = text.split("\\W+");\r
 550                 int wc = 0;\r
 551                 for (String string : bits) {\r
 552                         if (!whitespace(string)) wc++;\r
 553                 }\r
 554                 return wc;\r
 555         }\r
 556 \r
 557 }\r