+/**\r
+ * Basic String manipulation utilities.\r
+ * (c) Winterwell 2010 and ThinkTank Mathematics 2007\r
+ */\r
+package winterwell.markdown;\r
+\r
+import java.math.BigInteger;\r
+import java.security.MessageDigest;\r
+import java.security.NoSuchAlgorithmException;\r
+import java.util.ArrayList;\r
+import java.util.List;\r
+import java.util.regex.Pattern;\r
+\r
+import winterwell.utils.Mutable;\r
+import winterwell.utils.containers.Pair;\r
+\r
+/**\r
+ * A collection of general-purpose String handling methods.\r
+ * \r
+ * @author daniel.winterstein\r
+ */\r
+public final class StringMethods {\r
+\r
+ /**\r
+ * Removes xml tags, comment blocks and script blocks.\r
+ * \r
+ * @param page\r
+ * @return the page with all xml tags removed.\r
+ */\r
+ public static String stripTags(String page) {\r
+ // This code is rather ugly, but it does the job\r
+ StringBuilder stripped = new StringBuilder(page.length());\r
+ boolean inTag = false;\r
+ // Comment blocks and script blocks are given special treatment\r
+ boolean inComment = false;\r
+ boolean inScript = false;\r
+ // Go through the text\r
+ for (int i = 0; i < page.length(); i++) {\r
+ char c = page.charAt(i);\r
+ // First check whether we are ignoring text\r
+ if (inTag) {\r
+ if (c == '>')\r
+ inTag = false;\r
+ } else if (inComment) {\r
+ if (c == '>' && page.charAt(i - 1) == '-'\r
+ && page.charAt(i - 1) == '-') {\r
+ inComment = false;\r
+ }\r
+ } else if (inScript) {\r
+ if (c == '>' && page.substring(i - 7, i).equals("/script")) {\r
+ inScript = false;\r
+ }\r
+ } else {\r
+ // Check for the start of a tag - looks for '<' followed by any\r
+ // non-whitespace character\r
+ if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) {\r
+ // Comment, script-block or tag?\r
+ if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-'\r
+ && page.charAt(i + 3) == '-') {\r
+ inComment = true;\r
+ } else if (i + 8 < page.length()\r
+ && page.substring(i + 1, i + 7).equals("script")) {\r
+ inScript = true;\r
+ i += 7;\r
+ } else\r
+ inTag = true; // Normal tag by default\r
+ } else {\r
+ // Append all non-tag chars\r
+ stripped.append(c);\r
+ }\r
+ } // end if...\r
+ }\r
+ return stripped.toString();\r
+ }\r
+ \r
+ /**\r
+ * The local line-end string. \n on unix, \r\n on windows, \r on mac.\r
+ */\r
+ public static final String LINEEND = System.getProperty("line.separator");\r
+\r
+ /**\r
+ * @param s\r
+ * @return A version of s where the first letter is uppercase and all others\r
+ * are lowercase\r
+ */\r
+ public static final String capitalise(final String s) {\r
+ return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase();\r
+ }\r
+\r
+ /**\r
+ * Convert all line breaks into the system line break.\r
+ */\r
+ public static final String convertLineBreaks(String text) {\r
+ return convertLineBreaks(text, LINEEND);\r
+ }\r
+\r
+ /**\r
+ * Convert all line breaks into the specified line break.\r
+ */\r
+ public static final String convertLineBreaks(String text, String br) {\r
+ text = text.replaceAll("\r\n", br);\r
+ text = text.replaceAll("\r", br);\r
+ text = text.replaceAll("\n", br);\r
+ return text;\r
+ }\r
+\r
+ /**\r
+ * @param string\r
+ * @param character\r
+ * @return the number of times character appears in the string\r
+ * @author Sam Halliday\r
+ */\r
+ static public int countCharsInString(String string, char character) {\r
+ int count = 0;\r
+ for (char c : string.toCharArray()) {\r
+ if (c == character) {\r
+ count++;\r
+ }\r
+ }\r
+ return count;\r
+ }\r
+\r
+ /**\r
+ * \r
+ * E.g.\r
+ * <code>findEnclosingRegion("text with a [region] inside", 15, '[', ']')</code>\r
+ * is (??,??)\r
+ * \r
+ * @param text\r
+ * @param offset\r
+ * @param start\r
+ * @param end\r
+ * @return the smallest enclosed region (including start and end chars, the\r
+ * 1st number is inclusive, the 2nd exclusive), or null if none. So\r
+ * text.subString(start,end) is the specified region\r
+ */\r
+ public static Pair<Integer> findEnclosingRegion(String text, int offset,\r
+ char startMarker, char endMarker) {\r
+ // Forward\r
+ int end = findEnclosingRegion2(text, offset, endMarker, 1);\r
+ if (end == -1)\r
+ return null;\r
+ end++; // end is exclusive\r
+ // Backward\r
+ int start = findEnclosingRegion2(text, offset, startMarker, -1);\r
+ if (start == -1)\r
+ return null;\r
+ // Sanity\r
+ assert text.substring(start, end).charAt(0) == startMarker;\r
+ assert text.substring(start, end).endsWith("" + endMarker);\r
+ // Done\r
+ return new Pair<Integer>(start, end);\r
+ }\r
+\r
+ private static int findEnclosingRegion2(String text, int offset,\r
+ char endMarker, int direction) {\r
+ while (offset > -1 && offset < text.length()) {\r
+ char c = text.charAt(offset);\r
+ if (c == endMarker)\r
+ return offset;\r
+ offset += direction;\r
+ }\r
+ return -1;\r
+ }\r
+\r
+ /**\r
+ * A convenience wrapper for\r
+ * {@link #findEnclosingRegion(String, int, char, char)} E.g. <code>\r
+ findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]");\r
+ </code>\r
+ * \r
+ * @param text\r
+ * @param offset\r
+ * @param start\r
+ * @param end\r
+ * @return the smallest enclosed region (including start and end chars), or\r
+ * null if none.\r
+ */\r
+ public static String findEnclosingText(String text, int offset,\r
+ char startMarker, char endMarker) {\r
+ Pair<Integer> region = findEnclosingRegion(text, offset, startMarker,\r
+ endMarker);\r
+ if (region == null)\r
+ return null;\r
+ String s = text.substring(region.first, region.second);\r
+ return s;\r
+ }\r
+\r
+ /**\r
+ * Format a block of text to use the given line-width. I.e. adjust the line\r
+ * breaks. Also known as <i>hard</i> line-wrapping. Paragraphs are\r
+ * recognised by a line of blank space between them (e.g. two returns).\r
+ * <p>\r
+ * Note: a side-effect of this method is that it converts all line-breaks\r
+ * into the local system's line-breaks. E.g. on Windows, \n will become \r\n\r
+ * \r
+ * @param text\r
+ * The text to format\r
+ * @param lineWidth\r
+ * The number of columns in a line. Typically 78 or 80.\r
+ * @param respectLeadingCharacters\r
+ * Can be null. If set, the specified leading characters will be\r
+ * copied if the line is split. Use with " \t" to keep indented\r
+ * paragraphs properly indented. Use with "> \t" to also handle\r
+ * email-style quoting. Note that respected leading characters\r
+ * receive no special treatment when they are used inside a\r
+ * paragraph.\r
+ * @return A copy of text, formatted to the given line-width.\r
+ * <p>\r
+ * TODO: recognise paragraphs by changes in the respected leading\r
+ * characters\r
+ */\r
+ public static String format(String text, int lineWidth, int tabWidth,\r
+ String respectLeadingCharacters) {\r
+ // Switch to Linux line breaks for easier internal workings\r
+ text = convertLineBreaks(text, "\n");\r
+ // Find paragraphs\r
+ List<String> paras = format2_splitParagraphs(text,\r
+ respectLeadingCharacters);\r
+ // Rebuild text\r
+ StringBuilder sb = new StringBuilder(text.length() + 10);\r
+ for (String p : paras) {\r
+ String fp = format3_oneParagraph(p, lineWidth, tabWidth,\r
+ respectLeadingCharacters);\r
+ sb.append(fp);\r
+ // Paragraphs end with a double line break\r
+ sb.append("\n\n");\r
+ }\r
+ // Pop the last line breaks\r
+ sb.delete(sb.length() - 2, sb.length());\r
+ // Convert line breaks to system ones\r
+ text = convertLineBreaks(sb.toString());\r
+ // Done\r
+ return text;\r
+ }\r
+\r
+ private static List<String> format2_splitParagraphs(String text,\r
+ String respectLeadingCharacters) {\r
+ List<String> paras = new ArrayList<String>();\r
+ Mutable.Int index = new Mutable.Int(0);\r
+ // TODO The characters prefacing this paragraph\r
+ String leadingChars = "";\r
+ while (index.value < text.length()) {\r
+ // One paragraph\r
+ boolean inSpace = false;\r
+ int start = index.value;\r
+ while (index.value < text.length()) {\r
+ char c = text.charAt(index.value);\r
+ index.value++;\r
+ if (!Character.isWhitespace(c)) {\r
+ inSpace = false;\r
+ continue;\r
+ }\r
+ // Line end?\r
+ if (c == '\r' || c == '\n') {\r
+ // // Handle MS Windows 2 character \r\n line breaks\r
+ // if (index.value < text.length()) {\r
+ // char c2 = text.charAt(index.value);\r
+ // if (c=='\r' && c2=='\n') index.value++; // Push on past\r
+ // the 2nd line break char\r
+ // }\r
+ // Double line end - indicating a paragraph break\r
+ if (inSpace)\r
+ break;\r
+ inSpace = true;\r
+ }\r
+ // TODO Other paragraph markers, spotted by a change in\r
+ // leadingChars\r
+ }\r
+ String p = text.substring(start, index.value);\r
+ paras.add(p);\r
+ }\r
+ // Done\r
+ return paras;\r
+ }\r
+\r
+ /**\r
+ * Format a block of text to fit the given line width\r
+ * \r
+ * @param p\r
+ * @param lineWidth\r
+ * @param tabWidth\r
+ * @param respectLeadingCharacters\r
+ * @return\r
+ */\r
+ private static String format3_oneParagraph(String p, int lineWidth,\r
+ int tabWidth, String respectLeadingCharacters) {\r
+ // Collect the reformatted paragraph\r
+ StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for\r
+ // some extra\r
+ // line-breaks\r
+ // Get respected leading chars\r
+ String leadingChars = format4_getLeadingChars(p,\r
+ respectLeadingCharacters);\r
+ // First Line\r
+ sb.append(leadingChars);\r
+ int lineLength = leadingChars.length();\r
+ int index = leadingChars.length();\r
+ // Loop\r
+ while (index < p.length()) {\r
+ // Get the next word\r
+ StringBuilder word = new StringBuilder();\r
+ char c = p.charAt(index);\r
+ index++;\r
+ while (!Character.isWhitespace(c)) {\r
+ word.append(c);\r
+ if (index == p.length())\r
+ break;\r
+ c = p.charAt(index);\r
+ index++;\r
+ }\r
+ // Break the line if the word will not fit\r
+ if (lineLength + word.length() > lineWidth && lineLength != 0) {\r
+ trimEnd(sb);\r
+ sb.append('\n'); // lineEnd(sb);\r
+ // New line\r
+ sb.append(leadingChars);\r
+ lineLength = leadingChars.length();\r
+ }\r
+ // Add word\r
+ sb.append(word);\r
+ lineLength += word.length();\r
+ // Add the whitespace\r
+ if (index != p.length() && lineLength < lineWidth) {\r
+ if (c == '\n') {\r
+ c = ' ';\r
+ }\r
+ sb.append(c);\r
+ lineLength += (c == '\t') ? tabWidth : 1;\r
+ }\r
+ }\r
+ // A final trim\r
+ trimEnd(sb);\r
+ // Done\r
+ return sb.toString();\r
+ }\r
+\r
+ /**\r
+ * \r
+ * @param text\r
+ * @param respectLeadingCharacters\r
+ * Can be null\r
+ * @return The characters at the beginning of text which are respected. E.g.\r
+ * ("> Hello", " \t>") --> "> "\r
+ */\r
+ private static String format4_getLeadingChars(String text,\r
+ String respectLeadingCharacters) {\r
+ if (respectLeadingCharacters == null)\r
+ return "";\r
+ // Line-breaks cannot be respected\r
+ assert respectLeadingCharacters.indexOf('\n') == -1;\r
+ // Look for the first non-respected char\r
+ for (int i = 0; i < text.length(); i++) {\r
+ char c = text.charAt(i);\r
+ if (respectLeadingCharacters.indexOf(c) == -1) {\r
+ // Return the previous chars\r
+ return text.substring(0, i);\r
+ }\r
+ }\r
+ // All chars are respected\r
+ return text;\r
+ }\r
+\r
+ /**\r
+ * Ensure that line ends with the right line-end character(s)\r
+ */\r
+ public static final String lineEnd(String line) {\r
+ // strip possibly inappropriate line-endings\r
+ if (line.endsWith("\n")) {\r
+ line = line.substring(0, line.length() - 1);\r
+ }\r
+ if (line.endsWith("\r\n")) {\r
+ line = line.substring(0, line.length() - 2);\r
+ }\r
+ if (line.endsWith("\r")) {\r
+ line = line.substring(0, line.length() - 1);\r
+ }\r
+ // add in proper line end\r
+ if (!line.endsWith(LINEEND)) {\r
+ line += LINEEND;\r
+ }\r
+ return line;\r
+ }\r
+\r
+ /**\r
+ * Ensure that line ends with the right line-end character(s). This is more\r
+ * efficient than the version for Strings.\r
+ * \r
+ * @param line\r
+ */\r
+ public static final void lineEnd(final StringBuilder line) {\r
+ if (line.length() == 0) {\r
+ line.append(LINEEND);\r
+ return;\r
+ }\r
+ // strip possibly inappropriate line-endings\r
+ final char last = line.charAt(line.length() - 1);\r
+ if (last == '\n') {\r
+ if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) {\r
+ // \r\n\r
+ line.replace(line.length() - 2, line.length(), LINEEND);\r
+ return;\r
+ }\r
+ line.replace(line.length() - 1, line.length(), LINEEND);\r
+ return;\r
+ }\r
+ if (last == '\r') {\r
+ line.replace(line.length() - 1, line.length(), LINEEND);\r
+ return;\r
+ }\r
+ line.append(LINEEND);\r
+ return;\r
+ }\r
+\r
+\r
+ \r
+ /**\r
+ * @param string\r
+ * @return the MD5 sum of the string using the default charset. Null if\r
+ * there was an error in calculating the hash.\r
+ * @author Sam Halliday\r
+ */\r
+ public static String md5Hash(String string) {\r
+ MessageDigest md5 = null;\r
+ try {\r
+ md5 = MessageDigest.getInstance("MD5");\r
+ } catch (NoSuchAlgorithmException e) {\r
+ // ignore this exception, we know MD5 exists\r
+ }\r
+ md5.update(string.getBytes());\r
+ BigInteger hash = new BigInteger(1, md5.digest());\r
+ return hash.toString(16);\r
+ }\r
+\r
+ /**\r
+ * Removes HTML-style tags from a string.\r
+ * \r
+ * @param s\r
+ * a String from which to remove tags\r
+ * @return a string with all instances of <.*> removed.\r
+ */\r
+ public static String removeTags(String s) {\r
+ StringBuffer sb = new StringBuffer();\r
+ boolean inTag = false;\r
+ for (int i = 0; i < s.length(); i++) {\r
+ char c = s.charAt(i);\r
+ if (c == '<')\r
+ inTag = true;\r
+ if (!inTag)\r
+ sb.append(c);\r
+ if (c == '>')\r
+ inTag = false;\r
+ }\r
+ return sb.toString();\r
+ }\r
+\r
+ /**\r
+ * Repeat a character.\r
+ * \r
+ * @param c\r
+ * @param i\r
+ * @return A String consisting of i x c.\r
+ * @example assert repeat('-', 5).equals("-----");\r
+ */\r
+ public static String repeat(Character c, int i) {\r
+ StringBuilder dashes = new StringBuilder(i);\r
+ for (int j = 0; j < i; j++)\r
+ dashes.append(c);\r
+ return dashes.toString();\r
+ }\r
+\r
+ /**\r
+ * Split a piece of text into separate lines. The line breaks are left at\r
+ * the end of each line.\r
+ * \r
+ * @param text\r
+ * @return The individual lines in the text.\r
+ */\r
+ public static List<String> splitLines(String text) {\r
+ List<String> lines = new ArrayList<String>();\r
+ // Search for lines\r
+ int start = 0;\r
+ for (int i = 0; i < text.length(); i++) {\r
+ char c = text.charAt(i);\r
+ if (c == '\r' || c == '\n') {\r
+ // Handle MS Windows 2 character \r\n line breaks\r
+ if (i + 1 < text.length()) {\r
+ char c2 = text.charAt(i + 1);\r
+ if (c == '\r' && c2 == '\n')\r
+ i++;\r
+ }\r
+ // Get the line, with the line break\r
+ String line = text.substring(start, i + 1);\r
+ lines.add(line);\r
+ start = i + 1;\r
+ }\r
+ }\r
+ // Last one\r
+ if (start != text.length()) {\r
+ String line = text.substring(start);\r
+ lines.add(line);\r
+ }\r
+ return lines;\r
+ }\r
+\r
+ /**\r
+ * Remove <i>trailing</i> whitespace. c.f. String#trim() which removes\r
+ * leading and trailing whitespace.\r
+ * \r
+ * @param sb\r
+ */\r
+ private static void trimEnd(StringBuilder sb) {\r
+ while (true) {\r
+ // Get the last character\r
+ int i = sb.length() - 1;\r
+ if (i == -1)\r
+ return; // Quit if sb is empty\r
+ char c = sb.charAt(i);\r
+ if (!Character.isWhitespace(c))\r
+ return; // Finish?\r
+ sb.deleteCharAt(i); // Remove and continue\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Returns true if the string is just whitespace, or empty, or null.\r
+ * \r
+ * @param s\r
+ */\r
+ public static final boolean whitespace(final String s) {\r
+ if (s == null) {\r
+ return true;\r
+ }\r
+ for (int i = 0; i < s.length(); i++) {\r
+ final char c = s.charAt(i);\r
+ if (!Character.isWhitespace(c)) {\r
+ return false;\r
+ }\r
+ }\r
+ return true;\r
+ }\r
+\r
+ /**\r
+ * @param text\r
+ * @return the number of words in text. Uses a crude whitespace\r
+ * measure.\r
+ */\r
+ public static int wordCount(String text) {\r
+ String[] bits = text.split("\\W+");\r
+ int wc = 0;\r
+ for (String string : bits) {\r
+ if (!whitespace(string)) wc++;\r
+ }\r
+ return wc;\r
+ }\r
+\r
+}\r