X-Git-Url: https://gerrit.simantics.org/r/gitweb?p=simantics%2Fplatform.git;a=blobdiff_plain;f=bundles%2Fwinterwell.markdown%2Fsrc%2Fwinterwell%2Fmarkdown%2FStringMethods.java;fp=bundles%2Fwinterwell.markdown%2Fsrc%2Fwinterwell%2Fmarkdown%2FStringMethods.java;h=208e0ae9936606e37bfb05d1ddde41b802d3e328;hp=0000000000000000000000000000000000000000;hb=9a175feb652b2b7bba7afa540831b9076be3c10e;hpb=0b72d3e4ec886838314ffeba0fa201e32c0aae3e diff --git a/bundles/winterwell.markdown/src/winterwell/markdown/StringMethods.java b/bundles/winterwell.markdown/src/winterwell/markdown/StringMethods.java new file mode 100644 index 000000000..208e0ae99 --- /dev/null +++ b/bundles/winterwell.markdown/src/winterwell/markdown/StringMethods.java @@ -0,0 +1,557 @@ +/** + * Basic String manipulation utilities. + * (c) Winterwell 2010 and ThinkTank Mathematics 2007 + */ +package winterwell.markdown; + +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import winterwell.utils.Mutable; +import winterwell.utils.containers.Pair; + +/** + * A collection of general-purpose String handling methods. + * + * @author daniel.winterstein + */ +public final class StringMethods { + + /** + * Removes xml tags, comment blocks and script blocks. + * + * @param page + * @return the page with all xml tags removed. + */ + public static String stripTags(String page) { + // This code is rather ugly, but it does the job + StringBuilder stripped = new StringBuilder(page.length()); + boolean inTag = false; + // Comment blocks and script blocks are given special treatment + boolean inComment = false; + boolean inScript = false; + // Go through the text + for (int i = 0; i < page.length(); i++) { + char c = page.charAt(i); + // First check whether we are ignoring text + if (inTag) { + if (c == '>') + inTag = false; + } else if (inComment) { + if (c == '>' && page.charAt(i - 1) == '-' + && page.charAt(i - 1) == '-') { + inComment = false; + } + } else if (inScript) { + if (c == '>' && page.substring(i - 7, i).equals("/script")) { + inScript = false; + } + } else { + // Check for the start of a tag - looks for '<' followed by any + // non-whitespace character + if (c == '<' && !Character.isWhitespace(page.charAt(i + 1))) { + // Comment, script-block or tag? + if (page.charAt(i + 1) == '!' && page.charAt(i + 2) == '-' + && page.charAt(i + 3) == '-') { + inComment = true; + } else if (i + 8 < page.length() + && page.substring(i + 1, i + 7).equals("script")) { + inScript = true; + i += 7; + } else + inTag = true; // Normal tag by default + } else { + // Append all non-tag chars + stripped.append(c); + } + } // end if... + } + return stripped.toString(); + } + + /** + * The local line-end string. \n on unix, \r\n on windows, \r on mac. + */ + public static final String LINEEND = System.getProperty("line.separator"); + + /** + * @param s + * @return A version of s where the first letter is uppercase and all others + * are lowercase + */ + public static final String capitalise(final String s) { + return s.substring(0, 1).toUpperCase() + s.substring(1).toLowerCase(); + } + + /** + * Convert all line breaks into the system line break. + */ + public static final String convertLineBreaks(String text) { + return convertLineBreaks(text, LINEEND); + } + + /** + * Convert all line breaks into the specified line break. + */ + public static final String convertLineBreaks(String text, String br) { + text = text.replaceAll("\r\n", br); + text = text.replaceAll("\r", br); + text = text.replaceAll("\n", br); + return text; + } + + /** + * @param string + * @param character + * @return the number of times character appears in the string + * @author Sam Halliday + */ + static public int countCharsInString(String string, char character) { + int count = 0; + for (char c : string.toCharArray()) { + if (c == character) { + count++; + } + } + return count; + } + + /** + * + * E.g. + * findEnclosingRegion("text with a [region] inside", 15, '[', ']') + * is (??,??) + * + * @param text + * @param offset + * @param start + * @param end + * @return the smallest enclosed region (including start and end chars, the + * 1st number is inclusive, the 2nd exclusive), or null if none. So + * text.subString(start,end) is the specified region + */ + public static Pair findEnclosingRegion(String text, int offset, + char startMarker, char endMarker) { + // Forward + int end = findEnclosingRegion2(text, offset, endMarker, 1); + if (end == -1) + return null; + end++; // end is exclusive + // Backward + int start = findEnclosingRegion2(text, offset, startMarker, -1); + if (start == -1) + return null; + // Sanity + assert text.substring(start, end).charAt(0) == startMarker; + assert text.substring(start, end).endsWith("" + endMarker); + // Done + return new Pair(start, end); + } + + private static int findEnclosingRegion2(String text, int offset, + char endMarker, int direction) { + while (offset > -1 && offset < text.length()) { + char c = text.charAt(offset); + if (c == endMarker) + return offset; + offset += direction; + } + return -1; + } + + /** + * A convenience wrapper for + * {@link #findEnclosingRegion(String, int, char, char)} E.g. + findEnclosingRegion("text with a [region] inside", 15, '[', ']') .equals("[region]"); + + * + * @param text + * @param offset + * @param start + * @param end + * @return the smallest enclosed region (including start and end chars), or + * null if none. + */ + public static String findEnclosingText(String text, int offset, + char startMarker, char endMarker) { + Pair region = findEnclosingRegion(text, offset, startMarker, + endMarker); + if (region == null) + return null; + String s = text.substring(region.first, region.second); + return s; + } + + /** + * Format a block of text to use the given line-width. I.e. adjust the line + * breaks. Also known as hard line-wrapping. Paragraphs are + * recognised by a line of blank space between them (e.g. two returns). + *

+ * Note: a side-effect of this method is that it converts all line-breaks + * into the local system's line-breaks. E.g. on Windows, \n will become \r\n + * + * @param text + * The text to format + * @param lineWidth + * The number of columns in a line. Typically 78 or 80. + * @param respectLeadingCharacters + * Can be null. If set, the specified leading characters will be + * copied if the line is split. Use with " \t" to keep indented + * paragraphs properly indented. Use with "> \t" to also handle + * email-style quoting. Note that respected leading characters + * receive no special treatment when they are used inside a + * paragraph. + * @return A copy of text, formatted to the given line-width. + *

+ * TODO: recognise paragraphs by changes in the respected leading + * characters + */ + public static String format(String text, int lineWidth, int tabWidth, + String respectLeadingCharacters) { + // Switch to Linux line breaks for easier internal workings + text = convertLineBreaks(text, "\n"); + // Find paragraphs + List paras = format2_splitParagraphs(text, + respectLeadingCharacters); + // Rebuild text + StringBuilder sb = new StringBuilder(text.length() + 10); + for (String p : paras) { + String fp = format3_oneParagraph(p, lineWidth, tabWidth, + respectLeadingCharacters); + sb.append(fp); + // Paragraphs end with a double line break + sb.append("\n\n"); + } + // Pop the last line breaks + sb.delete(sb.length() - 2, sb.length()); + // Convert line breaks to system ones + text = convertLineBreaks(sb.toString()); + // Done + return text; + } + + private static List format2_splitParagraphs(String text, + String respectLeadingCharacters) { + List paras = new ArrayList(); + Mutable.Int index = new Mutable.Int(0); + // TODO The characters prefacing this paragraph + String leadingChars = ""; + while (index.value < text.length()) { + // One paragraph + boolean inSpace = false; + int start = index.value; + while (index.value < text.length()) { + char c = text.charAt(index.value); + index.value++; + if (!Character.isWhitespace(c)) { + inSpace = false; + continue; + } + // Line end? + if (c == '\r' || c == '\n') { + // // Handle MS Windows 2 character \r\n line breaks + // if (index.value < text.length()) { + // char c2 = text.charAt(index.value); + // if (c=='\r' && c2=='\n') index.value++; // Push on past + // the 2nd line break char + // } + // Double line end - indicating a paragraph break + if (inSpace) + break; + inSpace = true; + } + // TODO Other paragraph markers, spotted by a change in + // leadingChars + } + String p = text.substring(start, index.value); + paras.add(p); + } + // Done + return paras; + } + + /** + * Format a block of text to fit the given line width + * + * @param p + * @param lineWidth + * @param tabWidth + * @param respectLeadingCharacters + * @return + */ + private static String format3_oneParagraph(String p, int lineWidth, + int tabWidth, String respectLeadingCharacters) { + // Collect the reformatted paragraph + StringBuilder sb = new StringBuilder(p.length() + 10); // Allow for + // some extra + // line-breaks + // Get respected leading chars + String leadingChars = format4_getLeadingChars(p, + respectLeadingCharacters); + // First Line + sb.append(leadingChars); + int lineLength = leadingChars.length(); + int index = leadingChars.length(); + // Loop + while (index < p.length()) { + // Get the next word + StringBuilder word = new StringBuilder(); + char c = p.charAt(index); + index++; + while (!Character.isWhitespace(c)) { + word.append(c); + if (index == p.length()) + break; + c = p.charAt(index); + index++; + } + // Break the line if the word will not fit + if (lineLength + word.length() > lineWidth && lineLength != 0) { + trimEnd(sb); + sb.append('\n'); // lineEnd(sb); + // New line + sb.append(leadingChars); + lineLength = leadingChars.length(); + } + // Add word + sb.append(word); + lineLength += word.length(); + // Add the whitespace + if (index != p.length() && lineLength < lineWidth) { + if (c == '\n') { + c = ' '; + } + sb.append(c); + lineLength += (c == '\t') ? tabWidth : 1; + } + } + // A final trim + trimEnd(sb); + // Done + return sb.toString(); + } + + /** + * + * @param text + * @param respectLeadingCharacters + * Can be null + * @return The characters at the beginning of text which are respected. E.g. + * ("> Hello", " \t>") --> "> " + */ + private static String format4_getLeadingChars(String text, + String respectLeadingCharacters) { + if (respectLeadingCharacters == null) + return ""; + // Line-breaks cannot be respected + assert respectLeadingCharacters.indexOf('\n') == -1; + // Look for the first non-respected char + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (respectLeadingCharacters.indexOf(c) == -1) { + // Return the previous chars + return text.substring(0, i); + } + } + // All chars are respected + return text; + } + + /** + * Ensure that line ends with the right line-end character(s) + */ + public static final String lineEnd(String line) { + // strip possibly inappropriate line-endings + if (line.endsWith("\n")) { + line = line.substring(0, line.length() - 1); + } + if (line.endsWith("\r\n")) { + line = line.substring(0, line.length() - 2); + } + if (line.endsWith("\r")) { + line = line.substring(0, line.length() - 1); + } + // add in proper line end + if (!line.endsWith(LINEEND)) { + line += LINEEND; + } + return line; + } + + /** + * Ensure that line ends with the right line-end character(s). This is more + * efficient than the version for Strings. + * + * @param line + */ + public static final void lineEnd(final StringBuilder line) { + if (line.length() == 0) { + line.append(LINEEND); + return; + } + // strip possibly inappropriate line-endings + final char last = line.charAt(line.length() - 1); + if (last == '\n') { + if ((line.length() > 1) && (line.charAt(line.length() - 2) == '\r')) { + // \r\n + line.replace(line.length() - 2, line.length(), LINEEND); + return; + } + line.replace(line.length() - 1, line.length(), LINEEND); + return; + } + if (last == '\r') { + line.replace(line.length() - 1, line.length(), LINEEND); + return; + } + line.append(LINEEND); + return; + } + + + + /** + * @param string + * @return the MD5 sum of the string using the default charset. Null if + * there was an error in calculating the hash. + * @author Sam Halliday + */ + public static String md5Hash(String string) { + MessageDigest md5 = null; + try { + md5 = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + // ignore this exception, we know MD5 exists + } + md5.update(string.getBytes()); + BigInteger hash = new BigInteger(1, md5.digest()); + return hash.toString(16); + } + + /** + * Removes HTML-style tags from a string. + * + * @param s + * a String from which to remove tags + * @return a string with all instances of <.*> removed. + */ + public static String removeTags(String s) { + StringBuffer sb = new StringBuffer(); + boolean inTag = false; + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (c == '<') + inTag = true; + if (!inTag) + sb.append(c); + if (c == '>') + inTag = false; + } + return sb.toString(); + } + + /** + * Repeat a character. + * + * @param c + * @param i + * @return A String consisting of i x c. + * @example assert repeat('-', 5).equals("-----"); + */ + public static String repeat(Character c, int i) { + StringBuilder dashes = new StringBuilder(i); + for (int j = 0; j < i; j++) + dashes.append(c); + return dashes.toString(); + } + + /** + * Split a piece of text into separate lines. The line breaks are left at + * the end of each line. + * + * @param text + * @return The individual lines in the text. + */ + public static List splitLines(String text) { + List lines = new ArrayList(); + // Search for lines + int start = 0; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (c == '\r' || c == '\n') { + // Handle MS Windows 2 character \r\n line breaks + if (i + 1 < text.length()) { + char c2 = text.charAt(i + 1); + if (c == '\r' && c2 == '\n') + i++; + } + // Get the line, with the line break + String line = text.substring(start, i + 1); + lines.add(line); + start = i + 1; + } + } + // Last one + if (start != text.length()) { + String line = text.substring(start); + lines.add(line); + } + return lines; + } + + /** + * Remove trailing whitespace. c.f. String#trim() which removes + * leading and trailing whitespace. + * + * @param sb + */ + private static void trimEnd(StringBuilder sb) { + while (true) { + // Get the last character + int i = sb.length() - 1; + if (i == -1) + return; // Quit if sb is empty + char c = sb.charAt(i); + if (!Character.isWhitespace(c)) + return; // Finish? + sb.deleteCharAt(i); // Remove and continue + } + } + + /** + * Returns true if the string is just whitespace, or empty, or null. + * + * @param s + */ + public static final boolean whitespace(final String s) { + if (s == null) { + return true; + } + for (int i = 0; i < s.length(); i++) { + final char c = s.charAt(i); + if (!Character.isWhitespace(c)) { + return false; + } + } + return true; + } + + /** + * @param text + * @return the number of words in text. Uses a crude whitespace + * measure. + */ + public static int wordCount(String text) { + String[] bits = text.split("\\W+"); + int wc = 0; + for (String string : bits) { + if (!whitespace(string)) wc++; + } + return wc; + } + +}