bundles/winterwell.markdown/src/winterwell/markdown/pagemodel/MarkdownPage.java

   1 /**
   2  * Copyright winterwell Mathematics Ltd.
   3  * @author Daniel Winterstein
   4  * 11 Jan 2007
   5  */
   6 package winterwell.markdown.pagemodel;
   7
   8 import java.io.File;
   9 import java.util.ArrayList;
  10 import java.util.Collections;
  11 import java.util.HashMap;
  12 import java.util.List;
  13 import java.util.Map;
  14 import java.util.regex.Matcher;
  15 import java.util.regex.Pattern;
  16
  17 import org.eclipse.jface.preference.IPreferenceStore;
  18
  19 import winterwell.markdown.Activator;
  20 import winterwell.markdown.StringMethods;
  21 import winterwell.markdown.preferences.MarkdownPreferencePage;
  22 import winterwell.utils.FailureException;
  23 import winterwell.utils.Process;
  24 import winterwell.utils.StrUtils;
  25 import winterwell.utils.Utils;
  26 import winterwell.utils.io.FileUtils;
  27
  28 import com.petebevin.markdown.MarkdownProcessor;
  29
  30 /**
  31  * Understands Markdown syntax.
  32  *
  33  * @author Daniel Winterstein
  34  */
  35 public class MarkdownPage {
  36
  37         /**
  38          * Strip leading and trailing #s and whitespace
  39          *
  40          * @param line
  41          * @return cleaned up line
  42          */
  43         private String cleanHeader(String line) {
  44                 for (int j = 0; j < line.length(); j++) {
  45                         char c = line.charAt(j);
  46                         if (c != '#' && !Character.isWhitespace(c)) {
  47                                 line = line.substring(j);
  48                                 break;
  49                         }
  50                 }
  51                 for (int j = line.length() - 1; j > 0; j--) {
  52                         char c = line.charAt(j);
  53                         if (c != '#' && !Character.isWhitespace(c)) {
  54                                 line = line.substring(0, j + 1);
  55                                 break;
  56                         }
  57                 }
  58                 return line;
  59         }
  60
  61         /**
  62          * Represents information about a section header. E.g. ## Misc Warblings
  63          *
  64          * @author daniel
  65          */
  66         public class Header {
  67                 /**
  68                  * 1 = top-level (i.e. #), 2= 2nd-level (i.e. ##), etc.
  69                  */
  70                 final int level;
  71                 /**
  72                  * The text of the Header
  73                  */
  74                 final String heading;
  75                 /**
  76                  * Sub-sections, if any
  77                  */
  78                 final List<Header> subHeaders = new ArrayList<Header>();
  79                 /**
  80                  * The line on which this header occurs.
  81                  */
  82                 final int lineNumber;
  83
  84                 public int getLineNumber() {
  85                         return lineNumber;
  86                 }
  87
  88                 /**
  89                  *
  90                  * @return the next section (at this depth if possible), null if none
  91                  */
  92                 public Header getNext() {
  93                         if (parent == null) {
  94                                 int ti = level1Headers.indexOf(this);
  95                                 if (ti == -1 || ti == level1Headers.size() - 1)
  96                                         return null;
  97                                 return level1Headers.get(ti + 1);
  98                         }
  99                         int i = parent.subHeaders.indexOf(this);
 100                         assert i != -1 : this;
 101                         if (i == parent.subHeaders.size() - 1)
 102                                 return parent.getNext();
 103                         return parent.subHeaders.get(i + 1);
 104                 }
 105                 /**
 106                  *
 107                  * @return the next section (at this depth if possible), null if none
 108                  */
 109                 public Header getPrevious() {
 110                         if (parent == null) {
 111                                 int ti = level1Headers.indexOf(this);
 112                                 if (ti == -1 || ti == 0)
 113                                         return null;
 114                                 return level1Headers.get(ti - 1);
 115                         }
 116                         int i = parent.subHeaders.indexOf(this);
 117                         assert i != -1 : this;
 118                         if (i == 0)
 119                                 return parent.getPrevious();
 120                         return parent.subHeaders.get(i - 1);
 121                 }
 122
 123
 124                 /**
 125                  * The parent section. Can be null.
 126                  */
 127                 private Header parent;
 128
 129                 /**
 130                  * Create a marker for a section Header
 131                  *
 132                  * @param level
 133                  *            1 = top-level (i.e. #), 2= 2nd-level (i.e. ##), etc.
 134                  * @param lineNumber
 135                  *            The line on which this header occurs
 136                  * @param heading
 137                  *            The text of the Header, trimmed of #s
 138                  * @param currentHeader
 139                  *            The previous Header. This is used to find the parent
 140                  *            section if there is one. Can be null.
 141                  */
 142                 Header(int level, int lineNumber, String heading, Header currentHeader) {
 143                         this.lineNumber = lineNumber;
 144                         this.level = level;
 145                         this.heading = cleanHeader(heading);
 146                         // Heading Tree
 147                         setParent(currentHeader);
 148                 }
 149
 150                 private void setParent(Header currentHeader) {
 151                         if (currentHeader == null) {
 152                                 parent = null;
 153                                 return;
 154                         }
 155                         if (currentHeader.level < level) {
 156                                 parent = currentHeader;
 157                                 parent.subHeaders.add(this);
 158                                 return;
 159                         }
 160                         setParent(currentHeader.parent);
 161                 }
 162
 163                 public Header getParent() {
 164                         return parent;
 165                 }
 166
 167                 /**
 168                  * Sub-sections. May be zero-length, never null.
 169                  */
 170                 public List<Header> getSubHeaders() {
 171                         return subHeaders;
 172                 }
 173
 174                 @Override
 175                 public String toString() {
 176                         return heading;
 177                 }
 178
 179                 public int getLevel() {
 180                         return level;
 181                 }
 182         }
 183
 184         /**
 185          * The raw text, broken up into individual lines.
 186          */
 187         private List<String> lines;
 188
 189         /**
 190          * The raw text, broken up into individual lines.
 191          */
 192         public List<String> getText() {
 193                 return Collections.unmodifiableList(lines);
 194         }
 195
 196         public enum KLineType {
 197                 NORMAL, H1, H2, H3, H4, H5, H6, BLANK,
 198                 // TODO LIST, BLOCKQUOTE,
 199                 /** A line marking Markdown info about the preceding line, e.g. ====== */
 200                 MARKER,
 201                 /** A line containing meta-data, e.g. title: My Page */
 202                 META
 203         }
 204
 205         /**
 206          * Information about each line.
 207          */
 208         private List<KLineType> lineTypes;
 209         private Map<Integer,Object> pageObjects = new HashMap<Integer, Object>();
 210
 211         // TODO meta-data, footnotes, tables, link & image attributes
 212         private static Pattern multiMarkdownTag = Pattern.compile("^([\\w].*):(.*)");
 213         private Map<String, String> multiMarkdownTags = new HashMap<String, String>();
 214
 215         // Regular expression for Github support
 216         private static Pattern githubURLDetection = Pattern.compile("((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|])");
 217
 218         /**
 219          * The top-level headers. FIXME handle documents which have a 2nd level
 220          * header before any 1st level ones
 221          */
 222         private final List<Header> level1Headers = new ArrayList<Header>();
 223         private final IPreferenceStore pStore;
 224
 225         /**
 226          * Create a page.
 227          *
 228          * @param text
 229          */
 230         public MarkdownPage(String text) {
 231                 pStore = Activator.getDefault().getPreferenceStore();
 232                 setText(text);
 233         }
 234
 235         /**
 236          * Reset the text for this page.
 237          *
 238          * @param text
 239          */
 240         private void setText(String text) {
 241                 // Get lines
 242                 lines = StringMethods.splitLines(text);
 243                 // Clean out old
 244                 level1Headers.clear();
 245                 lineTypes = new ArrayList<KLineType>(lines.size());
 246                 pageObjects.clear();
 247                 // Dummy level-1 header in case there are none
 248                 Header dummyTopHeader = new Header(1, 0, "", null);
 249                 level1Headers.add(dummyTopHeader);
 250                 Header currentHeader = dummyTopHeader;
 251                 // Identify line types
 252                 int lineNum = 0;
 253
 254                 // Check if we should support the Multi-Markdown Metadata
 255                 boolean multiMarkdownMetadataSupport =
 256                                 pStore.getBoolean(MarkdownPreferencePage.PREF_MULTIMARKDOWN_METADATA);
 257
 258                 // Multi-markdown header
 259                 if (multiMarkdownMetadataSupport) {
 260                         // The key is the text before the colon, and the data is the text
 261                         // after the
 262                         // colon. In the above example, notice that there are two lines of
 263                         // information
 264                         // for the Author key. If you end a line with “space-space-newline”,
 265                         // the newline
 266                         // will be included when converted to other formats.
 267                         //
 268                         // There must not be any whitespace above the metadata, and the
 269                         // metadata block
 270                         // ends with the first whitespace only line. The metadata is
 271                         // stripped from the
 272                         // document before it is passed on to the syntax parser.
 273
 274                         //
 275                         // Check if the Metdatas are valid
 276                         //
 277                         boolean validMetadata = true;
 278                         for (lineNum = 0; lineNum < lines.size(); lineNum++) {
 279                                 String line = lines.get(lineNum);
 280                                 if (Utils.isBlank(line)) {
 281                                         break;
 282                                 }
 283                                 Matcher m = multiMarkdownTag.matcher(line);
 284                                 if (!m.find()) {
 285                                         if (lineNum == 0) {
 286                                                 // No MultiMarkdown metadata
 287                                                 validMetadata = false;
 288                                                 break;
 289                                         } else if (!line.matches("^\\s.*\n")) {
 290                                                 // The next line was not intended (ie. it does not start
 291                                                 // with a whitespace)
 292                                                 validMetadata = false;
 293                                                 break;
 294                                         }
 295                                 }
 296                         }
 297
 298                         // Valid Metadatas have been found. We need to retrieve these keys/values.
 299                         if (validMetadata) {
 300                                 String data = "";
 301                                 String tag = "";
 302                                 for (lineNum = 0; lineNum < lines.size(); lineNum++) {
 303                                         String line = lines.get(lineNum);
 304                                         if (Utils.isBlank(line)) {
 305                                                 break;
 306                                         }
 307                                         Matcher m = multiMarkdownTag.matcher(line);
 308                                         if (!m.find()) {
 309                                                 if (lineNum == 0) {
 310                                                         break;
 311                                                 }
 312                                                 // Multi-line tag
 313                                                 lineTypes.add(KLineType.META);
 314                                                 data += StrUtils.LINEEND + line.trim();
 315                                                 multiMarkdownTags.put(tag, data);
 316                                         } else {
 317                                                 lineTypes.add(KLineType.META);
 318                                                 tag = m.group(0);
 319                                                 data = m.group(1).trim();
 320                                                 if (m.group(1).endsWith(line))
 321                                                         multiMarkdownTags.put(tag, data);
 322                                         }
 323                                 }
 324                         } else {
 325                                 lineNum = 0;
 326                         }
 327                 }
 328                 for (; lineNum < lines.size(); lineNum++) {
 329                         String line = lines.get(lineNum);
 330                         // Headings
 331                         int h = numHash(line);
 332                         String hLine = line;
 333                         int hLineNum = lineNum;
 334                         int underline = -1;
 335                         if (lineNum != 0) {
 336                                 underline = just(line, '=') ? 1 : just(line, '-') ? 2 : -1;
 337                         }
 338                         if (underline != -1) {
 339                                 h = underline;
 340                                 hLineNum = lineNum - 1;
 341                                 hLine = lines.get(lineNum - 1);
 342                                 lineTypes.set(hLineNum, KLineType.values()[h]);
 343                                 lineTypes.add(KLineType.MARKER);
 344                         }
 345                         // Create a Header object
 346                         if (h > 0) {
 347                                 if (underline == -1)
 348                                         lineTypes.add(KLineType.values()[h]);
 349                                 Header header = new Header(h, hLineNum, hLine, currentHeader);
 350                                 if (h == 1) {
 351                                         level1Headers.add(header);
 352                                 }
 353                                 pageObjects.put(hLineNum, header);
 354                                 currentHeader = header;
 355                                 continue;
 356                         }
 357                         // TODO List
 358                         // TODO Block quote
 359                         // Blank line
 360                         if (Utils.isBlank(line)) {
 361                                 lineTypes.add(KLineType.BLANK);
 362                                 continue;
 363                         }
 364                         // Normal
 365                         lineTypes.add(KLineType.NORMAL);
 366                 } // end line-loop
 367                 // Remove dummy header?
 368                 if (dummyTopHeader.getSubHeaders().size() == 0) {
 369                         level1Headers.remove(dummyTopHeader);
 370                 }
 371
 372                 boolean githubSyntaxSupport =
 373                                 pStore.getBoolean(MarkdownPreferencePage.PREF_GITHUB_SYNTAX);
 374                 if (githubSyntaxSupport) {
 375                         /*
 376                          * Support Code block
 377                          */
 378                         boolean inCodeBlock = false;
 379                         for (lineNum = 0; lineNum < lines.size(); lineNum++) {
 380                                 String line = lines.get(lineNum);
 381                                 // Found the start or end of a code block
 382                                 if (line.matches("^```.*\n")) {
 383                                         // We reverse the boolean value
 384                                         inCodeBlock = !inCodeBlock;
 385
 386                                         // We force the line to be blank. But we mark it as normal
 387                                         // to prevent to be stripped
 388                                         lines.set(lineNum, "\n");
 389                                         lineTypes.set(lineNum, KLineType.NORMAL);
 390                                         continue;
 391                                 }
 392                                 if (inCodeBlock) {
 393                                         lines.set(lineNum, "    " + line);
 394                                 }
 395                         }
 396
 397                         /*
 398                          * Support for URL Detection
 399                          * We search for links that are not captured by Markdown syntax
 400                          */
 401                         for (lineNum = 0; lineNum < lines.size(); lineNum++) {
 402                                 String line = lines.get(lineNum);
 403                                 // When a link has been replaced we need to scan again the string
 404                                 // as the offsets have changed (we add '<' and '>' to the link to
 405                                 // be interpreted by the markdown library)
 406                                 boolean urlReplaced;
 407
 408                                 do {
 409                                         urlReplaced = false;
 410                                         Matcher m = githubURLDetection.matcher(line);
 411                                         while (m.find()) {
 412                                                 // Ignore the URL following the format <link>
 413                                                 if ((m.start() - 1 >= 0) && (m.end() < line.length()) &&
 414                                                         (line.charAt(m.start() - 1) == '<') &&
 415                                                         (line.charAt(m.end()) == '>'))
 416                                                 {
 417                                                         continue;
 418                                                 }
 419
 420                                                 // Ignore the URL following the format [description](link)
 421                                                 if ((m.start() - 2 >= 0) && (m.end() < line.length()) &&
 422                                                         (line.charAt(m.start() - 2) == ']') &&
 423                                                         (line.charAt(m.start() - 1) == '(') &&
 424                                                         (line.charAt(m.end()) == ')'))
 425                                                 {
 426                                                         continue;
 427                                                 }
 428
 429                                                 // Ignore the URL following the format [description](link "title")
 430                                                 if ((m.start() - 2 >= 0) && (m.end() + 1 < line.length()) &&
 431                                                         (line.charAt(m.start() - 2) == ']') &&
 432                                                         (line.charAt(m.start() - 1) == '(') &&
 433                                                         (line.charAt(m.end()) == ' ') &&
 434                                                         (line.charAt(m.end() + 1) == '"'))
 435                                                 {
 436                                                         continue;
 437                                                 }
 438
 439                                                 if (m.start() - 1 >= 0) {
 440                                                         // Case when the link is at the beginning of the string
 441                                                         line = line.substring(0, m.start()) + "<" + m.group(0) + ">" + line.substring(m.end());
 442                                                 } else {
 443                                                         line = "<" + m.group(0) + ">" + line.substring(m.end());
 444                                                 }
 445
 446                                                 // We replaced the string in the array
 447                                                 lines.set(lineNum, line);
 448                                                 urlReplaced = true;
 449                                                 break;
 450                                         }
 451                                 } while (urlReplaced);
 452                         }
 453                 }
 454         }
 455
 456         /**
 457          * @param line
 458          * @param c
 459          * @return true if line is just cs (and whitespace at the start/end)
 460          */
 461         boolean just(String line, char c) {
 462                 return line.matches("\\s*"+c+"+\\s*");
 463         }
 464
 465         /**
 466          * @param line
 467          * @return The number of # symbols prepending the line.
 468          */
 469         private int numHash(String line) {
 470                 for (int i = 0; i < line.length(); i++) {
 471                         if (line.charAt(i) != '#')
 472                                 return i;
 473                 }
 474                 return line.length();
 475         }
 476
 477         /**
 478          *
 479          * @param parent
 480          *            Can be null for top-level
 481          * @return List of sub-headers. Never null. FIXME handle documents which
 482          *         have a 2nd level header before any 1st level ones
 483          */
 484         public List<Header> getHeadings(Header parent) {
 485                 if (parent == null) {
 486                         return Collections.unmodifiableList(level1Headers);
 487                 }
 488                 return Collections.unmodifiableList(parent.subHeaders);
 489         }
 490
 491         // public WebPage getWebPage() {
 492         // WebPage page = new WebPage();
 493         // // Add the lines, one by one
 494         // boolean inParagraph = false;
 495         // for (int i=0; i<lines.size(); i++) {
 496         // String line = lines.get(i);
 497         // KLineType type = lineTypes.get(i);
 498         // switch(type) {
 499         // // Heading?
 500         // case H1: case H2: case H3:
 501         // case H4: case H5: case H6:
 502         // if (inParagraph) page.addText("</p>");
 503         // line = cleanHeader(line);
 504         // page.addText("<"+type+">"+line+"</"+type+">");
 505         // continue;
 506         // case MARKER: // Ignore
 507         // continue;
 508         // // TODO List?
 509         // // TODO Block quote?
 510         // }
 511         // // Paragraph end?
 512         // if (Utils.isBlank(line)) {
 513         // if (inParagraph) page.addText("</p>");
 514         // continue;
 515         // }
 516         // // Paragraph start?
 517         // if (!inParagraph) {
 518         // page.addText("<p>");
 519         // inParagraph = true;
 520         // }
 521         // // Plain text
 522         // page.addText(line);
 523         // }
 524         // return page;
 525         // }
 526
 527         /**
 528          * Get the HTML for this page. Uses the MarkdownJ project.
 529          */
 530         public String html() {
 531                 // Section numbers??
 532                 boolean sectionNumbers = pStore
 533                                 .getBoolean(MarkdownPreferencePage.PREF_SECTION_NUMBERS);
 534                 // Chop out multi-markdown header
 535                 StringBuilder sb = new StringBuilder();
 536                 assert lines.size() == lineTypes.size();
 537                 for (int i = 0, n = lines.size(); i < n; i++) {
 538                         KLineType type = lineTypes.get(i);
 539                         if (type == KLineType.META)
 540                                 continue;
 541                         String line = lines.get(i);
 542                         if (sectionNumbers && isHeader(type) && line.contains("$section")) {
 543                                 // TODO Header section = headers.get(i);
 544                                 // String secNum = section.getSectionNumber();
 545                                 // line.replace("$section", secNum);
 546                         }
 547                         sb.append(line);
 548                 }
 549                 String text = sb.toString();
 550                 // Use external converter?
 551                 final String cmd = pStore
 552                                 .getString(MarkdownPreferencePage.PREF_MARKDOWN_COMMAND);
 553                 if (Utils.isBlank(cmd)
 554                                 || (cmd.startsWith("(") && cmd.contains("MarkdownJ"))) {
 555                         // Use MarkdownJ
 556                         MarkdownProcessor markdown = new MarkdownProcessor();
 557                         // MarkdownJ doesn't convert £s for some reason
 558                         text = text.replace("£", "&pound;");
 559                         String html = markdown.markdown(text);
 560                         return html;
 561                 }
 562                 // Attempt to run external command
 563                 try {
 564                         final File md = File.createTempFile("tmp", ".md");
 565                         FileUtils.write(md, text);
 566                         Process process = new Process(cmd+" "+md.getAbsolutePath());
 567                         process.run();
 568                         int ok = process.waitFor(10000);
 569                         if (ok != 0) throw new FailureException(cmd+" failed:\n"+process.getError());
 570                         String html = process.getOutput();
 571                         FileUtils.delete(md);
 572                         return html;
 573                 } catch (Exception e) {
 574                         throw Utils.runtime(e);
 575                 }
 576         }
 577
 578         /**
 579          * @param type
 580          * @return
 581          */
 582         private boolean isHeader(KLineType type) {
 583                 return type == KLineType.H1 || type == KLineType.H2
 584                                 || type == KLineType.H3 || type == KLineType.H4
 585                                 || type == KLineType.H5 || type == KLineType.H6;
 586         }
 587
 588         /**
 589          * Return the raw text of this page.
 590          */
 591         @Override
 592         public String toString() {
 593                 StringBuilder sb = new StringBuilder();
 594                 for (String line : lines) {
 595                         sb.append(line);
 596                 }
 597                 return sb.toString();
 598         }
 599
 600         /**
 601          * Line type information for the raw text.
 602          *
 603          * @return
 604          */
 605         public List<KLineType> getLineTypes() {
 606                 return Collections.unmodifiableList(lineTypes);
 607         }
 608
 609         /**
 610          * @param line
 611          * @return
 612          */
 613         public Object getPageObject(int line) {
 614                 return pageObjects.get(line);
 615         }
 616
 617 }