2 * Copyright winterwell Mathematics Ltd.
\r
3 * @author Daniel Winterstein
\r
6 package winterwell.markdown.pagemodel;
\r
9 import java.util.ArrayList;
\r
10 import java.util.Collections;
\r
11 import java.util.HashMap;
\r
12 import java.util.List;
\r
13 import java.util.Map;
\r
14 import java.util.regex.Matcher;
\r
15 import java.util.regex.Pattern;
\r
17 import org.eclipse.jface.preference.IPreferenceStore;
\r
19 import winterwell.markdown.Activator;
\r
20 import winterwell.markdown.StringMethods;
\r
21 import winterwell.markdown.preferences.MarkdownPreferencePage;
\r
22 import winterwell.utils.FailureException;
\r
23 import winterwell.utils.Process;
\r
24 import winterwell.utils.StrUtils;
\r
25 import winterwell.utils.Utils;
\r
26 import winterwell.utils.io.FileUtils;
\r
28 import com.petebevin.markdown.MarkdownProcessor;
\r
31 * Understands Markdown syntax.
\r
33 * @author Daniel Winterstein
\r
35 public class MarkdownPage {
\r
38 * Strip leading and trailing #s and whitespace
\r
41 * @return cleaned up line
\r
43 private String cleanHeader(String line) {
\r
44 for (int j = 0; j < line.length(); j++) {
\r
45 char c = line.charAt(j);
\r
46 if (c != '#' && !Character.isWhitespace(c)) {
\r
47 line = line.substring(j);
\r
51 for (int j = line.length() - 1; j > 0; j--) {
\r
52 char c = line.charAt(j);
\r
53 if (c != '#' && !Character.isWhitespace(c)) {
\r
54 line = line.substring(0, j + 1);
\r
62 * Represents information about a section header. E.g. ## Misc Warblings
\r
66 public class Header {
\r
68 * 1 = top-level (i.e. #), 2= 2nd-level (i.e. ##), etc.
\r
72 * The text of the Header
\r
74 final String heading;
\r
76 * Sub-sections, if any
\r
78 final List<Header> subHeaders = new ArrayList<Header>();
\r
80 * The line on which this header occurs.
\r
82 final int lineNumber;
\r
84 public int getLineNumber() {
\r
90 * @return the next section (at this depth if possible), null if none
\r
92 public Header getNext() {
\r
93 if (parent == null) {
\r
94 int ti = level1Headers.indexOf(this);
\r
95 if (ti == -1 || ti == level1Headers.size() - 1)
\r
97 return level1Headers.get(ti + 1);
\r
99 int i = parent.subHeaders.indexOf(this);
\r
100 assert i != -1 : this;
\r
101 if (i == parent.subHeaders.size() - 1)
\r
102 return parent.getNext();
\r
103 return parent.subHeaders.get(i + 1);
\r
107 * @return the next section (at this depth if possible), null if none
\r
109 public Header getPrevious() {
\r
110 if (parent == null) {
\r
111 int ti = level1Headers.indexOf(this);
\r
112 if (ti == -1 || ti == 0)
\r
114 return level1Headers.get(ti - 1);
\r
116 int i = parent.subHeaders.indexOf(this);
\r
117 assert i != -1 : this;
\r
119 return parent.getPrevious();
\r
120 return parent.subHeaders.get(i - 1);
\r
125 * The parent section. Can be null.
\r
127 private Header parent;
\r
130 * Create a marker for a section Header
\r
133 * 1 = top-level (i.e. #), 2= 2nd-level (i.e. ##), etc.
\r
134 * @param lineNumber
\r
135 * The line on which this header occurs
\r
137 * The text of the Header, trimmed of #s
\r
138 * @param currentHeader
\r
139 * The previous Header. This is used to find the parent
\r
140 * section if there is one. Can be null.
\r
142 Header(int level, int lineNumber, String heading, Header currentHeader) {
\r
143 this.lineNumber = lineNumber;
\r
144 this.level = level;
\r
145 this.heading = cleanHeader(heading);
\r
147 setParent(currentHeader);
\r
150 private void setParent(Header currentHeader) {
\r
151 if (currentHeader == null) {
\r
155 if (currentHeader.level < level) {
\r
156 parent = currentHeader;
\r
157 parent.subHeaders.add(this);
\r
160 setParent(currentHeader.parent);
\r
163 public Header getParent() {
\r
168 * Sub-sections. May be zero-length, never null.
\r
170 public List<Header> getSubHeaders() {
\r
175 public String toString() {
\r
179 public int getLevel() {
\r
185 * The raw text, broken up into individual lines.
\r
187 private List<String> lines;
\r
190 * The raw text, broken up into individual lines.
\r
192 public List<String> getText() {
\r
193 return Collections.unmodifiableList(lines);
\r
196 public enum KLineType {
\r
197 NORMAL, H1, H2, H3, H4, H5, H6, BLANK,
\r
198 // TODO LIST, BLOCKQUOTE,
\r
199 /** A line marking Markdown info about the preceding line, e.g. ====== */
\r
201 /** A line containing meta-data, e.g. title: My Page */
\r
206 * Information about each line.
\r
208 private List<KLineType> lineTypes;
\r
209 private Map<Integer,Object> pageObjects = new HashMap<Integer, Object>();
\r
211 // TODO meta-data, footnotes, tables, link & image attributes
\r
212 private static Pattern multiMarkdownTag = Pattern.compile("^([\\w].*):(.*)");
\r
213 private Map<String, String> multiMarkdownTags = new HashMap<String, String>();
\r
215 // Regular expression for Github support
\r
216 private static Pattern githubURLDetection = Pattern.compile("((https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|])");
\r
219 * The top-level headers. FIXME handle documents which have a 2nd level
\r
220 * header before any 1st level ones
\r
222 private final List<Header> level1Headers = new ArrayList<Header>();
\r
223 private final IPreferenceStore pStore;
\r
230 public MarkdownPage(String text) {
\r
231 pStore = Activator.getDefault().getPreferenceStore();
\r
236 * Reset the text for this page.
\r
240 private void setText(String text) {
\r
242 lines = StringMethods.splitLines(text);
\r
244 level1Headers.clear();
\r
245 lineTypes = new ArrayList<KLineType>(lines.size());
\r
246 pageObjects.clear();
\r
247 // Dummy level-1 header in case there are none
\r
248 Header dummyTopHeader = new Header(1, 0, "", null);
\r
249 level1Headers.add(dummyTopHeader);
\r
250 Header currentHeader = dummyTopHeader;
\r
251 // Identify line types
\r
254 // Check if we should support the Multi-Markdown Metadata
\r
255 boolean multiMarkdownMetadataSupport =
\r
256 pStore.getBoolean(MarkdownPreferencePage.PREF_MULTIMARKDOWN_METADATA);
\r
258 // Multi-markdown header
\r
259 if (multiMarkdownMetadataSupport) {
\r
260 // The key is the text before the colon, and the data is the text
\r
262 // colon. In the above example, notice that there are two lines of
\r
264 // for the Author key. If you end a line with “space-space-newline”,
\r
266 // will be included when converted to other formats.
\r
268 // There must not be any whitespace above the metadata, and the
\r
270 // ends with the first whitespace only line. The metadata is
\r
271 // stripped from the
\r
272 // document before it is passed on to the syntax parser.
\r
275 // Check if the Metdatas are valid
\r
277 boolean validMetadata = true;
\r
278 for (lineNum = 0; lineNum < lines.size(); lineNum++) {
\r
279 String line = lines.get(lineNum);
\r
280 if (Utils.isBlank(line)) {
\r
283 Matcher m = multiMarkdownTag.matcher(line);
\r
285 if (lineNum == 0) {
\r
286 // No MultiMarkdown metadata
\r
287 validMetadata = false;
\r
289 } else if (!line.matches("^\\s.*\n")) {
\r
290 // The next line was not intended (ie. it does not start
\r
291 // with a whitespace)
\r
292 validMetadata = false;
\r
298 // Valid Metadatas have been found. We need to retrieve these keys/values.
\r
299 if (validMetadata) {
\r
302 for (lineNum = 0; lineNum < lines.size(); lineNum++) {
\r
303 String line = lines.get(lineNum);
\r
304 if (Utils.isBlank(line)) {
\r
307 Matcher m = multiMarkdownTag.matcher(line);
\r
309 if (lineNum == 0) {
\r
313 lineTypes.add(KLineType.META);
\r
314 data += StrUtils.LINEEND + line.trim();
\r
315 multiMarkdownTags.put(tag, data);
\r
317 lineTypes.add(KLineType.META);
\r
319 data = m.group(1).trim();
\r
320 if (m.group(1).endsWith(line))
\r
321 multiMarkdownTags.put(tag, data);
\r
328 for (; lineNum < lines.size(); lineNum++) {
\r
329 String line = lines.get(lineNum);
\r
331 int h = numHash(line);
\r
332 String hLine = line;
\r
333 int hLineNum = lineNum;
\r
334 int underline = -1;
\r
335 if (lineNum != 0) {
\r
336 underline = just(line, '=') ? 1 : just(line, '-') ? 2 : -1;
\r
338 if (underline != -1) {
\r
340 hLineNum = lineNum - 1;
\r
341 hLine = lines.get(lineNum - 1);
\r
342 lineTypes.set(hLineNum, KLineType.values()[h]);
\r
343 lineTypes.add(KLineType.MARKER);
\r
345 // Create a Header object
\r
347 if (underline == -1)
\r
348 lineTypes.add(KLineType.values()[h]);
\r
349 Header header = new Header(h, hLineNum, hLine, currentHeader);
\r
351 level1Headers.add(header);
\r
353 pageObjects.put(hLineNum, header);
\r
354 currentHeader = header;
\r
358 // TODO Block quote
\r
360 if (Utils.isBlank(line)) {
\r
361 lineTypes.add(KLineType.BLANK);
\r
365 lineTypes.add(KLineType.NORMAL);
\r
367 // Remove dummy header?
\r
368 if (dummyTopHeader.getSubHeaders().size() == 0) {
\r
369 level1Headers.remove(dummyTopHeader);
\r
372 boolean githubSyntaxSupport =
\r
373 pStore.getBoolean(MarkdownPreferencePage.PREF_GITHUB_SYNTAX);
\r
374 if (githubSyntaxSupport) {
\r
376 * Support Code block
\r
378 boolean inCodeBlock = false;
\r
379 for (lineNum = 0; lineNum < lines.size(); lineNum++) {
\r
380 String line = lines.get(lineNum);
\r
381 // Found the start or end of a code block
\r
382 if (line.matches("^```.*\n")) {
\r
383 // We reverse the boolean value
\r
384 inCodeBlock = !inCodeBlock;
\r
386 // We force the line to be blank. But we mark it as normal
\r
387 // to prevent to be stripped
\r
388 lines.set(lineNum, "\n");
\r
389 lineTypes.set(lineNum, KLineType.NORMAL);
\r
393 lines.set(lineNum, " " + line);
\r
398 * Support for URL Detection
\r
399 * We search for links that are not captured by Markdown syntax
\r
401 for (lineNum = 0; lineNum < lines.size(); lineNum++) {
\r
402 String line = lines.get(lineNum);
\r
403 // When a link has been replaced we need to scan again the string
\r
404 // as the offsets have changed (we add '<' and '>' to the link to
\r
405 // be interpreted by the markdown library)
\r
406 boolean urlReplaced;
\r
409 urlReplaced = false;
\r
410 Matcher m = githubURLDetection.matcher(line);
\r
412 // Ignore the URL following the format <link>
\r
413 if ((m.start() - 1 >= 0) && (m.end() < line.length()) &&
\r
414 (line.charAt(m.start() - 1) == '<') &&
\r
415 (line.charAt(m.end()) == '>'))
\r
420 // Ignore the URL following the format [description](link)
\r
421 if ((m.start() - 2 >= 0) && (m.end() < line.length()) &&
\r
422 (line.charAt(m.start() - 2) == ']') &&
\r
423 (line.charAt(m.start() - 1) == '(') &&
\r
424 (line.charAt(m.end()) == ')'))
\r
429 // Ignore the URL following the format [description](link "title")
\r
430 if ((m.start() - 2 >= 0) && (m.end() + 1 < line.length()) &&
\r
431 (line.charAt(m.start() - 2) == ']') &&
\r
432 (line.charAt(m.start() - 1) == '(') &&
\r
433 (line.charAt(m.end()) == ' ') &&
\r
434 (line.charAt(m.end() + 1) == '"'))
\r
439 if (m.start() - 1 >= 0) {
\r
440 // Case when the link is at the beginning of the string
\r
441 line = line.substring(0, m.start()) + "<" + m.group(0) + ">" + line.substring(m.end());
\r
443 line = "<" + m.group(0) + ">" + line.substring(m.end());
\r
446 // We replaced the string in the array
\r
447 lines.set(lineNum, line);
\r
448 urlReplaced = true;
\r
451 } while (urlReplaced);
\r
459 * @return true if line is just cs (and whitespace at the start/end)
\r
461 boolean just(String line, char c) {
\r
462 return line.matches("\\s*"+c+"+\\s*");
\r
467 * @return The number of # symbols prepending the line.
\r
469 private int numHash(String line) {
\r
470 for (int i = 0; i < line.length(); i++) {
\r
471 if (line.charAt(i) != '#')
\r
474 return line.length();
\r
480 * Can be null for top-level
\r
481 * @return List of sub-headers. Never null. FIXME handle documents which
\r
482 * have a 2nd level header before any 1st level ones
\r
484 public List<Header> getHeadings(Header parent) {
\r
485 if (parent == null) {
\r
486 return Collections.unmodifiableList(level1Headers);
\r
488 return Collections.unmodifiableList(parent.subHeaders);
\r
491 // public WebPage getWebPage() {
\r
492 // WebPage page = new WebPage();
\r
493 // // Add the lines, one by one
\r
494 // boolean inParagraph = false;
\r
495 // for (int i=0; i<lines.size(); i++) {
\r
496 // String line = lines.get(i);
\r
497 // KLineType type = lineTypes.get(i);
\r
500 // case H1: case H2: case H3:
\r
501 // case H4: case H5: case H6:
\r
502 // if (inParagraph) page.addText("</p>");
\r
503 // line = cleanHeader(line);
\r
504 // page.addText("<"+type+">"+line+"</"+type+">");
\r
506 // case MARKER: // Ignore
\r
509 // // TODO Block quote?
\r
511 // // Paragraph end?
\r
512 // if (Utils.isBlank(line)) {
\r
513 // if (inParagraph) page.addText("</p>");
\r
516 // // Paragraph start?
\r
517 // if (!inParagraph) {
\r
518 // page.addText("<p>");
\r
519 // inParagraph = true;
\r
522 // page.addText(line);
\r
528 * Get the HTML for this page. Uses the MarkdownJ project.
\r
530 public String html() {
\r
531 // Section numbers??
\r
532 boolean sectionNumbers = pStore
\r
533 .getBoolean(MarkdownPreferencePage.PREF_SECTION_NUMBERS);
\r
534 // Chop out multi-markdown header
\r
535 StringBuilder sb = new StringBuilder();
\r
536 assert lines.size() == lineTypes.size();
\r
537 for (int i = 0, n = lines.size(); i < n; i++) {
\r
538 KLineType type = lineTypes.get(i);
\r
539 if (type == KLineType.META)
\r
541 String line = lines.get(i);
\r
542 if (sectionNumbers && isHeader(type) && line.contains("$section")) {
\r
543 // TODO Header section = headers.get(i);
\r
544 // String secNum = section.getSectionNumber();
\r
545 // line.replace("$section", secNum);
\r
549 String text = sb.toString();
\r
550 // Use external converter?
\r
551 final String cmd = pStore
\r
552 .getString(MarkdownPreferencePage.PREF_MARKDOWN_COMMAND);
\r
553 if (Utils.isBlank(cmd)
\r
554 || (cmd.startsWith("(") && cmd.contains("MarkdownJ"))) {
\r
556 MarkdownProcessor markdown = new MarkdownProcessor();
\r
557 // MarkdownJ doesn't convert £s for some reason
\r
558 text = text.replace("£", "£");
\r
559 String html = markdown.markdown(text);
\r
562 // Attempt to run external command
\r
564 final File md = File.createTempFile("tmp", ".md");
\r
565 FileUtils.write(md, text);
\r
566 Process process = new Process(cmd+" "+md.getAbsolutePath());
\r
568 int ok = process.waitFor(10000);
\r
569 if (ok != 0) throw new FailureException(cmd+" failed:\n"+process.getError());
\r
570 String html = process.getOutput();
\r
571 FileUtils.delete(md);
\r
573 } catch (Exception e) {
\r
574 throw Utils.runtime(e);
\r
582 private boolean isHeader(KLineType type) {
\r
583 return type == KLineType.H1 || type == KLineType.H2
\r
584 || type == KLineType.H3 || type == KLineType.H4
\r
585 || type == KLineType.H5 || type == KLineType.H6;
\r
589 * Return the raw text of this page.
\r
592 public String toString() {
\r
593 StringBuilder sb = new StringBuilder();
\r
594 for (String line : lines) {
\r
597 return sb.toString();
\r
601 * Line type information for the raw text.
\r
605 public List<KLineType> getLineTypes() {
\r
606 return Collections.unmodifiableList(lineTypes);
\r
613 public Object getPageObject(int line) {
\r
614 return pageObjects.get(line);
\r