block.go (34509B)
1 // 2 // Blackfriday Markdown Processor 3 // Available at http://github.com/russross/blackfriday 4 // 5 // Copyright © 2011 Russ Ross <russ@russross.com>. 6 // Distributed under the Simplified BSD License. 7 // See README.md for details. 8 // 9 10 // 11 // Functions to parse block-level elements. 12 // 13 14 package blackfriday 15 16 import ( 17 "bytes" 18 "html" 19 "regexp" 20 "strings" 21 "unicode" 22 ) 23 24 const ( 25 charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});" 26 escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]" 27 ) 28 29 var ( 30 reBackslashOrAmp = regexp.MustCompile("[\\&]") 31 reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity) 32 ) 33 34 // Parse block-level data. 35 // Note: this function and many that it calls assume that 36 // the input buffer ends with a newline. 37 func (p *Markdown) block(data []byte) { 38 // this is called recursively: enforce a maximum depth 39 if p.nesting >= p.maxNesting { 40 return 41 } 42 p.nesting++ 43 44 // parse out one block-level construct at a time 45 for len(data) > 0 { 46 // prefixed heading: 47 // 48 // # Heading 1 49 // ## Heading 2 50 // ... 51 // ###### Heading 6 52 if p.isPrefixHeading(data) { 53 data = data[p.prefixHeading(data):] 54 continue 55 } 56 57 // block of preformatted HTML: 58 // 59 // <div> 60 // ... 61 // </div> 62 if data[0] == '<' { 63 if i := p.html(data, true); i > 0 { 64 data = data[i:] 65 continue 66 } 67 } 68 69 // title block 70 // 71 // % stuff 72 // % more stuff 73 // % even more stuff 74 if p.extensions&Titleblock != 0 { 75 if data[0] == '%' { 76 if i := p.titleBlock(data, true); i > 0 { 77 data = data[i:] 78 continue 79 } 80 } 81 } 82 83 // blank lines. note: returns the # of bytes to skip 84 if i := p.isEmpty(data); i > 0 { 85 data = data[i:] 86 continue 87 } 88 89 // indented code block: 90 // 91 // func max(a, b int) int { 92 // if a > b { 93 // return a 94 // } 95 // return b 96 // } 97 if p.codePrefix(data) > 0 { 98 data = data[p.code(data):] 99 continue 100 } 101 102 // fenced code block: 103 // 104 // ``` go 105 // func fact(n int) int { 106 // if n <= 1 { 107 // return n 108 // } 109 // return n * fact(n-1) 110 // } 111 // ``` 112 if p.extensions&FencedCode != 0 { 113 if i := p.fencedCodeBlock(data, true); i > 0 { 114 data = data[i:] 115 continue 116 } 117 } 118 119 // horizontal rule: 120 // 121 // ------ 122 // or 123 // ****** 124 // or 125 // ______ 126 if p.isHRule(data) { 127 p.addBlock(HorizontalRule, nil) 128 var i int 129 for i = 0; i < len(data) && data[i] != '\n'; i++ { 130 } 131 data = data[i:] 132 continue 133 } 134 135 // block quote: 136 // 137 // > A big quote I found somewhere 138 // > on the web 139 if p.quotePrefix(data) > 0 { 140 data = data[p.quote(data):] 141 continue 142 } 143 144 // table: 145 // 146 // Name | Age | Phone 147 // ------|-----|--------- 148 // Bob | 31 | 555-1234 149 // Alice | 27 | 555-4321 150 if p.extensions&Tables != 0 { 151 if i := p.table(data); i > 0 { 152 data = data[i:] 153 continue 154 } 155 } 156 157 // an itemized/unordered list: 158 // 159 // * Item 1 160 // * Item 2 161 // 162 // also works with + or - 163 if p.uliPrefix(data) > 0 { 164 data = data[p.list(data, 0):] 165 continue 166 } 167 168 // a numbered/ordered list: 169 // 170 // 1. Item 1 171 // 2. Item 2 172 if p.oliPrefix(data) > 0 { 173 data = data[p.list(data, ListTypeOrdered):] 174 continue 175 } 176 177 // definition lists: 178 // 179 // Term 1 180 // : Definition a 181 // : Definition b 182 // 183 // Term 2 184 // : Definition c 185 if p.extensions&DefinitionLists != 0 { 186 if p.dliPrefix(data) > 0 { 187 data = data[p.list(data, ListTypeDefinition):] 188 continue 189 } 190 } 191 192 // anything else must look like a normal paragraph 193 // note: this finds underlined headings, too 194 data = data[p.paragraph(data):] 195 } 196 197 p.nesting-- 198 } 199 200 func (p *Markdown) addBlock(typ NodeType, content []byte) *Node { 201 p.closeUnmatchedBlocks() 202 container := p.addChild(typ, 0) 203 container.content = content 204 return container 205 } 206 207 func (p *Markdown) isPrefixHeading(data []byte) bool { 208 if data[0] != '#' { 209 return false 210 } 211 212 if p.extensions&SpaceHeadings != 0 { 213 level := 0 214 for level < 6 && level < len(data) && data[level] == '#' { 215 level++ 216 } 217 if level == len(data) || data[level] != ' ' { 218 return false 219 } 220 } 221 return true 222 } 223 224 func (p *Markdown) prefixHeading(data []byte) int { 225 level := 0 226 for level < 6 && level < len(data) && data[level] == '#' { 227 level++ 228 } 229 i := skipChar(data, level, ' ') 230 end := skipUntilChar(data, i, '\n') 231 skip := end 232 id := "" 233 if p.extensions&HeadingIDs != 0 { 234 j, k := 0, 0 235 // find start/end of heading id 236 for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ { 237 } 238 for k = j + 1; k < end && data[k] != '}'; k++ { 239 } 240 // extract heading id iff found 241 if j < end && k < end { 242 id = string(data[j+2 : k]) 243 end = j 244 skip = k + 1 245 for end > 0 && data[end-1] == ' ' { 246 end-- 247 } 248 } 249 } 250 for end > 0 && data[end-1] == '#' { 251 if isBackslashEscaped(data, end-1) { 252 break 253 } 254 end-- 255 } 256 for end > 0 && data[end-1] == ' ' { 257 end-- 258 } 259 if end > i { 260 if id == "" && p.extensions&AutoHeadingIDs != 0 { 261 id = SanitizedAnchorName(string(data[i:end])) 262 } 263 block := p.addBlock(Heading, data[i:end]) 264 block.HeadingID = id 265 block.Level = level 266 } 267 return skip 268 } 269 270 func (p *Markdown) isUnderlinedHeading(data []byte) int { 271 // test of level 1 heading 272 if data[0] == '=' { 273 i := skipChar(data, 1, '=') 274 i = skipChar(data, i, ' ') 275 if i < len(data) && data[i] == '\n' { 276 return 1 277 } 278 return 0 279 } 280 281 // test of level 2 heading 282 if data[0] == '-' { 283 i := skipChar(data, 1, '-') 284 i = skipChar(data, i, ' ') 285 if i < len(data) && data[i] == '\n' { 286 return 2 287 } 288 return 0 289 } 290 291 return 0 292 } 293 294 func (p *Markdown) titleBlock(data []byte, doRender bool) int { 295 if data[0] != '%' { 296 return 0 297 } 298 splitData := bytes.Split(data, []byte("\n")) 299 var i int 300 for idx, b := range splitData { 301 if !bytes.HasPrefix(b, []byte("%")) { 302 i = idx // - 1 303 break 304 } 305 } 306 307 data = bytes.Join(splitData[0:i], []byte("\n")) 308 consumed := len(data) 309 data = bytes.TrimPrefix(data, []byte("% ")) 310 data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1) 311 block := p.addBlock(Heading, data) 312 block.Level = 1 313 block.IsTitleblock = true 314 315 return consumed 316 } 317 318 func (p *Markdown) html(data []byte, doRender bool) int { 319 var i, j int 320 321 // identify the opening tag 322 if data[0] != '<' { 323 return 0 324 } 325 curtag, tagfound := p.htmlFindTag(data[1:]) 326 327 // handle special cases 328 if !tagfound { 329 // check for an HTML comment 330 if size := p.htmlComment(data, doRender); size > 0 { 331 return size 332 } 333 334 // check for an <hr> tag 335 if size := p.htmlHr(data, doRender); size > 0 { 336 return size 337 } 338 339 // no special case recognized 340 return 0 341 } 342 343 // look for an unindented matching closing tag 344 // followed by a blank line 345 found := false 346 /* 347 closetag := []byte("\n</" + curtag + ">") 348 j = len(curtag) + 1 349 for !found { 350 // scan for a closing tag at the beginning of a line 351 if skip := bytes.Index(data[j:], closetag); skip >= 0 { 352 j += skip + len(closetag) 353 } else { 354 break 355 } 356 357 // see if it is the only thing on the line 358 if skip := p.isEmpty(data[j:]); skip > 0 { 359 // see if it is followed by a blank line/eof 360 j += skip 361 if j >= len(data) { 362 found = true 363 i = j 364 } else { 365 if skip := p.isEmpty(data[j:]); skip > 0 { 366 j += skip 367 found = true 368 i = j 369 } 370 } 371 } 372 } 373 */ 374 375 // if not found, try a second pass looking for indented match 376 // but not if tag is "ins" or "del" (following original Markdown.pl) 377 if !found && curtag != "ins" && curtag != "del" { 378 i = 1 379 for i < len(data) { 380 i++ 381 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') { 382 i++ 383 } 384 385 if i+2+len(curtag) >= len(data) { 386 break 387 } 388 389 j = p.htmlFindEnd(curtag, data[i-1:]) 390 391 if j > 0 { 392 i += j - 1 393 found = true 394 break 395 } 396 } 397 } 398 399 if !found { 400 return 0 401 } 402 403 // the end of the block has been found 404 if doRender { 405 // trim newlines 406 end := i 407 for end > 0 && data[end-1] == '\n' { 408 end-- 409 } 410 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end])) 411 } 412 413 return i 414 } 415 416 func finalizeHTMLBlock(block *Node) { 417 block.Literal = block.content 418 block.content = nil 419 } 420 421 // HTML comment, lax form 422 func (p *Markdown) htmlComment(data []byte, doRender bool) int { 423 i := p.inlineHTMLComment(data) 424 // needs to end with a blank line 425 if j := p.isEmpty(data[i:]); j > 0 { 426 size := i + j 427 if doRender { 428 // trim trailing newlines 429 end := size 430 for end > 0 && data[end-1] == '\n' { 431 end-- 432 } 433 block := p.addBlock(HTMLBlock, data[:end]) 434 finalizeHTMLBlock(block) 435 } 436 return size 437 } 438 return 0 439 } 440 441 // HR, which is the only self-closing block tag considered 442 func (p *Markdown) htmlHr(data []byte, doRender bool) int { 443 if len(data) < 4 { 444 return 0 445 } 446 if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') { 447 return 0 448 } 449 if data[3] != ' ' && data[3] != '/' && data[3] != '>' { 450 // not an <hr> tag after all; at least not a valid one 451 return 0 452 } 453 i := 3 454 for i < len(data) && data[i] != '>' && data[i] != '\n' { 455 i++ 456 } 457 if i < len(data) && data[i] == '>' { 458 i++ 459 if j := p.isEmpty(data[i:]); j > 0 { 460 size := i + j 461 if doRender { 462 // trim newlines 463 end := size 464 for end > 0 && data[end-1] == '\n' { 465 end-- 466 } 467 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end])) 468 } 469 return size 470 } 471 } 472 return 0 473 } 474 475 func (p *Markdown) htmlFindTag(data []byte) (string, bool) { 476 i := 0 477 for i < len(data) && isalnum(data[i]) { 478 i++ 479 } 480 key := string(data[:i]) 481 if _, ok := blockTags[key]; ok { 482 return key, true 483 } 484 return "", false 485 } 486 487 func (p *Markdown) htmlFindEnd(tag string, data []byte) int { 488 // assume data[0] == '<' && data[1] == '/' already tested 489 if tag == "hr" { 490 return 2 491 } 492 // check if tag is a match 493 closetag := []byte("</" + tag + ">") 494 if !bytes.HasPrefix(data, closetag) { 495 return 0 496 } 497 i := len(closetag) 498 499 // check that the rest of the line is blank 500 skip := 0 501 if skip = p.isEmpty(data[i:]); skip == 0 { 502 return 0 503 } 504 i += skip 505 skip = 0 506 507 if i >= len(data) { 508 return i 509 } 510 511 if p.extensions&LaxHTMLBlocks != 0 { 512 return i 513 } 514 if skip = p.isEmpty(data[i:]); skip == 0 { 515 // following line must be blank 516 return 0 517 } 518 519 return i + skip 520 } 521 522 func (*Markdown) isEmpty(data []byte) int { 523 // it is okay to call isEmpty on an empty buffer 524 if len(data) == 0 { 525 return 0 526 } 527 528 var i int 529 for i = 0; i < len(data) && data[i] != '\n'; i++ { 530 if data[i] != ' ' && data[i] != '\t' { 531 return 0 532 } 533 } 534 if i < len(data) && data[i] == '\n' { 535 i++ 536 } 537 return i 538 } 539 540 func (*Markdown) isHRule(data []byte) bool { 541 i := 0 542 543 // skip up to three spaces 544 for i < 3 && data[i] == ' ' { 545 i++ 546 } 547 548 // look at the hrule char 549 if data[i] != '*' && data[i] != '-' && data[i] != '_' { 550 return false 551 } 552 c := data[i] 553 554 // the whole line must be the char or whitespace 555 n := 0 556 for i < len(data) && data[i] != '\n' { 557 switch { 558 case data[i] == c: 559 n++ 560 case data[i] != ' ': 561 return false 562 } 563 i++ 564 } 565 566 return n >= 3 567 } 568 569 // isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data, 570 // and returns the end index if so, or 0 otherwise. It also returns the marker found. 571 // If info is not nil, it gets set to the syntax specified in the fence line. 572 func isFenceLine(data []byte, info *string, oldmarker string) (end int, marker string) { 573 i, size := 0, 0 574 575 // skip up to three spaces 576 for i < len(data) && i < 3 && data[i] == ' ' { 577 i++ 578 } 579 580 // check for the marker characters: ~ or ` 581 if i >= len(data) { 582 return 0, "" 583 } 584 if data[i] != '~' && data[i] != '`' { 585 return 0, "" 586 } 587 588 c := data[i] 589 590 // the whole line must be the same char or whitespace 591 for i < len(data) && data[i] == c { 592 size++ 593 i++ 594 } 595 596 // the marker char must occur at least 3 times 597 if size < 3 { 598 return 0, "" 599 } 600 marker = string(data[i-size : i]) 601 602 // if this is the end marker, it must match the beginning marker 603 if oldmarker != "" && marker != oldmarker { 604 return 0, "" 605 } 606 607 // TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here 608 // into one, always get the info string, and discard it if the caller doesn't care. 609 if info != nil { 610 infoLength := 0 611 i = skipChar(data, i, ' ') 612 613 if i >= len(data) { 614 if i == len(data) { 615 return i, marker 616 } 617 return 0, "" 618 } 619 620 infoStart := i 621 622 if data[i] == '{' { 623 i++ 624 infoStart++ 625 626 for i < len(data) && data[i] != '}' && data[i] != '\n' { 627 infoLength++ 628 i++ 629 } 630 631 if i >= len(data) || data[i] != '}' { 632 return 0, "" 633 } 634 635 // strip all whitespace at the beginning and the end 636 // of the {} block 637 for infoLength > 0 && isspace(data[infoStart]) { 638 infoStart++ 639 infoLength-- 640 } 641 642 for infoLength > 0 && isspace(data[infoStart+infoLength-1]) { 643 infoLength-- 644 } 645 i++ 646 i = skipChar(data, i, ' ') 647 } else { 648 for i < len(data) && !isverticalspace(data[i]) { 649 infoLength++ 650 i++ 651 } 652 } 653 654 *info = strings.TrimSpace(string(data[infoStart : infoStart+infoLength])) 655 } 656 657 if i == len(data) { 658 return i, marker 659 } 660 if i > len(data) || data[i] != '\n' { 661 return 0, "" 662 } 663 return i + 1, marker // Take newline into account. 664 } 665 666 // fencedCodeBlock returns the end index if data contains a fenced code block at the beginning, 667 // or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects. 668 // If doRender is true, a final newline is mandatory to recognize the fenced code block. 669 func (p *Markdown) fencedCodeBlock(data []byte, doRender bool) int { 670 var info string 671 beg, marker := isFenceLine(data, &info, "") 672 if beg == 0 || beg >= len(data) { 673 return 0 674 } 675 fenceLength := beg - 1 676 677 var work bytes.Buffer 678 work.Write([]byte(info)) 679 work.WriteByte('\n') 680 681 for { 682 // safe to assume beg < len(data) 683 684 // check for the end of the code block 685 fenceEnd, _ := isFenceLine(data[beg:], nil, marker) 686 if fenceEnd != 0 { 687 beg += fenceEnd 688 break 689 } 690 691 // copy the current line 692 end := skipUntilChar(data, beg, '\n') + 1 693 694 // did we reach the end of the buffer without a closing marker? 695 if end >= len(data) { 696 return 0 697 } 698 699 // verbatim copy to the working buffer 700 if doRender { 701 work.Write(data[beg:end]) 702 } 703 beg = end 704 } 705 706 if doRender { 707 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer 708 block.IsFenced = true 709 block.FenceLength = fenceLength 710 finalizeCodeBlock(block) 711 } 712 713 return beg 714 } 715 716 func unescapeChar(str []byte) []byte { 717 if str[0] == '\\' { 718 return []byte{str[1]} 719 } 720 return []byte(html.UnescapeString(string(str))) 721 } 722 723 func unescapeString(str []byte) []byte { 724 if reBackslashOrAmp.Match(str) { 725 return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar) 726 } 727 return str 728 } 729 730 func finalizeCodeBlock(block *Node) { 731 if block.IsFenced { 732 newlinePos := bytes.IndexByte(block.content, '\n') 733 firstLine := block.content[:newlinePos] 734 rest := block.content[newlinePos+1:] 735 block.Info = unescapeString(bytes.Trim(firstLine, "\n")) 736 block.Literal = rest 737 } else { 738 block.Literal = block.content 739 } 740 block.content = nil 741 } 742 743 func (p *Markdown) table(data []byte) int { 744 table := p.addBlock(Table, nil) 745 i, columns := p.tableHeader(data) 746 if i == 0 { 747 p.tip = table.Parent 748 table.Unlink() 749 return 0 750 } 751 752 p.addBlock(TableBody, nil) 753 754 for i < len(data) { 755 pipes, rowStart := 0, i 756 for ; i < len(data) && data[i] != '\n'; i++ { 757 if data[i] == '|' { 758 pipes++ 759 } 760 } 761 762 if pipes == 0 { 763 i = rowStart 764 break 765 } 766 767 // include the newline in data sent to tableRow 768 if i < len(data) && data[i] == '\n' { 769 i++ 770 } 771 p.tableRow(data[rowStart:i], columns, false) 772 } 773 774 return i 775 } 776 777 // check if the specified position is preceded by an odd number of backslashes 778 func isBackslashEscaped(data []byte, i int) bool { 779 backslashes := 0 780 for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' { 781 backslashes++ 782 } 783 return backslashes&1 == 1 784 } 785 786 func (p *Markdown) tableHeader(data []byte) (size int, columns []CellAlignFlags) { 787 i := 0 788 colCount := 1 789 for i = 0; i < len(data) && data[i] != '\n'; i++ { 790 if data[i] == '|' && !isBackslashEscaped(data, i) { 791 colCount++ 792 } 793 } 794 795 // doesn't look like a table header 796 if colCount == 1 { 797 return 798 } 799 800 // include the newline in the data sent to tableRow 801 j := i 802 if j < len(data) && data[j] == '\n' { 803 j++ 804 } 805 header := data[:j] 806 807 // column count ignores pipes at beginning or end of line 808 if data[0] == '|' { 809 colCount-- 810 } 811 if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) { 812 colCount-- 813 } 814 815 columns = make([]CellAlignFlags, colCount) 816 817 // move on to the header underline 818 i++ 819 if i >= len(data) { 820 return 821 } 822 823 if data[i] == '|' && !isBackslashEscaped(data, i) { 824 i++ 825 } 826 i = skipChar(data, i, ' ') 827 828 // each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3 829 // and trailing | optional on last column 830 col := 0 831 for i < len(data) && data[i] != '\n' { 832 dashes := 0 833 834 if data[i] == ':' { 835 i++ 836 columns[col] |= TableAlignmentLeft 837 dashes++ 838 } 839 for i < len(data) && data[i] == '-' { 840 i++ 841 dashes++ 842 } 843 if i < len(data) && data[i] == ':' { 844 i++ 845 columns[col] |= TableAlignmentRight 846 dashes++ 847 } 848 for i < len(data) && data[i] == ' ' { 849 i++ 850 } 851 if i == len(data) { 852 return 853 } 854 // end of column test is messy 855 switch { 856 case dashes < 3: 857 // not a valid column 858 return 859 860 case data[i] == '|' && !isBackslashEscaped(data, i): 861 // marker found, now skip past trailing whitespace 862 col++ 863 i++ 864 for i < len(data) && data[i] == ' ' { 865 i++ 866 } 867 868 // trailing junk found after last column 869 if col >= colCount && i < len(data) && data[i] != '\n' { 870 return 871 } 872 873 case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount: 874 // something else found where marker was required 875 return 876 877 case data[i] == '\n': 878 // marker is optional for the last column 879 col++ 880 881 default: 882 // trailing junk found after last column 883 return 884 } 885 } 886 if col != colCount { 887 return 888 } 889 890 p.addBlock(TableHead, nil) 891 p.tableRow(header, columns, true) 892 size = i 893 if size < len(data) && data[size] == '\n' { 894 size++ 895 } 896 return 897 } 898 899 func (p *Markdown) tableRow(data []byte, columns []CellAlignFlags, header bool) { 900 p.addBlock(TableRow, nil) 901 i, col := 0, 0 902 903 if data[i] == '|' && !isBackslashEscaped(data, i) { 904 i++ 905 } 906 907 for col = 0; col < len(columns) && i < len(data); col++ { 908 for i < len(data) && data[i] == ' ' { 909 i++ 910 } 911 912 cellStart := i 913 914 for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' { 915 i++ 916 } 917 918 cellEnd := i 919 920 // skip the end-of-cell marker, possibly taking us past end of buffer 921 i++ 922 923 for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' { 924 cellEnd-- 925 } 926 927 cell := p.addBlock(TableCell, data[cellStart:cellEnd]) 928 cell.IsHeader = header 929 cell.Align = columns[col] 930 } 931 932 // pad it out with empty columns to get the right number 933 for ; col < len(columns); col++ { 934 cell := p.addBlock(TableCell, nil) 935 cell.IsHeader = header 936 cell.Align = columns[col] 937 } 938 939 // silently ignore rows with too many cells 940 } 941 942 // returns blockquote prefix length 943 func (p *Markdown) quotePrefix(data []byte) int { 944 i := 0 945 for i < 3 && i < len(data) && data[i] == ' ' { 946 i++ 947 } 948 if i < len(data) && data[i] == '>' { 949 if i+1 < len(data) && data[i+1] == ' ' { 950 return i + 2 951 } 952 return i + 1 953 } 954 return 0 955 } 956 957 // blockquote ends with at least one blank line 958 // followed by something without a blockquote prefix 959 func (p *Markdown) terminateBlockquote(data []byte, beg, end int) bool { 960 if p.isEmpty(data[beg:]) <= 0 { 961 return false 962 } 963 if end >= len(data) { 964 return true 965 } 966 return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0 967 } 968 969 // parse a blockquote fragment 970 func (p *Markdown) quote(data []byte) int { 971 block := p.addBlock(BlockQuote, nil) 972 var raw bytes.Buffer 973 beg, end := 0, 0 974 for beg < len(data) { 975 end = beg 976 // Step over whole lines, collecting them. While doing that, check for 977 // fenced code and if one's found, incorporate it altogether, 978 // irregardless of any contents inside it 979 for end < len(data) && data[end] != '\n' { 980 if p.extensions&FencedCode != 0 { 981 if i := p.fencedCodeBlock(data[end:], false); i > 0 { 982 // -1 to compensate for the extra end++ after the loop: 983 end += i - 1 984 break 985 } 986 } 987 end++ 988 } 989 if end < len(data) && data[end] == '\n' { 990 end++ 991 } 992 if pre := p.quotePrefix(data[beg:]); pre > 0 { 993 // skip the prefix 994 beg += pre 995 } else if p.terminateBlockquote(data, beg, end) { 996 break 997 } 998 // this line is part of the blockquote 999 raw.Write(data[beg:end]) 1000 beg = end 1001 } 1002 p.block(raw.Bytes()) 1003 p.finalize(block) 1004 return end 1005 } 1006 1007 // returns prefix length for block code 1008 func (p *Markdown) codePrefix(data []byte) int { 1009 if len(data) >= 1 && data[0] == '\t' { 1010 return 1 1011 } 1012 if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' { 1013 return 4 1014 } 1015 return 0 1016 } 1017 1018 func (p *Markdown) code(data []byte) int { 1019 var work bytes.Buffer 1020 1021 i := 0 1022 for i < len(data) { 1023 beg := i 1024 for i < len(data) && data[i] != '\n' { 1025 i++ 1026 } 1027 if i < len(data) && data[i] == '\n' { 1028 i++ 1029 } 1030 1031 blankline := p.isEmpty(data[beg:i]) > 0 1032 if pre := p.codePrefix(data[beg:i]); pre > 0 { 1033 beg += pre 1034 } else if !blankline { 1035 // non-empty, non-prefixed line breaks the pre 1036 i = beg 1037 break 1038 } 1039 1040 // verbatim copy to the working buffer 1041 if blankline { 1042 work.WriteByte('\n') 1043 } else { 1044 work.Write(data[beg:i]) 1045 } 1046 } 1047 1048 // trim all the \n off the end of work 1049 workbytes := work.Bytes() 1050 eol := len(workbytes) 1051 for eol > 0 && workbytes[eol-1] == '\n' { 1052 eol-- 1053 } 1054 if eol != len(workbytes) { 1055 work.Truncate(eol) 1056 } 1057 1058 work.WriteByte('\n') 1059 1060 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer 1061 block.IsFenced = false 1062 finalizeCodeBlock(block) 1063 1064 return i 1065 } 1066 1067 // returns unordered list item prefix 1068 func (p *Markdown) uliPrefix(data []byte) int { 1069 i := 0 1070 // start with up to 3 spaces 1071 for i < len(data) && i < 3 && data[i] == ' ' { 1072 i++ 1073 } 1074 if i >= len(data)-1 { 1075 return 0 1076 } 1077 // need one of {'*', '+', '-'} followed by a space or a tab 1078 if (data[i] != '*' && data[i] != '+' && data[i] != '-') || 1079 (data[i+1] != ' ' && data[i+1] != '\t') { 1080 return 0 1081 } 1082 return i + 2 1083 } 1084 1085 // returns ordered list item prefix 1086 func (p *Markdown) oliPrefix(data []byte) int { 1087 i := 0 1088 1089 // start with up to 3 spaces 1090 for i < 3 && i < len(data) && data[i] == ' ' { 1091 i++ 1092 } 1093 1094 // count the digits 1095 start := i 1096 for i < len(data) && data[i] >= '0' && data[i] <= '9' { 1097 i++ 1098 } 1099 if start == i || i >= len(data)-1 { 1100 return 0 1101 } 1102 1103 // we need >= 1 digits followed by a dot and a space or a tab 1104 if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') { 1105 return 0 1106 } 1107 return i + 2 1108 } 1109 1110 // returns definition list item prefix 1111 func (p *Markdown) dliPrefix(data []byte) int { 1112 if len(data) < 2 { 1113 return 0 1114 } 1115 i := 0 1116 // need a ':' followed by a space or a tab 1117 if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') { 1118 return 0 1119 } 1120 for i < len(data) && data[i] == ' ' { 1121 i++ 1122 } 1123 return i + 2 1124 } 1125 1126 // parse ordered or unordered list block 1127 func (p *Markdown) list(data []byte, flags ListType) int { 1128 i := 0 1129 flags |= ListItemBeginningOfList 1130 block := p.addBlock(List, nil) 1131 block.ListFlags = flags 1132 block.Tight = true 1133 1134 for i < len(data) { 1135 skip := p.listItem(data[i:], &flags) 1136 if flags&ListItemContainsBlock != 0 { 1137 block.ListData.Tight = false 1138 } 1139 i += skip 1140 if skip == 0 || flags&ListItemEndOfList != 0 { 1141 break 1142 } 1143 flags &= ^ListItemBeginningOfList 1144 } 1145 1146 above := block.Parent 1147 finalizeList(block) 1148 p.tip = above 1149 return i 1150 } 1151 1152 // Returns true if the list item is not the same type as its parent list 1153 func (p *Markdown) listTypeChanged(data []byte, flags *ListType) bool { 1154 if p.dliPrefix(data) > 0 && *flags&ListTypeDefinition == 0 { 1155 return true 1156 } else if p.oliPrefix(data) > 0 && *flags&ListTypeOrdered == 0 { 1157 return true 1158 } else if p.uliPrefix(data) > 0 && (*flags&ListTypeOrdered != 0 || *flags&ListTypeDefinition != 0) { 1159 return true 1160 } 1161 return false 1162 } 1163 1164 // Returns true if block ends with a blank line, descending if needed 1165 // into lists and sublists. 1166 func endsWithBlankLine(block *Node) bool { 1167 // TODO: figure this out. Always false now. 1168 for block != nil { 1169 //if block.lastLineBlank { 1170 //return true 1171 //} 1172 t := block.Type 1173 if t == List || t == Item { 1174 block = block.LastChild 1175 } else { 1176 break 1177 } 1178 } 1179 return false 1180 } 1181 1182 func finalizeList(block *Node) { 1183 block.open = false 1184 item := block.FirstChild 1185 for item != nil { 1186 // check for non-final list item ending with blank line: 1187 if endsWithBlankLine(item) && item.Next != nil { 1188 block.ListData.Tight = false 1189 break 1190 } 1191 // recurse into children of list item, to see if there are spaces 1192 // between any of them: 1193 subItem := item.FirstChild 1194 for subItem != nil { 1195 if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) { 1196 block.ListData.Tight = false 1197 break 1198 } 1199 subItem = subItem.Next 1200 } 1201 item = item.Next 1202 } 1203 } 1204 1205 // Parse a single list item. 1206 // Assumes initial prefix is already removed if this is a sublist. 1207 func (p *Markdown) listItem(data []byte, flags *ListType) int { 1208 // keep track of the indentation of the first line 1209 itemIndent := 0 1210 if data[0] == '\t' { 1211 itemIndent += 4 1212 } else { 1213 for itemIndent < 3 && data[itemIndent] == ' ' { 1214 itemIndent++ 1215 } 1216 } 1217 1218 var bulletChar byte = '*' 1219 i := p.uliPrefix(data) 1220 if i == 0 { 1221 i = p.oliPrefix(data) 1222 } else { 1223 bulletChar = data[i-2] 1224 } 1225 if i == 0 { 1226 i = p.dliPrefix(data) 1227 // reset definition term flag 1228 if i > 0 { 1229 *flags &= ^ListTypeTerm 1230 } 1231 } 1232 if i == 0 { 1233 // if in definition list, set term flag and continue 1234 if *flags&ListTypeDefinition != 0 { 1235 *flags |= ListTypeTerm 1236 } else { 1237 return 0 1238 } 1239 } 1240 1241 // skip leading whitespace on first line 1242 for i < len(data) && data[i] == ' ' { 1243 i++ 1244 } 1245 1246 // find the end of the line 1247 line := i 1248 for i > 0 && i < len(data) && data[i-1] != '\n' { 1249 i++ 1250 } 1251 1252 // get working buffer 1253 var raw bytes.Buffer 1254 1255 // put the first line into the working buffer 1256 raw.Write(data[line:i]) 1257 line = i 1258 1259 // process the following lines 1260 containsBlankLine := false 1261 sublist := 0 1262 codeBlockMarker := "" 1263 1264 gatherlines: 1265 for line < len(data) { 1266 i++ 1267 1268 // find the end of this line 1269 for i < len(data) && data[i-1] != '\n' { 1270 i++ 1271 } 1272 1273 // if it is an empty line, guess that it is part of this item 1274 // and move on to the next line 1275 if p.isEmpty(data[line:i]) > 0 { 1276 containsBlankLine = true 1277 line = i 1278 continue 1279 } 1280 1281 // calculate the indentation 1282 indent := 0 1283 indentIndex := 0 1284 if data[line] == '\t' { 1285 indentIndex++ 1286 indent += 4 1287 } else { 1288 for indent < 4 && line+indent < i && data[line+indent] == ' ' { 1289 indent++ 1290 indentIndex++ 1291 } 1292 } 1293 1294 chunk := data[line+indentIndex : i] 1295 1296 if p.extensions&FencedCode != 0 { 1297 // determine if in or out of codeblock 1298 // if in codeblock, ignore normal list processing 1299 _, marker := isFenceLine(chunk, nil, codeBlockMarker) 1300 if marker != "" { 1301 if codeBlockMarker == "" { 1302 // start of codeblock 1303 codeBlockMarker = marker 1304 } else { 1305 // end of codeblock. 1306 codeBlockMarker = "" 1307 } 1308 } 1309 // we are in a codeblock, write line, and continue 1310 if codeBlockMarker != "" || marker != "" { 1311 raw.Write(data[line+indentIndex : i]) 1312 line = i 1313 continue gatherlines 1314 } 1315 } 1316 1317 // evaluate how this line fits in 1318 switch { 1319 // is this a nested list item? 1320 case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) || 1321 p.oliPrefix(chunk) > 0 || 1322 p.dliPrefix(chunk) > 0: 1323 1324 // to be a nested list, it must be indented more 1325 // if not, it is either a different kind of list 1326 // or the next item in the same list 1327 if indent <= itemIndent { 1328 if p.listTypeChanged(chunk, flags) { 1329 *flags |= ListItemEndOfList 1330 } else if containsBlankLine { 1331 *flags |= ListItemContainsBlock 1332 } 1333 1334 break gatherlines 1335 } 1336 1337 if containsBlankLine { 1338 *flags |= ListItemContainsBlock 1339 } 1340 1341 // is this the first item in the nested list? 1342 if sublist == 0 { 1343 sublist = raw.Len() 1344 } 1345 1346 // is this a nested prefix heading? 1347 case p.isPrefixHeading(chunk): 1348 // if the heading is not indented, it is not nested in the list 1349 // and thus ends the list 1350 if containsBlankLine && indent < 4 { 1351 *flags |= ListItemEndOfList 1352 break gatherlines 1353 } 1354 *flags |= ListItemContainsBlock 1355 1356 // anything following an empty line is only part 1357 // of this item if it is indented 4 spaces 1358 // (regardless of the indentation of the beginning of the item) 1359 case containsBlankLine && indent < 4: 1360 if *flags&ListTypeDefinition != 0 && i < len(data)-1 { 1361 // is the next item still a part of this list? 1362 next := i 1363 for next < len(data) && data[next] != '\n' { 1364 next++ 1365 } 1366 for next < len(data)-1 && data[next] == '\n' { 1367 next++ 1368 } 1369 if i < len(data)-1 && data[i] != ':' && data[next] != ':' { 1370 *flags |= ListItemEndOfList 1371 } 1372 } else { 1373 *flags |= ListItemEndOfList 1374 } 1375 break gatherlines 1376 1377 // a blank line means this should be parsed as a block 1378 case containsBlankLine: 1379 raw.WriteByte('\n') 1380 *flags |= ListItemContainsBlock 1381 } 1382 1383 // if this line was preceded by one or more blanks, 1384 // re-introduce the blank into the buffer 1385 if containsBlankLine { 1386 containsBlankLine = false 1387 raw.WriteByte('\n') 1388 } 1389 1390 // add the line into the working buffer without prefix 1391 raw.Write(data[line+indentIndex : i]) 1392 1393 line = i 1394 } 1395 1396 rawBytes := raw.Bytes() 1397 1398 block := p.addBlock(Item, nil) 1399 block.ListFlags = *flags 1400 block.Tight = false 1401 block.BulletChar = bulletChar 1402 block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark 1403 1404 // render the contents of the list item 1405 if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 { 1406 // intermediate render of block item, except for definition term 1407 if sublist > 0 { 1408 p.block(rawBytes[:sublist]) 1409 p.block(rawBytes[sublist:]) 1410 } else { 1411 p.block(rawBytes) 1412 } 1413 } else { 1414 // intermediate render of inline item 1415 if sublist > 0 { 1416 child := p.addChild(Paragraph, 0) 1417 child.content = rawBytes[:sublist] 1418 p.block(rawBytes[sublist:]) 1419 } else { 1420 child := p.addChild(Paragraph, 0) 1421 child.content = rawBytes 1422 } 1423 } 1424 return line 1425 } 1426 1427 // render a single paragraph that has already been parsed out 1428 func (p *Markdown) renderParagraph(data []byte) { 1429 if len(data) == 0 { 1430 return 1431 } 1432 1433 // trim leading spaces 1434 beg := 0 1435 for data[beg] == ' ' { 1436 beg++ 1437 } 1438 1439 end := len(data) 1440 // trim trailing newline 1441 if data[len(data)-1] == '\n' { 1442 end-- 1443 } 1444 1445 // trim trailing spaces 1446 for end > beg && data[end-1] == ' ' { 1447 end-- 1448 } 1449 1450 p.addBlock(Paragraph, data[beg:end]) 1451 } 1452 1453 func (p *Markdown) paragraph(data []byte) int { 1454 // prev: index of 1st char of previous line 1455 // line: index of 1st char of current line 1456 // i: index of cursor/end of current line 1457 var prev, line, i int 1458 tabSize := TabSizeDefault 1459 if p.extensions&TabSizeEight != 0 { 1460 tabSize = TabSizeDouble 1461 } 1462 // keep going until we find something to mark the end of the paragraph 1463 for i < len(data) { 1464 // mark the beginning of the current line 1465 prev = line 1466 current := data[i:] 1467 line = i 1468 1469 // did we find a reference or a footnote? If so, end a paragraph 1470 // preceding it and report that we have consumed up to the end of that 1471 // reference: 1472 if refEnd := isReference(p, current, tabSize); refEnd > 0 { 1473 p.renderParagraph(data[:i]) 1474 return i + refEnd 1475 } 1476 1477 // did we find a blank line marking the end of the paragraph? 1478 if n := p.isEmpty(current); n > 0 { 1479 // did this blank line followed by a definition list item? 1480 if p.extensions&DefinitionLists != 0 { 1481 if i < len(data)-1 && data[i+1] == ':' { 1482 return p.list(data[prev:], ListTypeDefinition) 1483 } 1484 } 1485 1486 p.renderParagraph(data[:i]) 1487 return i + n 1488 } 1489 1490 // an underline under some text marks a heading, so our paragraph ended on prev line 1491 if i > 0 { 1492 if level := p.isUnderlinedHeading(current); level > 0 { 1493 // render the paragraph 1494 p.renderParagraph(data[:prev]) 1495 1496 // ignore leading and trailing whitespace 1497 eol := i - 1 1498 for prev < eol && data[prev] == ' ' { 1499 prev++ 1500 } 1501 for eol > prev && data[eol-1] == ' ' { 1502 eol-- 1503 } 1504 1505 id := "" 1506 if p.extensions&AutoHeadingIDs != 0 { 1507 id = SanitizedAnchorName(string(data[prev:eol])) 1508 } 1509 1510 block := p.addBlock(Heading, data[prev:eol]) 1511 block.Level = level 1512 block.HeadingID = id 1513 1514 // find the end of the underline 1515 for i < len(data) && data[i] != '\n' { 1516 i++ 1517 } 1518 return i 1519 } 1520 } 1521 1522 // if the next line starts a block of HTML, then the paragraph ends here 1523 if p.extensions&LaxHTMLBlocks != 0 { 1524 if data[i] == '<' && p.html(current, false) > 0 { 1525 // rewind to before the HTML block 1526 p.renderParagraph(data[:i]) 1527 return i 1528 } 1529 } 1530 1531 // if there's a prefixed heading or a horizontal rule after this, paragraph is over 1532 if p.isPrefixHeading(current) || p.isHRule(current) { 1533 p.renderParagraph(data[:i]) 1534 return i 1535 } 1536 1537 // if there's a fenced code block, paragraph is over 1538 if p.extensions&FencedCode != 0 { 1539 if p.fencedCodeBlock(current, false) > 0 { 1540 p.renderParagraph(data[:i]) 1541 return i 1542 } 1543 } 1544 1545 // if there's a definition list item, prev line is a definition term 1546 if p.extensions&DefinitionLists != 0 { 1547 if p.dliPrefix(current) != 0 { 1548 ret := p.list(data[prev:], ListTypeDefinition) 1549 return ret 1550 } 1551 } 1552 1553 // if there's a list after this, paragraph is over 1554 if p.extensions&NoEmptyLineBeforeBlock != 0 { 1555 if p.uliPrefix(current) != 0 || 1556 p.oliPrefix(current) != 0 || 1557 p.quotePrefix(current) != 0 || 1558 p.codePrefix(current) != 0 { 1559 p.renderParagraph(data[:i]) 1560 return i 1561 } 1562 } 1563 1564 // otherwise, scan to the beginning of the next line 1565 nl := bytes.IndexByte(data[i:], '\n') 1566 if nl >= 0 { 1567 i += nl + 1 1568 } else { 1569 i += len(data[i:]) 1570 } 1571 } 1572 1573 p.renderParagraph(data[:i]) 1574 return i 1575 } 1576 1577 func skipChar(data []byte, start int, char byte) int { 1578 i := start 1579 for i < len(data) && data[i] == char { 1580 i++ 1581 } 1582 return i 1583 } 1584 1585 func skipUntilChar(text []byte, start int, char byte) int { 1586 i := start 1587 for i < len(text) && text[i] != char { 1588 i++ 1589 } 1590 return i 1591 } 1592 1593 // SanitizedAnchorName returns a sanitized anchor name for the given text. 1594 // 1595 // It implements the algorithm specified in the package comment. 1596 func SanitizedAnchorName(text string) string { 1597 var anchorName []rune 1598 futureDash := false 1599 for _, r := range text { 1600 switch { 1601 case unicode.IsLetter(r) || unicode.IsNumber(r): 1602 if futureDash && len(anchorName) > 0 { 1603 anchorName = append(anchorName, '-') 1604 } 1605 futureDash = false 1606 anchorName = append(anchorName, unicode.ToLower(r)) 1607 default: 1608 futureDash = true 1609 } 1610 } 1611 return string(anchorName) 1612 }