You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

748 lines
14 KiB

  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bytes"
  7. "io"
  8. "io/ioutil"
  9. "reflect"
  10. "runtime"
  11. "strings"
  12. "testing"
  13. )
  14. type tokenTest struct {
  15. // A short description of the test case.
  16. desc string
  17. // The HTML to parse.
  18. html string
  19. // The string representations of the expected tokens, joined by '$'.
  20. golden string
  21. }
  22. var tokenTests = []tokenTest{
  23. {
  24. "empty",
  25. "",
  26. "",
  27. },
  28. // A single text node. The tokenizer should not break text nodes on whitespace,
  29. // nor should it normalize whitespace within a text node.
  30. {
  31. "text",
  32. "foo bar",
  33. "foo bar",
  34. },
  35. // An entity.
  36. {
  37. "entity",
  38. "one < two",
  39. "one < two",
  40. },
  41. // A start, self-closing and end tag. The tokenizer does not care if the start
  42. // and end tokens don't match; that is the job of the parser.
  43. {
  44. "tags",
  45. "<a>b<c/>d</e>",
  46. "<a>$b$<c/>$d$</e>",
  47. },
  48. // Angle brackets that aren't a tag.
  49. {
  50. "not a tag #0",
  51. "<",
  52. "&lt;",
  53. },
  54. {
  55. "not a tag #1",
  56. "</",
  57. "&lt;/",
  58. },
  59. {
  60. "not a tag #2",
  61. "</>",
  62. "<!---->",
  63. },
  64. {
  65. "not a tag #3",
  66. "a</>b",
  67. "a$<!---->$b",
  68. },
  69. {
  70. "not a tag #4",
  71. "</ >",
  72. "<!-- -->",
  73. },
  74. {
  75. "not a tag #5",
  76. "</.",
  77. "<!--.-->",
  78. },
  79. {
  80. "not a tag #6",
  81. "</.>",
  82. "<!--.-->",
  83. },
  84. {
  85. "not a tag #7",
  86. "a < b",
  87. "a &lt; b",
  88. },
  89. {
  90. "not a tag #8",
  91. "<.>",
  92. "&lt;.&gt;",
  93. },
  94. {
  95. "not a tag #9",
  96. "a<<<b>>>c",
  97. "a&lt;&lt;$<b>$&gt;&gt;c",
  98. },
  99. {
  100. "not a tag #10",
  101. "if x<0 and y < 0 then x*y>0",
  102. "if x&lt;0 and y &lt; 0 then x*y&gt;0",
  103. },
  104. {
  105. "not a tag #11",
  106. "<<p>",
  107. "&lt;$<p>",
  108. },
  109. // EOF in a tag name.
  110. {
  111. "tag name eof #0",
  112. "<a",
  113. "",
  114. },
  115. {
  116. "tag name eof #1",
  117. "<a ",
  118. "",
  119. },
  120. {
  121. "tag name eof #2",
  122. "a<b",
  123. "a",
  124. },
  125. {
  126. "tag name eof #3",
  127. "<a><b",
  128. "<a>",
  129. },
  130. {
  131. "tag name eof #4",
  132. `<a x`,
  133. ``,
  134. },
  135. // Some malformed tags that are missing a '>'.
  136. {
  137. "malformed tag #0",
  138. `<p</p>`,
  139. `<p< p="">`,
  140. },
  141. {
  142. "malformed tag #1",
  143. `<p </p>`,
  144. `<p <="" p="">`,
  145. },
  146. {
  147. "malformed tag #2",
  148. `<p id`,
  149. ``,
  150. },
  151. {
  152. "malformed tag #3",
  153. `<p id=`,
  154. ``,
  155. },
  156. {
  157. "malformed tag #4",
  158. `<p id=>`,
  159. `<p id="">`,
  160. },
  161. {
  162. "malformed tag #5",
  163. `<p id=0`,
  164. ``,
  165. },
  166. {
  167. "malformed tag #6",
  168. `<p id=0</p>`,
  169. `<p id="0&lt;/p">`,
  170. },
  171. {
  172. "malformed tag #7",
  173. `<p id="0</p>`,
  174. ``,
  175. },
  176. {
  177. "malformed tag #8",
  178. `<p id="0"</p>`,
  179. `<p id="0" <="" p="">`,
  180. },
  181. {
  182. "malformed tag #9",
  183. `<p></p id`,
  184. `<p>`,
  185. },
  186. // Raw text and RCDATA.
  187. {
  188. "basic raw text",
  189. "<script><a></b></script>",
  190. "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
  191. },
  192. {
  193. "unfinished script end tag",
  194. "<SCRIPT>a</SCR",
  195. "<script>$a&lt;/SCR",
  196. },
  197. {
  198. "broken script end tag",
  199. "<SCRIPT>a</SCR ipt>",
  200. "<script>$a&lt;/SCR ipt&gt;",
  201. },
  202. {
  203. "EOF in script end tag",
  204. "<SCRIPT>a</SCRipt",
  205. "<script>$a&lt;/SCRipt",
  206. },
  207. {
  208. "scriptx end tag",
  209. "<SCRIPT>a</SCRiptx",
  210. "<script>$a&lt;/SCRiptx",
  211. },
  212. {
  213. "' ' completes script end tag",
  214. "<SCRIPT>a</SCRipt ",
  215. "<script>$a",
  216. },
  217. {
  218. "'>' completes script end tag",
  219. "<SCRIPT>a</SCRipt>",
  220. "<script>$a$</script>",
  221. },
  222. {
  223. "self-closing script end tag",
  224. "<SCRIPT>a</SCRipt/>",
  225. "<script>$a$</script>",
  226. },
  227. {
  228. "nested script tag",
  229. "<SCRIPT>a</SCRipt<script>",
  230. "<script>$a&lt;/SCRipt&lt;script&gt;",
  231. },
  232. {
  233. "script end tag after unfinished",
  234. "<SCRIPT>a</SCRipt</script>",
  235. "<script>$a&lt;/SCRipt$</script>",
  236. },
  237. {
  238. "script/style mismatched tags",
  239. "<script>a</style>",
  240. "<script>$a&lt;/style&gt;",
  241. },
  242. {
  243. "style element with entity",
  244. "<style>&apos;",
  245. "<style>$&amp;apos;",
  246. },
  247. {
  248. "textarea with tag",
  249. "<textarea><div></textarea>",
  250. "<textarea>$&lt;div&gt;$</textarea>",
  251. },
  252. {
  253. "title with tag and entity",
  254. "<title><b>K&amp;R C</b></title>",
  255. "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
  256. },
  257. // DOCTYPE tests.
  258. {
  259. "Proper DOCTYPE",
  260. "<!DOCTYPE html>",
  261. "<!DOCTYPE html>",
  262. },
  263. {
  264. "DOCTYPE with no space",
  265. "<!doctypehtml>",
  266. "<!DOCTYPE html>",
  267. },
  268. {
  269. "DOCTYPE with two spaces",
  270. "<!doctype html>",
  271. "<!DOCTYPE html>",
  272. },
  273. {
  274. "looks like DOCTYPE but isn't",
  275. "<!DOCUMENT html>",
  276. "<!--DOCUMENT html-->",
  277. },
  278. {
  279. "DOCTYPE at EOF",
  280. "<!DOCtype",
  281. "<!DOCTYPE >",
  282. },
  283. // XML processing instructions.
  284. {
  285. "XML processing instruction",
  286. "<?xml?>",
  287. "<!--?xml?-->",
  288. },
  289. // Comments.
  290. {
  291. "comment0",
  292. "abc<b><!-- skipme --></b>def",
  293. "abc$<b>$<!-- skipme -->$</b>$def",
  294. },
  295. {
  296. "comment1",
  297. "a<!-->z",
  298. "a$<!---->$z",
  299. },
  300. {
  301. "comment2",
  302. "a<!--->z",
  303. "a$<!---->$z",
  304. },
  305. {
  306. "comment3",
  307. "a<!--x>-->z",
  308. "a$<!--x>-->$z",
  309. },
  310. {
  311. "comment4",
  312. "a<!--x->-->z",
  313. "a$<!--x->-->$z",
  314. },
  315. {
  316. "comment5",
  317. "a<!>z",
  318. "a$<!---->$z",
  319. },
  320. {
  321. "comment6",
  322. "a<!->z",
  323. "a$<!----->$z",
  324. },
  325. {
  326. "comment7",
  327. "a<!---<>z",
  328. "a$<!---<>z-->",
  329. },
  330. {
  331. "comment8",
  332. "a<!--z",
  333. "a$<!--z-->",
  334. },
  335. {
  336. "comment9",
  337. "a<!--z-",
  338. "a$<!--z-->",
  339. },
  340. {
  341. "comment10",
  342. "a<!--z--",
  343. "a$<!--z-->",
  344. },
  345. {
  346. "comment11",
  347. "a<!--z---",
  348. "a$<!--z--->",
  349. },
  350. {
  351. "comment12",
  352. "a<!--z----",
  353. "a$<!--z---->",
  354. },
  355. {
  356. "comment13",
  357. "a<!--x--!>z",
  358. "a$<!--x-->$z",
  359. },
  360. // An attribute with a backslash.
  361. {
  362. "backslash",
  363. `<p id="a\"b">`,
  364. `<p id="a\" b"="">`,
  365. },
  366. // Entities, tag name and attribute key lower-casing, and whitespace
  367. // normalization within a tag.
  368. {
  369. "tricky",
  370. "<p \t\n iD=\"a&quot;B\" foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
  371. `<p id="a&#34;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
  372. },
  373. // A nonexistent entity. Tokenizing and converting back to a string should
  374. // escape the "&" to become "&amp;".
  375. {
  376. "noSuchEntity",
  377. `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
  378. `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
  379. },
  380. {
  381. "entity without semicolon",
  382. `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
  383. `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
  384. },
  385. {
  386. "entity with digits",
  387. "&frac12;",
  388. "½",
  389. },
  390. // Attribute tests:
  391. // http://dev.w3.org/html5/pf-summary/Overview.html#attributes
  392. {
  393. "Empty attribute",
  394. `<input disabled FOO>`,
  395. `<input disabled="" foo="">`,
  396. },
  397. {
  398. "Empty attribute, whitespace",
  399. `<input disabled FOO >`,
  400. `<input disabled="" foo="">`,
  401. },
  402. {
  403. "Unquoted attribute value",
  404. `<input value=yes FOO=BAR>`,
  405. `<input value="yes" foo="BAR">`,
  406. },
  407. {
  408. "Unquoted attribute value, spaces",
  409. `<input value = yes FOO = BAR>`,
  410. `<input value="yes" foo="BAR">`,
  411. },
  412. {
  413. "Unquoted attribute value, trailing space",
  414. `<input value=yes FOO=BAR >`,
  415. `<input value="yes" foo="BAR">`,
  416. },
  417. {
  418. "Single-quoted attribute value",
  419. `<input value='yes' FOO='BAR'>`,
  420. `<input value="yes" foo="BAR">`,
  421. },
  422. {
  423. "Single-quoted attribute value, trailing space",
  424. `<input value='yes' FOO='BAR' >`,
  425. `<input value="yes" foo="BAR">`,
  426. },
  427. {
  428. "Double-quoted attribute value",
  429. `<input value="I'm an attribute" FOO="BAR">`,
  430. `<input value="I&#39;m an attribute" foo="BAR">`,
  431. },
  432. {
  433. "Attribute name characters",
  434. `<meta http-equiv="content-type">`,
  435. `<meta http-equiv="content-type">`,
  436. },
  437. {
  438. "Mixed attributes",
  439. `a<P V="0 1" w='2' X=3 y>z`,
  440. `a$<p v="0 1" w="2" x="3" y="">$z`,
  441. },
  442. {
  443. "Attributes with a solitary single quote",
  444. `<p id=can't><p id=won't>`,
  445. `<p id="can&#39;t">$<p id="won&#39;t">`,
  446. },
  447. }
  448. func TestTokenizer(t *testing.T) {
  449. loop:
  450. for _, tt := range tokenTests {
  451. z := NewTokenizer(strings.NewReader(tt.html))
  452. if tt.golden != "" {
  453. for i, s := range strings.Split(tt.golden, "$") {
  454. if z.Next() == ErrorToken {
  455. t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err())
  456. continue loop
  457. }
  458. actual := z.Token().String()
  459. if s != actual {
  460. t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
  461. continue loop
  462. }
  463. }
  464. }
  465. z.Next()
  466. if z.Err() != io.EOF {
  467. t.Errorf("%s: want EOF got %q", tt.desc, z.Err())
  468. }
  469. }
  470. }
  471. func TestMaxBuffer(t *testing.T) {
  472. // Exceeding the maximum buffer size generates ErrBufferExceeded.
  473. z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10)))
  474. z.SetMaxBuf(5)
  475. tt := z.Next()
  476. if got, want := tt, ErrorToken; got != want {
  477. t.Fatalf("token type: got: %v want: %v", got, want)
  478. }
  479. if got, want := z.Err(), ErrBufferExceeded; got != want {
  480. t.Errorf("error type: got: %v want: %v", got, want)
  481. }
  482. if got, want := string(z.Raw()), "<tttt"; got != want {
  483. t.Fatalf("buffered before overflow: got: %q want: %q", got, want)
  484. }
  485. }
  486. func TestMaxBufferReconstruction(t *testing.T) {
  487. // Exceeding the maximum buffer size at any point while tokenizing permits
  488. // reconstructing the original input.
  489. tests:
  490. for _, test := range tokenTests {
  491. for maxBuf := 1; ; maxBuf++ {
  492. r := strings.NewReader(test.html)
  493. z := NewTokenizer(r)
  494. z.SetMaxBuf(maxBuf)
  495. var tokenized bytes.Buffer
  496. for {
  497. tt := z.Next()
  498. tokenized.Write(z.Raw())
  499. if tt == ErrorToken {
  500. if err := z.Err(); err != io.EOF && err != ErrBufferExceeded {
  501. t.Errorf("%s: unexpected error: %v", test.desc, err)
  502. }
  503. break
  504. }
  505. }
  506. // Anything tokenized along with untokenized input or data left in the reader.
  507. assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r))
  508. if err != nil {
  509. t.Errorf("%s: ReadAll: %v", test.desc, err)
  510. continue tests
  511. }
  512. if got, want := string(assembled), test.html; got != want {
  513. t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want)
  514. continue tests
  515. }
  516. // EOF indicates that we completed tokenization and hence found the max
  517. // maxBuf that generates ErrBufferExceeded, so continue to the next test.
  518. if z.Err() == io.EOF {
  519. break
  520. }
  521. } // buffer sizes
  522. } // tests
  523. }
  524. func TestPassthrough(t *testing.T) {
  525. // Accumulating the raw output for each parse event should reconstruct the
  526. // original input.
  527. for _, test := range tokenTests {
  528. z := NewTokenizer(strings.NewReader(test.html))
  529. var parsed bytes.Buffer
  530. for {
  531. tt := z.Next()
  532. parsed.Write(z.Raw())
  533. if tt == ErrorToken {
  534. break
  535. }
  536. }
  537. if got, want := parsed.String(), test.html; got != want {
  538. t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want)
  539. }
  540. }
  541. }
  542. func TestBufAPI(t *testing.T) {
  543. s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
  544. z := NewTokenizer(bytes.NewBufferString(s))
  545. var result bytes.Buffer
  546. depth := 0
  547. loop:
  548. for {
  549. tt := z.Next()
  550. switch tt {
  551. case ErrorToken:
  552. if z.Err() != io.EOF {
  553. t.Error(z.Err())
  554. }
  555. break loop
  556. case TextToken:
  557. if depth > 0 {
  558. result.Write(z.Text())
  559. }
  560. case StartTagToken, EndTagToken:
  561. tn, _ := z.TagName()
  562. if len(tn) == 1 && tn[0] == 'a' {
  563. if tt == StartTagToken {
  564. depth++
  565. } else {
  566. depth--
  567. }
  568. }
  569. }
  570. }
  571. u := "14567"
  572. v := string(result.Bytes())
  573. if u != v {
  574. t.Errorf("TestBufAPI: want %q got %q", u, v)
  575. }
  576. }
  577. func TestConvertNewlines(t *testing.T) {
  578. testCases := map[string]string{
  579. "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
  580. "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
  581. "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
  582. "": "",
  583. "\n": "\n",
  584. "\n\r": "\n\n",
  585. "\r": "\n",
  586. "\r\n": "\n",
  587. "\r\n\n": "\n\n",
  588. "\r\n\r": "\n\n",
  589. "\r\n\r\n": "\n\n",
  590. "\r\r": "\n\n",
  591. "\r\r\n": "\n\n",
  592. "\r\r\n\n": "\n\n\n",
  593. "\r\r\r\n": "\n\n\n",
  594. "\r \n": "\n \n",
  595. "xyz": "xyz",
  596. }
  597. for in, want := range testCases {
  598. if got := string(convertNewlines([]byte(in))); got != want {
  599. t.Errorf("input %q: got %q, want %q", in, got, want)
  600. }
  601. }
  602. }
  603. func TestReaderEdgeCases(t *testing.T) {
  604. const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>"
  605. testCases := []io.Reader{
  606. &zeroOneByteReader{s: s},
  607. &eofStringsReader{s: s},
  608. &stuckReader{},
  609. }
  610. for i, tc := range testCases {
  611. got := []TokenType{}
  612. z := NewTokenizer(tc)
  613. for {
  614. tt := z.Next()
  615. if tt == ErrorToken {
  616. break
  617. }
  618. got = append(got, tt)
  619. }
  620. if err := z.Err(); err != nil && err != io.EOF {
  621. if err != io.ErrNoProgress {
  622. t.Errorf("i=%d: %v", i, err)
  623. }
  624. continue
  625. }
  626. want := []TokenType{
  627. StartTagToken,
  628. TextToken,
  629. EndTagToken,
  630. }
  631. if !reflect.DeepEqual(got, want) {
  632. t.Errorf("i=%d: got %v, want %v", i, got, want)
  633. continue
  634. }
  635. }
  636. }
  637. // zeroOneByteReader is like a strings.Reader that alternates between
  638. // returning 0 bytes and 1 byte at a time.
  639. type zeroOneByteReader struct {
  640. s string
  641. n int
  642. }
  643. func (r *zeroOneByteReader) Read(p []byte) (int, error) {
  644. if len(p) == 0 {
  645. return 0, nil
  646. }
  647. if len(r.s) == 0 {
  648. return 0, io.EOF
  649. }
  650. r.n++
  651. if r.n%2 != 0 {
  652. return 0, nil
  653. }
  654. p[0], r.s = r.s[0], r.s[1:]
  655. return 1, nil
  656. }
  657. // eofStringsReader is like a strings.Reader but can return an (n, err) where
  658. // n > 0 && err != nil.
  659. type eofStringsReader struct {
  660. s string
  661. }
  662. func (r *eofStringsReader) Read(p []byte) (int, error) {
  663. n := copy(p, r.s)
  664. r.s = r.s[n:]
  665. if r.s != "" {
  666. return n, nil
  667. }
  668. return n, io.EOF
  669. }
  670. // stuckReader is an io.Reader that always returns no data and no error.
  671. type stuckReader struct{}
  672. func (*stuckReader) Read(p []byte) (int, error) {
  673. return 0, nil
  674. }
  675. const (
  676. rawLevel = iota
  677. lowLevel
  678. highLevel
  679. )
  680. func benchmarkTokenizer(b *testing.B, level int) {
  681. buf, err := ioutil.ReadFile("testdata/go1.html")
  682. if err != nil {
  683. b.Fatalf("could not read testdata/go1.html: %v", err)
  684. }
  685. b.SetBytes(int64(len(buf)))
  686. runtime.GC()
  687. b.ReportAllocs()
  688. b.ResetTimer()
  689. for i := 0; i < b.N; i++ {
  690. z := NewTokenizer(bytes.NewBuffer(buf))
  691. for {
  692. tt := z.Next()
  693. if tt == ErrorToken {
  694. if err := z.Err(); err != nil && err != io.EOF {
  695. b.Fatalf("tokenizer error: %v", err)
  696. }
  697. break
  698. }
  699. switch level {
  700. case rawLevel:
  701. // Calling z.Raw just returns the raw bytes of the token. It does
  702. // not unescape &lt; to <, or lower-case tag names and attribute keys.
  703. z.Raw()
  704. case lowLevel:
  705. // Caling z.Text, z.TagName and z.TagAttr returns []byte values
  706. // whose contents may change on the next call to z.Next.
  707. switch tt {
  708. case TextToken, CommentToken, DoctypeToken:
  709. z.Text()
  710. case StartTagToken, SelfClosingTagToken:
  711. _, more := z.TagName()
  712. for more {
  713. _, _, more = z.TagAttr()
  714. }
  715. case EndTagToken:
  716. z.TagName()
  717. }
  718. case highLevel:
  719. // Calling z.Token converts []byte values to strings whose validity
  720. // extend beyond the next call to z.Next.
  721. z.Token()
  722. }
  723. }
  724. }
  725. }
  726. func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) }
  727. func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) }
  728. func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }