You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

388 lines
9.9 KiB

  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "bufio"
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "io"
  11. "io/ioutil"
  12. "os"
  13. "path/filepath"
  14. "runtime"
  15. "sort"
  16. "strings"
  17. "testing"
  18. "golang.org/x/net/html/atom"
  19. )
  20. // readParseTest reads a single test case from r.
  21. func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
  22. line, err := r.ReadSlice('\n')
  23. if err != nil {
  24. return "", "", "", err
  25. }
  26. var b []byte
  27. // Read the HTML.
  28. if string(line) != "#data\n" {
  29. return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
  30. }
  31. for {
  32. line, err = r.ReadSlice('\n')
  33. if err != nil {
  34. return "", "", "", err
  35. }
  36. if line[0] == '#' {
  37. break
  38. }
  39. b = append(b, line...)
  40. }
  41. text = strings.TrimSuffix(string(b), "\n")
  42. b = b[:0]
  43. // Skip the error list.
  44. if string(line) != "#errors\n" {
  45. return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
  46. }
  47. for {
  48. line, err = r.ReadSlice('\n')
  49. if err != nil {
  50. return "", "", "", err
  51. }
  52. if line[0] == '#' {
  53. break
  54. }
  55. }
  56. if string(line) == "#document-fragment\n" {
  57. line, err = r.ReadSlice('\n')
  58. if err != nil {
  59. return "", "", "", err
  60. }
  61. context = strings.TrimSpace(string(line))
  62. line, err = r.ReadSlice('\n')
  63. if err != nil {
  64. return "", "", "", err
  65. }
  66. }
  67. // Read the dump of what the parse tree should be.
  68. if string(line) != "#document\n" {
  69. return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
  70. }
  71. inQuote := false
  72. for {
  73. line, err = r.ReadSlice('\n')
  74. if err != nil && err != io.EOF {
  75. return "", "", "", err
  76. }
  77. trimmed := bytes.Trim(line, "| \n")
  78. if len(trimmed) > 0 {
  79. if line[0] == '|' && trimmed[0] == '"' {
  80. inQuote = true
  81. }
  82. if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
  83. inQuote = false
  84. }
  85. }
  86. if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
  87. break
  88. }
  89. b = append(b, line...)
  90. }
  91. return text, string(b), context, nil
  92. }
  93. func dumpIndent(w io.Writer, level int) {
  94. io.WriteString(w, "| ")
  95. for i := 0; i < level; i++ {
  96. io.WriteString(w, " ")
  97. }
  98. }
  99. type sortedAttributes []Attribute
  100. func (a sortedAttributes) Len() int {
  101. return len(a)
  102. }
  103. func (a sortedAttributes) Less(i, j int) bool {
  104. if a[i].Namespace != a[j].Namespace {
  105. return a[i].Namespace < a[j].Namespace
  106. }
  107. return a[i].Key < a[j].Key
  108. }
  109. func (a sortedAttributes) Swap(i, j int) {
  110. a[i], a[j] = a[j], a[i]
  111. }
  112. func dumpLevel(w io.Writer, n *Node, level int) error {
  113. dumpIndent(w, level)
  114. switch n.Type {
  115. case ErrorNode:
  116. return errors.New("unexpected ErrorNode")
  117. case DocumentNode:
  118. return errors.New("unexpected DocumentNode")
  119. case ElementNode:
  120. if n.Namespace != "" {
  121. fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
  122. } else {
  123. fmt.Fprintf(w, "<%s>", n.Data)
  124. }
  125. attr := sortedAttributes(n.Attr)
  126. sort.Sort(attr)
  127. for _, a := range attr {
  128. io.WriteString(w, "\n")
  129. dumpIndent(w, level+1)
  130. if a.Namespace != "" {
  131. fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
  132. } else {
  133. fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
  134. }
  135. }
  136. case TextNode:
  137. fmt.Fprintf(w, `"%s"`, n.Data)
  138. case CommentNode:
  139. fmt.Fprintf(w, "<!-- %s -->", n.Data)
  140. case DoctypeNode:
  141. fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
  142. if n.Attr != nil {
  143. var p, s string
  144. for _, a := range n.Attr {
  145. switch a.Key {
  146. case "public":
  147. p = a.Val
  148. case "system":
  149. s = a.Val
  150. }
  151. }
  152. if p != "" || s != "" {
  153. fmt.Fprintf(w, ` "%s"`, p)
  154. fmt.Fprintf(w, ` "%s"`, s)
  155. }
  156. }
  157. io.WriteString(w, ">")
  158. case scopeMarkerNode:
  159. return errors.New("unexpected scopeMarkerNode")
  160. default:
  161. return errors.New("unknown node type")
  162. }
  163. io.WriteString(w, "\n")
  164. for c := n.FirstChild; c != nil; c = c.NextSibling {
  165. if err := dumpLevel(w, c, level+1); err != nil {
  166. return err
  167. }
  168. }
  169. return nil
  170. }
  171. func dump(n *Node) (string, error) {
  172. if n == nil || n.FirstChild == nil {
  173. return "", nil
  174. }
  175. var b bytes.Buffer
  176. for c := n.FirstChild; c != nil; c = c.NextSibling {
  177. if err := dumpLevel(&b, c, 0); err != nil {
  178. return "", err
  179. }
  180. }
  181. return b.String(), nil
  182. }
  183. const testDataDir = "testdata/webkit/"
  184. func TestParser(t *testing.T) {
  185. testFiles, err := filepath.Glob(testDataDir + "*.dat")
  186. if err != nil {
  187. t.Fatal(err)
  188. }
  189. for _, tf := range testFiles {
  190. f, err := os.Open(tf)
  191. if err != nil {
  192. t.Fatal(err)
  193. }
  194. defer f.Close()
  195. r := bufio.NewReader(f)
  196. for i := 0; ; i++ {
  197. text, want, context, err := readParseTest(r)
  198. if err == io.EOF {
  199. break
  200. }
  201. if err != nil {
  202. t.Fatal(err)
  203. }
  204. err = testParseCase(text, want, context)
  205. if err != nil {
  206. t.Errorf("%s test #%d %q, %s", tf, i, text, err)
  207. }
  208. }
  209. }
  210. }
  211. // testParseCase tests one test case from the test files. If the test does not
  212. // pass, it returns an error that explains the failure.
  213. // text is the HTML to be parsed, want is a dump of the correct parse tree,
  214. // and context is the name of the context node, if any.
  215. func testParseCase(text, want, context string) (err error) {
  216. defer func() {
  217. if x := recover(); x != nil {
  218. switch e := x.(type) {
  219. case error:
  220. err = e
  221. default:
  222. err = fmt.Errorf("%v", e)
  223. }
  224. }
  225. }()
  226. var doc *Node
  227. if context == "" {
  228. doc, err = Parse(strings.NewReader(text))
  229. if err != nil {
  230. return err
  231. }
  232. } else {
  233. contextNode := &Node{
  234. Type: ElementNode,
  235. DataAtom: atom.Lookup([]byte(context)),
  236. Data: context,
  237. }
  238. nodes, err := ParseFragment(strings.NewReader(text), contextNode)
  239. if err != nil {
  240. return err
  241. }
  242. doc = &Node{
  243. Type: DocumentNode,
  244. }
  245. for _, n := range nodes {
  246. doc.AppendChild(n)
  247. }
  248. }
  249. if err := checkTreeConsistency(doc); err != nil {
  250. return err
  251. }
  252. got, err := dump(doc)
  253. if err != nil {
  254. return err
  255. }
  256. // Compare the parsed tree to the #document section.
  257. if got != want {
  258. return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
  259. }
  260. if renderTestBlacklist[text] || context != "" {
  261. return nil
  262. }
  263. // Check that rendering and re-parsing results in an identical tree.
  264. pr, pw := io.Pipe()
  265. go func() {
  266. pw.CloseWithError(Render(pw, doc))
  267. }()
  268. doc1, err := Parse(pr)
  269. if err != nil {
  270. return err
  271. }
  272. got1, err := dump(doc1)
  273. if err != nil {
  274. return err
  275. }
  276. if got != got1 {
  277. return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
  278. }
  279. return nil
  280. }
  281. // Some test input result in parse trees are not 'well-formed' despite
  282. // following the HTML5 recovery algorithms. Rendering and re-parsing such a
  283. // tree will not result in an exact clone of that tree. We blacklist such
  284. // inputs from the render test.
  285. var renderTestBlacklist = map[string]bool{
  286. // The second <a> will be reparented to the first <table>'s parent. This
  287. // results in an <a> whose parent is an <a>, which is not 'well-formed'.
  288. `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
  289. // The same thing with a <p>:
  290. `<p><table></p>`: true,
  291. // More cases of <a> being reparented:
  292. `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
  293. `<a><table><a></table><p><a><div><a>`: true,
  294. `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
  295. // A similar reparenting situation involving <nobr>:
  296. `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
  297. // A <plaintext> element is reparented, putting it before a table.
  298. // A <plaintext> element can't have anything after it in HTML.
  299. `<table><plaintext><td>`: true,
  300. `<!doctype html><table><plaintext></plaintext>`: true,
  301. `<!doctype html><table><tbody><plaintext></plaintext>`: true,
  302. `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
  303. // A form inside a table inside a form doesn't work either.
  304. `<!doctype html><form><table></form><form></table></form>`: true,
  305. // A script that ends at EOF may escape its own closing tag when rendered.
  306. `<!doctype html><script><!--<script `: true,
  307. `<!doctype html><script><!--<script <`: true,
  308. `<!doctype html><script><!--<script <a`: true,
  309. `<!doctype html><script><!--<script </`: true,
  310. `<!doctype html><script><!--<script </s`: true,
  311. `<!doctype html><script><!--<script </script`: true,
  312. `<!doctype html><script><!--<script </scripta`: true,
  313. `<!doctype html><script><!--<script -`: true,
  314. `<!doctype html><script><!--<script -a`: true,
  315. `<!doctype html><script><!--<script -<`: true,
  316. `<!doctype html><script><!--<script --`: true,
  317. `<!doctype html><script><!--<script --a`: true,
  318. `<!doctype html><script><!--<script --<`: true,
  319. `<script><!--<script `: true,
  320. `<script><!--<script <a`: true,
  321. `<script><!--<script </script`: true,
  322. `<script><!--<script </scripta`: true,
  323. `<script><!--<script -`: true,
  324. `<script><!--<script -a`: true,
  325. `<script><!--<script --`: true,
  326. `<script><!--<script --a`: true,
  327. `<script><!--<script <`: true,
  328. `<script><!--<script </`: true,
  329. `<script><!--<script </s`: true,
  330. // Reconstructing the active formatting elements results in a <plaintext>
  331. // element that contains an <a> element.
  332. `<!doctype html><p><a><plaintext>b`: true,
  333. }
  334. func TestNodeConsistency(t *testing.T) {
  335. // inconsistentNode is a Node whose DataAtom and Data do not agree.
  336. inconsistentNode := &Node{
  337. Type: ElementNode,
  338. DataAtom: atom.Frameset,
  339. Data: "table",
  340. }
  341. _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
  342. if err == nil {
  343. t.Errorf("got nil error, want non-nil")
  344. }
  345. }
  346. func BenchmarkParser(b *testing.B) {
  347. buf, err := ioutil.ReadFile("testdata/go1.html")
  348. if err != nil {
  349. b.Fatalf("could not read testdata/go1.html: %v", err)
  350. }
  351. b.SetBytes(int64(len(buf)))
  352. runtime.GC()
  353. b.ReportAllocs()
  354. b.ResetTimer()
  355. for i := 0; i < b.N; i++ {
  356. Parse(bytes.NewBuffer(buf))
  357. }
  358. }