You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

331 lines
6.7 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // This program takes an HTML file and outputs a corresponding article file in
  5. // present format. See: golang.org/x/tools/present
  6. package main // import "golang.org/x/tools/cmd/html2article"
  7. import (
  8. "bytes"
  9. "errors"
  10. "flag"
  11. "fmt"
  12. "io"
  13. "log"
  14. "net/url"
  15. "os"
  16. "regexp"
  17. "strings"
  18. "golang.org/x/net/html"
  19. "golang.org/x/net/html/atom"
  20. )
  21. func main() {
  22. flag.Parse()
  23. err := convert(os.Stdout, os.Stdin)
  24. if err != nil {
  25. log.Fatal(err)
  26. }
  27. }
  28. func convert(w io.Writer, r io.Reader) error {
  29. root, err := html.Parse(r)
  30. if err != nil {
  31. return err
  32. }
  33. style := find(root, isTag(atom.Style))
  34. if err := parseStyles(style); err != nil {
  35. log.Printf("couldn't parse all styles: %v", err)
  36. }
  37. body := find(root, isTag(atom.Body))
  38. if body == nil {
  39. return errors.New("couldn't find body")
  40. }
  41. article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
  42. _, err = fmt.Fprintf(w, "Title\n\n%s", article)
  43. return err
  44. }
  45. type Style string
  46. const (
  47. Bold Style = "*"
  48. Italic Style = "_"
  49. Code Style = "`"
  50. )
  51. var cssRules = make(map[string]Style)
  52. func parseStyles(style *html.Node) error {
  53. if style == nil || style.FirstChild == nil {
  54. return errors.New("couldn't find styles")
  55. }
  56. styles := style.FirstChild.Data
  57. readUntil := func(end rune) (string, bool) {
  58. i := strings.IndexRune(styles, end)
  59. if i < 0 {
  60. return "", false
  61. }
  62. s := styles[:i]
  63. styles = styles[i:]
  64. return s, true
  65. }
  66. for {
  67. sel, ok := readUntil('{')
  68. if !ok && sel == "" {
  69. break
  70. } else if !ok {
  71. return fmt.Errorf("could not parse selector %q", styles)
  72. }
  73. value, ok := readUntil('}')
  74. if !ok {
  75. return fmt.Errorf("couldn't parse style body for %s", sel)
  76. }
  77. switch {
  78. case strings.Contains(value, "italic"):
  79. cssRules[sel] = Italic
  80. case strings.Contains(value, "bold"):
  81. cssRules[sel] = Bold
  82. case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"):
  83. cssRules[sel] = Code
  84. }
  85. }
  86. return nil
  87. }
  88. var newlineRun = regexp.MustCompile(`\n\n+`)
  89. func limitNewlineRuns(s string) string {
  90. return newlineRun.ReplaceAllString(s, "\n\n")
  91. }
  92. func makeHeadings(body string) string {
  93. buf := new(bytes.Buffer)
  94. lines := strings.Split(body, "\n")
  95. for i, s := range lines {
  96. if i == 0 && !isBoldTitle(s) {
  97. buf.WriteString("* Introduction\n\n")
  98. }
  99. if isBoldTitle(s) {
  100. s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
  101. s = "* " + s
  102. }
  103. buf.WriteString(s)
  104. buf.WriteByte('\n')
  105. }
  106. return buf.String()
  107. }
  108. func isBoldTitle(s string) bool {
  109. return !strings.Contains(s, " ") &&
  110. strings.HasPrefix(s, "*") &&
  111. strings.HasSuffix(s, "*")
  112. }
  113. func indent(buf *bytes.Buffer, s string) {
  114. for _, l := range strings.Split(s, "\n") {
  115. if l != "" {
  116. buf.WriteByte('\t')
  117. buf.WriteString(l)
  118. }
  119. buf.WriteByte('\n')
  120. }
  121. }
  122. func unwrap(buf *bytes.Buffer, s string) {
  123. var cont bool
  124. for _, l := range strings.Split(s, "\n") {
  125. l = strings.TrimSpace(l)
  126. if len(l) == 0 {
  127. if cont {
  128. buf.WriteByte('\n')
  129. buf.WriteByte('\n')
  130. }
  131. cont = false
  132. } else {
  133. if cont {
  134. buf.WriteByte(' ')
  135. }
  136. buf.WriteString(l)
  137. cont = true
  138. }
  139. }
  140. }
  141. func text(n *html.Node) string {
  142. var buf bytes.Buffer
  143. walk(n, func(n *html.Node) bool {
  144. switch n.Type {
  145. case html.TextNode:
  146. buf.WriteString(n.Data)
  147. return false
  148. case html.ElementNode:
  149. // no-op
  150. default:
  151. return true
  152. }
  153. a := n.DataAtom
  154. if a == atom.Span {
  155. switch {
  156. case hasStyle(Code)(n):
  157. a = atom.Code
  158. case hasStyle(Bold)(n):
  159. a = atom.B
  160. case hasStyle(Italic)(n):
  161. a = atom.I
  162. }
  163. }
  164. switch a {
  165. case atom.Br:
  166. buf.WriteByte('\n')
  167. case atom.P:
  168. unwrap(&buf, childText(n))
  169. buf.WriteString("\n\n")
  170. case atom.Li:
  171. buf.WriteString("- ")
  172. unwrap(&buf, childText(n))
  173. buf.WriteByte('\n')
  174. case atom.Pre:
  175. indent(&buf, childText(n))
  176. buf.WriteByte('\n')
  177. case atom.A:
  178. href, text := attr(n, "href"), childText(n)
  179. // Skip links with no text.
  180. if strings.TrimSpace(text) == "" {
  181. break
  182. }
  183. // Don't emit empty links.
  184. if strings.TrimSpace(href) == "" {
  185. buf.WriteString(text)
  186. break
  187. }
  188. // Use original url for Google Docs redirections.
  189. if u, err := url.Parse(href); err != nil {
  190. log.Printf("parsing url %q: %v", href, err)
  191. } else if u.Host == "www.google.com" && u.Path == "/url" {
  192. href = u.Query().Get("q")
  193. }
  194. fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
  195. case atom.Code:
  196. buf.WriteString(highlight(n, "`"))
  197. case atom.B:
  198. buf.WriteString(highlight(n, "*"))
  199. case atom.I:
  200. buf.WriteString(highlight(n, "_"))
  201. case atom.Img:
  202. src := attr(n, "src")
  203. fmt.Fprintf(&buf, ".image %s\n", src)
  204. case atom.Iframe:
  205. src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
  206. fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
  207. case atom.Param:
  208. if attr(n, "name") == "movie" {
  209. // Old style YouTube embed.
  210. u := attr(n, "value")
  211. u = strings.Replace(u, "/v/", "/embed/", 1)
  212. if i := strings.Index(u, "&"); i >= 0 {
  213. u = u[:i]
  214. }
  215. fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
  216. }
  217. case atom.Title:
  218. default:
  219. return true
  220. }
  221. return false
  222. })
  223. return buf.String()
  224. }
  225. func childText(node *html.Node) string {
  226. var buf bytes.Buffer
  227. for n := node.FirstChild; n != nil; n = n.NextSibling {
  228. fmt.Fprint(&buf, text(n))
  229. }
  230. return buf.String()
  231. }
  232. func highlight(node *html.Node, char string) string {
  233. t := strings.Replace(childText(node), " ", char, -1)
  234. return fmt.Sprintf("%s%s%s", char, t, char)
  235. }
  236. type selector func(*html.Node) bool
  237. func isTag(a atom.Atom) selector {
  238. return func(n *html.Node) bool {
  239. return n.DataAtom == a
  240. }
  241. }
  242. func hasClass(name string) selector {
  243. return func(n *html.Node) bool {
  244. for _, a := range n.Attr {
  245. if a.Key == "class" {
  246. for _, c := range strings.Fields(a.Val) {
  247. if c == name {
  248. return true
  249. }
  250. }
  251. }
  252. }
  253. return false
  254. }
  255. }
  256. func hasStyle(s Style) selector {
  257. return func(n *html.Node) bool {
  258. for rule, s2 := range cssRules {
  259. if s2 != s {
  260. continue
  261. }
  262. if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
  263. return true
  264. }
  265. if n.DataAtom.String() == rule {
  266. return true
  267. }
  268. }
  269. return false
  270. }
  271. }
  272. func attr(node *html.Node, key string) (value string) {
  273. for _, attr := range node.Attr {
  274. if attr.Key == key {
  275. return attr.Val
  276. }
  277. }
  278. return ""
  279. }
  280. func find(n *html.Node, fn selector) *html.Node {
  281. var result *html.Node
  282. walk(n, func(n *html.Node) bool {
  283. if result != nil {
  284. return false
  285. }
  286. if fn(n) {
  287. result = n
  288. return false
  289. }
  290. return true
  291. })
  292. return result
  293. }
  294. func walk(n *html.Node, fn selector) {
  295. if fn(n) {
  296. for c := n.FirstChild; c != nil; c = c.NextSibling {
  297. walk(c, fn)
  298. }
  299. }
  300. }