You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

335 lines
9.5 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package encoding defines an interface for character encodings, such as Shift
  5. // JIS and Windows 1252, that can convert to and from UTF-8.
  6. //
  7. // Encoding implementations are provided in other packages, such as
  8. // golang.org/x/text/encoding/charmap and
  9. // golang.org/x/text/encoding/japanese.
  10. package encoding // import "golang.org/x/text/encoding"
  11. import (
  12. "errors"
  13. "io"
  14. "strconv"
  15. "unicode/utf8"
  16. "golang.org/x/text/encoding/internal/identifier"
  17. "golang.org/x/text/transform"
  18. )
  19. // TODO:
  20. // - There seems to be some inconsistency in when decoders return errors
  21. // and when not. Also documentation seems to suggest they shouldn't return
  22. // errors at all (except for UTF-16).
  23. // - Encoders seem to rely on or at least benefit from the input being in NFC
  24. // normal form. Perhaps add an example how users could prepare their output.
  25. // Encoding is a character set encoding that can be transformed to and from
  26. // UTF-8.
  27. type Encoding interface {
  28. // NewDecoder returns a Decoder.
  29. NewDecoder() *Decoder
  30. // NewEncoder returns an Encoder.
  31. NewEncoder() *Encoder
  32. }
  33. // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
  34. //
  35. // Transforming source bytes that are not of that encoding will not result in an
  36. // error per se. Each byte that cannot be transcoded will be represented in the
  37. // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
  38. type Decoder struct {
  39. transform.Transformer
  40. // This forces external creators of Decoders to use names in struct
  41. // initializers, allowing for future extendibility without having to break
  42. // code.
  43. _ struct{}
  44. }
  45. // Bytes converts the given encoded bytes to UTF-8. It returns the converted
  46. // bytes or nil, err if any error occurred.
  47. func (d *Decoder) Bytes(b []byte) ([]byte, error) {
  48. b, _, err := transform.Bytes(d, b)
  49. if err != nil {
  50. return nil, err
  51. }
  52. return b, nil
  53. }
  54. // String converts the given encoded string to UTF-8. It returns the converted
  55. // string or "", err if any error occurred.
  56. func (d *Decoder) String(s string) (string, error) {
  57. s, _, err := transform.String(d, s)
  58. if err != nil {
  59. return "", err
  60. }
  61. return s, nil
  62. }
  63. // Reader wraps another Reader to decode its bytes.
  64. //
  65. // The Decoder may not be used for any other operation as long as the returned
  66. // Reader is in use.
  67. func (d *Decoder) Reader(r io.Reader) io.Reader {
  68. return transform.NewReader(r, d)
  69. }
  70. // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
  71. //
  72. // Each rune that cannot be transcoded will result in an error. In this case,
  73. // the transform will consume all source byte up to, not including the offending
  74. // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
  75. // `\uFFFD`. To return early with an error instead, use transform.Chain to
  76. // preprocess the data with a UTF8Validator.
  77. type Encoder struct {
  78. transform.Transformer
  79. // This forces external creators of Encoders to use names in struct
  80. // initializers, allowing for future extendibility without having to break
  81. // code.
  82. _ struct{}
  83. }
  84. // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
  85. // any error occurred.
  86. func (e *Encoder) Bytes(b []byte) ([]byte, error) {
  87. b, _, err := transform.Bytes(e, b)
  88. if err != nil {
  89. return nil, err
  90. }
  91. return b, nil
  92. }
  93. // String converts a string from UTF-8. It returns the converted string or
  94. // "", err if any error occurred.
  95. func (e *Encoder) String(s string) (string, error) {
  96. s, _, err := transform.String(e, s)
  97. if err != nil {
  98. return "", err
  99. }
  100. return s, nil
  101. }
  102. // Writer wraps another Writer to encode its UTF-8 output.
  103. //
  104. // The Encoder may not be used for any other operation as long as the returned
  105. // Writer is in use.
  106. func (e *Encoder) Writer(w io.Writer) io.Writer {
  107. return transform.NewWriter(w, e)
  108. }
  109. // ASCIISub is the ASCII substitute character, as recommended by
  110. // https://unicode.org/reports/tr36/#Text_Comparison
  111. const ASCIISub = '\x1a'
  112. // Nop is the nop encoding. Its transformed bytes are the same as the source
  113. // bytes; it does not replace invalid UTF-8 sequences.
  114. var Nop Encoding = nop{}
  115. type nop struct{}
  116. func (nop) NewDecoder() *Decoder {
  117. return &Decoder{Transformer: transform.Nop}
  118. }
  119. func (nop) NewEncoder() *Encoder {
  120. return &Encoder{Transformer: transform.Nop}
  121. }
  122. // Replacement is the replacement encoding. Decoding from the replacement
  123. // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
  124. // the replacement encoding yields the same as the source bytes except that
  125. // invalid UTF-8 is converted to '\uFFFD'.
  126. //
  127. // It is defined at http://encoding.spec.whatwg.org/#replacement
  128. var Replacement Encoding = replacement{}
  129. type replacement struct{}
  130. func (replacement) NewDecoder() *Decoder {
  131. return &Decoder{Transformer: replacementDecoder{}}
  132. }
  133. func (replacement) NewEncoder() *Encoder {
  134. return &Encoder{Transformer: replacementEncoder{}}
  135. }
  136. func (replacement) ID() (mib identifier.MIB, other string) {
  137. return identifier.Replacement, ""
  138. }
  139. type replacementDecoder struct{ transform.NopResetter }
  140. func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  141. if len(dst) < 3 {
  142. return 0, 0, transform.ErrShortDst
  143. }
  144. if atEOF {
  145. const fffd = "\ufffd"
  146. dst[0] = fffd[0]
  147. dst[1] = fffd[1]
  148. dst[2] = fffd[2]
  149. nDst = 3
  150. }
  151. return nDst, len(src), nil
  152. }
  153. type replacementEncoder struct{ transform.NopResetter }
  154. func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  155. r, size := rune(0), 0
  156. for ; nSrc < len(src); nSrc += size {
  157. r = rune(src[nSrc])
  158. // Decode a 1-byte rune.
  159. if r < utf8.RuneSelf {
  160. size = 1
  161. } else {
  162. // Decode a multi-byte rune.
  163. r, size = utf8.DecodeRune(src[nSrc:])
  164. if size == 1 {
  165. // All valid runes of size 1 (those below utf8.RuneSelf) were
  166. // handled above. We have invalid UTF-8 or we haven't seen the
  167. // full character yet.
  168. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  169. err = transform.ErrShortSrc
  170. break
  171. }
  172. r = '\ufffd'
  173. }
  174. }
  175. if nDst+utf8.RuneLen(r) > len(dst) {
  176. err = transform.ErrShortDst
  177. break
  178. }
  179. nDst += utf8.EncodeRune(dst[nDst:], r)
  180. }
  181. return nDst, nSrc, err
  182. }
  183. // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
  184. // repertoire of the destination encoding with HTML escape sequences.
  185. //
  186. // This wrapper exists to comply to URL and HTML forms requiring a
  187. // non-terminating legacy encoder. The produced sequences may lead to data
  188. // loss as they are indistinguishable from legitimate input. To avoid this
  189. // issue, use UTF-8 encodings whenever possible.
  190. func HTMLEscapeUnsupported(e *Encoder) *Encoder {
  191. return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
  192. }
  193. // ReplaceUnsupported wraps encoders to replace source runes outside the
  194. // repertoire of the destination encoding with an encoding-specific
  195. // replacement.
  196. //
  197. // This wrapper is only provided for backwards compatibility and legacy
  198. // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
  199. func ReplaceUnsupported(e *Encoder) *Encoder {
  200. return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
  201. }
  202. type errorHandler struct {
  203. *Encoder
  204. handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
  205. }
  206. // TODO: consider making this error public in some form.
  207. type repertoireError interface {
  208. Replacement() byte
  209. }
  210. func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  211. nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
  212. for err != nil {
  213. rerr, ok := err.(repertoireError)
  214. if !ok {
  215. return nDst, nSrc, err
  216. }
  217. r, sz := utf8.DecodeRune(src[nSrc:])
  218. n, ok := h.handler(dst[nDst:], r, rerr)
  219. if !ok {
  220. return nDst, nSrc, transform.ErrShortDst
  221. }
  222. err = nil
  223. nDst += n
  224. if nSrc += sz; nSrc < len(src) {
  225. var dn, sn int
  226. dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
  227. nDst += dn
  228. nSrc += sn
  229. }
  230. }
  231. return nDst, nSrc, err
  232. }
  233. func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
  234. buf := [8]byte{}
  235. b := strconv.AppendUint(buf[:0], uint64(r), 10)
  236. if n = len(b) + len("&#;"); n >= len(dst) {
  237. return 0, false
  238. }
  239. dst[0] = '&'
  240. dst[1] = '#'
  241. dst[copy(dst[2:], b)+2] = ';'
  242. return n, true
  243. }
  244. func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
  245. if len(dst) == 0 {
  246. return 0, false
  247. }
  248. dst[0] = err.Replacement()
  249. return 1, true
  250. }
  251. // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
  252. var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
  253. // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
  254. // input byte that is not valid UTF-8.
  255. var UTF8Validator transform.Transformer = utf8Validator{}
  256. type utf8Validator struct{ transform.NopResetter }
  257. func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  258. n := len(src)
  259. if n > len(dst) {
  260. n = len(dst)
  261. }
  262. for i := 0; i < n; {
  263. if c := src[i]; c < utf8.RuneSelf {
  264. dst[i] = c
  265. i++
  266. continue
  267. }
  268. _, size := utf8.DecodeRune(src[i:])
  269. if size == 1 {
  270. // All valid runes of size 1 (those below utf8.RuneSelf) were
  271. // handled above. We have invalid UTF-8 or we haven't seen the
  272. // full character yet.
  273. err = ErrInvalidUTF8
  274. if !atEOF && !utf8.FullRune(src[i:]) {
  275. err = transform.ErrShortSrc
  276. }
  277. return i, i, err
  278. }
  279. if i+size > len(dst) {
  280. return i, i, transform.ErrShortDst
  281. }
  282. for ; size > 0; size-- {
  283. dst[i] = src[i]
  284. i++
  285. }
  286. }
  287. if len(src) > len(dst) {
  288. err = transform.ErrShortDst
  289. }
  290. return n, n, err
  291. }