You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
6.1 KiB

  1. /*! http://mths.be/utf8js v2.0.0 by @mathias */
  2. ;(function(root) {
  3. // Detect free variables `exports`
  4. var freeExports = typeof exports == 'object' && exports;
  5. // Detect free variable `module`
  6. var freeModule = typeof module == 'object' && module &&
  7. module.exports == freeExports && module;
  8. // Detect free variable `global`, from Node.js or Browserified code,
  9. // and use it as `root`
  10. var freeGlobal = typeof global == 'object' && global;
  11. if (freeGlobal.global === freeGlobal || freeGlobal.window === freeGlobal) {
  12. root = freeGlobal;
  13. }
  14. /*--------------------------------------------------------------------------*/
  15. var stringFromCharCode = String.fromCharCode;
  16. // Taken from http://mths.be/punycode
  17. function ucs2decode(string) {
  18. var output = [];
  19. var counter = 0;
  20. var length = string.length;
  21. var value;
  22. var extra;
  23. while (counter < length) {
  24. value = string.charCodeAt(counter++);
  25. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  26. // high surrogate, and there is a next character
  27. extra = string.charCodeAt(counter++);
  28. if ((extra & 0xFC00) == 0xDC00) { // low surrogate
  29. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  30. } else {
  31. // unmatched surrogate; only append this code unit, in case the next
  32. // code unit is the high surrogate of a surrogate pair
  33. output.push(value);
  34. counter--;
  35. }
  36. } else {
  37. output.push(value);
  38. }
  39. }
  40. return output;
  41. }
  42. // Taken from http://mths.be/punycode
  43. function ucs2encode(array) {
  44. var length = array.length;
  45. var index = -1;
  46. var value;
  47. var output = '';
  48. while (++index < length) {
  49. value = array[index];
  50. if (value > 0xFFFF) {
  51. value -= 0x10000;
  52. output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
  53. value = 0xDC00 | value & 0x3FF;
  54. }
  55. output += stringFromCharCode(value);
  56. }
  57. return output;
  58. }
  59. /*--------------------------------------------------------------------------*/
  60. function createByte(codePoint, shift) {
  61. return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
  62. }
  63. function encodeCodePoint(codePoint) {
  64. if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
  65. return stringFromCharCode(codePoint);
  66. }
  67. var symbol = '';
  68. if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
  69. symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
  70. }
  71. else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
  72. symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
  73. symbol += createByte(codePoint, 6);
  74. }
  75. else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
  76. symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0);
  77. symbol += createByte(codePoint, 12);
  78. symbol += createByte(codePoint, 6);
  79. }
  80. symbol += stringFromCharCode((codePoint & 0x3F) | 0x80);
  81. return symbol;
  82. }
  83. function utf8encode(string) {
  84. var codePoints = ucs2decode(string);
  85. // console.log(JSON.stringify(codePoints.map(function(x) {
  86. // return 'U+' + x.toString(16).toUpperCase();
  87. // })));
  88. var length = codePoints.length;
  89. var index = -1;
  90. var codePoint;
  91. var byteString = '';
  92. while (++index < length) {
  93. codePoint = codePoints[index];
  94. byteString += encodeCodePoint(codePoint);
  95. }
  96. return byteString;
  97. }
  98. /*--------------------------------------------------------------------------*/
  99. function readContinuationByte() {
  100. if (byteIndex >= byteCount) {
  101. throw Error('Invalid byte index');
  102. }
  103. var continuationByte = byteArray[byteIndex] & 0xFF;
  104. byteIndex++;
  105. if ((continuationByte & 0xC0) == 0x80) {
  106. return continuationByte & 0x3F;
  107. }
  108. // If we end up here, it’s not a continuation byte
  109. throw Error('Invalid continuation byte');
  110. }
  111. function decodeSymbol() {
  112. var byte1;
  113. var byte2;
  114. var byte3;
  115. var byte4;
  116. var codePoint;
  117. if (byteIndex > byteCount) {
  118. throw Error('Invalid byte index');
  119. }
  120. if (byteIndex == byteCount) {
  121. return false;
  122. }
  123. // Read first byte
  124. byte1 = byteArray[byteIndex] & 0xFF;
  125. byteIndex++;
  126. // 1-byte sequence (no continuation bytes)
  127. if ((byte1 & 0x80) == 0) {
  128. return byte1;
  129. }
  130. // 2-byte sequence
  131. if ((byte1 & 0xE0) == 0xC0) {
  132. var byte2 = readContinuationByte();
  133. codePoint = ((byte1 & 0x1F) << 6) | byte2;
  134. if (codePoint >= 0x80) {
  135. return codePoint;
  136. } else {
  137. throw Error('Invalid continuation byte');
  138. }
  139. }
  140. // 3-byte sequence (may include unpaired surrogates)
  141. if ((byte1 & 0xF0) == 0xE0) {
  142. byte2 = readContinuationByte();
  143. byte3 = readContinuationByte();
  144. codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
  145. if (codePoint >= 0x0800) {
  146. return codePoint;
  147. } else {
  148. throw Error('Invalid continuation byte');
  149. }
  150. }
  151. // 4-byte sequence
  152. if ((byte1 & 0xF8) == 0xF0) {
  153. byte2 = readContinuationByte();
  154. byte3 = readContinuationByte();
  155. byte4 = readContinuationByte();
  156. codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) |
  157. (byte3 << 0x06) | byte4;
  158. if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
  159. return codePoint;
  160. }
  161. }
  162. throw Error('Invalid UTF-8 detected');
  163. }
  164. var byteArray;
  165. var byteCount;
  166. var byteIndex;
  167. function utf8decode(byteString) {
  168. byteArray = ucs2decode(byteString);
  169. byteCount = byteArray.length;
  170. byteIndex = 0;
  171. var codePoints = [];
  172. var tmp;
  173. while ((tmp = decodeSymbol()) !== false) {
  174. codePoints.push(tmp);
  175. }
  176. return ucs2encode(codePoints);
  177. }
  178. /*--------------------------------------------------------------------------*/
  179. var utf8 = {
  180. 'version': '2.0.0',
  181. 'encode': utf8encode,
  182. 'decode': utf8decode
  183. };
  184. // Some AMD build optimizers, like r.js, check for specific condition patterns
  185. // like the following:
  186. if (
  187. typeof define == 'function' &&
  188. typeof define.amd == 'object' &&
  189. define.amd
  190. ) {
  191. define(function() {
  192. return utf8;
  193. });
  194. } else if (freeExports && !freeExports.nodeType) {
  195. if (freeModule) { // in Node.js or RingoJS v0.8.0+
  196. freeModule.exports = utf8;
  197. } else { // in Narwhal or RingoJS v0.7.0-
  198. var object = {};
  199. var hasOwnProperty = object.hasOwnProperty;
  200. for (var key in utf8) {
  201. hasOwnProperty.call(utf8, key) && (freeExports[key] = utf8[key]);
  202. }
  203. }
  204. } else { // in Rhino or a web browser
  205. root.utf8 = utf8;
  206. }
  207. }(this));