You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

47 lines
1.3 KiB

  1. #!/usr/bin/env python
  2. import re
  3. import json
  4. # http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
  5. # http://stackoverflow.com/a/13436167/96656
  6. def unisymbol(codePoint):
  7. if codePoint >= 0x0000 and codePoint <= 0xFFFF:
  8. return unichr(codePoint)
  9. elif codePoint >= 0x010000 and codePoint <= 0x10FFFF:
  10. highSurrogate = int((codePoint - 0x10000) / 0x400) + 0xD800
  11. lowSurrogate = int((codePoint - 0x10000) % 0x400) + 0xDC00
  12. return unichr(highSurrogate) + unichr(lowSurrogate)
  13. else:
  14. return 'Error'
  15. def hexify(codePoint):
  16. return 'U+' + hex(codePoint)[2:].upper().zfill(6)
  17. def writeFile(filename, contents):
  18. print filename
  19. with open(filename, 'w') as f:
  20. f.write(contents.strip() + '\n')
  21. data = []
  22. for codePoint in range(0x000000, 0x10FFFF + 1):
  23. symbol = unisymbol(codePoint)
  24. # http://stackoverflow.com/a/17199950/96656
  25. bytes = symbol.encode('utf8').decode('latin1')
  26. data.append({
  27. 'codePoint': codePoint,
  28. 'decoded': symbol,
  29. 'encoded': bytes
  30. });
  31. jsonData = json.dumps(data, sort_keys=False, indent=2, separators=(',', ': '))
  32. # Use tabs instead of double spaces for indentation
  33. jsonData = jsonData.replace(' ', '\t')
  34. # Escape hexadecimal digits in escape sequences
  35. jsonData = re.sub(
  36. r'\\u([a-fA-F0-9]{4})',
  37. lambda match: r'\u{}'.format(match.group(1).upper()),
  38. jsonData
  39. )
  40. writeFile('data.json', jsonData)