|
#!/usr/bin/env python
|
|
|
|
import re
|
|
import json
|
|
|
|
# http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
|
|
# http://stackoverflow.com/a/13436167/96656
|
|
def unisymbol(codePoint):
|
|
if codePoint >= 0x0000 and codePoint <= 0xFFFF:
|
|
return unichr(codePoint)
|
|
elif codePoint >= 0x010000 and codePoint <= 0x10FFFF:
|
|
highSurrogate = int((codePoint - 0x10000) / 0x400) + 0xD800
|
|
lowSurrogate = int((codePoint - 0x10000) % 0x400) + 0xDC00
|
|
return unichr(highSurrogate) + unichr(lowSurrogate)
|
|
else:
|
|
return 'Error'
|
|
|
|
def hexify(codePoint):
|
|
return 'U+' + hex(codePoint)[2:].upper().zfill(6)
|
|
|
|
def writeFile(filename, contents):
|
|
print filename
|
|
with open(filename, 'w') as f:
|
|
f.write(contents.strip() + '\n')
|
|
|
|
data = []
|
|
for codePoint in range(0x000000, 0x10FFFF + 1):
|
|
symbol = unisymbol(codePoint)
|
|
# http://stackoverflow.com/a/17199950/96656
|
|
bytes = symbol.encode('utf8').decode('latin1')
|
|
data.append({
|
|
'codePoint': codePoint,
|
|
'decoded': symbol,
|
|
'encoded': bytes
|
|
});
|
|
|
|
jsonData = json.dumps(data, sort_keys=False, indent=2, separators=(',', ': '))
|
|
# Use tabs instead of double spaces for indentation
|
|
jsonData = jsonData.replace(' ', '\t')
|
|
# Escape hexadecimal digits in escape sequences
|
|
jsonData = re.sub(
|
|
r'\\u([a-fA-F0-9]{4})',
|
|
lambda match: r'\u{}'.format(match.group(1).upper()),
|
|
jsonData
|
|
)
|
|
|
|
writeFile('data.json', jsonData)
|