/**
* Class for encoding Acedia's `MutableText` value into UTF8 byte
* representation.
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
* Copyright 2021 Anton Tarasenko
*------------------------------------------------------------------------------
* This file is part of Acedia.
*
* Acedia is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License, or
* (at your option) any later version.
*
* Acedia is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Acedia. If not, see .
*/
class Utf8Encoder extends AcediaObject;
// Limits on code point values that can be recorded with 1, 2, 3 and 4 bytes
// respectively
var private int utfLimit1, utfLimit2, utfLimit3, utfLimit4;
// Bit prefixes for UTF8 encoding
var private int utfMask2, utfMask3, utfMask4, utfMaskIn;
// This integer will have only 6 last bits be 1s.
// We need it to zero all but last 6 bits for `int`s (with `&` bit operator).
var private int lastSixBits;
/**
* Encodes passed `Text` object into UTF8 byte representation.
*
* In case passed `text` is somehow broken and contains invalid Unicode
* code points - this method will return empty array.
*
* @param text `Text` object to encode.
* @return UTF8 representation of passed `text` as an array of `byte`s.
* Empty array if `text == none` or `text` contains invalid Unicode
* code points.
*/
public final function array Encode(Text text)
{
local int i, nextCodepoint, textLength;
local array buffer;
if (__().text.IsEmpty(text)) {
return buffer; // empty array
}
textLength = text.GetLength();
for (i = 0; i < textLength; i += 1)
{
nextCodepoint = text.GetCharacter(i).codePoint;
if (nextCodepoint <= utfLimit1) {
buffer[buffer.length] = nextCodepoint;
}
else if (nextCodepoint <= utfLimit2)
{
// Drop 6 bits that will be recorded inside second byte and
// add 2-byte sequence mask
buffer[buffer.length] = utfMask2 | (nextCodepoint >> 6);
// Take only last 6 bits for the second (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
}
else if (nextCodepoint <= utfLimit3)
{
// Drop 12 bits that will be recorded inside second and third bytes
// and add 3-byte sequence mask
buffer[buffer.length] = utfMask3 | (nextCodepoint >> 12);
// Drop 6 bits that will be recorded inside third byte and
// add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
// Take only last 6 bits for the third (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
}
else if (nextCodepoint <= utfLimit4)
{
// Drop 18 bits that will be recorded inside second, third and
// fourth bytes, then add 4-byte sequence mask
buffer[buffer.length] = utfMask4 | (nextCodepoint >> 18);
// Drop 12 bits that will be recorded inside third and fourth bytes
// and add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 12) & lastSixBits);
// Drop 6 bits that will be recorded inside fourth byte
// and add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
// Take only last 6 bits for the fourth (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
}
else
{
// Outside of known Unicode range
// Should not be possible, since `Text` is expected to
// contain only correct Unicode
buffer.length = 0;
break;
}
}
return buffer;
}
defaultproperties
{
utfLimit1 = 127
utfLimit2 = 2047
utfLimit3 = 65535
utfLimit4 = 1114111
utfMask2 = 192 // 1 1 0 0 0 0 0 0
utfMask3 = 224 // 1 1 1 0 0 0 0 0
utfMask4 = 240 // 1 1 1 1 0 0 0 0
utfMaskIn = 128 // 1 0 0 0 0 0 0 0
lastSixBits = 63 // 0 0 1 1 1 1 1 1
}