You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
122 lines
4.9 KiB
122 lines
4.9 KiB
3 years ago
|
/**
|
||
|
* Class for encoding Acedia's `MutableText` value into UTF8 byte
|
||
|
* representation.
|
||
3 years ago
|
* This is a separate object instead of just a method to match design of
|
||
|
* `Utf8Decoder`.
|
||
3 years ago
|
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
|
||
|
* Copyright 2021 Anton Tarasenko
|
||
|
*------------------------------------------------------------------------------
|
||
|
* This file is part of Acedia.
|
||
|
*
|
||
|
* Acedia is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Acedia is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Acedia. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
class Utf8Encoder extends AcediaObject;
|
||
|
|
||
|
// Limits on code point values that can be recorded with 1, 2, 3 and 4 bytes
|
||
|
// respectively
|
||
|
var private int utfLimit1, utfLimit2, utfLimit3, utfLimit4;
|
||
|
|
||
|
// Bit prefixes for UTF8 encoding
|
||
|
var private int utfMask2, utfMask3, utfMask4, utfMaskIn;
|
||
|
// This integer will have only 6 last bits be 1s.
|
||
|
// We need it to zero all but last 6 bits for `int`s (with `&` bit operator).
|
||
|
var private int lastSixBits;
|
||
|
|
||
|
/**
|
||
|
* Encodes passed `Text` object into UTF8 byte representation.
|
||
|
*
|
||
|
* In case passed `text` is somehow broken and contains invalid Unicode
|
||
|
* code points - this method will return empty array.
|
||
|
*
|
||
|
* @param text `Text` object to encode.
|
||
3 years ago
|
* @return UTF8 representation of passed `text` inside `ByteArrayRef`.
|
||
|
* `none` iff `text == none` or `text` contains invalid Unicode
|
||
3 years ago
|
* code points.
|
||
|
*/
|
||
3 years ago
|
public final function ByteArrayRef Encode(Text text)
|
||
3 years ago
|
{
|
||
|
local int i, nextCodepoint, textLength;
|
||
3 years ago
|
local ByteArrayRef buffer;
|
||
3 years ago
|
if (__().text.IsEmpty(text)) {
|
||
3 years ago
|
return none; // empty array
|
||
3 years ago
|
}
|
||
3 years ago
|
buffer = ByteArrayRef(_.memory.Allocate(class'ByteArrayRef'));
|
||
3 years ago
|
textLength = text.GetLength();
|
||
|
for (i = 0; i < textLength; i += 1)
|
||
|
{
|
||
|
nextCodepoint = text.GetCharacter(i).codePoint;
|
||
|
if (nextCodepoint <= utfLimit1) {
|
||
3 years ago
|
buffer.AddItem(nextCodepoint);
|
||
3 years ago
|
}
|
||
|
else if (nextCodepoint <= utfLimit2)
|
||
|
{
|
||
|
// Drop 6 bits that will be recorded inside second byte and
|
||
|
// add 2-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMask2 | (nextCodepoint >> 6));
|
||
3 years ago
|
// Take only last 6 bits for the second (last) byte
|
||
|
// + add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
|
||
3 years ago
|
}
|
||
|
else if (nextCodepoint <= utfLimit3)
|
||
|
{
|
||
|
// Drop 12 bits that will be recorded inside second and third bytes
|
||
|
// and add 3-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMask3 | (nextCodepoint >> 12));
|
||
3 years ago
|
// Drop 6 bits that will be recorded inside third byte and
|
||
|
// add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
|
||
3 years ago
|
// Take only last 6 bits for the third (last) byte
|
||
|
// + add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
|
||
3 years ago
|
}
|
||
|
else if (nextCodepoint <= utfLimit4)
|
||
|
{
|
||
|
// Drop 18 bits that will be recorded inside second, third and
|
||
|
// fourth bytes, then add 4-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMask4 | (nextCodepoint >> 18));
|
||
3 years ago
|
// Drop 12 bits that will be recorded inside third and fourth bytes
|
||
|
// and add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 12) & lastSixBits));
|
||
3 years ago
|
// Drop 6 bits that will be recorded inside fourth byte
|
||
|
// and add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
|
||
3 years ago
|
// Take only last 6 bits for the fourth (last) byte
|
||
|
// + add inner-byte sequence mask
|
||
3 years ago
|
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
|
||
3 years ago
|
}
|
||
|
else
|
||
|
{
|
||
|
// Outside of known Unicode range
|
||
|
// Should not be possible, since `Text` is expected to
|
||
|
// contain only correct Unicode
|
||
3 years ago
|
_.memory.Free(buffer);
|
||
|
buffer = none;
|
||
3 years ago
|
break;
|
||
|
}
|
||
|
}
|
||
|
return buffer;
|
||
|
}
|
||
|
|
||
|
defaultproperties
|
||
|
{
|
||
|
utfLimit1 = 127
|
||
|
utfLimit2 = 2047
|
||
|
utfLimit3 = 65535
|
||
|
utfLimit4 = 1114111
|
||
|
utfMask2 = 192 // 1 1 0 0 0 0 0 0
|
||
|
utfMask3 = 224 // 1 1 1 0 0 0 0 0
|
||
|
utfMask4 = 240 // 1 1 1 1 0 0 0 0
|
||
|
utfMaskIn = 128 // 1 0 0 0 0 0 0 0
|
||
|
lastSixBits = 63 // 0 0 1 1 1 1 1 1
|
||
|
}
|