|
|
|
/**
|
|
|
|
* Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
|
|
|
|
* This is a separate object instead of just a method, because it allows
|
|
|
|
* to make code simpler by storing state variables related to
|
|
|
|
* the decoding process.
|
|
|
|
* This implementation should correctly convert any valid UTF8, but it is
|
|
|
|
* not guaranteed to reject any invalid UTF8. In particular, it accepts
|
|
|
|
* overlong code point encodings. It does check whether every byte has
|
|
|
|
* a correct bit prefix and does not attempt to repair input data if it finds
|
|
|
|
* invalid one.
|
|
|
|
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
|
|
|
|
* Copyright 2021 Anton Tarasenko
|
|
|
|
*------------------------------------------------------------------------------
|
|
|
|
* This file is part of Acedia.
|
|
|
|
*
|
|
|
|
* Acedia is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* Acedia is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with Acedia. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
class Utf8Decoder extends AcediaObject;
|
|
|
|
|
|
|
|
// Variables for building a multi-byte code point.
|
|
|
|
// Stored as a class member variables to avoid copying them between methods.
|
|
|
|
var private MutableText builtText;
|
|
|
|
var private int nextCodePoint;
|
|
|
|
var private int innerBytesLeft;
|
|
|
|
|
|
|
|
// These masks (`maskDropN`) allow to turn into zero first `N` bits in
|
|
|
|
// the byte with `&` operator.
|
|
|
|
var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
|
|
|
|
// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
|
|
|
|
// in the byte with `&` operator.
|
|
|
|
// `maskTakeN == ~maskDropN`.
|
|
|
|
var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Decodes passed `byte` array (that contains utf8-encoded text) into
|
|
|
|
* the `MutableText` type.
|
|
|
|
*
|
|
|
|
* @param byteStream Byte stream to decode.
|
|
|
|
* @return `MutableText` that contains `byteStream`'s text data.
|
|
|
|
* `none` iff either `byteStream == none` or it's contents do not
|
|
|
|
* correspond to a (valid) utf8-encoded text.
|
|
|
|
*/
|
|
|
|
public final function MutableText Decode(ByteArrayRef byteStream)
|
|
|
|
{
|
|
|
|
local int i;
|
|
|
|
local int length;
|
|
|
|
local MutableText result;
|
|
|
|
if (byteStream == none) {
|
|
|
|
return none;
|
|
|
|
}
|
|
|
|
nextCodePoint = 0;
|
|
|
|
innerBytesLeft = 0;
|
|
|
|
builtText = _.text.Empty();
|
|
|
|
length = byteStream.GetLength();
|
|
|
|
for (i = 0; i < length; i += 1)
|
|
|
|
{
|
|
|
|
if (!PushByte(byteStream.GetItem(i)))
|
|
|
|
{
|
|
|
|
_.memory.Free(builtText);
|
|
|
|
return none;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (innerBytesLeft <= 0) {
|
|
|
|
result = builtText;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
_.memory.Free(builtText);
|
|
|
|
}
|
|
|
|
builtText = none;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
private final function bool PushByte(byte nextByte)
|
|
|
|
{
|
|
|
|
if (innerBytesLeft > 0) {
|
|
|
|
return PushInnerByte(nextByte);
|
|
|
|
}
|
|
|
|
// Form of 0xxxxxxx means 1 byte per code point
|
|
|
|
if ((nextByte & maskTake1) == 0)
|
|
|
|
{
|
|
|
|
AppendCodePoint(nextByte);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Form of 110xxxxx means 2 bytes per code point
|
|
|
|
if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
|
|
|
|
{
|
|
|
|
nextCodePoint = nextByte & maskDrop3;
|
|
|
|
innerBytesLeft = 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Form of 1110xxxx means 3 bytes per code point
|
|
|
|
if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
|
|
|
|
{
|
|
|
|
nextCodePoint = nextByte & maskDrop4;
|
|
|
|
innerBytesLeft = 2;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Form of 11110xxx means 4 bytes per code point
|
|
|
|
if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
|
|
|
|
{
|
|
|
|
nextCodePoint = nextByte & maskDrop5;
|
|
|
|
innerBytesLeft = 3;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// `nextByte` must have has one of the above forms
|
|
|
|
// (or 10xxxxxx that is handled in `PushInnerByte()`)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// This method is responsible for pushing "inner" bytes: bytes that come
|
|
|
|
// after the first one when code point is encoded with multiple bytes.
|
|
|
|
// All of them are expected to have 10xxxxxx prefix.
|
|
|
|
// Assumes `innerBytesLeft > 0` to avoid needless checks.
|
|
|
|
private final function bool PushInnerByte(byte nextByte)
|
|
|
|
{
|
|
|
|
// Fail if `nextByte` does not have an expected form: 10xxxxxx
|
|
|
|
if ((nextByte & maskTake2) != maskTake1) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
|
|
|
|
// that actually encode code point, so to make space for those bits we must
|
|
|
|
// shift previously added code points by `6`
|
|
|
|
nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
|
|
|
|
innerBytesLeft -= 1;
|
|
|
|
if (innerBytesLeft <= 0) {
|
|
|
|
AppendCodePoint(nextCodePoint);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
private final function AppendCodePoint(int codePoint)
|
|
|
|
{
|
|
|
|
local BaseText.Character nextCharacter;
|
|
|
|
nextCharacter.codePoint = codePoint;
|
|
|
|
builtText.AppendCharacter(nextCharacter);
|
|
|
|
}
|
|
|
|
|
|
|
|
defaultproperties
|
|
|
|
{
|
|
|
|
maskDrop1 = 127 // 0 1 1 1 1 1 1 1
|
|
|
|
maskDrop2 = 63 // 0 0 1 1 1 1 1 1
|
|
|
|
maskDrop3 = 31 // 0 0 0 1 1 1 1 1
|
|
|
|
maskDrop4 = 15 // 0 0 0 0 1 1 1 1
|
|
|
|
maskDrop5 = 7 // 0 0 0 0 0 1 1 1
|
|
|
|
maskTake1 = 128 // 1 0 0 0 0 0 0 0
|
|
|
|
maskTake2 = 192 // 1 1 0 0 0 0 0 0
|
|
|
|
maskTake3 = 224 // 1 1 1 0 0 0 0 0
|
|
|
|
maskTake4 = 240 // 1 1 1 1 0 0 0 0
|
|
|
|
maskTake5 = 248 // 1 1 1 1 1 0 0 0
|
|
|
|
}
|