You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
260 lines
9.3 KiB
260 lines
9.3 KiB
3 years ago
|
/**
|
||
|
* Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
|
||
|
* It is made to work with incoming, and possibly incomplete, streams of
|
||
|
* bytes: instead of consuming the whole utf8 text, it is made to consume it
|
||
|
* byte-by-byte and store `MutableText`s that it parsed from the stream
|
||
|
* (assumes that separate `MutableText`s are separated by `0` byte).
|
||
|
* This implementation should correctly convert any valid UTF8, but it is
|
||
|
* not guaranteed to reject any invalid UTF8. In particular, it accepts
|
||
|
* overlong code point encodings (except overlong encoding of zero).
|
||
|
* It, however, does check whether every byte has a correct bit prefix and
|
||
|
* does not attempt to repair input data if it finds invalid one.
|
||
|
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
|
||
|
* Copyright 2021 Anton Tarasenko
|
||
|
*------------------------------------------------------------------------------
|
||
|
* This file is part of Acedia.
|
||
|
*
|
||
|
* Acedia is free software: you can redistribute it and/or modify
|
||
|
* it under the terms of the GNU General Public License as published by
|
||
|
* the Free Software Foundation, version 3 of the License, or
|
||
|
* (at your option) any later version.
|
||
|
*
|
||
|
* Acedia is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with Acedia. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
class Utf8Decoder extends AcediaObject;
|
||
|
|
||
|
/**
|
||
|
* `Utf8Decoder` consumes byte by byte with `PushByte()` method and it's
|
||
|
* algorithm is simple:
|
||
|
* 1. If it encounters a byte that encodes a singular code point by
|
||
|
* itself (starts with `0` bit) - it is added as a codepoint;
|
||
|
* 2. If it encounters byte which indicates that next code point is
|
||
|
* composed out of several bytes (starts with 110, 1110 or 11110) -
|
||
|
* remembers that it has to read several "inner" bytes belonging to
|
||
|
* the same code point and starts to expect them instead;
|
||
|
* 3. If it ever encounters a byte with unexpected (and thus invalid)
|
||
|
* bit prefix - enters a failed state;
|
||
|
* 4. If it ever encounters a `0` byte:
|
||
|
* * If it was not in a failed state - records `MutableText`
|
||
|
* accumulated so far;
|
||
|
* * Clears failed state.
|
||
|
*/
|
||
|
|
||
|
var private bool failedState;
|
||
|
|
||
|
// Variables for building a multi-byte code point
|
||
|
var private int nextCodePoint;
|
||
|
var private int innerBytesLeft;
|
||
|
|
||
|
// `MutableText` we are building right now
|
||
|
var private MutableText nextText;
|
||
|
// `MutableText`s we have already built
|
||
|
var private array<MutableText> outputQueue;
|
||
|
|
||
|
// These masks (`maskDropN`) allow to turn into zero first `N` bits in
|
||
|
// the byte with `&` operator.
|
||
|
var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
|
||
|
// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
|
||
|
// in the byte with `&` operator.
|
||
|
// `maskTakeN == ~maskDropN`.
|
||
|
var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
|
||
|
|
||
|
protected function Constructor()
|
||
|
{
|
||
|
nextText = _.text.Empty();
|
||
|
}
|
||
|
|
||
|
protected function Finalizer()
|
||
|
{
|
||
|
_.memory.Free(nextText);
|
||
|
_.memory.FreeMany(outputQueue);
|
||
|
nextText = none;
|
||
|
failedState = false;
|
||
|
outputQueue.length = 0;
|
||
|
innerBytesLeft = 0;
|
||
|
nextCodePoint = 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Checks whether data in the `MutableText` that caller `Utf8Decoder` is
|
||
|
* currently filling was detected to be invalid.
|
||
|
*
|
||
|
* This state can be reset by pushing `0` byte into caller `Utf8Decoder`.
|
||
|
* See `PushByte()` for more info.
|
||
|
*
|
||
|
* @return `true` iff caller `Utf8Decoder` is not in a failed state.
|
||
|
*/
|
||
|
public final function bool Failed()
|
||
|
{
|
||
|
return failedState;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Checks whether caller `Utf8Decoder` has any data put in
|
||
|
* the `MutableText` it is currently building.
|
||
|
* Result is guaranteed to be `false` after `self.PushByte(0)` call, since
|
||
|
* it starts a brand new `MutableText`.
|
||
|
*/
|
||
|
public final function bool HasUnfinishedData()
|
||
|
{
|
||
|
if (innerBytesLeft > 0) return true;
|
||
|
if (nextText.GetLength() > 0) return true;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns next `MutableText` that was successfully decoded by
|
||
|
* the caller `Utf8Decoder`, removing it from the output queue.
|
||
|
*
|
||
|
* @return Next `MutableText` in the caller `Utf8Decoder`'s output queue.
|
||
|
* `none` iff output queue is empty. `MutableText`s are returned in order
|
||
|
* they were decoded.
|
||
|
*/
|
||
|
public final function MutableText PopText()
|
||
|
{
|
||
|
local MutableText result;
|
||
|
if (outputQueue.length <= 0) {
|
||
|
return none;
|
||
|
}
|
||
|
result = outputQueue[0];
|
||
|
outputQueue.Remove(0, 1);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Adds next `byte` from the byte stream that is supposed to encode UTF8 text.
|
||
|
* To finish building `MutableText` pass `0` byte into this method, which will
|
||
|
* `MutableText` built so far into an "output queue" (accessible with
|
||
|
* `PopText()`) and start building a new one.
|
||
|
*
|
||
|
* This method expects `byte`s, in order, from a sequence that has correct
|
||
|
* UTF8 encoding. If method detects incorrect UTF8 sequence - it will be put
|
||
|
* into a "failed state", discarding `MutableText` it was currently building,
|
||
|
* along with any further input (except `0` byte).
|
||
|
* Pushing `0` byte will restore `Utf8Decoder` from a failed state and it
|
||
|
* will start building a new `MutableText`.
|
||
|
*
|
||
|
* @param nextByte next byte from byte stream that is supposed to encode
|
||
|
* UTF8 text. `0` will make caller `Utf8Decoder` start building new
|
||
|
* `MutableText`.
|
||
|
* @return `true` iff caller `Utf8Decoder` was not in a failed state and
|
||
|
* operation was successful.
|
||
|
*/
|
||
|
public final function bool PushByte(byte nextByte)
|
||
|
{
|
||
|
if (nextByte == 0) return QueueCurrentText();
|
||
|
if (failedState) return false;
|
||
|
if (innerBytesLeft > 0) return PushInnerByte(nextByte);
|
||
|
|
||
|
// Form of 0xxxxxxx means 1 byte per code point
|
||
|
if ((nextByte & maskTake1) == 0)
|
||
|
{
|
||
|
AppendCodePoint(nextByte);
|
||
|
return true;
|
||
|
}
|
||
|
// Form of 110xxxxx means 2 bytes per code point
|
||
|
if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
|
||
|
{
|
||
|
nextCodePoint = nextByte & maskDrop3;
|
||
|
innerBytesLeft = 1;
|
||
|
return true;
|
||
|
}
|
||
|
// Form of 1110xxxx means 3 bytes per code point
|
||
|
if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
|
||
|
{
|
||
|
nextCodePoint = nextByte & maskDrop4;
|
||
|
innerBytesLeft = 2;
|
||
|
return true;
|
||
|
}
|
||
|
// Form of 11110xxx means 4 bytes per code point
|
||
|
if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
|
||
|
{
|
||
|
nextCodePoint = nextByte & maskDrop5;
|
||
|
innerBytesLeft = 3;
|
||
|
return true;
|
||
|
}
|
||
|
// `nextByte` must have has one of the above forms
|
||
|
// (or 10xxxxxx that is handled in `PushInnerByte()`)
|
||
|
failedState = true;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// This method is responsible for pushing "inner" bytes: bytes that come
|
||
|
// after the first one when code point is encoded with multiple bytes.
|
||
|
// All of them are expected to have 10xxxxxx prefix.
|
||
|
// Assumes `innerBytesLeft > 0` and `failedState == false`
|
||
|
// to avoid needless checks.
|
||
|
private final function bool PushInnerByte(byte nextByte)
|
||
|
{
|
||
|
// Fail if `nextByte` does not have an expected form: 10xxxxxx
|
||
|
if ((nextByte & maskTake2) != maskTake1)
|
||
|
{
|
||
|
failedState = true;
|
||
|
return false;
|
||
|
}
|
||
|
// Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
|
||
|
// that actually encode code point, so to make space for those bits we must
|
||
|
// shift previously added code points by `6`
|
||
|
nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
|
||
|
innerBytesLeft -= 1;
|
||
|
if (innerBytesLeft <= 0)
|
||
|
{
|
||
|
// We forbid overlong encoding of `0`
|
||
|
// (as does the Unicode standard)
|
||
|
if (nextCodePoint == 0)
|
||
|
{
|
||
|
failedState = true;
|
||
|
return false;
|
||
|
}
|
||
|
AppendCodePoint(nextCodePoint);
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
private final function AppendCodePoint(int codePoint)
|
||
|
{
|
||
|
local Text.Character nextCharacter;
|
||
|
nextCharacter.codePoint = codePoint;
|
||
|
nextText.AppendCharacter(nextCharacter);
|
||
|
}
|
||
|
|
||
|
// Return `true` if `MutableText` was added to the queue
|
||
|
// (there were no encoding errors)
|
||
|
private final function bool QueueCurrentText()
|
||
|
{
|
||
|
local bool result;
|
||
|
// If we still do not have all bytes for the character we were building -
|
||
|
// then passed UTF8 was invalid
|
||
|
failedState = failedState || innerBytesLeft > 0;
|
||
|
result = !failedState;
|
||
|
if (failedState) {
|
||
|
_.memory.Free(nextText);
|
||
|
}
|
||
|
else {
|
||
|
outputQueue[outputQueue.length] = nextText;
|
||
|
}
|
||
|
failedState = false;
|
||
|
innerBytesLeft = 0;
|
||
|
nextText = _.text.Empty();
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
defaultproperties
|
||
|
{
|
||
|
maskDrop1 = 127 // 0 1 1 1 1 1 1 1
|
||
|
maskDrop2 = 63 // 0 0 1 1 1 1 1 1
|
||
|
maskDrop3 = 31 // 0 0 0 1 1 1 1 1
|
||
|
maskDrop4 = 15 // 0 0 0 0 1 1 1 1
|
||
|
maskDrop5 = 7 // 0 0 0 0 0 1 1 1
|
||
|
maskTake1 = 128 // 1 0 0 0 0 0 0 0
|
||
|
maskTake2 = 192 // 1 1 0 0 0 0 0 0
|
||
|
maskTake3 = 224 // 1 1 1 0 0 0 0 0
|
||
|
maskTake4 = 240 // 1 1 1 1 0 0 0 0
|
||
|
maskTake5 = 248 // 1 1 1 1 1 0 0 0
|
||
|
}
|