Browse Source

Refactor utf8 encoder and decoder

Make decoder work with byte arrays containing a single complete text,
instead of byte streams with several separate texts.

Moved codecs into "Text" category.
pull/8/head
Anton Tarasenko 3 years ago
parent
commit
323bf71e70
  1. BIN
      sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
  2. 260
      sources/Avarice/Utf8Decoder.uc
  3. 161
      sources/Text/Codecs/Utf8Decoder.uc
  4. 39
      sources/Text/Codecs/Utf8Encoder.uc
  5. BIN
      sources/Text/Tests/TEST_UTF8EncoderDecoder.uc

BIN
sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc

Binary file not shown.

260
sources/Avarice/Utf8Decoder.uc

@ -1,260 +0,0 @@
/**
* Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
* It is made to work with incoming, and possibly incomplete, streams of
* bytes: instead of consuming the whole utf8 text, it is made to consume it
* byte-by-byte and store `MutableText`s that it parsed from the stream
* (assumes that separate `MutableText`s are separated by `0` byte).
* This implementation should correctly convert any valid UTF8, but it is
* not guaranteed to reject any invalid UTF8. In particular, it accepts
* overlong code point encodings (except overlong encoding of zero).
* It, however, does check whether every byte has a correct bit prefix and
* does not attempt to repair input data if it finds invalid one.
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
* Copyright 2021 Anton Tarasenko
*------------------------------------------------------------------------------
* This file is part of Acedia.
*
* Acedia is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License, or
* (at your option) any later version.
*
* Acedia is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Acedia. If not, see <https://www.gnu.org/licenses/>.
*/
class Utf8Decoder extends AcediaObject;
/**
* `Utf8Decoder` consumes byte by byte with `PushByte()` method and it's
* algorithm is simple:
* 1. If it encounters a byte that encodes a singular code point by
* itself (starts with `0` bit) - it is added as a codepoint;
* 2. If it encounters byte which indicates that next code point is
* composed out of several bytes (starts with 110, 1110 or 11110) -
* remembers that it has to read several "inner" bytes belonging to
* the same code point and starts to expect them instead;
* 3. If it ever encounters a byte with unexpected (and thus invalid)
* bit prefix - enters a failed state;
* 4. If it ever encounters a `0` byte:
* * If it was not in a failed state - records `MutableText`
* accumulated so far;
* * Clears failed state.
*/
var private bool failedState;
// Variables for building a multi-byte code point
var private int nextCodePoint;
var private int innerBytesLeft;
// `MutableText` we are building right now
var private MutableText nextText;
// `MutableText`s we have already built
var private array<MutableText> outputQueue;
// These masks (`maskDropN`) allow to turn into zero first `N` bits in
// the byte with `&` operator.
var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
// in the byte with `&` operator.
// `maskTakeN == ~maskDropN`.
var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
protected function Constructor()
{
nextText = _.text.Empty();
}
protected function Finalizer()
{
_.memory.Free(nextText);
_.memory.FreeMany(outputQueue);
nextText = none;
failedState = false;
outputQueue.length = 0;
innerBytesLeft = 0;
nextCodePoint = 0;
}
/**
* Checks whether data in the `MutableText` that caller `Utf8Decoder` is
* currently filling was detected to be invalid.
*
* This state can be reset by pushing `0` byte into caller `Utf8Decoder`.
* See `PushByte()` for more info.
*
* @return `true` iff caller `Utf8Decoder` is not in a failed state.
*/
public final function bool Failed()
{
return failedState;
}
/**
* Checks whether caller `Utf8Decoder` has any data put in
* the `MutableText` it is currently building.
* Result is guaranteed to be `false` after `self.PushByte(0)` call, since
* it starts a brand new `MutableText`.
*/
public final function bool HasUnfinishedData()
{
if (innerBytesLeft > 0) return true;
if (nextText.GetLength() > 0) return true;
return false;
}
/**
* Returns next `MutableText` that was successfully decoded by
* the caller `Utf8Decoder`, removing it from the output queue.
*
* @return Next `MutableText` in the caller `Utf8Decoder`'s output queue.
* `none` iff output queue is empty. `MutableText`s are returned in order
* they were decoded.
*/
public final function MutableText PopText()
{
local MutableText result;
if (outputQueue.length <= 0) {
return none;
}
result = outputQueue[0];
outputQueue.Remove(0, 1);
return result;
}
/**
* Adds next `byte` from the byte stream that is supposed to encode UTF8 text.
* To finish building `MutableText` pass `0` byte into this method, which will
* `MutableText` built so far into an "output queue" (accessible with
* `PopText()`) and start building a new one.
*
* This method expects `byte`s, in order, from a sequence that has correct
* UTF8 encoding. If method detects incorrect UTF8 sequence - it will be put
* into a "failed state", discarding `MutableText` it was currently building,
* along with any further input (except `0` byte).
* Pushing `0` byte will restore `Utf8Decoder` from a failed state and it
* will start building a new `MutableText`.
*
* @param nextByte next byte from byte stream that is supposed to encode
* UTF8 text. `0` will make caller `Utf8Decoder` start building new
* `MutableText`.
* @return `true` iff caller `Utf8Decoder` was not in a failed state and
* operation was successful.
*/
public final function bool PushByte(byte nextByte)
{
if (nextByte == 0) return QueueCurrentText();
if (failedState) return false;
if (innerBytesLeft > 0) return PushInnerByte(nextByte);
// Form of 0xxxxxxx means 1 byte per code point
if ((nextByte & maskTake1) == 0)
{
AppendCodePoint(nextByte);
return true;
}
// Form of 110xxxxx means 2 bytes per code point
if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
{
nextCodePoint = nextByte & maskDrop3;
innerBytesLeft = 1;
return true;
}
// Form of 1110xxxx means 3 bytes per code point
if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
{
nextCodePoint = nextByte & maskDrop4;
innerBytesLeft = 2;
return true;
}
// Form of 11110xxx means 4 bytes per code point
if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
{
nextCodePoint = nextByte & maskDrop5;
innerBytesLeft = 3;
return true;
}
// `nextByte` must have has one of the above forms
// (or 10xxxxxx that is handled in `PushInnerByte()`)
failedState = true;
return false;
}
// This method is responsible for pushing "inner" bytes: bytes that come
// after the first one when code point is encoded with multiple bytes.
// All of them are expected to have 10xxxxxx prefix.
// Assumes `innerBytesLeft > 0` and `failedState == false`
// to avoid needless checks.
private final function bool PushInnerByte(byte nextByte)
{
// Fail if `nextByte` does not have an expected form: 10xxxxxx
if ((nextByte & maskTake2) != maskTake1)
{
failedState = true;
return false;
}
// Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
// that actually encode code point, so to make space for those bits we must
// shift previously added code points by `6`
nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
innerBytesLeft -= 1;
if (innerBytesLeft <= 0)
{
// We forbid overlong encoding of `0`
// (as does the Unicode standard)
if (nextCodePoint == 0)
{
failedState = true;
return false;
}
AppendCodePoint(nextCodePoint);
}
return true;
}
private final function AppendCodePoint(int codePoint)
{
local Text.Character nextCharacter;
nextCharacter.codePoint = codePoint;
nextText.AppendCharacter(nextCharacter);
}
// Return `true` if `MutableText` was added to the queue
// (there were no encoding errors)
private final function bool QueueCurrentText()
{
local bool result;
// If we still do not have all bytes for the character we were building -
// then passed UTF8 was invalid
failedState = failedState || innerBytesLeft > 0;
result = !failedState;
if (failedState) {
_.memory.Free(nextText);
}
else {
outputQueue[outputQueue.length] = nextText;
}
failedState = false;
innerBytesLeft = 0;
nextText = _.text.Empty();
return result;
}
defaultproperties
{
maskDrop1 = 127 // 0 1 1 1 1 1 1 1
maskDrop2 = 63 // 0 0 1 1 1 1 1 1
maskDrop3 = 31 // 0 0 0 1 1 1 1 1
maskDrop4 = 15 // 0 0 0 0 1 1 1 1
maskDrop5 = 7 // 0 0 0 0 0 1 1 1
maskTake1 = 128 // 1 0 0 0 0 0 0 0
maskTake2 = 192 // 1 1 0 0 0 0 0 0
maskTake3 = 224 // 1 1 1 0 0 0 0 0
maskTake4 = 240 // 1 1 1 1 0 0 0 0
maskTake5 = 248 // 1 1 1 1 1 0 0 0
}

161
sources/Text/Codecs/Utf8Decoder.uc

@ -0,0 +1,161 @@
/**
* Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
* This is a separate object instead of just a method, because it allows
* to make code simpler by storing state variables related to
* the decoding process.
* This implementation should correctly convert any valid UTF8, but it is
* not guaranteed to reject any invalid UTF8. In particular, it accepts
* overlong code point encodings. It does check whether every byte has
* a correct bit prefix and does not attempt to repair input data if it finds
* invalid one.
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
* Copyright 2021 Anton Tarasenko
*------------------------------------------------------------------------------
* This file is part of Acedia.
*
* Acedia is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License, or
* (at your option) any later version.
*
* Acedia is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Acedia. If not, see <https://www.gnu.org/licenses/>.
*/
class Utf8Decoder extends AcediaObject;
// Variables for building a multi-byte code point.
// Stored as a class member variables to avoid copying them between methods.
var private MutableText builtText;
var private int nextCodePoint;
var private int innerBytesLeft;
// These masks (`maskDropN`) allow to turn into zero first `N` bits in
// the byte with `&` operator.
var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
// in the byte with `&` operator.
// `maskTakeN == ~maskDropN`.
var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
/**
* Decodes passed `byte` array (that contains utf8-encoded text) into
* the `MutableText` type.
*
* @param byteStream Byte stream to decode.
* @return `MutableText` that contains `byteStream`'s text data.
* `none` iff either `byteStream == none` or it's contents do not
* correspond to a (valid) utf8-encoded text.
*/
public final function MutableText Decode(ByteArrayRef byteStream)
{
local int i;
local int length;
local MutableText result;
if (byteStream == none) {
return none;
}
nextCodePoint = 0;
innerBytesLeft = 0;
builtText = _.text.Empty();
length = byteStream.GetLength();
for (i = 0; i < length; i += 1)
{
if (!PushByte(byteStream.GetItem(i)))
{
_.memory.Free(builtText);
return none;
}
}
if (innerBytesLeft <= 0) {
result = builtText;
}
else {
_.memory.Free(builtText);
}
builtText = none;
return result;
}
private final function bool PushByte(byte nextByte)
{
if (innerBytesLeft > 0) {
return PushInnerByte(nextByte);
}
// Form of 0xxxxxxx means 1 byte per code point
if ((nextByte & maskTake1) == 0)
{
AppendCodePoint(nextByte);
return true;
}
// Form of 110xxxxx means 2 bytes per code point
if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
{
nextCodePoint = nextByte & maskDrop3;
innerBytesLeft = 1;
return true;
}
// Form of 1110xxxx means 3 bytes per code point
if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
{
nextCodePoint = nextByte & maskDrop4;
innerBytesLeft = 2;
return true;
}
// Form of 11110xxx means 4 bytes per code point
if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
{
nextCodePoint = nextByte & maskDrop5;
innerBytesLeft = 3;
return true;
}
// `nextByte` must have has one of the above forms
// (or 10xxxxxx that is handled in `PushInnerByte()`)
return false;
}
// This method is responsible for pushing "inner" bytes: bytes that come
// after the first one when code point is encoded with multiple bytes.
// All of them are expected to have 10xxxxxx prefix.
// Assumes `innerBytesLeft > 0` to avoid needless checks.
private final function bool PushInnerByte(byte nextByte)
{
// Fail if `nextByte` does not have an expected form: 10xxxxxx
if ((nextByte & maskTake2) != maskTake1) {
return false;
}
// Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
// that actually encode code point, so to make space for those bits we must
// shift previously added code points by `6`
nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
innerBytesLeft -= 1;
if (innerBytesLeft <= 0) {
AppendCodePoint(nextCodePoint);
}
return true;
}
private final function AppendCodePoint(int codePoint)
{
local Text.Character nextCharacter;
nextCharacter.codePoint = codePoint;
builtText.AppendCharacter(nextCharacter);
}
defaultproperties
{
maskDrop1 = 127 // 0 1 1 1 1 1 1 1
maskDrop2 = 63 // 0 0 1 1 1 1 1 1
maskDrop3 = 31 // 0 0 0 1 1 1 1 1
maskDrop4 = 15 // 0 0 0 0 1 1 1 1
maskDrop5 = 7 // 0 0 0 0 0 1 1 1
maskTake1 = 128 // 1 0 0 0 0 0 0 0
maskTake2 = 192 // 1 1 0 0 0 0 0 0
maskTake3 = 224 // 1 1 1 0 0 0 0 0
maskTake4 = 240 // 1 1 1 1 0 0 0 0
maskTake5 = 248 // 1 1 1 1 1 0 0 0
}

39
sources/Avarice/Utf8Encoder.uc → sources/Text/Codecs/Utf8Encoder.uc

@ -1,6 +1,8 @@
/**
* Class for encoding Acedia's `MutableText` value into UTF8 byte
* representation.
* This is a separate object instead of just a method to match design of
* `Utf8Decoder`.
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
* Copyright 2021 Anton Tarasenko
*------------------------------------------------------------------------------
@ -38,69 +40,68 @@ var private int lastSixBits;
* code points - this method will return empty array.
*
* @param text `Text` object to encode.
* @return UTF8 representation of passed `text` as an array of `byte`s.
* Empty array if `text == none` or `text` contains invalid Unicode
* @return UTF8 representation of passed `text` inside `ByteArrayRef`.
* `none` iff `text == none` or `text` contains invalid Unicode
* code points.
*/
public final function array<byte> Encode(Text text)
public final function ByteArrayRef Encode(Text text)
{
local int i, nextCodepoint, textLength;
local array<byte> buffer;
local ByteArrayRef buffer;
if (__().text.IsEmpty(text)) {
return buffer; // empty array
return none; // empty array
}
buffer = ByteArrayRef(_.memory.Allocate(class'ByteArrayRef'));
textLength = text.GetLength();
for (i = 0; i < textLength; i += 1)
{
nextCodepoint = text.GetCharacter(i).codePoint;
if (nextCodepoint <= utfLimit1) {
buffer[buffer.length] = nextCodepoint;
buffer.AddItem(nextCodepoint);
}
else if (nextCodepoint <= utfLimit2)
{
// Drop 6 bits that will be recorded inside second byte and
// add 2-byte sequence mask
buffer[buffer.length] = utfMask2 | (nextCodepoint >> 6);
buffer.AddItem(utfMask2 | (nextCodepoint >> 6));
// Take only last 6 bits for the second (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else if (nextCodepoint <= utfLimit3)
{
// Drop 12 bits that will be recorded inside second and third bytes
// and add 3-byte sequence mask
buffer[buffer.length] = utfMask3 | (nextCodepoint >> 12);
buffer.AddItem(utfMask3 | (nextCodepoint >> 12));
// Drop 6 bits that will be recorded inside third byte and
// add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
// Take only last 6 bits for the third (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else if (nextCodepoint <= utfLimit4)
{
// Drop 18 bits that will be recorded inside second, third and
// fourth bytes, then add 4-byte sequence mask
buffer[buffer.length] = utfMask4 | (nextCodepoint >> 18);
buffer.AddItem(utfMask4 | (nextCodepoint >> 18));
// Drop 12 bits that will be recorded inside third and fourth bytes
// and add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 12) & lastSixBits);
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 12) & lastSixBits));
// Drop 6 bits that will be recorded inside fourth byte
// and add inner-byte sequence mask
buffer[buffer.length] =
utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
// Take only last 6 bits for the fourth (last) byte
// + add inner-byte sequence mask
buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else
{
// Outside of known Unicode range
// Should not be possible, since `Text` is expected to
// contain only correct Unicode
buffer.length = 0;
_.memory.Free(buffer);
buffer = none;
break;
}
}

BIN
sources/Text/Tests/TEST_UTF8EncoderDecoder.uc

Binary file not shown.
Loading…
Cancel
Save