Refactor utf8 encoder and decoder

Make decoder work with byte arrays containing a single complete text, instead of byte streams with several separate texts. Moved codecs into "Text" category.
3 years ago · 323bf71e70
5 changed files with 181 additions and 279 deletions
--- a/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
+++ b/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
--- a/sources/Avarice/Utf8Decoder.uc
+++ b/sources/Avarice/Utf8Decoder.uc
@ -1,260 +0,0 @@
-/**
- *      Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
- *      It is made to work with incoming, and possibly incomplete, streams of
- *  bytes: instead of consuming the whole utf8 text, it is made to consume it
- *  byte-by-byte and store `MutableText`s that it parsed from the stream
- *  (assumes that separate `MutableText`s are separated by `0` byte).
- *      This implementation should correctly convert any valid UTF8, but it is
- *  not guaranteed to reject any invalid UTF8. In particular, it accepts
- *  overlong code point encodings (except overlong encoding of zero).
- *  It, however, does check whether every byte has a correct bit prefix and
- *  does not attempt to repair input data if it finds invalid one.
- *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
- *      Copyright 2021 Anton Tarasenko
- *------------------------------------------------------------------------------
- * This file is part of Acedia.
- *
- * Acedia is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License, or
- * (at your option) any later version.
- *
- * Acedia is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Acedia.  If not, see <https://www.gnu.org/licenses/>.
- */
-class Utf8Decoder extends AcediaObject;
-
-/**
- *  `Utf8Decoder` consumes byte by byte with `PushByte()` method and it's
- *  algorithm is simple:
- *      1.  If it encounters a byte that encodes a singular code point by
- *              itself (starts with `0` bit) - it is added as a codepoint;
- *      2.  If it encounters byte which indicates that next code point is
- *              composed out of several bytes (starts with 110, 1110 or 11110) -
- *              remembers that it has to read several "inner" bytes belonging to
- *              the same code point and starts to expect them instead;
- *      3.  If it ever encounters a byte with unexpected (and thus invalid)
- *              bit prefix - enters a failed state;
- *      4.  If it ever encounters a `0` byte:
- *          *   If it was not in a failed state - records `MutableText`
- *                  accumulated so far;
- *          *   Clears failed state.
- */
-
-var private bool failedState;
-
-//  Variables for building a multi-byte code point
-var private int nextCodePoint;
-var private int innerBytesLeft;
-
-//  `MutableText` we are building right now
-var private MutableText         nextText;
-//  `MutableText`s we have already built
-var private array<MutableText>  outputQueue;
-
-//  These masks (`maskDropN`) allow to turn into zero first `N` bits in
-//  the byte with `&` operator.
-var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
-//      These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
-//  in the byte with `&` operator.
-//      `maskTakeN == ~maskDropN`.
-var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
-
-protected function Constructor()
-{
-    nextText = _.text.Empty();
-}
-
-protected function Finalizer()
-{
-    _.memory.Free(nextText);
-    _.memory.FreeMany(outputQueue);
-    nextText            = none;
-    failedState         = false;
-    outputQueue.length  = 0;
-    innerBytesLeft      = 0;
-    nextCodePoint       = 0;
-}
-
-/**
- *  Checks whether data in the `MutableText` that caller `Utf8Decoder` is
- *  currently filling was detected to be invalid.
- *
- *  This state can be reset by pushing `0` byte into caller `Utf8Decoder`.
- *  See `PushByte()` for more info.
- *
- *  @return `true` iff  caller `Utf8Decoder` is not in a failed state.
- */
-public final function bool Failed()
-{
-    return failedState;
-}
-
-/**
- *      Checks whether caller `Utf8Decoder` has any data put in
- *  the `MutableText` it is currently building.
- *      Result is guaranteed to be `false` after `self.PushByte(0)` call, since
- *  it starts a brand new `MutableText`.
- */
-public final function bool HasUnfinishedData()
-{
-    if (innerBytesLeft > 0)         return true;
-    if (nextText.GetLength() > 0)   return true;
-    return false;
-}
-
-/**
- *  Returns next `MutableText` that was successfully decoded by
- *  the caller `Utf8Decoder`, removing it from the output queue.
- *
- *  @return Next `MutableText` in the caller `Utf8Decoder`'s output queue.
- *      `none` iff output queue is empty. `MutableText`s are returned in order
- *      they were decoded.
- */
-public final function MutableText PopText()
-{
-    local MutableText result;
-    if (outputQueue.length <= 0) {
-        return none;
-    }
-    result = outputQueue[0];
-    outputQueue.Remove(0, 1);
-    return result;
-}
-
-/**
- *  Adds next `byte` from the byte stream that is supposed to encode UTF8 text.
- *  To finish building `MutableText` pass `0` byte into this method, which will
- *  `MutableText` built so far into an "output queue" (accessible with
- *  `PopText()`) and start building a new one.
- *
- *      This method expects `byte`s, in order, from a sequence that has correct
- *  UTF8 encoding. If method detects incorrect UTF8 sequence - it will be put
- *  into a "failed state", discarding `MutableText` it was currently building,
- *  along with any further input (except `0` byte).
- *      Pushing `0` byte will restore `Utf8Decoder` from a failed state and it
- *  will start building a new `MutableText`.
- *
- *  @param  nextByte    next byte from byte stream that is supposed to encode
- *      UTF8 text. `0` will make caller `Utf8Decoder` start building new
- *      `MutableText`.
- *  @return `true` iff caller `Utf8Decoder` was not in a failed state and
- *      operation was successful.
- */
-public final function bool PushByte(byte nextByte)
-{
-    if (nextByte == 0)      return QueueCurrentText();
-    if (failedState)        return false;
-    if (innerBytesLeft > 0) return PushInnerByte(nextByte);
-
-    //  Form of 0xxxxxxx means 1 byte per code point
-    if ((nextByte & maskTake1) == 0)
-    {
-        AppendCodePoint(nextByte);
-        return true;
-    }
-    //  Form of 110xxxxx means 2 bytes per code point
-    if ((nextByte & maskTake3) == maskTake2)    //  maskTake2 == 1 1 0 0 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop3;
-        innerBytesLeft = 1;
-        return true;
-    }
-    //  Form of 1110xxxx means 3 bytes per code point
-    if ((nextByte & maskTake4) == maskTake3)    //  maskTake3 == 1 1 1 0 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop4;
-        innerBytesLeft = 2;
-        return true;
-    }
-    //  Form of 11110xxx means 4 bytes per code point
-    if ((nextByte & maskTake5) == maskTake4)    //  maskTake4 == 1 1 1 1 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop5;
-        innerBytesLeft = 3;
-        return true;
-    }
-    //  `nextByte` must have has one of the above forms
-    //  (or 10xxxxxx that is handled in `PushInnerByte()`)
-    failedState = true;
-    return false;
-}
-
-//      This method is responsible for pushing "inner" bytes: bytes that come
-//  after the first one when code point is encoded with multiple bytes.
-//  All of them are expected to have 10xxxxxx prefix.
-//      Assumes `innerBytesLeft > 0` and `failedState == false`
-//  to avoid needless checks.
-private final function bool PushInnerByte(byte nextByte)
-{
-    //  Fail if `nextByte` does not have an expected form: 10xxxxxx
-    if ((nextByte & maskTake2) != maskTake1)
-    {
-        failedState = true;
-        return false;
-    }
-    //  Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
-    //  that actually encode code point, so to make space for those bits we must
-    //  shift previously added code points by `6`
-    nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
-    innerBytesLeft -= 1;
-    if (innerBytesLeft <= 0)
-    {
-        //  We forbid overlong encoding of `0`
-        //  (as does the Unicode standard)
-        if (nextCodePoint == 0)
-        {
-            failedState = true;
-            return false;
-        }
-        AppendCodePoint(nextCodePoint);
-    }
-    return true;
-}
-
-private final function AppendCodePoint(int codePoint)
-{
-    local Text.Character nextCharacter;
-    nextCharacter.codePoint = codePoint;
-    nextText.AppendCharacter(nextCharacter);
-}
-
-//  Return `true` if `MutableText` was added to the queue
-//  (there were no encoding errors)
-private final function bool QueueCurrentText()
-{
-    local bool result;
-    //  If we still do not have all bytes for the character we were building -
-    //  then passed UTF8 was invalid
-    failedState = failedState || innerBytesLeft > 0;
-    result = !failedState;
-    if (failedState) {
-        _.memory.Free(nextText);
-    }
-    else {
-        outputQueue[outputQueue.length] = nextText;
-    }
-    failedState = false;
-    innerBytesLeft = 0;
-    nextText = _.text.Empty();
-    return result;
-}
-
-defaultproperties
-{
-    maskDrop1 = 127 //  0 1 1 1 1 1 1 1
-    maskDrop2 = 63  //  0 0 1 1 1 1 1 1
-    maskDrop3 = 31  //  0 0 0 1 1 1 1 1
-    maskDrop4 = 15  //  0 0 0 0 1 1 1 1
-    maskDrop5 = 7   //  0 0 0 0 0 1 1 1
-    maskTake1 = 128 //  1 0 0 0 0 0 0 0
-    maskTake2 = 192 //  1 1 0 0 0 0 0 0
-    maskTake3 = 224 //  1 1 1 0 0 0 0 0
-    maskTake4 = 240 //  1 1 1 1 0 0 0 0
-    maskTake5 = 248 //  1 1 1 1 1 0 0 0
-}
--- a/sources/Text/Codecs/Utf8Decoder.uc
+++ b/sources/Text/Codecs/Utf8Decoder.uc
@ -0,0 +1,161 @@
+/**
+ *      Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
+ *      This is a separate object instead of just a method, because it allows
+ *  to make code simpler by storing state variables related to
+ *  the decoding process.
+ *      This implementation should correctly convert any valid UTF8, but it is
+ *  not guaranteed to reject any invalid UTF8. In particular, it accepts
+ *  overlong code point encodings. It does check whether every byte has
+ *  a correct bit prefix and does not attempt to repair input data if it finds
+ *  invalid one.
+ *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
+ *      Copyright 2021 Anton Tarasenko
+ *------------------------------------------------------------------------------
+ * This file is part of Acedia.
+ *
+ * Acedia is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Acedia is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Acedia.  If not, see <https://www.gnu.org/licenses/>.
+ */
+class Utf8Decoder extends AcediaObject;
+
+//  Variables for building a multi-byte code point.
+//  Stored as a class member variables to avoid copying them between methods.
+var private MutableText builtText;
+var private int         nextCodePoint;
+var private int         innerBytesLeft;
+
+//  These masks (`maskDropN`) allow to turn into zero first `N` bits in
+//  the byte with `&` operator.
+var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
+//      These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
+//  in the byte with `&` operator.
+//      `maskTakeN == ~maskDropN`.
+var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
+
+/**
+ *  Decodes passed `byte` array (that contains utf8-encoded text) into
+ *  the `MutableText` type.
+ *
+ *  @param  byteStream  Byte stream to decode.
+ *  @return `MutableText` that contains `byteStream`'s text data.
+ *      `none` iff either `byteStream == none` or it's contents do not
+ *      correspond to a (valid) utf8-encoded text.
+ */
+public final function MutableText Decode(ByteArrayRef byteStream)
+{
+    local int           i;
+    local int           length;
+    local MutableText   result;
+    if (byteStream == none) {
+        return none;
+    }
+    nextCodePoint   = 0;
+    innerBytesLeft  = 0;
+    builtText       = _.text.Empty();
+    length = byteStream.GetLength();
+    for (i = 0; i < length; i += 1)
+    {
+        if (!PushByte(byteStream.GetItem(i)))
+        {
+            _.memory.Free(builtText);
+            return none;
+        }
+    }
+    if (innerBytesLeft <= 0) {
+        result = builtText;
+    }
+    else {
+        _.memory.Free(builtText);
+    }
+    builtText = none;
+    return result;
+}
+
+private final function bool PushByte(byte nextByte)
+{
+    if (innerBytesLeft > 0) {
+        return PushInnerByte(nextByte);
+    }
+    //  Form of 0xxxxxxx means 1 byte per code point
+    if ((nextByte & maskTake1) == 0)
+    {
+        AppendCodePoint(nextByte);
+        return true;
+    }
+    //  Form of 110xxxxx means 2 bytes per code point
+    if ((nextByte & maskTake3) == maskTake2)    //  maskTake2 == 1 1 0 0 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop3;
+        innerBytesLeft = 1;
+        return true;
+    }
+    //  Form of 1110xxxx means 3 bytes per code point
+    if ((nextByte & maskTake4) == maskTake3)    //  maskTake3 == 1 1 1 0 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop4;
+        innerBytesLeft = 2;
+        return true;
+    }
+    //  Form of 11110xxx means 4 bytes per code point
+    if ((nextByte & maskTake5) == maskTake4)    //  maskTake4 == 1 1 1 1 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop5;
+        innerBytesLeft = 3;
+        return true;
+    }
+    //  `nextByte` must have has one of the above forms
+    //  (or 10xxxxxx that is handled in `PushInnerByte()`)
+    return false;
+}
+
+//      This method is responsible for pushing "inner" bytes: bytes that come
+//  after the first one when code point is encoded with multiple bytes.
+//  All of them are expected to have 10xxxxxx prefix.
+//      Assumes `innerBytesLeft > 0` to avoid needless checks.
+private final function bool PushInnerByte(byte nextByte)
+{
+    //  Fail if `nextByte` does not have an expected form: 10xxxxxx
+    if ((nextByte & maskTake2) != maskTake1) {
+        return false;
+    }
+    //  Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
+    //  that actually encode code point, so to make space for those bits we must
+    //  shift previously added code points by `6`
+    nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
+    innerBytesLeft -= 1;
+    if (innerBytesLeft <= 0) {
+        AppendCodePoint(nextCodePoint);
+    }
+    return true;
+}
+
+private final function AppendCodePoint(int codePoint)
+{
+    local Text.Character nextCharacter;
+    nextCharacter.codePoint = codePoint;
+    builtText.AppendCharacter(nextCharacter);
+}
+
+defaultproperties
+{
+    maskDrop1 = 127 //  0 1 1 1 1 1 1 1
+    maskDrop2 = 63  //  0 0 1 1 1 1 1 1
+    maskDrop3 = 31  //  0 0 0 1 1 1 1 1
+    maskDrop4 = 15  //  0 0 0 0 1 1 1 1
+    maskDrop5 = 7   //  0 0 0 0 0 1 1 1
+    maskTake1 = 128 //  1 0 0 0 0 0 0 0
+    maskTake2 = 192 //  1 1 0 0 0 0 0 0
+    maskTake3 = 224 //  1 1 1 0 0 0 0 0
+    maskTake4 = 240 //  1 1 1 1 0 0 0 0
+    maskTake5 = 248 //  1 1 1 1 1 0 0 0
+}
--- a/sources/Text/Codecs/Utf8Encoder.uc
+++ b/sources/Text/Codecs/Utf8Encoder.uc
@ -1,6 +1,8 @@
 /**
 *      Class for encoding Acedia's `MutableText` value into UTF8 byte
 *  representation.
+ *      This is a separate object instead of just a method to match design of
+ *  `Utf8Decoder`.
 *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
 *      Copyright 2021 Anton Tarasenko
 *------------------------------------------------------------------------------
@ -38,69 +40,68 @@ var private int lastSixBits;
 *  code points - this method will return empty array.
 *
 *  @param  text    `Text` object to encode.
- *  @return UTF8 representation of passed `text` as an array of `byte`s.
- *      Empty array if `text == none` or `text` contains invalid Unicode
+ *  @return UTF8 representation of passed `text` inside `ByteArrayRef`.
+ *      `none` iff `text == none` or `text` contains invalid Unicode
 *      code points.
 */
-public final function array<byte> Encode(Text text)
+public final function ByteArrayRef Encode(Text text)
 {
    local int           i, nextCodepoint, textLength;
-    local array<byte>   buffer;
+    local ByteArrayRef  buffer;
    if (__().text.IsEmpty(text)) {
-        return buffer; // empty array
+        return none; // empty array
    }
+    buffer = ByteArrayRef(_.memory.Allocate(class'ByteArrayRef'));
    textLength = text.GetLength();
    for (i = 0; i < textLength; i += 1)
    {
        nextCodepoint = text.GetCharacter(i).codePoint;
        if (nextCodepoint <= utfLimit1) {
-            buffer[buffer.length] = nextCodepoint;
+            buffer.AddItem(nextCodepoint);
        }
        else if (nextCodepoint <= utfLimit2)
        {
            //  Drop 6 bits that will be recorded inside second byte and
            //  add 2-byte sequence mask
-            buffer[buffer.length] = utfMask2 | (nextCodepoint >> 6);
+            buffer.AddItem(utfMask2 | (nextCodepoint >> 6));
            //  Take only last 6 bits for the second (last) byte
            //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
        }
        else if (nextCodepoint <= utfLimit3)
        {
            //  Drop 12 bits that will be recorded inside second and third bytes
            //  and add 3-byte sequence mask
-            buffer[buffer.length] = utfMask3 | (nextCodepoint >> 12);
+            buffer.AddItem(utfMask3 | (nextCodepoint >> 12));
            //  Drop 6 bits that will be recorded inside third byte and
            //  add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
            //  Take only last 6 bits for the third (last) byte
            //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
        }
        else if (nextCodepoint <= utfLimit4)
        {
            //  Drop 18 bits that will be recorded inside second, third and
            //  fourth bytes, then add 4-byte sequence mask
-            buffer[buffer.length] = utfMask4 | (nextCodepoint >> 18);
+            buffer.AddItem(utfMask4 | (nextCodepoint >> 18));
            //  Drop 12 bits that will be recorded inside third and fourth bytes
            //  and add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 12) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 12) & lastSixBits));
            //  Drop 6 bits that will be recorded inside fourth byte
            //  and add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
            //  Take only last 6 bits for the fourth (last) byte
            //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
        }
        else
        {
            //      Outside of known Unicode range
            //      Should not be possible, since `Text` is expected to
            //  contain only correct Unicode
-            buffer.length = 0;
+            _.memory.Free(buffer);
+            buffer = none;
            break;
        }
    }
--- a/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc
+++ b/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc