diff --git a/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc b/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
deleted file mode 100644
index 950d0da..0000000
Binary files a/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc and /dev/null differ
diff --git a/sources/Avarice/Utf8Decoder.uc b/sources/Avarice/Utf8Decoder.uc
deleted file mode 100644
index 89f677a..0000000
--- a/sources/Avarice/Utf8Decoder.uc
+++ /dev/null
@@ -1,260 +0,0 @@
-/**
- * Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
- * It is made to work with incoming, and possibly incomplete, streams of
- * bytes: instead of consuming the whole utf8 text, it is made to consume it
- * byte-by-byte and store `MutableText`s that it parsed from the stream
- * (assumes that separate `MutableText`s are separated by `0` byte).
- * This implementation should correctly convert any valid UTF8, but it is
- * not guaranteed to reject any invalid UTF8. In particular, it accepts
- * overlong code point encodings (except overlong encoding of zero).
- * It, however, does check whether every byte has a correct bit prefix and
- * does not attempt to repair input data if it finds invalid one.
- * See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
- * Copyright 2021 Anton Tarasenko
- *------------------------------------------------------------------------------
- * This file is part of Acedia.
- *
- * Acedia is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License, or
- * (at your option) any later version.
- *
- * Acedia is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Acedia. If not, see .
- */
-class Utf8Decoder extends AcediaObject;
-
-/**
- * `Utf8Decoder` consumes byte by byte with `PushByte()` method and it's
- * algorithm is simple:
- * 1. If it encounters a byte that encodes a singular code point by
- * itself (starts with `0` bit) - it is added as a codepoint;
- * 2. If it encounters byte which indicates that next code point is
- * composed out of several bytes (starts with 110, 1110 or 11110) -
- * remembers that it has to read several "inner" bytes belonging to
- * the same code point and starts to expect them instead;
- * 3. If it ever encounters a byte with unexpected (and thus invalid)
- * bit prefix - enters a failed state;
- * 4. If it ever encounters a `0` byte:
- * * If it was not in a failed state - records `MutableText`
- * accumulated so far;
- * * Clears failed state.
- */
-
-var private bool failedState;
-
-// Variables for building a multi-byte code point
-var private int nextCodePoint;
-var private int innerBytesLeft;
-
-// `MutableText` we are building right now
-var private MutableText nextText;
-// `MutableText`s we have already built
-var private array outputQueue;
-
-// These masks (`maskDropN`) allow to turn into zero first `N` bits in
-// the byte with `&` operator.
-var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
-// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
-// in the byte with `&` operator.
-// `maskTakeN == ~maskDropN`.
-var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
-
-protected function Constructor()
-{
- nextText = _.text.Empty();
-}
-
-protected function Finalizer()
-{
- _.memory.Free(nextText);
- _.memory.FreeMany(outputQueue);
- nextText = none;
- failedState = false;
- outputQueue.length = 0;
- innerBytesLeft = 0;
- nextCodePoint = 0;
-}
-
-/**
- * Checks whether data in the `MutableText` that caller `Utf8Decoder` is
- * currently filling was detected to be invalid.
- *
- * This state can be reset by pushing `0` byte into caller `Utf8Decoder`.
- * See `PushByte()` for more info.
- *
- * @return `true` iff caller `Utf8Decoder` is not in a failed state.
- */
-public final function bool Failed()
-{
- return failedState;
-}
-
-/**
- * Checks whether caller `Utf8Decoder` has any data put in
- * the `MutableText` it is currently building.
- * Result is guaranteed to be `false` after `self.PushByte(0)` call, since
- * it starts a brand new `MutableText`.
- */
-public final function bool HasUnfinishedData()
-{
- if (innerBytesLeft > 0) return true;
- if (nextText.GetLength() > 0) return true;
- return false;
-}
-
-/**
- * Returns next `MutableText` that was successfully decoded by
- * the caller `Utf8Decoder`, removing it from the output queue.
- *
- * @return Next `MutableText` in the caller `Utf8Decoder`'s output queue.
- * `none` iff output queue is empty. `MutableText`s are returned in order
- * they were decoded.
- */
-public final function MutableText PopText()
-{
- local MutableText result;
- if (outputQueue.length <= 0) {
- return none;
- }
- result = outputQueue[0];
- outputQueue.Remove(0, 1);
- return result;
-}
-
-/**
- * Adds next `byte` from the byte stream that is supposed to encode UTF8 text.
- * To finish building `MutableText` pass `0` byte into this method, which will
- * `MutableText` built so far into an "output queue" (accessible with
- * `PopText()`) and start building a new one.
- *
- * This method expects `byte`s, in order, from a sequence that has correct
- * UTF8 encoding. If method detects incorrect UTF8 sequence - it will be put
- * into a "failed state", discarding `MutableText` it was currently building,
- * along with any further input (except `0` byte).
- * Pushing `0` byte will restore `Utf8Decoder` from a failed state and it
- * will start building a new `MutableText`.
- *
- * @param nextByte next byte from byte stream that is supposed to encode
- * UTF8 text. `0` will make caller `Utf8Decoder` start building new
- * `MutableText`.
- * @return `true` iff caller `Utf8Decoder` was not in a failed state and
- * operation was successful.
- */
-public final function bool PushByte(byte nextByte)
-{
- if (nextByte == 0) return QueueCurrentText();
- if (failedState) return false;
- if (innerBytesLeft > 0) return PushInnerByte(nextByte);
-
- // Form of 0xxxxxxx means 1 byte per code point
- if ((nextByte & maskTake1) == 0)
- {
- AppendCodePoint(nextByte);
- return true;
- }
- // Form of 110xxxxx means 2 bytes per code point
- if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
- {
- nextCodePoint = nextByte & maskDrop3;
- innerBytesLeft = 1;
- return true;
- }
- // Form of 1110xxxx means 3 bytes per code point
- if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
- {
- nextCodePoint = nextByte & maskDrop4;
- innerBytesLeft = 2;
- return true;
- }
- // Form of 11110xxx means 4 bytes per code point
- if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
- {
- nextCodePoint = nextByte & maskDrop5;
- innerBytesLeft = 3;
- return true;
- }
- // `nextByte` must have has one of the above forms
- // (or 10xxxxxx that is handled in `PushInnerByte()`)
- failedState = true;
- return false;
-}
-
-// This method is responsible for pushing "inner" bytes: bytes that come
-// after the first one when code point is encoded with multiple bytes.
-// All of them are expected to have 10xxxxxx prefix.
-// Assumes `innerBytesLeft > 0` and `failedState == false`
-// to avoid needless checks.
-private final function bool PushInnerByte(byte nextByte)
-{
- // Fail if `nextByte` does not have an expected form: 10xxxxxx
- if ((nextByte & maskTake2) != maskTake1)
- {
- failedState = true;
- return false;
- }
- // Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
- // that actually encode code point, so to make space for those bits we must
- // shift previously added code points by `6`
- nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
- innerBytesLeft -= 1;
- if (innerBytesLeft <= 0)
- {
- // We forbid overlong encoding of `0`
- // (as does the Unicode standard)
- if (nextCodePoint == 0)
- {
- failedState = true;
- return false;
- }
- AppendCodePoint(nextCodePoint);
- }
- return true;
-}
-
-private final function AppendCodePoint(int codePoint)
-{
- local Text.Character nextCharacter;
- nextCharacter.codePoint = codePoint;
- nextText.AppendCharacter(nextCharacter);
-}
-
-// Return `true` if `MutableText` was added to the queue
-// (there were no encoding errors)
-private final function bool QueueCurrentText()
-{
- local bool result;
- // If we still do not have all bytes for the character we were building -
- // then passed UTF8 was invalid
- failedState = failedState || innerBytesLeft > 0;
- result = !failedState;
- if (failedState) {
- _.memory.Free(nextText);
- }
- else {
- outputQueue[outputQueue.length] = nextText;
- }
- failedState = false;
- innerBytesLeft = 0;
- nextText = _.text.Empty();
- return result;
-}
-
-defaultproperties
-{
- maskDrop1 = 127 // 0 1 1 1 1 1 1 1
- maskDrop2 = 63 // 0 0 1 1 1 1 1 1
- maskDrop3 = 31 // 0 0 0 1 1 1 1 1
- maskDrop4 = 15 // 0 0 0 0 1 1 1 1
- maskDrop5 = 7 // 0 0 0 0 0 1 1 1
- maskTake1 = 128 // 1 0 0 0 0 0 0 0
- maskTake2 = 192 // 1 1 0 0 0 0 0 0
- maskTake3 = 224 // 1 1 1 0 0 0 0 0
- maskTake4 = 240 // 1 1 1 1 0 0 0 0
- maskTake5 = 248 // 1 1 1 1 1 0 0 0
-}
\ No newline at end of file
diff --git a/sources/Text/Codecs/Utf8Decoder.uc b/sources/Text/Codecs/Utf8Decoder.uc
new file mode 100644
index 0000000..5f86e8f
--- /dev/null
+++ b/sources/Text/Codecs/Utf8Decoder.uc
@@ -0,0 +1,161 @@
+/**
+ * Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
+ * This is a separate object instead of just a method, because it allows
+ * to make code simpler by storing state variables related to
+ * the decoding process.
+ * This implementation should correctly convert any valid UTF8, but it is
+ * not guaranteed to reject any invalid UTF8. In particular, it accepts
+ * overlong code point encodings. It does check whether every byte has
+ * a correct bit prefix and does not attempt to repair input data if it finds
+ * invalid one.
+ * See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
+ * Copyright 2021 Anton Tarasenko
+ *------------------------------------------------------------------------------
+ * This file is part of Acedia.
+ *
+ * Acedia is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Acedia is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Acedia. If not, see .
+ */
+class Utf8Decoder extends AcediaObject;
+
+// Variables for building a multi-byte code point.
+// Stored as a class member variables to avoid copying them between methods.
+var private MutableText builtText;
+var private int nextCodePoint;
+var private int innerBytesLeft;
+
+// These masks (`maskDropN`) allow to turn into zero first `N` bits in
+// the byte with `&` operator.
+var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
+// These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
+// in the byte with `&` operator.
+// `maskTakeN == ~maskDropN`.
+var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
+
+/**
+ * Decodes passed `byte` array (that contains utf8-encoded text) into
+ * the `MutableText` type.
+ *
+ * @param byteStream Byte stream to decode.
+ * @return `MutableText` that contains `byteStream`'s text data.
+ * `none` iff either `byteStream == none` or it's contents do not
+ * correspond to a (valid) utf8-encoded text.
+ */
+public final function MutableText Decode(ByteArrayRef byteStream)
+{
+ local int i;
+ local int length;
+ local MutableText result;
+ if (byteStream == none) {
+ return none;
+ }
+ nextCodePoint = 0;
+ innerBytesLeft = 0;
+ builtText = _.text.Empty();
+ length = byteStream.GetLength();
+ for (i = 0; i < length; i += 1)
+ {
+ if (!PushByte(byteStream.GetItem(i)))
+ {
+ _.memory.Free(builtText);
+ return none;
+ }
+ }
+ if (innerBytesLeft <= 0) {
+ result = builtText;
+ }
+ else {
+ _.memory.Free(builtText);
+ }
+ builtText = none;
+ return result;
+}
+
+private final function bool PushByte(byte nextByte)
+{
+ if (innerBytesLeft > 0) {
+ return PushInnerByte(nextByte);
+ }
+ // Form of 0xxxxxxx means 1 byte per code point
+ if ((nextByte & maskTake1) == 0)
+ {
+ AppendCodePoint(nextByte);
+ return true;
+ }
+ // Form of 110xxxxx means 2 bytes per code point
+ if ((nextByte & maskTake3) == maskTake2) // maskTake2 == 1 1 0 0 0 0 0 0
+ {
+ nextCodePoint = nextByte & maskDrop3;
+ innerBytesLeft = 1;
+ return true;
+ }
+ // Form of 1110xxxx means 3 bytes per code point
+ if ((nextByte & maskTake4) == maskTake3) // maskTake3 == 1 1 1 0 0 0 0 0
+ {
+ nextCodePoint = nextByte & maskDrop4;
+ innerBytesLeft = 2;
+ return true;
+ }
+ // Form of 11110xxx means 4 bytes per code point
+ if ((nextByte & maskTake5) == maskTake4) // maskTake4 == 1 1 1 1 0 0 0 0
+ {
+ nextCodePoint = nextByte & maskDrop5;
+ innerBytesLeft = 3;
+ return true;
+ }
+ // `nextByte` must have has one of the above forms
+ // (or 10xxxxxx that is handled in `PushInnerByte()`)
+ return false;
+}
+
+// This method is responsible for pushing "inner" bytes: bytes that come
+// after the first one when code point is encoded with multiple bytes.
+// All of them are expected to have 10xxxxxx prefix.
+// Assumes `innerBytesLeft > 0` to avoid needless checks.
+private final function bool PushInnerByte(byte nextByte)
+{
+ // Fail if `nextByte` does not have an expected form: 10xxxxxx
+ if ((nextByte & maskTake2) != maskTake1) {
+ return false;
+ }
+ // Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
+ // that actually encode code point, so to make space for those bits we must
+ // shift previously added code points by `6`
+ nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
+ innerBytesLeft -= 1;
+ if (innerBytesLeft <= 0) {
+ AppendCodePoint(nextCodePoint);
+ }
+ return true;
+}
+
+private final function AppendCodePoint(int codePoint)
+{
+ local Text.Character nextCharacter;
+ nextCharacter.codePoint = codePoint;
+ builtText.AppendCharacter(nextCharacter);
+}
+
+defaultproperties
+{
+ maskDrop1 = 127 // 0 1 1 1 1 1 1 1
+ maskDrop2 = 63 // 0 0 1 1 1 1 1 1
+ maskDrop3 = 31 // 0 0 0 1 1 1 1 1
+ maskDrop4 = 15 // 0 0 0 0 1 1 1 1
+ maskDrop5 = 7 // 0 0 0 0 0 1 1 1
+ maskTake1 = 128 // 1 0 0 0 0 0 0 0
+ maskTake2 = 192 // 1 1 0 0 0 0 0 0
+ maskTake3 = 224 // 1 1 1 0 0 0 0 0
+ maskTake4 = 240 // 1 1 1 1 0 0 0 0
+ maskTake5 = 248 // 1 1 1 1 1 0 0 0
+}
\ No newline at end of file
diff --git a/sources/Avarice/Utf8Encoder.uc b/sources/Text/Codecs/Utf8Encoder.uc
similarity index 76%
rename from sources/Avarice/Utf8Encoder.uc
rename to sources/Text/Codecs/Utf8Encoder.uc
index e66321d..af0042b 100644
--- a/sources/Avarice/Utf8Encoder.uc
+++ b/sources/Text/Codecs/Utf8Encoder.uc
@@ -1,6 +1,8 @@
/**
* Class for encoding Acedia's `MutableText` value into UTF8 byte
* representation.
+ * This is a separate object instead of just a method to match design of
+ * `Utf8Decoder`.
* See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
* Copyright 2021 Anton Tarasenko
*------------------------------------------------------------------------------
@@ -38,69 +40,68 @@ var private int lastSixBits;
* code points - this method will return empty array.
*
* @param text `Text` object to encode.
- * @return UTF8 representation of passed `text` as an array of `byte`s.
- * Empty array if `text == none` or `text` contains invalid Unicode
+ * @return UTF8 representation of passed `text` inside `ByteArrayRef`.
+ * `none` iff `text == none` or `text` contains invalid Unicode
* code points.
*/
-public final function array Encode(Text text)
+public final function ByteArrayRef Encode(Text text)
{
local int i, nextCodepoint, textLength;
- local array buffer;
+ local ByteArrayRef buffer;
if (__().text.IsEmpty(text)) {
- return buffer; // empty array
+ return none; // empty array
}
+ buffer = ByteArrayRef(_.memory.Allocate(class'ByteArrayRef'));
textLength = text.GetLength();
for (i = 0; i < textLength; i += 1)
{
nextCodepoint = text.GetCharacter(i).codePoint;
if (nextCodepoint <= utfLimit1) {
- buffer[buffer.length] = nextCodepoint;
+ buffer.AddItem(nextCodepoint);
}
else if (nextCodepoint <= utfLimit2)
{
// Drop 6 bits that will be recorded inside second byte and
// add 2-byte sequence mask
- buffer[buffer.length] = utfMask2 | (nextCodepoint >> 6);
+ buffer.AddItem(utfMask2 | (nextCodepoint >> 6));
// Take only last 6 bits for the second (last) byte
// + add inner-byte sequence mask
- buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+ buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else if (nextCodepoint <= utfLimit3)
{
// Drop 12 bits that will be recorded inside second and third bytes
// and add 3-byte sequence mask
- buffer[buffer.length] = utfMask3 | (nextCodepoint >> 12);
+ buffer.AddItem(utfMask3 | (nextCodepoint >> 12));
// Drop 6 bits that will be recorded inside third byte and
// add inner-byte sequence mask
- buffer[buffer.length] =
- utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+ buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
// Take only last 6 bits for the third (last) byte
// + add inner-byte sequence mask
- buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+ buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else if (nextCodepoint <= utfLimit4)
{
// Drop 18 bits that will be recorded inside second, third and
// fourth bytes, then add 4-byte sequence mask
- buffer[buffer.length] = utfMask4 | (nextCodepoint >> 18);
+ buffer.AddItem(utfMask4 | (nextCodepoint >> 18));
// Drop 12 bits that will be recorded inside third and fourth bytes
// and add inner-byte sequence mask
- buffer[buffer.length] =
- utfMaskIn | ((nextCodepoint >> 12) & lastSixBits);
+ buffer.AddItem(utfMaskIn | ((nextCodepoint >> 12) & lastSixBits));
// Drop 6 bits that will be recorded inside fourth byte
// and add inner-byte sequence mask
- buffer[buffer.length] =
- utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+ buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
// Take only last 6 bits for the fourth (last) byte
// + add inner-byte sequence mask
- buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+ buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
}
else
{
// Outside of known Unicode range
// Should not be possible, since `Text` is expected to
// contain only correct Unicode
- buffer.length = 0;
+ _.memory.Free(buffer);
+ buffer = none;
break;
}
}
diff --git a/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc b/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc
new file mode 100644
index 0000000..63daa53
Binary files /dev/null and b/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc differ