From 323bf71e709ffb2a56135bfd6c82f1817dab4526 Mon Sep 17 00:00:00 2001
From: Anton Tarasenko <dkanus@gmail.com>
Date: Wed, 28 Jul 2021 04:47:01 +0700
Subject: [PATCH] Refactor utf8 encoder and decoder

Make decoder work with byte arrays containing a single complete text,
instead of byte streams with several separate texts.

Moved codecs into "Text" category.
---
 .../Avarice/Tests/TEST_UTF8EncoderDecoder.uc  | Bin 16540 -> 0 bytes
 sources/Avarice/Utf8Decoder.uc                | 260 ------------------
 sources/Text/Codecs/Utf8Decoder.uc            | 161 +++++++++++
 .../{Avarice => Text/Codecs}/Utf8Encoder.uc   |  39 +--
 sources/Text/Tests/TEST_UTF8EncoderDecoder.uc | Bin 0 -> 10184 bytes
 5 files changed, 181 insertions(+), 279 deletions(-)
 delete mode 100644 sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
 delete mode 100644 sources/Avarice/Utf8Decoder.uc
 create mode 100644 sources/Text/Codecs/Utf8Decoder.uc
 rename sources/{Avarice => Text/Codecs}/Utf8Encoder.uc (76%)
 create mode 100644 sources/Text/Tests/TEST_UTF8EncoderDecoder.uc
diff --git a/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc b/sources/Avarice/Tests/TEST_UTF8EncoderDecoder.uc
deleted file mode 100644
index 950d0daa77987d328e49cd106741cee99777b141..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16540
zcmeHOTWDOz8J@FRxp9J9gK-}WK3U05EI0CMb&c#gwq(hQ4T@|mxhAz8Yjs&$Sdo;~
z%JQXl3N7?Xq3MfLNSnqgen|;U`_MokZJ|);gCEimm)Z(7A*Q4+g+7(i^!w)D(VW>m
zd(p1e7+MZz_spD`fByOJ|ID2Ke5zX17jSJ*kE$5{W4MOZq{`rbR;BQpL1`Yp^LS3<
zd0HiP-GqKOi{DXo95pG`t@`nP9Q`KnpGJKQZD&vlwmFEJV4G&VT>$h+-EK@>Lb+F6
zK+m+ALd!`Ncd#Rdy!tTC99py0S#=H=#8jI~;BLiJFGk6r#SETDG_0f^V@91vt(omE
z_3&N>N8*1Db0qy!_@4$vV%hSXo|)xUucp@o-X;T7vyg)C4UClLDq+rpGAU4PQA3?n
zdr)@?bqn|(M;&QN2_Sq*1Svm;x0Db<BOb!<7W7J^^dkP*It4fu3u(Ckcp@j1v2z**
zA*Ioab2ZW@Z=O{{kg}M17A<CUt0X89Q%5oW7@jC8F*T_F*@E&!YzS#?rOmt^hw?}{
zvlyB22lV{db6AgWX%SNcs1ckg|5GT<D9Y{yjm><vCZ^g^=i=^S#Xh8t9lZ&7l6FFC
z(v6aB#@LB6sY<;ty2m!|xK>9j+%Rs3@s9ME!U&cR=QIYS11Tfe3D*|Ua#~Y<OjB4i
zHwDT}gBo^hk&Bo*p$<cS52+*I-U#6Jf{)MR{{)_n;bM6NHD5#>VRJmvne>>{a}44&
z1=*OxNK<%6tVL4Onm!p#AyVFA@tit{G5RpRjMJ+Q0G~mP6R{8}9nj?=v>nDpP9H%p
z;!61<o+D_{hZzj&5^Im6_9)s7qgNd5#A+IfO$KnNOmb6M+^9j+xtP|K3}#CD&xdOF
z^O|zhW1-=+rV;r^SrOf(J)P2J(l3ddwwQJ_S86N4+fts|Newv*xQ4H!D91PUiBvlb
ze#Ovk796K$P;ZIrUd$nbYYsJg;G<dqg;yL9XCNPO^ho2mMN0#riq6lYW?z6hEt(VK
zXhmB<YcQhuJfhBMi5!5}ay^pk!zhIpfd}DFWc!Oh?9&hpr^P2FVOPX5CNa|tq?#1r
zE&hzQh*py&v6{w?3Z-a8M8YQ_Sz=-NhLj<n%)5)uTRRG&lalnp{3VY$f>^9^l&Gon
zzs9;)dJL;3wNvAIMgPi-Xme)KUaW1npBLG*K3(uJ{b}RvyO??gScJoA0#D9pEaI5y
zByRHf61dq5EDRo;R}&x_j_xP`Z3kwvGs^*kOCKOIE)bqJsYXbHaFh0rvTCt4HMCol
z67d=AD>S86CV^udP;$pHP|xH<MD$kFPJum)(l{<+FpoLevFQcNhe`dNTT_^aFtd5k
zbq4Z6T<P!WhbJ%}%2Ki<w(eXWJlA8|W{AzBP02ViU5Gyem>KPBAzp}m6KN_fOSBmx
z`9_w4_HPGh%D9-g(87$O#CD8YsUg)!UC^TS1HREeYPe|URZS1hr!t!%TC+O&4%hmi
z&WYR;E7t}c2Ce3yE!6x*9rIZ&3hJe$U$LAdmE|(&zo_M!F^0r9uJ07T6sF<yNR)#&
za35DsA@4}QO10%k86ygMnuFl*oaRFY<#cu?u8zc4=5+uzk9MXBnapn00iGGI89Qcu
zSmltrfNefxGq3G|d9?wDpyXFSdb{c>JoftCD`W3QG`)L`$8N*CYe&;sLGD(ag((Kf
z6G^3wW^NF)Dr?KRL}4s$s}B!lIoAqpXa|+M*PM49fSfP2)k|b6u&ovEI-tk3!MUg&
z#oo+vgR3IF$Y0i05~xRETD%s#Xs;QUpDt$g-anP4SU<F_vj!AvS5MbS5;|ZfYa?Tw
zp<Fh8f{_NZj0a<JY8AB<i=ZoyT1>_=4f;F>pU=3NYgLmgF`uliGWu)PIUc<}OKmxM
zOK$uV#Fw-Whd!z!Os?-Sb7Z^lxiIpaKpVz%CKs1Xm8*BmQJC`(GZVuw6KHhoiHS^l
z(6SNpWbVdHlj}IL8deO;un5c0f~&dA8S|qhQo%&eHnKICnEq`Q_7AsXLzqnlYgWu7
z8KId`XyM(wcXj<4i<R47MlMYc12Jg#!zD>p3W{+khB#SPQx~;7Nw&<qLh@<;a#boU
z7cL$JTMEYY>TU*hmt%2NPgXI~pfkOGd?k-Lig6{@r)*F<T4AMxhKB#fLRAx?mxmj9
zUko?#(p=#*m%*)?D7!q|NRwi?wINRH((#GGFN<4MQFuOXgU}Yr4A*z5tA=mwcrHio
zZPsL9w_#q~4X8|Bx-e^cV@5DC^X$B<j$+HhuawMmYrZfd&EZQ`QC)esm6DlGM9S1Q
z>ZC)vs^TYuTk8s6OUq1c;(>BjzrsCfu9vEGJ(W=c_k7OhtRK3X;@a{9+G0vHMh~$F
z9V_PXr?j`{D&`!lX<B<SMsl+LL<=mt2Cj{lH7c%C)Bl9o`$D)s_%*<w-wXG~T<_y<
z3O#utyyMtCD!m$7ZtQI%hr;90ySb5#v5~<Ux!ga9h)ebijV}+E&{1%>R-7+Q<J_2~
zHk8dBr4oAfLnh3sDOXhNJ}5PVak8rytSL@c*)Qd;2KmHYjr>?CA9L9+uq%Ql>vk<c
zIOVO+%!OMS-E*Os-3*IeE4-F?g`lK9JlIM0|8USQSI1qfO4GXYfm6CR-q!B9gj=j~
ze5z{;Dcu)ze!v|)<_Xl*G%nc}4el6l_hVA;a$Q8*{4Ch)6%_NOVVWE+^Fe=8n!{YD
z?naDdqAX?)CUa;-%q|yq9uj!73wI~(9jM=d|1r#*Iaz5<Tpd;-cg7=`j3#`(g)NZb
zI=vP10Ru0Ht;<zoZOBjZbriFvH{(ej#*Sg!BX4E*uNeQhD{j_EEWgdZd3cmlntSEi
z!D7^3J(UVad}hBf-y4#O`I1lCWw3^m2E9o=?l3T}Wa}g-Mw-(z8j7)HIIY5E!$v~+
zw9K`QrKRKH)%n6YU?e)|Kka^>(X0or^I=kFu`y4^5*^ZVK>Z8bL6xY7x@69c$%z8@
z+)TKH6w{1w+A(u2WG%%`2CXGm23Kz_3-hx)x><f6&}+h6S)(Qur{EZLZbIAg|652K
zm&b83M^0>%???VuE#wiM&!>^^(vH)E=ATb7rw+^Ut=z|;OqvygRF)I=#Mx@RL1C@R
z*L223^sYfow_4@KMTs)vpVf@rigmBtm@GJU(8@gQW3Se}=Fi;t0Pgn(ub_<zx9x>#
zzIu%|5|SGk@@x;i4lOIsBlDSOPeuC8*;hs%-0iYE3i+{+oRJFdg)y68P7xOS=f^X-
zS$w&%VPPtnIfus%rRl#q-WzSJ4sIdNu2o)H?9>}Ezv9?};O;DCf;*LA*Hq5_M6llo
zWQY5;Vg2M3`<(Xirk$JPmkWuJHb*&zf5}&w(-31h*Gqd|7&3R#xR-VbXR1xEv@eUX
zDOU#B<m*AYGb7+-eG6lo!q^!Y{pex+y1;Pt$c5p;ch{C(e5yTF>_Q7WRqW#F;^NM$
zT3lRcVYrkdD}tdtYi;^4nvy&?c=lY*!locW{Kg{#i&yBBy2;XvHk2b>f{V+|xN{=k
z?y(n>(%i94B+~?YFJmP1UKLAwahHQzT>B2QYssB1_I@h6-^@MCD~?4m^LFJS>|AF#
zqo2fRr*(`Iw7Oy|%O5G<(}dY2p-?{Fxp}y&gYNo_l}+PegItv}Ld*mC{R3$_3i_4f
zY28yNT!}E&lo0qO#WW;yHL)gF9)f2$!YGm}Po<)mFc@*1WRPz*XoG{NMT$%E+KfGd
zv5$f_%#UWA??r<9@8o$Ux*blBO4f9&_jhrTn2RTlxHri!zGN-a)j58dLT~u2wp<JF
zkD?nq6VixNkHPOq-4jcu=MW(9YZIeW($n}Qw@)F<C;HQfokD&eOTW&O<%|y*aW<$}
z^zmp<v^&}s?TjAaNw4T$lzR9kF`W28;(Q{Rm>v53!r-Ugsn5Oi?Q8!$wHMe2_1;9<
zL!gA5$D-AyK2eM0n+&eSaZL9+LB^g*=-XuWD<bYIb1jcO`OdC8u$H^VT(co{+_$p3
z#Su9M^X)MFWN%EqUf`J=`F4@ozo<vD-w$#vQT8f{5!>-A7WyCl@(WhZkh60m12adn
zBNxPi5k&YmQsl1YtXCHuzukE^RzKta)_d987I}QrPj7eL`HuHDZ=b*a=Jz8lw>y_x
zQS$;?{SnU{k*?dFAATM8BXxg3{i~>3hx-QB*?D@V1FP;`I7!lt6PO8bv`1}I+tm~5
z1@$Dpr>3s$!WRI~z;B#{zc{U~!Vm4iy3;9exEa1Ej_=i4)P8jhFdy*&`R*IIXV*>m
ze*K2%R__+-4|><@yPL1$>G1lG>wbdwA4LX%%e}~!=#%*Wd2|%N_q|_4wnxuye#`p|
zTKvQRJ?e(h=hVh`fc@JjoxuHFZ?E6y9gglp-S<#_Wo!B-=DR%Qe}Z1Oyam)@j(2|O
zU0&-MMmPGA<|T~s0dD^P`R40O_al#DwDrK|iKV~dsR#Fh_jh3QGM@UqFX49p_giTH
zOVn<~?Js@eT||!t{{~8p-dFMN*ZyPvyIW@(0B!j+d3yI`bWhzoeqa4W^JVl0HI~~@
zOYUFy&Z1__zv_Prwe=hCd)L-{t|>r%$@|9U<+>XUt>EO+x+F&45Z!{2|APCD_gdt$
zX!9#P_xoSQ?>&FUzwEue=}$}G#PT<B{|s~N+H_0Z--wz!$I$;>+`sX6;P-c^*%o~*
z@}j>Vr86isq4fx!$NZBh{Tr6%UnsTvU)$M?H)%Y-x7M=+%-8E)!Dv4MwvXY?;Q4*8
zG2;8smAeZ|A4GnP+OIThUjH%vG1t2v)%Psj^XI(x8YWQM()=cRK$e$}<8DNaIazx3
Ezb943E&u=k

diff --git a/sources/Avarice/Utf8Decoder.uc b/sources/Avarice/Utf8Decoder.uc
deleted file mode 100644
index 89f677a..0000000
--- a/sources/Avarice/Utf8Decoder.uc
+++ /dev/null
@@ -1,260 +0,0 @@
-/**
- *      Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
- *      It is made to work with incoming, and possibly incomplete, streams of
- *  bytes: instead of consuming the whole utf8 text, it is made to consume it
- *  byte-by-byte and store `MutableText`s that it parsed from the stream
- *  (assumes that separate `MutableText`s are separated by `0` byte).
- *      This implementation should correctly convert any valid UTF8, but it is
- *  not guaranteed to reject any invalid UTF8. In particular, it accepts
- *  overlong code point encodings (except overlong encoding of zero).
- *  It, however, does check whether every byte has a correct bit prefix and
- *  does not attempt to repair input data if it finds invalid one.
- *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
- *      Copyright 2021 Anton Tarasenko
- *------------------------------------------------------------------------------
- * This file is part of Acedia.
- *
- * Acedia is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License, or
- * (at your option) any later version.
- *
- * Acedia is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Acedia.  If not, see <https://www.gnu.org/licenses/>.
- */
-class Utf8Decoder extends AcediaObject;
-
-/**
- *  `Utf8Decoder` consumes byte by byte with `PushByte()` method and it's
- *  algorithm is simple:
- *      1.  If it encounters a byte that encodes a singular code point by
- *              itself (starts with `0` bit) - it is added as a codepoint;
- *      2.  If it encounters byte which indicates that next code point is
- *              composed out of several bytes (starts with 110, 1110 or 11110) -
- *              remembers that it has to read several "inner" bytes belonging to
- *              the same code point and starts to expect them instead;
- *      3.  If it ever encounters a byte with unexpected (and thus invalid)
- *              bit prefix - enters a failed state;
- *      4.  If it ever encounters a `0` byte:
- *          *   If it was not in a failed state - records `MutableText`
- *                  accumulated so far;
- *          *   Clears failed state.
- */
-
-var private bool failedState;
-
-//  Variables for building a multi-byte code point
-var private int nextCodePoint;
-var private int innerBytesLeft;
-
-//  `MutableText` we are building right now
-var private MutableText         nextText;
-//  `MutableText`s we have already built
-var private array<MutableText>  outputQueue;
-
-//  These masks (`maskDropN`) allow to turn into zero first `N` bits in
-//  the byte with `&` operator.
-var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
-//      These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
-//  in the byte with `&` operator.
-//      `maskTakeN == ~maskDropN`.
-var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
-
-protected function Constructor()
-{
-    nextText = _.text.Empty();
-}
-
-protected function Finalizer()
-{
-    _.memory.Free(nextText);
-    _.memory.FreeMany(outputQueue);
-    nextText            = none;
-    failedState         = false;
-    outputQueue.length  = 0;
-    innerBytesLeft      = 0;
-    nextCodePoint       = 0;
-}
-
-/**
- *  Checks whether data in the `MutableText` that caller `Utf8Decoder` is
- *  currently filling was detected to be invalid.
- *
- *  This state can be reset by pushing `0` byte into caller `Utf8Decoder`.
- *  See `PushByte()` for more info.
- *
- *  @return `true` iff  caller `Utf8Decoder` is not in a failed state.
- */
-public final function bool Failed()
-{
-    return failedState;
-}
-
-/**
- *      Checks whether caller `Utf8Decoder` has any data put in
- *  the `MutableText` it is currently building.
- *      Result is guaranteed to be `false` after `self.PushByte(0)` call, since
- *  it starts a brand new `MutableText`.
- */
-public final function bool HasUnfinishedData()
-{
-    if (innerBytesLeft > 0)         return true;
-    if (nextText.GetLength() > 0)   return true;
-    return false;
-}
-
-/**
- *  Returns next `MutableText` that was successfully decoded by
- *  the caller `Utf8Decoder`, removing it from the output queue.
- *
- *  @return Next `MutableText` in the caller `Utf8Decoder`'s output queue.
- *      `none` iff output queue is empty. `MutableText`s are returned in order
- *      they were decoded.
- */
-public final function MutableText PopText()
-{
-    local MutableText result;
-    if (outputQueue.length <= 0) {
-        return none;
-    }
-    result = outputQueue[0];
-    outputQueue.Remove(0, 1);
-    return result;
-}
-
-/**
- *  Adds next `byte` from the byte stream that is supposed to encode UTF8 text.
- *  To finish building `MutableText` pass `0` byte into this method, which will
- *  `MutableText` built so far into an "output queue" (accessible with
- *  `PopText()`) and start building a new one.
- *
- *      This method expects `byte`s, in order, from a sequence that has correct
- *  UTF8 encoding. If method detects incorrect UTF8 sequence - it will be put
- *  into a "failed state", discarding `MutableText` it was currently building,
- *  along with any further input (except `0` byte).
- *      Pushing `0` byte will restore `Utf8Decoder` from a failed state and it
- *  will start building a new `MutableText`.
- *
- *  @param  nextByte    next byte from byte stream that is supposed to encode
- *      UTF8 text. `0` will make caller `Utf8Decoder` start building new
- *      `MutableText`.
- *  @return `true` iff caller `Utf8Decoder` was not in a failed state and
- *      operation was successful.
- */
-public final function bool PushByte(byte nextByte)
-{
-    if (nextByte == 0)      return QueueCurrentText();
-    if (failedState)        return false;
-    if (innerBytesLeft > 0) return PushInnerByte(nextByte);
-
-    //  Form of 0xxxxxxx means 1 byte per code point
-    if ((nextByte & maskTake1) == 0)
-    {
-        AppendCodePoint(nextByte);
-        return true;
-    }
-    //  Form of 110xxxxx means 2 bytes per code point
-    if ((nextByte & maskTake3) == maskTake2)    //  maskTake2 == 1 1 0 0 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop3;
-        innerBytesLeft = 1;
-        return true;
-    }
-    //  Form of 1110xxxx means 3 bytes per code point
-    if ((nextByte & maskTake4) == maskTake3)    //  maskTake3 == 1 1 1 0 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop4;
-        innerBytesLeft = 2;
-        return true;
-    }
-    //  Form of 11110xxx means 4 bytes per code point
-    if ((nextByte & maskTake5) == maskTake4)    //  maskTake4 == 1 1 1 1 0 0 0 0
-    {
-        nextCodePoint = nextByte & maskDrop5;
-        innerBytesLeft = 3;
-        return true;
-    }
-    //  `nextByte` must have has one of the above forms
-    //  (or 10xxxxxx that is handled in `PushInnerByte()`)
-    failedState = true;
-    return false;
-}
-
-//      This method is responsible for pushing "inner" bytes: bytes that come
-//  after the first one when code point is encoded with multiple bytes.
-//  All of them are expected to have 10xxxxxx prefix.
-//      Assumes `innerBytesLeft > 0` and `failedState == false`
-//  to avoid needless checks.
-private final function bool PushInnerByte(byte nextByte)
-{
-    //  Fail if `nextByte` does not have an expected form: 10xxxxxx
-    if ((nextByte & maskTake2) != maskTake1)
-    {
-        failedState = true;
-        return false;
-    }
-    //  Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
-    //  that actually encode code point, so to make space for those bits we must
-    //  shift previously added code points by `6`
-    nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
-    innerBytesLeft -= 1;
-    if (innerBytesLeft <= 0)
-    {
-        //  We forbid overlong encoding of `0`
-        //  (as does the Unicode standard)
-        if (nextCodePoint == 0)
-        {
-            failedState = true;
-            return false;
-        }
-        AppendCodePoint(nextCodePoint);
-    }
-    return true;
-}
-
-private final function AppendCodePoint(int codePoint)
-{
-    local Text.Character nextCharacter;
-    nextCharacter.codePoint = codePoint;
-    nextText.AppendCharacter(nextCharacter);
-}
-
-//  Return `true` if `MutableText` was added to the queue
-//  (there were no encoding errors)
-private final function bool QueueCurrentText()
-{
-    local bool result;
-    //  If we still do not have all bytes for the character we were building -
-    //  then passed UTF8 was invalid
-    failedState = failedState || innerBytesLeft > 0;
-    result = !failedState;
-    if (failedState) {
-        _.memory.Free(nextText);
-    }
-    else {
-        outputQueue[outputQueue.length] = nextText;
-    }
-    failedState = false;
-    innerBytesLeft = 0;
-    nextText = _.text.Empty();
-    return result;
-}
-
-defaultproperties
-{
-    maskDrop1 = 127 //  0 1 1 1 1 1 1 1
-    maskDrop2 = 63  //  0 0 1 1 1 1 1 1
-    maskDrop3 = 31  //  0 0 0 1 1 1 1 1
-    maskDrop4 = 15  //  0 0 0 0 1 1 1 1
-    maskDrop5 = 7   //  0 0 0 0 0 1 1 1
-    maskTake1 = 128 //  1 0 0 0 0 0 0 0
-    maskTake2 = 192 //  1 1 0 0 0 0 0 0
-    maskTake3 = 224 //  1 1 1 0 0 0 0 0
-    maskTake4 = 240 //  1 1 1 1 0 0 0 0
-    maskTake5 = 248 //  1 1 1 1 1 0 0 0
-}
\ No newline at end of file
diff --git a/sources/Text/Codecs/Utf8Decoder.uc b/sources/Text/Codecs/Utf8Decoder.uc
new file mode 100644
index 0000000..5f86e8f
--- /dev/null
+++ b/sources/Text/Codecs/Utf8Decoder.uc
@@ -0,0 +1,161 @@
+/**
+ *      Class for decoding UTF8 byte stream into Acedia's `MutableText` value.
+ *      This is a separate object instead of just a method, because it allows
+ *  to make code simpler by storing state variables related to
+ *  the decoding process.
+ *      This implementation should correctly convert any valid UTF8, but it is
+ *  not guaranteed to reject any invalid UTF8. In particular, it accepts
+ *  overlong code point encodings. It does check whether every byte has
+ *  a correct bit prefix and does not attempt to repair input data if it finds
+ *  invalid one.
+ *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
+ *      Copyright 2021 Anton Tarasenko
+ *------------------------------------------------------------------------------
+ * This file is part of Acedia.
+ *
+ * Acedia is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Acedia is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Acedia.  If not, see <https://www.gnu.org/licenses/>.
+ */
+class Utf8Decoder extends AcediaObject;
+
+//  Variables for building a multi-byte code point.
+//  Stored as a class member variables to avoid copying them between methods.
+var private MutableText builtText;
+var private int         nextCodePoint;
+var private int         innerBytesLeft;
+
+//  These masks (`maskDropN`) allow to turn into zero first `N` bits in
+//  the byte with `&` operator.
+var private byte maskDrop1, maskDrop2, maskDrop3, maskDrop4, maskDrop5;
+//      These masks (`maskTakeN`) allow to turn into zero all but first `N` bits
+//  in the byte with `&` operator.
+//      `maskTakeN == ~maskDropN`.
+var private byte maskTake1, maskTake2, maskTake3, maskTake4, maskTake5;
+
+/**
+ *  Decodes passed `byte` array (that contains utf8-encoded text) into
+ *  the `MutableText` type.
+ *
+ *  @param  byteStream  Byte stream to decode.
+ *  @return `MutableText` that contains `byteStream`'s text data.
+ *      `none` iff either `byteStream == none` or it's contents do not
+ *      correspond to a (valid) utf8-encoded text.
+ */
+public final function MutableText Decode(ByteArrayRef byteStream)
+{
+    local int           i;
+    local int           length;
+    local MutableText   result;
+    if (byteStream == none) {
+        return none;
+    }
+    nextCodePoint   = 0;
+    innerBytesLeft  = 0;
+    builtText       = _.text.Empty();
+    length = byteStream.GetLength();
+    for (i = 0; i < length; i += 1)
+    {
+        if (!PushByte(byteStream.GetItem(i)))
+        {
+            _.memory.Free(builtText);
+            return none;
+        }
+    }
+    if (innerBytesLeft <= 0) {
+        result = builtText;
+    }
+    else {
+        _.memory.Free(builtText);
+    }
+    builtText = none;
+    return result;
+}
+
+private final function bool PushByte(byte nextByte)
+{
+    if (innerBytesLeft > 0) {
+        return PushInnerByte(nextByte);
+    }
+    //  Form of 0xxxxxxx means 1 byte per code point
+    if ((nextByte & maskTake1) == 0)
+    {
+        AppendCodePoint(nextByte);
+        return true;
+    }
+    //  Form of 110xxxxx means 2 bytes per code point
+    if ((nextByte & maskTake3) == maskTake2)    //  maskTake2 == 1 1 0 0 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop3;
+        innerBytesLeft = 1;
+        return true;
+    }
+    //  Form of 1110xxxx means 3 bytes per code point
+    if ((nextByte & maskTake4) == maskTake3)    //  maskTake3 == 1 1 1 0 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop4;
+        innerBytesLeft = 2;
+        return true;
+    }
+    //  Form of 11110xxx means 4 bytes per code point
+    if ((nextByte & maskTake5) == maskTake4)    //  maskTake4 == 1 1 1 1 0 0 0 0
+    {
+        nextCodePoint = nextByte & maskDrop5;
+        innerBytesLeft = 3;
+        return true;
+    }
+    //  `nextByte` must have has one of the above forms
+    //  (or 10xxxxxx that is handled in `PushInnerByte()`)
+    return false;
+}
+
+//      This method is responsible for pushing "inner" bytes: bytes that come
+//  after the first one when code point is encoded with multiple bytes.
+//  All of them are expected to have 10xxxxxx prefix.
+//      Assumes `innerBytesLeft > 0` to avoid needless checks.
+private final function bool PushInnerByte(byte nextByte)
+{
+    //  Fail if `nextByte` does not have an expected form: 10xxxxxx
+    if ((nextByte & maskTake2) != maskTake1) {
+        return false;
+    }
+    //  Since inner bytes have the form of 10xxxxxx, they all carry only 6 bits
+    //  that actually encode code point, so to make space for those bits we must
+    //  shift previously added code points by `6`
+    nextCodePoint = (nextCodePoint << 6) + (nextByte & maskDrop2);
+    innerBytesLeft -= 1;
+    if (innerBytesLeft <= 0) {
+        AppendCodePoint(nextCodePoint);
+    }
+    return true;
+}
+
+private final function AppendCodePoint(int codePoint)
+{
+    local Text.Character nextCharacter;
+    nextCharacter.codePoint = codePoint;
+    builtText.AppendCharacter(nextCharacter);
+}
+
+defaultproperties
+{
+    maskDrop1 = 127 //  0 1 1 1 1 1 1 1
+    maskDrop2 = 63  //  0 0 1 1 1 1 1 1
+    maskDrop3 = 31  //  0 0 0 1 1 1 1 1
+    maskDrop4 = 15  //  0 0 0 0 1 1 1 1
+    maskDrop5 = 7   //  0 0 0 0 0 1 1 1
+    maskTake1 = 128 //  1 0 0 0 0 0 0 0
+    maskTake2 = 192 //  1 1 0 0 0 0 0 0
+    maskTake3 = 224 //  1 1 1 0 0 0 0 0
+    maskTake4 = 240 //  1 1 1 1 0 0 0 0
+    maskTake5 = 248 //  1 1 1 1 1 0 0 0
+}
\ No newline at end of file
diff --git a/sources/Avarice/Utf8Encoder.uc b/sources/Text/Codecs/Utf8Encoder.uc
similarity index 76%
rename from sources/Avarice/Utf8Encoder.uc
rename to sources/Text/Codecs/Utf8Encoder.uc
index e66321d..af0042b 100644
--- a/sources/Avarice/Utf8Encoder.uc
+++ b/sources/Text/Codecs/Utf8Encoder.uc
@@ -1,6 +1,8 @@
 /**
  *      Class for encoding Acedia's `MutableText` value into UTF8 byte
  *  representation.
+ *      This is a separate object instead of just a method to match design of
+ *  `Utf8Decoder`.
  *      See [wiki page](https://en.wikipedia.org/wiki/UTF-8) for details.
  *      Copyright 2021 Anton Tarasenko
  *------------------------------------------------------------------------------
@@ -38,69 +40,68 @@ var private int lastSixBits;
  *  code points - this method will return empty array.
  *
  *  @param  text    `Text` object to encode.
- *  @return UTF8 representation of passed `text` as an array of `byte`s.
- *      Empty array if `text == none` or `text` contains invalid Unicode
+ *  @return UTF8 representation of passed `text` inside `ByteArrayRef`.
+ *      `none` iff `text == none` or `text` contains invalid Unicode
  *      code points.
  */
-public final function array<byte> Encode(Text text)
+public final function ByteArrayRef Encode(Text text)
 {
     local int           i, nextCodepoint, textLength;
-    local array<byte>   buffer;
+    local ByteArrayRef  buffer;
     if (__().text.IsEmpty(text)) {
-        return buffer; // empty array
+        return none; // empty array
     }
+    buffer = ByteArrayRef(_.memory.Allocate(class'ByteArrayRef'));
     textLength = text.GetLength();
     for (i = 0; i < textLength; i += 1)
     {
         nextCodepoint = text.GetCharacter(i).codePoint;
         if (nextCodepoint <= utfLimit1) {
-            buffer[buffer.length] = nextCodepoint;
+            buffer.AddItem(nextCodepoint);
         }
         else if (nextCodepoint <= utfLimit2)
         {
             //  Drop 6 bits that will be recorded inside second byte and
             //  add 2-byte sequence mask
-            buffer[buffer.length] = utfMask2 | (nextCodepoint >> 6);
+            buffer.AddItem(utfMask2 | (nextCodepoint >> 6));
             //  Take only last 6 bits for the second (last) byte
             //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
         }
         else if (nextCodepoint <= utfLimit3)
         {
             //  Drop 12 bits that will be recorded inside second and third bytes
             //  and add 3-byte sequence mask
-            buffer[buffer.length] = utfMask3 | (nextCodepoint >> 12);
+            buffer.AddItem(utfMask3 | (nextCodepoint >> 12));
             //  Drop 6 bits that will be recorded inside third byte and
             //  add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
             //  Take only last 6 bits for the third (last) byte
             //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
         }
         else if (nextCodepoint <= utfLimit4)
         {
             //  Drop 18 bits that will be recorded inside second, third and
             //  fourth bytes, then add 4-byte sequence mask
-            buffer[buffer.length] = utfMask4 | (nextCodepoint >> 18);
+            buffer.AddItem(utfMask4 | (nextCodepoint >> 18));
             //  Drop 12 bits that will be recorded inside third and fourth bytes
             //  and add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 12) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 12) & lastSixBits));
             //  Drop 6 bits that will be recorded inside fourth byte
             //  and add inner-byte sequence mask
-            buffer[buffer.length] =
-                utfMaskIn | ((nextCodepoint >> 6) & lastSixBits);
+            buffer.AddItem(utfMaskIn | ((nextCodepoint >> 6) & lastSixBits));
             //  Take only last 6 bits for the fourth (last) byte
             //  + add inner-byte sequence mask
-            buffer[buffer.length] = utfMaskIn | (nextCodepoint & lastSixBits);
+            buffer.AddItem(utfMaskIn | (nextCodepoint & lastSixBits));
         }
         else
         {
             //      Outside of known Unicode range
             //      Should not be possible, since `Text` is expected to
             //  contain only correct Unicode
-            buffer.length = 0;
+            _.memory.Free(buffer);
+            buffer = none;
             break;
         }
     }
diff --git a/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc b/sources/Text/Tests/TEST_UTF8EncoderDecoder.uc
new file mode 100644
index 0000000000000000000000000000000000000000..63daa53c947bb8e886f30a4850629f20d15a80a5
GIT binary patch
literal 10184
zcmd5?TWDOz8J@FRxp9J9gK-}WK3U00B)9TPT4@!@b!^G9WeXI=MC+Q=POO!7R}zs|
z%4+3XYp2jcuN0cTIEA!ntm2oH(6kQ?6w($7g+BNp4N<A3P!nQG`cmjqDNVm`{xdo=
z>$9ti6%&?onK?86eDmLDX8-rGYE@srZ<~5hCGbq(H=^>YjOUCh;9f>)4)-~{m+(HV
zvbrv(zs=x2s!pP&pt{u&d>==@9G)fAC(yQtlHKMgYV0;G_&N{jdEIVIT|&7}ok!1-
znnKIGN;>F>kiUM6GmF-2HK8VfAfZyK4Octf`Y=ivEsA*0Xj)l4Mo~SFS~J^y>eKr(
z7>R!pb0q%@cus>OiJ&~IXBNt;PxC8>uUU(0Atn2}p;6FMCCxc-rU0&8&{XqkKk6=_
zZXVBZ)RC9e0Me&Mkn>~sN(~`3;t+YaqE`u}b9l0K0dztX(sCa3L{F$=lbQx8mC%cG
zHQFa%j;Ud2SwbB{i=u9o1t${f1jZl38#N`NhV+vys82*gO0z3%=JYt!N9tLK$c#Ur
z=f|ESdi;<V2{nKk!A$+1LaC^zyXQ5Uxu7PYI#B0gcTuqqxf71w44mYh@S1$1W}7kg
zqD-#RE{yH5jXQ2=BO%==X-CP9{FuTBp&TYP0`h^J5p*K83urm5IX|X3ES6gUXQsi8
zaBR_wggT`Lp})t~Q;=Q;^!gyjXYf3Qx2N%AIfI%np^mgU9{Egu<n<hFTvO1ES&TG=
zZ$vGcTGIR}YYvg~A&Mu}QH;@#@nxJobr^VtG$x`DEgjJ1VYD5=kCJ{0y@-|iMVuM5
z=*J9(bcwYmQF{XIM$jvXcH%V+$MP12jcINnz(xzA%_X$0lrdBCe=gFxpV6G79SaYq
zHIFDi>WbJd{ppl0lYd!U^u_d}m0G(I+#%;_owSe%&^2-;M>)RnPvqJl<ds0X8AzO#
zLAxc^1DHb@zgg7mM~rF(75<W-ScHBg(W8X-R;>-BDmFidnu8X1TD2s`(Tcu+-XNpp
zoKa`BMh+lqxe>{YVbsElz(M-Cpufb!eofIxT4GWbenmWE7BekFtH}{w63^(1=rvgq
zuW9_KaEe|;G&~Q@5)aES<P7CxzFl;_hNF-=Imsx@J7vrf#AA)4L`!A-HQpuU$B1fH
zdo|Xt>buN{K4%8)#oI>5dC|=<rVAd^pFTePmQYUsMKqme$mFa>k;F{%xG3XGkY*oH
z7#y5eGbkE~HdUe4gxTy3Bw(=ggCg?+k!iDPgf@sY>Hny!AzD*IzeO#Pn8CinQ(9#f
z7?Yq<IgX)v*2;*4Ub8g<V_jv>aD0Bf|J+JwTN7VFTP!=e5H&C28~`Wim#WQ9{DMef
zbs5msL}QE$Y`@W@t#DLJj+QwJF0mc6J!;myNXzyFwjZEnh*n?G{NQ{xW^1BtQF;+=
z6ShW*zHsd77J3-=LLKdcR2Ua8Ax`DBk9D=ywNYbR)8OEfI{MO+G1H{P$z_opDYMkB
zJ(4M<Dy&R=nplTua5YpLwfJiEqHKN;`Z9+((Q7p7e39|dXcqH8&co%7tx>gBH-$j&
z8CNcVDmB++!fq6n2p8omY;Muh<?tKSn9c!lyXsM$(7vcIU0CZ-FO7GLelm9Q%WJq!
zsxM-N)P_+~j$PNr(4o54Fi!O7Qfn|gVqr*OUbbJTjUffZ)%j5EOSej|AP+EX?ODOl
zgIvAG!C*9+D@evE*Bd7-Z%%Ep5gY$txDaL;M`1Cu&<w`hVfZMya}Cq1r7ZST+25v}
zM25o1&kQ&8L}sL5UW@Ae<H5|@A>j^itJ|5IOR-YV$lbOode&K$^PSMoXx%hAvpxw^
z&!RNFqbjz?EgJ|pk$64e19t=ZkhiY~Lx+023WiSTN;JB(Wgh{Cch3YEjz9y`6&Q?`
zrO@VnYMFQoa>M3`>mTi?$Lw*mN<0|VQJ8BB@qNrYC4*w*mK=+Bu2iD(Bd)cWgWJ}T
zLBtY2Z8+49-dypQ!9kA2h$fkB36>!GDw#qao_!|B1IShKwH<PCJ!BxAn!YgFa@UJZ
z?4mk0V;{zfP2QDIwS45}U?WeGL53B@w-mfbm$qyZTLQk-^quR+Hl*X_1f)Bu{hE<m
z2k^Q1$y#Zr?nz&|Fl*X>80~_2uj*Sj2j6Ns)2-#gSXUvJnjUX+u&t&uX~YOx8*Nh1
zt;G|rAKU6Wv-ugiyTbXT^K?5-yH;R-k;qXk55E6-^>KLS^D9|3vupZ3u0bU8u~*Mr
zIdM%gh0KUK)(lFFB9bF>k3v?H_Njp#mw2)e6;*8h+?_=*!|B$ZuQK;Z2U=@K#c0iB
z{wA5*dNM=f&20K^)crJY&&0-dcz*eDrO*bs?8BAD)r9&cJjWn8Ms|BXc5mrt_hzP3
zt9Nb5)mj7f1QFiG@=3^Uv%E^J9R|*!`pP~7D+yQPVcx(u*)iuiM|ow%?)naUg<p%3
z$-KC0Aa=@KAZBv1he-XePPv*tB1W^)-WX-$b#4A}e~drkGnrPY20BB-^(8{XwP$yR
zE`9O5hL($GqkcD<TGVcai_^v8&g)*WxYVMstVLG@VYshr`XsSu$Th<`XD3_s59$AT
zo>YdHueKv@e5y!qZQ3PRTxoJ`9coXwBCn;Tm19R)pN!>7t8)KfEf!HZmK!%h8JYbo
zdv>yZ6W=P|M`0~@$6MLS%z@`TN0Ac`vtuD=2<~nxPcNqRN-Zi1SNFd5&PNo_D(4za
zDO`y~(MV#YDCZ<xE!ul5wI#YaV`nh-3GjybOR++;d5$aVQfkF|bUchbZcmXLB~^m0
z@Jv)<vKvEqc0(V2OxxQ$yt>#7SIBby!Kl*cM03+;7=3vXW2{X2-0iJIyIdO8ELSX}
z5ofaeFA3uk_fDBB^1QM^CE^dqd*j{lR6HF&%zftg0hD_ArER3`M{UoyW!q+kKR-Y8
zv3KTkFMRjvzt0?~KyPx9VemvWnqG%?K<#H7%i_(Pu-a$pJk8GH94H^0xbfWGWTkS_
zZFYBAdjV8<3QlghU#CDvc9c0U+K`=}7<{sKA}1Eyt1$nbAV0|S@T{FVr&yx=3nL3D
z8tL;q(VQl_{wAEGImNe8Fk(iZCsNv4(q3JBd@;S0s4x1z_g?h()9^TqV7?|J|5
z4*G{~{4mzKm|kv2&2wn=XS{dDx)#&#e-qaOb$>$r%c$Fe>pC(=?kaR5kL<z<yBm9H
zZIEcM+JpU$N7ZxcG5q(4dz<^PC-ek%sZS#ldRARQG~bWR@eCx~f-~VH&URbXA$1ir
zAMinW=`~z4TXMc%zb(GoyNUXv-nIJfmTPz$-1=eN&+z?&*buPXj_ry+hUYKiqqyJk
zeieHpKC$x+?=xueFaM9I8$q8l+us8EH&Hr;>-*jTzuy~-A4J^`P=0B5=?3PzJmr6c
zUN^mY)M1Xde(YV|>>0+l`>~b<jPf2XKL2{-)rC8;2Qk`Kpm}uR?|AFQHShfsh+f3o
z5%0^m58!$O?SGBh-MIXPkGzZM(coW4snPp7zWvsJ$bWlxu>sVUpQTKfPRIAxz2*1U
z=UOhKKe(}+LM^3#&6_~Yn199p4r=SS-|?<)`dka3{DSwboy&FC8`>esg)Lc(ye+;9
zBmWK8E$@}sXVK<2ct7HQ1^3&2(ZB4yx#KSjki_!0as2{w?Amct-Pw+sTTi3^+qiz`
zH{t#V)a;4B5<BM~Lg_3@&1jv$`<Q<krT@ay{0F5D{~LQ-@TG+JcQ$*Lp!sUuOBn4Z
zK>HA`GTz_y8e_f>TUnZ4crW%-)PAjD=hhGL#9Ws?sPA34?azAeG~`g))$%%eK$n+K
N;%Y=qF7SL;{tpwt-JAdb

literal 0
HcmV?d00001