Module: stdgo.unicode.utf8
Overview
Package utf8 implements functions and constants to support text encoded in UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. See https://en.wikipedia.org/wiki/UTF-8
Index
-
function appendRune(_p:stdgo.Slice<stdgo.GoByte>, _r:stdgo.GoRune):stdgo.Slice<stdgo.GoByte>
-
function decodeLastRune(_p:stdgo.Slice<stdgo.GoByte>):{ _1:stdgo.GoInt; _0:stdgo.GoRune; }
-
function decodeLastRuneInString(_s:stdgo.GoString):{ _1:stdgo.GoInt; _0:stdgo.GoRune; }
-
function decodeRune(_p:stdgo.Slice<stdgo.GoByte>):{ _1:stdgo.GoInt; _0:stdgo.GoRune; }
-
function decodeRuneInString(_s:stdgo.GoString):{ _1:stdgo.GoInt; _0:stdgo.GoRune; }
-
function encodeRune(_p:stdgo.Slice<stdgo.GoByte>, _r:stdgo.GoRune):stdgo.GoInt
-
function runeCount(_p:stdgo.Slice<stdgo.GoByte>):stdgo.GoInt
Examples
Constants
import stdgo.unicode.utf8.Utf8
final _as:stdgo.GoUInt64 = ((240i64 : stdgo.GoUInt64))
ASCII: size 1
final _hicb:stdgo.GoUInt64 = ((191i64 : stdgo.GoUInt64))
final _locb:stdgo.GoUInt64 = ((128i64 : stdgo.GoUInt64))
The default lowest and highest continuation byte.
final _mask2:stdgo.GoUInt64 = ((31i64 : stdgo.GoUInt64))
final _mask3:stdgo.GoUInt64 = ((15i64 : stdgo.GoUInt64))
final _mask4:stdgo.GoUInt64 = ((7i64 : stdgo.GoUInt64))
final _maskx:stdgo.GoUInt64 = ((63i64 : stdgo.GoUInt64))
final _rune1Max:stdgo.GoUInt64 = ((127i64 : stdgo.GoUInt64))
final _rune2Max:stdgo.GoUInt64 = ((2047i64 : stdgo.GoUInt64))
final _rune3Max:stdgo.GoUInt64 = ((65535i64 : stdgo.GoUInt64))
final _s1:stdgo.GoUInt64 = ((2i64 : stdgo.GoUInt64))
accept 0, size 2
final _s2:stdgo.GoUInt64 = ((19i64 : stdgo.GoUInt64))
accept 1, size 3
final _s3:stdgo.GoUInt64 = ((3i64 : stdgo.GoUInt64))
accept 0, size 3
final _s4:stdgo.GoUInt64 = ((35i64 : stdgo.GoUInt64))
accept 2, size 3
final _s5:stdgo.GoUInt64 = ((52i64 : stdgo.GoUInt64))
accept 3, size 4
final _s6:stdgo.GoUInt64 = ((4i64 : stdgo.GoUInt64))
accept 0, size 4
final _s7:stdgo.GoUInt64 = ((68i64 : stdgo.GoUInt64))
accept 4, size 4
final _surrogateMax:stdgo.GoUInt64 = ((57343i64 : stdgo.GoUInt64))
Code points in the surrogate range are not valid for UTF-8.
final _surrogateMin:stdgo.GoUInt64 = ((55296i64 : stdgo.GoUInt64))
Code points in the surrogate range are not valid for UTF-8.
final _t1:stdgo.GoUInt64 = ((0i64 : stdgo.GoUInt64))
final _t2:stdgo.GoUInt64 = ((192i64 : stdgo.GoUInt64))
final _t3:stdgo.GoUInt64 = ((224i64 : stdgo.GoUInt64))
final _t4:stdgo.GoUInt64 = ((240i64 : stdgo.GoUInt64))
final _t5:stdgo.GoUInt64 = ((248i64 : stdgo.GoUInt64))
final _tx:stdgo.GoUInt64 = ((128i64 : stdgo.GoUInt64))
final _xx:stdgo.GoUInt64 = ((241i64 : stdgo.GoUInt64))
These names of these constants are chosen to give nice alignment in the table below. The first nibble is an index into acceptRanges or F for special one-byte cases. The second nibble is the Rune length or the Status for the special one-byte case.
invalid: size 1
final maxRune:stdgo.GoInt32 = ((1114111 : stdgo.GoInt32))
Numbers fundamental to the encoding.
Maximum valid Unicode code point.
final runeError:stdgo.GoInt32 = ((65533 : stdgo.GoInt32))
Numbers fundamental to the encoding.
the "error" Rune or "Unicode replacement character"
final runeSelf:stdgo.GoUInt64 = ((128i64 : stdgo.GoUInt64))
Numbers fundamental to the encoding.
characters below RuneSelf are represented as themselves in a single byte.
final utfmax:stdgo.GoUInt64 = ((4i64 : stdgo.GoUInt64))
Numbers fundamental to the encoding.
maximum number of bytes of a UTF-8 encoded Unicode character.
Variables
import stdgo.unicode.utf8.Utf8
var _acceptRanges:stdgo.GoArray<stdgo.unicode.utf8.T_acceptRange>
acceptRanges has size 16 to avoid bounds checks in the code that uses it.
var _first:stdgo.GoArray<stdgo.GoUInt8>
first is information about the first byte in a UTF-8 sequence.
Functions
import stdgo.unicode.utf8.Utf8
function _appendRuneNonASCII
function _appendRuneNonASCII(_p:stdgo.Slice<stdgo.GoByte>, _r:stdgo.GoRune):stdgo.Slice<stdgo.GoByte>
function appendRune
function appendRune(_p:stdgo.Slice<stdgo.GoByte>, _r:stdgo.GoRune):stdgo.Slice<stdgo.GoByte>
AppendRune appends the UTF-8 encoding of r to the end of p and returns the extended buffer. If the rune is out of range, it appends the encoding of RuneError.
exampleAppendRune
function exampleAppendRune():Void {
var _buf1 = stdgo.unicode.utf8.Utf8.appendRune((null : stdgo.Slice<stdgo.GoUInt8>), (65536 : stdgo.GoInt32));
var _buf2 = stdgo.unicode.utf8.Utf8.appendRune((("init" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>), (65536 : stdgo.GoInt32));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface((_buf1 : stdgo.GoString)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface((_buf2 : stdgo.GoString)));
}
function decodeLastRune
function decodeLastRune(_p:stdgo.Slice<stdgo.GoByte>):{
_1:stdgo.GoInt;
_0:stdgo.GoRune;
}
DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it returns (RuneError, 1). Both are impossible results for correct, non-empty UTF-8.
An encoding is invalid if it is incorrect UTF-8, encodes a rune that is out of range, or is not the shortest possible UTF-8 encoding for the value. No other validation is performed.
exampleDecodeLastRune
function exampleDecodeLastRune():Void {
var _b = (("Hello, 世界" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>);
while ((_b.length) > (0 : stdgo.GoInt)) {
var __tmp__ = stdgo.unicode.utf8.Utf8.decodeLastRune(_b), _r:stdgo.GoInt32 = __tmp__._0, _size:stdgo.GoInt = __tmp__._1;
stdgo.fmt.Fmt.printf(("%c %v\n" : stdgo.GoString), stdgo.Go.toInterface(_r), stdgo.Go.toInterface(_size));
_b = (_b.__slice__(0, (_b.length) - _size) : stdgo.Slice<stdgo.GoUInt8>);
};
}
function decodeLastRuneInString
function decodeLastRuneInString(_s:stdgo.GoString):{
_1:stdgo.GoInt;
_0:stdgo.GoRune;
}
DecodeLastRuneInString is like DecodeLastRune but its input is a string. If s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it returns (RuneError, 1). Both are impossible results for correct, non-empty UTF-8.
An encoding is invalid if it is incorrect UTF-8, encodes a rune that is out of range, or is not the shortest possible UTF-8 encoding for the value. No other validation is performed.
exampleDecodeLastRuneInString
function exampleDecodeLastRuneInString():Void {
var _str:stdgo.GoString = ("Hello, 世界" : stdgo.GoString);
while ((_str.length) > (0 : stdgo.GoInt)) {
var __tmp__ = stdgo.unicode.utf8.Utf8.decodeLastRuneInString(_str?.__copy__()), _r:stdgo.GoInt32 = __tmp__._0, _size:stdgo.GoInt = __tmp__._1;
stdgo.fmt.Fmt.printf(("%c %v\n" : stdgo.GoString), stdgo.Go.toInterface(_r), stdgo.Go.toInterface(_size));
_str = (_str.__slice__(0, (_str.length) - _size) : stdgo.GoString)?.__copy__();
};
}
function decodeRune
function decodeRune(_p:stdgo.Slice<stdgo.GoByte>):{
_1:stdgo.GoInt;
_0:stdgo.GoRune;
}
DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it returns (RuneError, 1). Both are impossible results for correct, non-empty UTF-8.
An encoding is invalid if it is incorrect UTF-8, encodes a rune that is out of range, or is not the shortest possible UTF-8 encoding for the value. No other validation is performed.
exampleDecodeRune
function exampleDecodeRune():Void {
var _b = (("Hello, 世界" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>);
while ((_b.length) > (0 : stdgo.GoInt)) {
var __tmp__ = stdgo.unicode.utf8.Utf8.decodeRune(_b), _r:stdgo.GoInt32 = __tmp__._0, _size:stdgo.GoInt = __tmp__._1;
stdgo.fmt.Fmt.printf(("%c %v\n" : stdgo.GoString), stdgo.Go.toInterface(_r), stdgo.Go.toInterface(_size));
_b = (_b.__slice__(_size) : stdgo.Slice<stdgo.GoUInt8>);
};
}
function decodeRuneInString
function decodeRuneInString(_s:stdgo.GoString):{
_1:stdgo.GoInt;
_0:stdgo.GoRune;
}
DecodeRuneInString is like DecodeRune but its input is a string. If s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it returns (RuneError, 1). Both are impossible results for correct, non-empty UTF-8.
An encoding is invalid if it is incorrect UTF-8, encodes a rune that is out of range, or is not the shortest possible UTF-8 encoding for the value. No other validation is performed.
exampleDecodeRuneInString
function exampleDecodeRuneInString():Void {
var _str:stdgo.GoString = ("Hello, 世界" : stdgo.GoString);
while ((_str.length) > (0 : stdgo.GoInt)) {
var __tmp__ = stdgo.unicode.utf8.Utf8.decodeRuneInString(_str?.__copy__()), _r:stdgo.GoInt32 = __tmp__._0, _size:stdgo.GoInt = __tmp__._1;
stdgo.fmt.Fmt.printf(("%c %v\n" : stdgo.GoString), stdgo.Go.toInterface(_r), stdgo.Go.toInterface(_size));
_str = (_str.__slice__(_size) : stdgo.GoString)?.__copy__();
};
}
function encodeRune
function encodeRune(_p:stdgo.Slice<stdgo.GoByte>, _r:stdgo.GoRune):stdgo.GoInt
EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. If the rune is out of range, it writes the encoding of RuneError. It returns the number of bytes written.
exampleEncodeRune
function exampleEncodeRune():Void {
var _r:stdgo.GoInt32 = (19990 : stdgo.GoInt32);
var _buf = new stdgo.Slice<stdgo.GoUInt8>((3 : stdgo.GoInt).toBasic(), 0).__setNumber32__();
var _n:stdgo.GoInt = stdgo.unicode.utf8.Utf8.encodeRune(_buf, _r);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(_buf));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(_n));
}
exampleEncodeRune_outOfRange
function exampleEncodeRune_outOfRange():Void {
var _runes = (new stdgo.Slice<stdgo.GoInt32>(3, 3, (-1 : stdgo.GoInt32), (1114112 : stdgo.GoInt32), (65533 : stdgo.GoInt32)) : stdgo.Slice<stdgo.GoInt32>);
for (_i => _c in _runes) {
var _buf = new stdgo.Slice<stdgo.GoUInt8>((3 : stdgo.GoInt).toBasic(), 0).__setNumber32__();
var _size:stdgo.GoInt = stdgo.unicode.utf8.Utf8.encodeRune(_buf, _c);
stdgo.fmt.Fmt.printf(("%d: %d %[2]s %d\n" : stdgo.GoString), stdgo.Go.toInterface(_i), stdgo.Go.toInterface(_buf), stdgo.Go.toInterface(_size));
};
}
function fullRune
function fullRune(_p:stdgo.Slice<stdgo.GoByte>):Bool
FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
exampleFullRune
function exampleFullRune():Void {
var _buf = (new stdgo.Slice<stdgo.GoUInt8>(3, 3, (228 : stdgo.GoUInt8), (184 : stdgo.GoUInt8), (150 : stdgo.GoUInt8)) : stdgo.Slice<stdgo.GoUInt8>);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.fullRune(_buf)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.fullRune((_buf.__slice__(0, (2 : stdgo.GoInt)) : stdgo.Slice<stdgo.GoUInt8>))));
}
function fullRuneInString
function fullRuneInString(_s:stdgo.GoString):Bool
FullRuneInString is like FullRune but its input is a string.
exampleFullRuneInString
function exampleFullRuneInString():Void {
var _str:stdgo.GoString = ("世" : stdgo.GoString);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.fullRuneInString(_str?.__copy__())));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.fullRuneInString((_str.__slice__(0, (2 : stdgo.GoInt)) : stdgo.GoString)?.__copy__())));
}
function runeCount
function runeCount(_p:stdgo.Slice<stdgo.GoByte>):stdgo.GoInt
RuneCount returns the number of runes in p. Erroneous and short encodings are treated as single runes of width 1 byte.
exampleRuneCount
function exampleRuneCount():Void {
var _buf = (("Hello, 世界" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(("bytes =" : stdgo.GoString)), stdgo.Go.toInterface((_buf.length)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(("runes =" : stdgo.GoString)), stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeCount(_buf)));
}
function runeCountInString
function runeCountInString(_s:stdgo.GoString):stdgo.GoInt
RuneCountInString is like RuneCount but its input is a string.
exampleRuneCountInString
function exampleRuneCountInString():Void {
var _str:stdgo.GoString = ("Hello, 世界" : stdgo.GoString);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(("bytes =" : stdgo.GoString)), stdgo.Go.toInterface((_str.length)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(("runes =" : stdgo.GoString)), stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeCountInString(_str?.__copy__())));
}
function runeLen
function runeLen(_r:stdgo.GoRune):stdgo.GoInt
RuneLen returns the number of bytes required to encode the rune. It returns -1 if the rune is not a valid value to encode in UTF-8.
exampleRuneLen
function exampleRuneLen():Void {
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeLen((97 : stdgo.GoInt32))));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeLen((30028 : stdgo.GoInt32))));
}
function runeStart
function runeStart(_b:stdgo.GoByte):Bool
RuneStart reports whether the byte could be the first byte of an encoded, possibly invalid rune. Second and subsequent bytes always have the top two bits set to 10.
exampleRuneStart
function exampleRuneStart():Void {
var _buf = (("a界" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeStart(_buf[(0 : stdgo.GoInt)])));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeStart(_buf[(1 : stdgo.GoInt)])));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.runeStart(_buf[(2 : stdgo.GoInt)])));
}
function valid
function valid(_p:stdgo.Slice<stdgo.GoByte>):Bool
Valid reports whether p consists entirely of valid UTF-8-encoded runes.
exampleValid
function exampleValid():Void {
var _valid = (("Hello, 世界" : stdgo.GoString) : stdgo.Slice<stdgo.GoByte>);
var _invalid = (new stdgo.Slice<stdgo.GoUInt8>(3, 3, (255 : stdgo.GoUInt8), (254 : stdgo.GoUInt8), (253 : stdgo.GoUInt8)) : stdgo.Slice<stdgo.GoUInt8>);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.valid(_valid)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.valid(_invalid)));
}
function validRune
function validRune(_r:stdgo.GoRune):Bool
ValidRune reports whether r can be legally encoded as UTF-8. Code points that are out of range or a surrogate half are illegal.
exampleValidRune
function exampleValidRune():Void {
var _valid:stdgo.GoInt32 = (97 : stdgo.GoInt32);
var _invalid:stdgo.GoInt32 = ((268435455 : stdgo.GoInt32) : stdgo.GoRune);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.validRune(_valid)));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.validRune(_invalid)));
}
function validString
function validString(_s:stdgo.GoString):Bool
ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
exampleValidString
function exampleValidString():Void {
var _valid:stdgo.GoString = ("Hello, 世界" : stdgo.GoString);
var _invalid:stdgo.GoString = ((new stdgo.Slice<stdgo.GoUInt8>(3, 3, (255 : stdgo.GoUInt8), (254 : stdgo.GoUInt8), (253 : stdgo.GoUInt8)) : stdgo.Slice<stdgo.GoUInt8>) : stdgo.GoString);
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.validString(_valid?.__copy__())));
stdgo.fmt.Fmt.println(stdgo.Go.toInterface(stdgo.unicode.utf8.Utf8.validString(_invalid?.__copy__())));
}