summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2024-12-05 18:40:55 +1100
committerMaxime Coste <mawww@kakoune.org>2024-12-05 18:40:55 +1100
commit98ed5847de20f04d0595d5ac08da06ff8aa5c1b2 (patch)
tree3fdfeedba8d5f7463b203ec0c2b6fa90e100dfb6 /src
parentdac922e24c48d297de063786be3065554409cc71 (diff)
Split utf8::read_codepoint between single byte and multibyte code
Make read_codepoint_multibyte noinline so that the common case single byte case gets inlined.
Diffstat (limited to 'src')
-rw-r--r--src/utf8.hh28
1 files changed, 17 insertions, 11 deletions
diff --git a/src/utf8.hh b/src/utf8.hh
index 0ba8bdef..e5643214 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -39,22 +39,13 @@ struct Pass
}
-// returns the codepoint of the character whose first byte
-// is pointed by it
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
typename Iterator, typename Sentinel>
-Codepoint read_codepoint(Iterator& it, const Sentinel& end)
+[[gnu::noinline]]
+Codepoint read_codepoint_multibyte(Iterator& it, const Sentinel& end, char byte)
noexcept(noexcept(InvalidPolicy{}(0)))
{
if (it == end)
- return InvalidPolicy{}(-1);
- // According to rfc3629, UTF-8 allows only up to 4 bytes.
- // (21 bits codepoint)
- unsigned char byte = read(it);
- if ((byte & 0x80) == 0) // 0xxxxxxx
- return byte;
-
- if (it == end)
return InvalidPolicy{}(byte);
if ((byte & 0xE0) == 0xC0) // 110xxxxx
@@ -81,6 +72,21 @@ Codepoint read_codepoint(Iterator& it, const Sentinel& end)
return InvalidPolicy{}(byte);
}
+// returns the codepoint of the character whose first byte
+// is pointed by it
+template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
+ typename Iterator, typename Sentinel>
+Codepoint read_codepoint(Iterator& it, const Sentinel& end)
+ noexcept(noexcept(InvalidPolicy{}(0)))
+{
+ if (it == end)
+ return InvalidPolicy{}(-1);
+ unsigned char byte = read(it);
+ if ((byte & 0x80) == 0) [[likely]] // 0xxxxxxx
+ return byte;
+ return read_codepoint_multibyte(it, end, byte);
+}
+
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass,
typename Iterator, typename Sentinel>
Codepoint codepoint(Iterator it, const Sentinel& end)