summaryrefslogtreecommitdiff
path: root/src/utf8.hh
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2017-04-20 16:18:49 +0100
committerMaxime Coste <mawww@kakoune.org>2017-04-20 16:18:49 +0100
commitdbcddafbfdc8808e8823812b1a5c40d4aedcdf90 (patch)
treef849d94304e2fbc064741bdaf3d45f93fd89da67 /src/utf8.hh
parent30e6387071b6aee2239d155822091dc834090b7f (diff)
Change utf8::to_next/to_previous so that they are more symetrical
The previous implementation could yield different positions when iterating forward and backward, leading to confusion in boost regex. This makes an existing problem a bit more visible: iterating with to_next and with read_codepoint wont behave the same way, as read_codepoint will put the iterator onto the byte following the utf8 codepoint, whereas to_next will put it on the next utf8 character start byte, which might be different if the buffer content is not valid utf8. Fixes #1195
Diffstat (limited to 'src/utf8.hh')
-rw-r--r--src/utf8.hh31
1 files changed, 14 insertions, 17 deletions
diff --git a/src/utf8.hh b/src/utf8.hh
index bbf34ae3..34cecc81 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end)
// According to rfc3629, UTF-8 allows only up to 4 bytes.
// (21 bits codepoint)
unsigned char byte = read(it);
- if (not (byte & 0x80)) // 0xxxxxxx
+ if ((byte & 0x80) == 0) // 0xxxxxxx
return byte;
if (it == end)
@@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end)
template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
ByteCount codepoint_size(char byte)
{
- if (not (byte & 0x80)) // 0xxxxxxx
+ if ((byte & 0x80) == 0) // 0xxxxxxx
return 1;
else if ((byte & 0xE0) == 0xC0) // 110xxxxx
return 2;
@@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp)
template<typename Iterator>
void to_next(Iterator& it, const Iterator& end)
{
- if (it != end and read(it) & 0x80)
- while (it != end and (*(it) & 0xC0) == 0x80)
- ++it;
+ if (it != end)
+ ++it;
+ while (it != end and not is_character_start(*it))
+ ++it;
}
// returns an iterator to next character first byte
@@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end)
template<typename Iterator>
void to_previous(Iterator& it, const Iterator& begin)
{
- while (it != begin and (*(--it) & 0xC0) == 0x80)
- ;
+ if (it != begin)
+ --it;
+ while (not is_character_start(*it))
+ --it;
}
// returns an iterator to the previous character first byte
template<typename Iterator>
@@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d)
if (d < 0)
{
- while (it != end and d != 0)
- {
- if (is_character_start(*--it))
- ++d;
- }
+ while (it != end and d++ != 0)
+ to_previous(it, end);
}
else if (d > 0)
{
- while (it != end and d != 0)
- {
- if (is_character_start(*++it))
- --d;
- }
+ while (it != end and d-- != 0)
+ to_next(it, end);
}
return it;
}