diff options
| author | Maxime Coste <mawww@kakoune.org> | 2017-04-20 16:18:49 +0100 |
|---|---|---|
| committer | Maxime Coste <mawww@kakoune.org> | 2017-04-20 16:18:49 +0100 |
| commit | dbcddafbfdc8808e8823812b1a5c40d4aedcdf90 (patch) | |
| tree | f849d94304e2fbc064741bdaf3d45f93fd89da67 /src | |
| parent | 30e6387071b6aee2239d155822091dc834090b7f (diff) | |
Change utf8::to_next/to_previous so that they are more symetrical
The previous implementation could yield different positions when
iterating forward and backward, leading to confusion in boost regex.
This makes an existing problem a bit more visible: iterating with
to_next and with read_codepoint wont behave the same way, as
read_codepoint will put the iterator onto the byte following the
utf8 codepoint, whereas to_next will put it on the next utf8
character start byte, which might be different if the buffer content
is not valid utf8.
Fixes #1195
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8.hh | 31 |
1 files changed, 14 insertions, 17 deletions
diff --git a/src/utf8.hh b/src/utf8.hh index bbf34ae3..34cecc81 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end) // According to rfc3629, UTF-8 allows only up to 4 bytes. // (21 bits codepoint) unsigned char byte = read(it); - if (not (byte & 0x80)) // 0xxxxxxx + if ((byte & 0x80) == 0) // 0xxxxxxx return byte; if (it == end) @@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end) template<typename InvalidPolicy = utf8::InvalidPolicy::Pass> ByteCount codepoint_size(char byte) { - if (not (byte & 0x80)) // 0xxxxxxx + if ((byte & 0x80) == 0) // 0xxxxxxx return 1; else if ((byte & 0xE0) == 0xC0) // 110xxxxx return 2; @@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp) template<typename Iterator> void to_next(Iterator& it, const Iterator& end) { - if (it != end and read(it) & 0x80) - while (it != end and (*(it) & 0xC0) == 0x80) - ++it; + if (it != end) + ++it; + while (it != end and not is_character_start(*it)) + ++it; } // returns an iterator to next character first byte @@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end) template<typename Iterator> void to_previous(Iterator& it, const Iterator& begin) { - while (it != begin and (*(--it) & 0xC0) == 0x80) - ; + if (it != begin) + --it; + while (not is_character_start(*it)) + --it; } // returns an iterator to the previous character first byte template<typename Iterator> @@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d) if (d < 0) { - while (it != end and d != 0) - { - if (is_character_start(*--it)) - ++d; - } + while (it != end and d++ != 0) + to_previous(it, end); } else if (d > 0) { - while (it != end and d != 0) - { - if (is_character_start(*++it)) - --d; - } + while (it != end and d-- != 0) + to_next(it, end); } return it; } |
