Change utf8::to_next/to_previous so that they are more symetrical

The previous implementation could yield different positions when iterating forward and backward, leading to confusion in boost regex. This makes an existing problem a bit more visible: iterating with to_next and with read_codepoint wont behave the same way, as read_codepoint will put the iterator onto the byte following the utf8 codepoint, whereas to_next will put it on the next utf8 character start byte, which might be different if the buffer content is not valid utf8. Fixes #1195
author: Maxime Coste <mawww@kakoune.org> 2017-04-20 16:18:49 +0100
committer: Maxime Coste <mawww@kakoune.org> 2017-04-20 16:18:49 +0100
commit: dbcddafbfdc8808e8823812b1a5c40d4aedcdf90 (patch)
tree: f849d94304e2fbc064741bdaf3d45f93fd89da67 /src/utf8.hh
parent: 30e6387071b6aee2239d155822091dc834090b7f (diff)
1 files changed, 14 insertions, 17 deletions
diff --git a/src/utf8.hh b/src/utf8.hh
index bbf34ae3..34cecc81 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -51,7 +51,7 @@ Codepoint read_codepoint(Iterator& it, const Iterator& end)
     // According to rfc3629, UTF-8 allows only up to 4 bytes.
     // (21 bits codepoint)
     unsigned char byte = read(it);
-    if (not (byte & 0x80)) // 0xxxxxxx
+    if ((byte & 0x80) == 0) // 0xxxxxxx
         return byte;
 
     if (it == end)
@@ -91,7 +91,7 @@ Codepoint codepoint(Iterator it, const Iterator& end)
 template<typename InvalidPolicy = utf8::InvalidPolicy::Pass>
 ByteCount codepoint_size(char byte)
 {
-    if (not (byte & 0x80)) // 0xxxxxxx
+    if ((byte & 0x80) == 0) // 0xxxxxxx
         return 1;
     else if ((byte & 0xE0) == 0xC0) // 110xxxxx
         return 2;
@@ -125,9 +125,10 @@ inline ByteCount codepoint_size(Codepoint cp)
 template<typename Iterator>
 void to_next(Iterator& it, const Iterator& end)
 {
-    if (it != end and read(it) & 0x80)
-        while (it != end and (*(it) & 0xC0) == 0x80)
-            ++it;
+    if (it != end)
+        ++it;
+    while (it != end and not is_character_start(*it))
+        ++it;
 }
 
 // returns an iterator to next character first byte
@@ -151,8 +152,10 @@ Iterator finish(Iterator it, const Iterator& end)
 template<typename Iterator>
 void to_previous(Iterator& it, const Iterator& begin)
 {
-    while (it != begin and (*(--it) & 0xC0) == 0x80)
-           ;
+    if (it != begin)
+        --it;
+    while (not is_character_start(*it))
+        --it;
 }
 // returns an iterator to the previous character first byte
 template<typename Iterator>
@@ -173,19 +176,13 @@ Iterator advance(Iterator it, const Iterator& end, CharCount d)
 
     if (d < 0)
     {
-        while (it != end and d != 0)
-        {
-            if (is_character_start(*--it))
-                ++d;
-        }
+        while (it != end and d++ != 0)
+            to_previous(it, end);
     }
     else if (d > 0)
     {
-        while (it != end and d != 0)
-        {
-            if (is_character_start(*++it))
-                --d;
-        }
+        while (it != end and d-- != 0)
+            to_next(it, end);
     }
     return it;
 }
author	Maxime Coste <mawww@kakoune.org>	2017-04-20 16:18:49 +0100
committer	Maxime Coste <mawww@kakoune.org>	2017-04-20 16:18:49 +0100
commit	dbcddafbfdc8808e8823812b1a5c40d4aedcdf90 (patch)
tree	f849d94304e2fbc064741bdaf3d45f93fd89da67 /src/utf8.hh
parent	30e6387071b6aee2239d155822091dc834090b7f (diff)