utf8: use end of sequence iterators for more security

author: Maxime Coste <frrrwww@gmail.com> 2014-07-02 21:14:01 +0100
committer: Maxime Coste <frrrwww@gmail.com> 2014-07-05 12:10:06 +0100
commit: ed68d1ff287d43c5293abb4d41e908aa8e50afec (patch)
tree: b5ab042ba67259a63b2aa0ae99ad74a5bc646a7f /src/utf8.hh
parent: 3f70d91f8c716ef2dbc76abb9c878f86ecb946f7 (diff)
1 files changed, 42 insertions, 35 deletions
diff --git a/src/utf8.hh b/src/utf8.hh
index 60460a7a..116c225e 100644
--- a/src/utf8.hh
+++ b/src/utf8.hh
@@ -15,10 +15,10 @@ namespace utf8
 
 // returns an iterator to next character first byte
 template<typename Iterator>
-Iterator next(Iterator it)
+Iterator next(Iterator it, Iterator end)
 {
-    if (*it++ & 0x80)
-        while ((*(it) & 0xC0) == 0x80)
+    if (it != end and *it++ & 0x80)
+        while (it != end and (*(it) & 0xC0) == 0x80)
             ++it;
     return it;
 }
@@ -26,18 +26,18 @@ Iterator next(Iterator it)
 // returns it's parameter if it points to a character first byte,
 // or else returns next character first byte
 template<typename Iterator>
-Iterator finish(Iterator it)
+Iterator finish(Iterator it, Iterator end)
 {
-    while ((*(it) & 0xC0) == 0x80)
+    while (it != end and (*(it) & 0xC0) == 0x80)
         ++it;
     return it;
 }
 
 // returns an iterator to the previous character first byte
 template<typename Iterator>
-Iterator previous(Iterator it)
+Iterator previous(Iterator it, Iterator begin)
 {
-    while ((*(--it) & 0xC0) == 0x80)
+    while (it != begin and (*(--it) & 0xC0) == 0x80)
            ;
     return it;
 }
@@ -51,12 +51,12 @@ Iterator advance(Iterator it, Iterator end, CharCount d)
     if (d < 0)
     {
        while (it != end and d++)
-           it = utf8::previous(it);
+           it = utf8::previous(it, end);
     }
     else
     {
         while (it != end and d--)
-           it = utf8::next(it);
+           it = utf8::next(it, end);
     }
     return it;
 }
@@ -83,65 +83,72 @@ inline bool is_character_start(char c)
 
 // returns an iterator to the first byte of the character it is into
 template<typename Iterator>
-Iterator character_start(Iterator it)
+Iterator character_start(Iterator it, Iterator begin)
 {
-    while (not is_character_start(*it))
+    while (it != begin and not is_character_start(*it))
         --it;
     return it;
 }
 
-namespace InvalidBytePolicy
+namespace InvalidPolicy
 {
 
 struct Assert
 {
-    Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; }
+    Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; }
 };
 
 struct Pass
 {
-    Codepoint operator()(unsigned char byte) const { return byte; }
+    Codepoint operator()(Codepoint cp) const { return cp; }
 };
 
 }
 
 // returns the codepoint of the character whose first byte
 // is pointed by it
-template<typename InvalidPolicy = InvalidBytePolicy::Assert,
+template<typename InvalidPolicy = utf8::InvalidPolicy::Assert,
          typename Iterator>
-Codepoint codepoint(Iterator it)
+Codepoint codepoint(Iterator it, Iterator end)
 {
+    if (it == end)
+        return InvalidPolicy{}(-1);
     // According to rfc3629, UTF-8 allows only up to 4 bytes.
     // (21 bits codepoint)
-    Codepoint cp;
     unsigned char byte = *it++;
     if (not (byte & 0x80)) // 0xxxxxxx
-        cp = byte;
-    else if ((byte & 0xE0) == 0xC0) // 110xxxxx
-    {
-        cp = ((byte & 0x1F) << 6) | (*it & 0x3F);
-    }
-    else if ((byte & 0xF0) == 0xE0) // 1110xxxx
+        return byte;
+
+    if (it == end)
+        return InvalidPolicy{}(byte);
+
+    if ((byte & 0xE0) == 0xC0) // 110xxxxx
+        return ((byte & 0x1F) << 6) | (*it & 0x3F);
+
+    if ((byte & 0xF0) == 0xE0) // 1110xxxx
     {
-        cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
-        cp |= (*it & 0x3F);
+        Codepoint cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6);
+        if (it == end)
+            return InvalidPolicy{}(cp);
+        return cp | (*it & 0x3F);
     }
-    else if ((byte & 0xF8) == 0xF0) // 11110xxx
+
+    if ((byte & 0xF8) == 0xF0) // 11110xxx
     {
-        cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
+        Codepoint cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12);
+        if (it == end)
+            return InvalidPolicy{}(cp);
         cp |= (*it++ & 0x3F) << 6;
-        cp |= (*it & 0x3F);
+        if (it == end)
+            return InvalidPolicy{}(cp);
+        return cp | (*it & 0x3F);
     }
-    else
-        cp = InvalidPolicy{}(byte);
-    return cp;
+    return InvalidPolicy{}(byte);
 }
 
-template<typename InvalidPolicy = InvalidBytePolicy::Assert,
-         typename Iterator>
-ByteCount codepoint_size(Iterator it)
+template<typename InvalidPolicy = utf8::InvalidPolicy::Assert>
+ByteCount codepoint_size(char byte)
 {
-    unsigned char byte = *it;
     if (not (byte & 0x80)) // 0xxxxxxx
         return 1;
     else if ((byte & 0xE0) == 0xC0) // 110xxxxx
author	Maxime Coste <frrrwww@gmail.com>	2014-07-02 21:14:01 +0100
committer	Maxime Coste <frrrwww@gmail.com>	2014-07-05 12:10:06 +0100
commit	ed68d1ff287d43c5293abb4d41e908aa8e50afec (patch)
tree	b5ab042ba67259a63b2aa0ae99ad74a5bc646a7f /src/utf8.hh
parent	3f70d91f8c716ef2dbc76abb9c878f86ecb946f7 (diff)