diff options
| author | Maxime Coste <frrrwww@gmail.com> | 2014-07-02 21:14:01 +0100 |
|---|---|---|
| committer | Maxime Coste <frrrwww@gmail.com> | 2014-07-05 12:10:06 +0100 |
| commit | ed68d1ff287d43c5293abb4d41e908aa8e50afec (patch) | |
| tree | b5ab042ba67259a63b2aa0ae99ad74a5bc646a7f /src/utf8.hh | |
| parent | 3f70d91f8c716ef2dbc76abb9c878f86ecb946f7 (diff) | |
utf8: use end of sequence iterators for more security
Diffstat (limited to 'src/utf8.hh')
| -rw-r--r-- | src/utf8.hh | 77 |
1 files changed, 42 insertions, 35 deletions
diff --git a/src/utf8.hh b/src/utf8.hh index 60460a7a..116c225e 100644 --- a/src/utf8.hh +++ b/src/utf8.hh @@ -15,10 +15,10 @@ namespace utf8 // returns an iterator to next character first byte template<typename Iterator> -Iterator next(Iterator it) +Iterator next(Iterator it, Iterator end) { - if (*it++ & 0x80) - while ((*(it) & 0xC0) == 0x80) + if (it != end and *it++ & 0x80) + while (it != end and (*(it) & 0xC0) == 0x80) ++it; return it; } @@ -26,18 +26,18 @@ Iterator next(Iterator it) // returns it's parameter if it points to a character first byte, // or else returns next character first byte template<typename Iterator> -Iterator finish(Iterator it) +Iterator finish(Iterator it, Iterator end) { - while ((*(it) & 0xC0) == 0x80) + while (it != end and (*(it) & 0xC0) == 0x80) ++it; return it; } // returns an iterator to the previous character first byte template<typename Iterator> -Iterator previous(Iterator it) +Iterator previous(Iterator it, Iterator begin) { - while ((*(--it) & 0xC0) == 0x80) + while (it != begin and (*(--it) & 0xC0) == 0x80) ; return it; } @@ -51,12 +51,12 @@ Iterator advance(Iterator it, Iterator end, CharCount d) if (d < 0) { while (it != end and d++) - it = utf8::previous(it); + it = utf8::previous(it, end); } else { while (it != end and d--) - it = utf8::next(it); + it = utf8::next(it, end); } return it; } @@ -83,65 +83,72 @@ inline bool is_character_start(char c) // returns an iterator to the first byte of the character it is into template<typename Iterator> -Iterator character_start(Iterator it) +Iterator character_start(Iterator it, Iterator begin) { - while (not is_character_start(*it)) + while (it != begin and not is_character_start(*it)) --it; return it; } -namespace InvalidBytePolicy +namespace InvalidPolicy { struct Assert { - Codepoint operator()(unsigned char byte) const { kak_assert(false); return byte; } + Codepoint operator()(Codepoint cp) const { kak_assert(false); return cp; } }; struct Pass { - Codepoint operator()(unsigned char byte) const { return byte; } + Codepoint operator()(Codepoint cp) const { return cp; } }; } // returns the codepoint of the character whose first byte // is pointed by it -template<typename InvalidPolicy = InvalidBytePolicy::Assert, +template<typename InvalidPolicy = utf8::InvalidPolicy::Assert, typename Iterator> -Codepoint codepoint(Iterator it) +Codepoint codepoint(Iterator it, Iterator end) { + if (it == end) + return InvalidPolicy{}(-1); // According to rfc3629, UTF-8 allows only up to 4 bytes. // (21 bits codepoint) - Codepoint cp; unsigned char byte = *it++; if (not (byte & 0x80)) // 0xxxxxxx - cp = byte; - else if ((byte & 0xE0) == 0xC0) // 110xxxxx - { - cp = ((byte & 0x1F) << 6) | (*it & 0x3F); - } - else if ((byte & 0xF0) == 0xE0) // 1110xxxx + return byte; + + if (it == end) + return InvalidPolicy{}(byte); + + if ((byte & 0xE0) == 0xC0) // 110xxxxx + return ((byte & 0x1F) << 6) | (*it & 0x3F); + + if ((byte & 0xF0) == 0xE0) // 1110xxxx { - cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); - cp |= (*it & 0x3F); + Codepoint cp = ((byte & 0x0F) << 12) | ((*it++ & 0x3F) << 6); + if (it == end) + return InvalidPolicy{}(cp); + return cp | (*it & 0x3F); } - else if ((byte & 0xF8) == 0xF0) // 11110xxx + + if ((byte & 0xF8) == 0xF0) // 11110xxx { - cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12); + Codepoint cp = ((byte & 0x0F) << 18) | ((*it++ & 0x3F) << 12); + if (it == end) + return InvalidPolicy{}(cp); cp |= (*it++ & 0x3F) << 6; - cp |= (*it & 0x3F); + if (it == end) + return InvalidPolicy{}(cp); + return cp | (*it & 0x3F); } - else - cp = InvalidPolicy{}(byte); - return cp; + return InvalidPolicy{}(byte); } -template<typename InvalidPolicy = InvalidBytePolicy::Assert, - typename Iterator> -ByteCount codepoint_size(Iterator it) +template<typename InvalidPolicy = utf8::InvalidPolicy::Assert> +ByteCount codepoint_size(char byte) { - unsigned char byte = *it; if (not (byte & 0x80)) // 0xxxxxxx return 1; else if ((byte & 0xE0) == 0xC0) // 110xxxxx |
