diff options
| author | Maxime Coste <mawww@kakoune.org> | 2023-10-25 12:52:14 +1100 |
|---|---|---|
| committer | Maxime Coste <mawww@kakoune.org> | 2023-10-25 12:52:14 +1100 |
| commit | be33dee211f08570072ae4ef5cea951aa4aa44ef (patch) | |
| tree | 14a87d499e2bfcf2e2cb12ab04f64b5169dff899 /src/word_db.cc | |
| parent | b33b673f106ca938681356dca159634eec01d570 (diff) | |
Speed up WordSplitter
Only do utf8 decoding once per codepoint instead of twice, limit
the byte length instead of the codepoint length.
Diffstat (limited to 'src/word_db.cc')
| -rw-r--r-- | src/word_db.cc | 29 |
1 files changed, 10 insertions, 19 deletions
diff --git a/src/word_db.cc b/src/word_db.cc index 56b7fedf..2ea35513 100644 --- a/src/word_db.cc +++ b/src/word_db.cc @@ -34,30 +34,21 @@ struct WordSplitter const auto* end = m_splitter->m_content.end(); auto extra_chars = m_splitter->m_extra_word_chars; - while (true) + do { - m_word_begin = m_word_end; - while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars)) - utf8::to_next(m_word_begin, end); - m_word_end = m_word_begin; - CharCount word_len = 0; - while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars)) - { - utf8::to_next(m_word_end, end); - ++word_len; - } - if (m_word_begin == end or word_len < WordDB::max_word_len) - break; - } + auto it = m_word_begin = m_word_end; + while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars)) + m_word_begin = it; + + m_word_end = it; + while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars)) + m_word_end = it; + } while (m_word_begin != end and (m_word_end - m_word_begin) > WordDB::max_word_len); return *this; } - friend bool operator==(const Iterator& lhs, const Iterator& rhs) - { return lhs.m_word_begin == rhs.m_word_begin and lhs.m_word_end == rhs.m_word_end; } - - friend bool operator!=(const Iterator& lhs, const Iterator& rhs) - { return not (lhs == rhs); } + friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default; const char* m_word_begin; const char* m_word_end; |
