summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2023-10-25 12:52:14 +1100
committerMaxime Coste <mawww@kakoune.org>2023-10-25 12:52:14 +1100
commitbe33dee211f08570072ae4ef5cea951aa4aa44ef (patch)
tree14a87d499e2bfcf2e2cb12ab04f64b5169dff899
parentb33b673f106ca938681356dca159634eec01d570 (diff)
Speed up WordSplitter
Only do utf8 decoding once per codepoint instead of twice, limit the byte length instead of the codepoint length.
-rw-r--r--src/word_db.cc29
-rw-r--r--src/word_db.hh2
2 files changed, 11 insertions, 20 deletions
diff --git a/src/word_db.cc b/src/word_db.cc
index 56b7fedf..2ea35513 100644
--- a/src/word_db.cc
+++ b/src/word_db.cc
@@ -34,30 +34,21 @@ struct WordSplitter
const auto* end = m_splitter->m_content.end();
auto extra_chars = m_splitter->m_extra_word_chars;
- while (true)
+ do
{
- m_word_begin = m_word_end;
- while (m_word_begin != end and not is_word(utf8::codepoint(m_word_begin, end), extra_chars))
- utf8::to_next(m_word_begin, end);
- m_word_end = m_word_begin;
- CharCount word_len = 0;
- while (m_word_end != end and is_word(utf8::codepoint(m_word_end, end), extra_chars))
- {
- utf8::to_next(m_word_end, end);
- ++word_len;
- }
- if (m_word_begin == end or word_len < WordDB::max_word_len)
- break;
- }
+ auto it = m_word_begin = m_word_end;
+ while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
+ m_word_begin = it;
+
+ m_word_end = it;
+ while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
+ m_word_end = it;
+ } while (m_word_begin != end and (m_word_end - m_word_begin) > WordDB::max_word_len);
return *this;
}
- friend bool operator==(const Iterator& lhs, const Iterator& rhs)
- { return lhs.m_word_begin == rhs.m_word_begin and lhs.m_word_end == rhs.m_word_end; }
-
- friend bool operator!=(const Iterator& lhs, const Iterator& rhs)
- { return not (lhs == rhs); }
+ friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;
const char* m_word_begin;
const char* m_word_end;
diff --git a/src/word_db.hh b/src/word_db.hh
index ac7b8c01..dcb46653 100644
--- a/src/word_db.hh
+++ b/src/word_db.hh
@@ -18,7 +18,7 @@ class Buffer;
class WordDB : public OptionManagerWatcher
{
public:
- static constexpr CharCount max_word_len = 50;
+ static constexpr ByteCount max_word_len = 50;
WordDB(const Buffer& buffer);
~WordDB();