summaryrefslogtreecommitdiff
path: root/src/word_splitter.hh
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2025-04-02 17:35:23 +1100
committerMaxime Coste <mawww@kakoune.org>2025-04-02 17:35:23 +1100
commit63efcc06d5bee7f05a1ee9539b2391c80e5d6205 (patch)
tree8ad15a23836f02421ff98796b7a2c994fe00165d /src/word_splitter.hh
parentc7d688f578c7b58989fc04e7bb1e9b5a939a5730 (diff)
Tweak ranked match behaviour to consider the number of full words
Tracking the number of query words that appear as full words in the candidate seems to fix a few cases where the existing fuzzy matching algorithm was not great. I have been running with this for a while and did not notice any annoyances, the whole RankedMatch code probably deserves more attention but this seems to go in the right direction.
Diffstat (limited to 'src/word_splitter.hh')
-rw-r--r--src/word_splitter.hh57
1 files changed, 57 insertions, 0 deletions
diff --git a/src/word_splitter.hh b/src/word_splitter.hh
new file mode 100644
index 00000000..97af606d
--- /dev/null
+++ b/src/word_splitter.hh
@@ -0,0 +1,57 @@
+#ifndef word_splitter_hh_INCLUDED
+#define word_splitter_hh_INCLUDED
+
+#include "string.hh"
+#include "array_view.hh"
+
+namespace Kakoune
+{
+
+struct WordSplitter
+{
+ static constexpr ByteCount max_word_len = 100;
+
+ struct Iterator
+ {
+ Iterator(const char* begin, const WordSplitter& splitter)
+ : m_word_begin{begin}, m_word_end{begin}, m_splitter{&splitter}
+ { operator++(); }
+
+ StringView operator*() const { return {m_word_begin, m_word_end}; }
+
+ Iterator& operator++()
+ {
+ const auto* end = m_splitter->m_content.end();
+ auto extra_chars = m_splitter->m_extra_word_chars;
+
+ do
+ {
+ auto it = m_word_begin = m_word_end;
+ while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
+ m_word_begin = it;
+
+ m_word_end = it;
+ while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
+ m_word_end = it;
+ } while (m_word_begin != end and (m_word_end - m_word_begin) > max_word_len);
+
+ return *this;
+ }
+
+ friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;
+
+ const char* m_word_begin;
+ const char* m_word_end;
+ const WordSplitter* m_splitter;
+ };
+
+ StringView m_content;
+ ConstArrayView<Codepoint> m_extra_word_chars;
+
+ Iterator begin() const { return {m_content.begin(), *this}; }
+ Iterator end() const { return {m_content.end(), *this}; }
+};
+
+}
+
+#endif // word_splitter_hh_INCLUDED