Tweak ranked match behaviour to consider the number of full words

Tracking the number of query words that appear as full words in the candidate seems to fix a few cases where the existing fuzzy matching algorithm was not great. I have been running with this for a while and did not notice any annoyances, the whole RankedMatch code probably deserves more attention but this seems to go in the right direction.
author: Maxime Coste <mawww@kakoune.org> 2025-04-02 17:35:23 +1100
committer: Maxime Coste <mawww@kakoune.org> 2025-04-02 17:35:23 +1100
commit: 63efcc06d5bee7f05a1ee9539b2391c80e5d6205 (patch)
tree: 8ad15a23836f02421ff98796b7a2c994fe00165d /src
parent: c7d688f578c7b58989fc04e7bb1e9b5a939a5730 (diff)
6 files changed, 88 insertions, 48 deletions
diff --git a/src/insert_completer.cc b/src/insert_completer.cc
index 49bbd4ca..01058012 100644
--- a/src/insert_completer.cc
+++ b/src/insert_completer.cc
@@ -3,7 +3,6 @@
 #include "buffer_manager.hh"
 #include "buffer_utils.hh"
 #include "debug.hh"
-#include "client.hh"
 #include "command_manager.hh"
 #include "changes.hh"
 #include "context.hh"
@@ -13,6 +12,7 @@
 #include "regex.hh"
 #include "window.hh"
 #include "word_db.hh"
+#include "word_splitter.hh"
 #include "option_types.hh"
 #include "utf8_iterator.hh"
 #include "user_interface.hh"
@@ -87,7 +87,7 @@ InsertCompletion complete_word(const SelectionList& sels,
     for (int i = 0; i < sels.size(); ++i)
     {
         int len = 0;
-        auto is_short_enough_word = [&] (Codepoint c) { return len++ < WordDB::max_word_len && is_word_pred(c); };
+        auto is_short_enough_word = [&] (Codepoint c) { return len++ < WordSplitter::max_word_len && is_word_pred(c); };
 
         Utf8It end{buffer.iterator_at(sels[i].cursor()), buffer};
         Utf8It begin = end-1;
@@ -103,7 +103,7 @@ InsertCompletion complete_word(const SelectionList& sels,
 
         skip_while(end, buffer.end(), is_short_enough_word);
 
-        if (len <= WordDB::max_word_len)
+        if (len <= WordSplitter::max_word_len)
         {
             StringView word = buffer.substr(begin.base().coord(), end.base().coord());
             ++sel_word_counts[word];
diff --git a/src/ranked_match.cc b/src/ranked_match.cc
index 87bcece7..6918809c 100644
--- a/src/ranked_match.cc
+++ b/src/ranked_match.cc
@@ -5,6 +5,7 @@
 #include "utf8_iterator.hh"
 #include "optional.hh"
 #include "ranges.hh"
+#include "word_splitter.hh"
 
 #include <algorithm>
 
@@ -70,6 +71,25 @@ static int count_word_boundaries_match(StringView candidate, StringView query)
     return count;
 }
 
+static int count_full_word_match(StringView candidate, StringView query)
+{
+    int count = 0;
+    WordSplitter query_words{query, {}};
+    WordSplitter candidate_words{candidate, {}};
+    for (auto query_word : query_words)
+    {
+        for (auto word : candidate_words)
+        {
+            if (word == query_word)
+            {
+                ++count;
+                break;
+            }
+        }
+    }
+    return count;
+}
+
 static bool smartcase_eq(Codepoint candidate, Codepoint query)
 {
     return query == (is_lower(query) ? to_lower(candidate) : candidate);
@@ -168,6 +188,7 @@ RankedMatch::RankedMatch(StringView candidate, StringView query, TestFunc func)
         }
     }
 
+    m_full_word_match_count = count_full_word_match(candidate, query);
     m_word_boundary_match_count = count_word_boundaries_match(candidate, query);
     if (m_word_boundary_match_count == query.length())
         m_flags |= Flags::OnlyWordBoundary;
@@ -207,6 +228,9 @@ bool RankedMatch::operator<(const RankedMatch& other) const
         m_word_boundary_match_count != other.m_word_boundary_match_count)
         return m_word_boundary_match_count > other.m_word_boundary_match_count;
 
+    if (m_full_word_match_count != other.m_full_word_match_count)
+        return m_full_word_match_count > other.m_full_word_match_count;
+
     if (m_max_index != other.m_max_index)
         return m_max_index < other.m_max_index;
 
@@ -288,6 +312,8 @@ UnitTest test_ranked_match{[] {
     kak_assert(preferred("foo_bar", "test_foo_bar", "foo_test_bar"));
     kak_assert(preferred("rm.cc", "src/ranked_match.cc", "test/README.asciidoc"));
     kak_assert(preferred("luaremote", "src/script/LuaRemote.cpp", "tests/TestLuaRemote.cpp"));
+    kak_assert(preferred("lang/haystack/needle.c", "git.evilcorp.com/language/haystack/aaa/needle.c", "git.evilcorp.com/aaa/ng/wrong-haystack/needle.cpp"));
+    kak_assert(preferred("evilcorp-lint/bar.go", "scripts/evilcorp-lint/foo/bar.go", "src/evilcorp-client/foo/bar.go"));
 }};
 
 UnitTest test_used_letters{[]()
diff --git a/src/ranked_match.hh b/src/ranked_match.hh
index 5a58defb..1d877d53 100644
--- a/src/ranked_match.hh
+++ b/src/ranked_match.hh
@@ -54,6 +54,7 @@ private:
     StringView m_candidate{};
     bool m_matches = false;
     Flags m_flags = Flags::None;
+    int m_full_word_match_count = 0;
     int m_word_boundary_match_count = 0;
     int m_max_index = 0;
     size_t m_input_sequence_number = 0;
diff --git a/src/word_db.cc b/src/word_db.cc
index 0a1873e7..57821da8 100644
--- a/src/word_db.cc
+++ b/src/word_db.cc
@@ -4,6 +4,7 @@
 #include "line_modification.hh"
 #include "unit_tests.hh"
 #include "value.hh"
+#include "word_splitter.hh"
 
 namespace Kakoune
 {
@@ -17,49 +18,6 @@ WordDB& get_word_db(const Buffer& buffer)
     return cache_val.as<WordDB>();
 }
 
-struct WordSplitter
-{
-    struct Iterator
-    {
-        Iterator(const char* begin, const WordSplitter& splitter)
-            : m_word_begin{begin}, m_word_end{begin}, m_splitter{&splitter}
-        { operator++(); }
-
-        StringView operator*() const { return {m_word_begin, m_word_end}; }
-
-        Iterator& operator++()
-        {
-            const auto* end = m_splitter->m_content.end();
-            auto extra_chars = m_splitter->m_extra_word_chars;
-
-            do
-            {
-                auto it = m_word_begin = m_word_end;
-                while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
-                    m_word_begin = it;
-
-                m_word_end = it;
-                while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
-                    m_word_end = it;
-            } while (m_word_begin != end and (m_word_end - m_word_begin) > WordDB::max_word_len);
-
-            return *this;
-        }
-
-        friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;
-
-        const char* m_word_begin;
-        const char* m_word_end;
-        const WordSplitter* m_splitter;
-    };
-
-    StringView m_content;
-    ConstArrayView<Codepoint> m_extra_word_chars;
-
-    Iterator begin() const { return {m_content.begin(), *this}; }
-    Iterator end()   const { return {m_content.end(), *this}; }
-};
-
 static ConstArrayView<Codepoint> get_extra_word_chars(const Buffer& buffer)
 {
     return buffer.options()["extra_word_chars"].get<Vector<Codepoint, MemoryDomain::Options>>();
diff --git a/src/word_db.hh b/src/word_db.hh
index dcb46653..dab65d4c 100644
--- a/src/word_db.hh
+++ b/src/word_db.hh
@@ -18,8 +18,6 @@ class Buffer;
 class WordDB : public OptionManagerWatcher
 {
 public:
-    static constexpr ByteCount max_word_len = 50;
-
     WordDB(const Buffer& buffer);
     ~WordDB();
     WordDB(const WordDB&) = delete;
diff --git a/src/word_splitter.hh b/src/word_splitter.hh
new file mode 100644
index 00000000..97af606d
--- /dev/null
+++ b/src/word_splitter.hh
@@ -0,0 +1,57 @@
+#ifndef word_splitter_hh_INCLUDED
+#define word_splitter_hh_INCLUDED
+
+#include "string.hh"
+#include "array_view.hh"
+
+namespace Kakoune
+{
+
+struct WordSplitter
+{
+    static constexpr ByteCount max_word_len = 100;
+
+    struct Iterator
+    {
+        Iterator(const char* begin, const WordSplitter& splitter)
+            : m_word_begin{begin}, m_word_end{begin}, m_splitter{&splitter}
+        { operator++(); }
+
+        StringView operator*() const { return {m_word_begin, m_word_end}; }
+
+        Iterator& operator++()
+        {
+            const auto* end = m_splitter->m_content.end();
+            auto extra_chars = m_splitter->m_extra_word_chars;
+
+            do
+            {
+                auto it = m_word_begin = m_word_end;
+                while (it != end and not is_word(utf8::read_codepoint(it, end), extra_chars))
+                    m_word_begin = it;
+
+                m_word_end = it;
+                while (it != end and is_word(utf8::read_codepoint(it, end), extra_chars))
+                    m_word_end = it;
+            } while (m_word_begin != end and (m_word_end - m_word_begin) > max_word_len);
+
+            return *this;
+        }
+
+        friend bool operator==(const Iterator& lhs, const Iterator& rhs) = default;
+
+        const char* m_word_begin;
+        const char* m_word_end;
+        const WordSplitter* m_splitter;
+    };
+
+    StringView m_content;
+    ConstArrayView<Codepoint> m_extra_word_chars;
+
+    Iterator begin() const { return {m_content.begin(), *this}; }
+    Iterator end()   const { return {m_content.end(), *this}; }
+};
+
+}
+
+#endif // word_splitter_hh_INCLUDED
author	Maxime Coste <mawww@kakoune.org>	2025-04-02 17:35:23 +1100
committer	Maxime Coste <mawww@kakoune.org>	2025-04-02 17:35:23 +1100
commit	63efcc06d5bee7f05a1ee9539b2391c80e5d6205 (patch)
tree	8ad15a23836f02421ff98796b7a2c994fe00165d /src
parent	c7d688f578c7b58989fc04e7bb1e9b5a939a5730 (diff)