Simplify and accelerate start desc map

Store values for all possible bytes and fill utf8 multi byte start values when necessary.
author: Maxime Coste <mawww@kakoune.org> 2024-03-13 17:29:05 +1100
committer: Maxime Coste <mawww@kakoune.org> 2024-03-13 17:29:05 +1100
commit: c4df0fac52c83fe68b7f583de1f419c976645dc0 (patch)
tree: bc95671103c778995f75267a6c04f8e65bda1354 /src/regex_impl.cc
parent: c956413046ae1a57ab4dab9dee6986ec3e0eb2a6 (diff)
1 files changed, 16 insertions, 11 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 33d7a88a..b1630a1f 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -17,7 +17,6 @@
 namespace Kakoune
 {
 
-constexpr Codepoint CompiledRegex::StartDesc::other;
 constexpr Codepoint CompiledRegex::StartDesc::count;
 
 struct ParsedRegex
@@ -893,11 +892,17 @@ private:
     bool compute_start_desc(ParsedRegex::NodeIndex index,
                              CompiledRegex::StartDesc& start_desc) const
     {
+        // fill all bytes that mark the start of an utf8 multi byte sequence
+        auto add_multi_byte_utf8 = [&] {
+            std::fill(start_desc.map + 0b11000000, start_desc.map + 0b11111000, true);
+        };
+        static constexpr Codepoint single_byte_limit = 128;
+
         auto& node = get_node(index);
         switch (node.op)
         {
             case ParsedRegex::Literal:
-                if (node.value < CompiledRegex::StartDesc::count)
+                if (node.value < single_byte_limit)
                 {
                     if (node.ignore_case)
                     {
@@ -908,14 +913,14 @@ private:
                         start_desc.map[node.value] = true;
                 }
                 else
-                    start_desc.map[CompiledRegex::StartDesc::other] = true;
+                    add_multi_byte_utf8();
                 return node.quantifier.allows_none();
             case ParsedRegex::AnyChar:
                 for (auto& b : start_desc.map)
                     b = true;
                return node.quantifier.allows_none();
             case ParsedRegex::AnyCharExceptNewLine:
-                for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+                for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
                 {
                     if (cp != '\n')
                         start_desc.map[cp] = true;
@@ -930,33 +935,33 @@ private:
                 {
                     for (auto& range : character_class.ranges)
                     {
-                        const auto clamp = [](Codepoint cp) { return std::min(CompiledRegex::StartDesc::count, cp); };
+                        const auto clamp = [](Codepoint cp) { return std::min(single_byte_limit, cp); };
                         for (auto cp = clamp(range.min), end = clamp(range.max + 1); cp < end; ++cp)
                             start_desc.map[cp] = true;
-                        if (range.max >= CompiledRegex::StartDesc::count)
-                            start_desc.map[CompiledRegex::StartDesc::other] = true;
+                        if (range.max >= single_byte_limit)
+                            add_multi_byte_utf8();
                     }
                 }
                 else
                 {
-                    for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+                    for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
                     {
                         if (start_desc.map[cp] or character_class.matches(cp))
                             start_desc.map[cp] = true;
                     }
                 }
-                start_desc.map[CompiledRegex::StartDesc::other] = true;
+                add_multi_byte_utf8();
                 return node.quantifier.allows_none();
             }
             case ParsedRegex::CharType:
             {
                 const CharacterType ctype = (CharacterType)node.value;
-                for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+                for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
                 {
                     if (is_ctype(ctype, cp))
                         start_desc.map[cp] = true;
                 }
-                start_desc.map[CompiledRegex::StartDesc::other] = true;
+                add_multi_byte_utf8();
                 return node.quantifier.allows_none();
             }
             case ParsedRegex::Sequence:
author	Maxime Coste <mawww@kakoune.org>	2024-03-13 17:29:05 +1100
committer	Maxime Coste <mawww@kakoune.org>	2024-03-13 17:29:05 +1100
commit	c4df0fac52c83fe68b7f583de1f419c976645dc0 (patch)
tree	bc95671103c778995f75267a6c04f8e65bda1354 /src/regex_impl.cc
parent	c956413046ae1a57ab4dab9dee6986ec3e0eb2a6 (diff)