Add a CharRange regex op to optimize the common simple range case

Instead of jumping into the general CharClass code, detect simple [a-z] style ranges and use a specific op. Also detect when a range can be converted to ignore case
author: Maxime Coste <mawww@kakoune.org> 2025-07-07 10:15:11 +1000
committer: Maxime Coste <mawww@kakoune.org> 2025-07-07 12:15:19 +1000
commit: 3c92a6650d721fdc8fb98e83b30795e6d2984f20 (patch)
tree: 03163b991a085e7f65afd313b5f2c7339c40896e /src
parent: cb6cbb4e17b1080cc18a0195773ab763e7e11e64 (diff)
2 files changed, 58 insertions, 11 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index dd900125..8a46d833 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -505,6 +505,18 @@ private:
             parse_error("unclosed character class");
         ++m_pos;
 
+        if (not character_class.ignore_case)
+        {
+            bool could_ignore_case = true;
+            for (const auto& [min, max] : character_class.ranges)
+            {
+                if (not contains(character_class.ranges, CharacterClass::Range{to_lower(min), to_lower(max)}) or
+                    not contains(character_class.ranges, CharacterClass::Range{to_upper(min), to_upper(max)}))
+                    could_ignore_case = false;
+            }
+            character_class.ignore_case = could_ignore_case;
+        }
+
         if (character_class.ignore_case)
         {
             for (auto& range : character_class.ranges)
@@ -521,7 +533,7 @@ private:
         if (character_class.ctypes == CharacterType::None and not character_class.negative and
             character_class.ranges.size() == 1 and
             character_class.ranges.front().min == character_class.ranges.front().max)
-            return add_node(ParsedRegex::Literal, character_class.ranges.front().min);
+            return add_node(ParsedRegex::Literal, character_class.ranges.front().min, {1,1}, character_class.ignore_case);
 
         if (character_class.ctypes != CharacterType::None and not character_class.negative and
             character_class.ranges.empty())
@@ -585,14 +597,14 @@ private:
         }
     }
 
-    NodeIndex add_node(ParsedRegex::Op op, Codepoint value = -1, ParsedRegex::Quantifier quantifier = {1, 1})
+    NodeIndex add_node(ParsedRegex::Op op, Codepoint value = -1, ParsedRegex::Quantifier quantifier = {1, 1}, bool ignore_case = false)
     {
         constexpr auto max_nodes = std::numeric_limits<int16_t>::max();
         const NodeIndex res = m_parsed_regex.nodes.size();
         if (res == max_nodes)
             parse_error(format("regex parsed to more than {} ast nodes", max_nodes));
         const NodeIndex next = res+1;
-        m_parsed_regex.nodes.push_back({op, m_flags & Flags::IgnoreCase, next, value, quantifier});
+        m_parsed_regex.nodes.push_back({op, ignore_case or (m_flags & Flags::IgnoreCase), next, value, quantifier});
         return res;
     }
 
@@ -703,6 +715,7 @@ struct RegexCompiler
 private:
 
     template<RegexMode direction>
+    [[gnu::noinline]]
     OpIndex compile_node_inner(ParsedRegex::NodeIndex index)
     {
         auto& node = get_node(index);
@@ -729,7 +742,15 @@ private:
                 push_inst(CompiledRegex::AnyCharExceptNewLine);
                 break;
             case ParsedRegex::CharClass:
-                push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)});
+                if (auto& char_class = m_parsed_regex.character_classes[node.value];
+                    char_class.ranges.size() == 1 and char_class.ctypes == CharacterType::None and
+                    char_class.ranges[0].max <= std::numeric_limits<uint8_t>::max())
+                    push_inst(CompiledRegex::CharRange, {.range={.min=uint8_t(char_class.ranges[0].min),
+                                                                 .max=uint8_t(char_class.ranges[0].max),
+                                                                 .ignore_case=char_class.ignore_case,
+                                                                 .negative=char_class.negative}});
+                else
+                    push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)});
                 break;
             case ParsedRegex::CharType:
                 push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}});
@@ -1107,12 +1128,18 @@ String dump_regex(const CompiledRegex& program)
             case CompiledRegex::AnyCharExceptNewLine:
                 res += "anything but newline\n";
                 break;
-            case CompiledRegex::CharClass:
-                res += format("character class {}\n", inst.param.character_class_index);
+            case CompiledRegex::CharRange:
+                res += format("character range {}[{}{}-{}]\n",
+                              inst.param.range.ignore_case ? "(ignore case) " : "",
+                              inst.param.range.negative ? "^" : "",
+                              inst.param.range.min, inst.param.range.max);
                 break;
             case CompiledRegex::CharType:
                 res += format("character type {}\n", to_underlying(inst.param.character_type));
                 break;
+            case CompiledRegex::CharClass:
+                res += format("character class {}\n", inst.param.character_class_index);
+                break;
             case CompiledRegex::Jump:
                 res += format("jump {} ({:03})\n", inst.param.jump_offset, index + inst.param.jump_offset);
                 break;
@@ -1259,6 +1286,12 @@ auto test_regex = UnitTest{[]{
     }
 
     {
+        TestVM<> vm{R"([aA])"};
+        kak_assert(vm.exec("a"));
+        kak_assert(vm.exec("A"));
+    }
+
+    {
         TestVM<> vm{R"(a{3,5}b)"};
         kak_assert(not vm.exec("aab"));
         kak_assert(vm.exec("aaab"));
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 5faa7c94..2f5cbcfa 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -76,8 +76,9 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex>
         Literal,
         AnyChar,
         AnyCharExceptNewLine,
-        CharClass,
+        CharRange,
         CharType,
+        CharClass,
         Jump,
         Split,
         Save,
@@ -105,8 +106,15 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex>
             uint32_t codepoint : 24;
             bool ignore_case : 1;
         } literal;
-        int16_t character_class_index;
+        struct CharRange
+        {
+            uint8_t min;
+            uint8_t max;
+            bool ignore_case : 1;
+            bool negative;
+        } range;
         CharacterType character_type;
+        int16_t character_class_index;
         int16_t jump_offset;
         int16_t save_index;
         struct Split
@@ -399,15 +407,21 @@ private:
                     if (pos != config.end and cp != '\n')
                         return consumed();
                     return failed();
-                case CompiledRegex::CharClass:
-                    if (pos != config.end and
-                        m_program.character_classes[inst.param.character_class_index].matches(cp))
+                case CompiledRegex::CharRange:
+                    if (auto actual_cp = (inst.param.range.ignore_case ? to_lower(cp) : cp);
+                        pos != config.end and
+                        (actual_cp >= inst.param.range.min and actual_cp <= inst.param.range.max) != inst.param.range.negative)
                         return consumed();
                     return failed();
                 case CompiledRegex::CharType:
                     if (pos != config.end and is_ctype(inst.param.character_type, cp))
                         return consumed();
                     return failed();
+                case CompiledRegex::CharClass:
+                    if (pos != config.end and
+                        m_program.character_classes[inst.param.character_class_index].matches(cp))
+                        return consumed();
+                    return failed();
                 case CompiledRegex::Jump:
                     thread.inst = &inst + inst.param.jump_offset;
                     break;
author	Maxime Coste <mawww@kakoune.org>	2025-07-07 10:15:11 +1000
committer	Maxime Coste <mawww@kakoune.org>	2025-07-07 12:15:19 +1000
commit	3c92a6650d721fdc8fb98e83b30795e6d2984f20 (patch)
tree	03163b991a085e7f65afd313b5f2c7339c40896e /src
parent	cb6cbb4e17b1080cc18a0195773ab763e7e11e64 (diff)