diff options
| author | Maxime Coste <mawww@kakoune.org> | 2025-07-07 10:15:11 +1000 |
|---|---|---|
| committer | Maxime Coste <mawww@kakoune.org> | 2025-07-07 12:15:19 +1000 |
| commit | 3c92a6650d721fdc8fb98e83b30795e6d2984f20 (patch) | |
| tree | 03163b991a085e7f65afd313b5f2c7339c40896e /src | |
| parent | cb6cbb4e17b1080cc18a0195773ab763e7e11e64 (diff) | |
Add a CharRange regex op to optimize the common simple range case
Instead of jumping into the general CharClass code, detect simple
[a-z] style ranges and use a specific op.
Also detect when a range can be converted to ignore case
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex_impl.cc | 45 | ||||
| -rw-r--r-- | src/regex_impl.hh | 24 |
2 files changed, 58 insertions, 11 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc index dd900125..8a46d833 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -505,6 +505,18 @@ private: parse_error("unclosed character class"); ++m_pos; + if (not character_class.ignore_case) + { + bool could_ignore_case = true; + for (const auto& [min, max] : character_class.ranges) + { + if (not contains(character_class.ranges, CharacterClass::Range{to_lower(min), to_lower(max)}) or + not contains(character_class.ranges, CharacterClass::Range{to_upper(min), to_upper(max)})) + could_ignore_case = false; + } + character_class.ignore_case = could_ignore_case; + } + if (character_class.ignore_case) { for (auto& range : character_class.ranges) @@ -521,7 +533,7 @@ private: if (character_class.ctypes == CharacterType::None and not character_class.negative and character_class.ranges.size() == 1 and character_class.ranges.front().min == character_class.ranges.front().max) - return add_node(ParsedRegex::Literal, character_class.ranges.front().min); + return add_node(ParsedRegex::Literal, character_class.ranges.front().min, {1,1}, character_class.ignore_case); if (character_class.ctypes != CharacterType::None and not character_class.negative and character_class.ranges.empty()) @@ -585,14 +597,14 @@ private: } } - NodeIndex add_node(ParsedRegex::Op op, Codepoint value = -1, ParsedRegex::Quantifier quantifier = {1, 1}) + NodeIndex add_node(ParsedRegex::Op op, Codepoint value = -1, ParsedRegex::Quantifier quantifier = {1, 1}, bool ignore_case = false) { constexpr auto max_nodes = std::numeric_limits<int16_t>::max(); const NodeIndex res = m_parsed_regex.nodes.size(); if (res == max_nodes) parse_error(format("regex parsed to more than {} ast nodes", max_nodes)); const NodeIndex next = res+1; - m_parsed_regex.nodes.push_back({op, m_flags & Flags::IgnoreCase, next, value, quantifier}); + m_parsed_regex.nodes.push_back({op, ignore_case or (m_flags & Flags::IgnoreCase), next, value, quantifier}); return res; } @@ -703,6 +715,7 @@ struct RegexCompiler private: template<RegexMode direction> + [[gnu::noinline]] OpIndex compile_node_inner(ParsedRegex::NodeIndex index) { auto& node = get_node(index); @@ -729,7 +742,15 @@ private: push_inst(CompiledRegex::AnyCharExceptNewLine); break; case ParsedRegex::CharClass: - push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)}); + if (auto& char_class = m_parsed_regex.character_classes[node.value]; + char_class.ranges.size() == 1 and char_class.ctypes == CharacterType::None and + char_class.ranges[0].max <= std::numeric_limits<uint8_t>::max()) + push_inst(CompiledRegex::CharRange, {.range={.min=uint8_t(char_class.ranges[0].min), + .max=uint8_t(char_class.ranges[0].max), + .ignore_case=char_class.ignore_case, + .negative=char_class.negative}}); + else + push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)}); break; case ParsedRegex::CharType: push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}}); @@ -1107,12 +1128,18 @@ String dump_regex(const CompiledRegex& program) case CompiledRegex::AnyCharExceptNewLine: res += "anything but newline\n"; break; - case CompiledRegex::CharClass: - res += format("character class {}\n", inst.param.character_class_index); + case CompiledRegex::CharRange: + res += format("character range {}[{}{}-{}]\n", + inst.param.range.ignore_case ? "(ignore case) " : "", + inst.param.range.negative ? "^" : "", + inst.param.range.min, inst.param.range.max); break; case CompiledRegex::CharType: res += format("character type {}\n", to_underlying(inst.param.character_type)); break; + case CompiledRegex::CharClass: + res += format("character class {}\n", inst.param.character_class_index); + break; case CompiledRegex::Jump: res += format("jump {} ({:03})\n", inst.param.jump_offset, index + inst.param.jump_offset); break; @@ -1259,6 +1286,12 @@ auto test_regex = UnitTest{[]{ } { + TestVM<> vm{R"([aA])"}; + kak_assert(vm.exec("a")); + kak_assert(vm.exec("A")); + } + + { TestVM<> vm{R"(a{3,5}b)"}; kak_assert(not vm.exec("aab")); kak_assert(vm.exec("aaab")); diff --git a/src/regex_impl.hh b/src/regex_impl.hh index 5faa7c94..2f5cbcfa 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -76,8 +76,9 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex> Literal, AnyChar, AnyCharExceptNewLine, - CharClass, + CharRange, CharType, + CharClass, Jump, Split, Save, @@ -105,8 +106,15 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex> uint32_t codepoint : 24; bool ignore_case : 1; } literal; - int16_t character_class_index; + struct CharRange + { + uint8_t min; + uint8_t max; + bool ignore_case : 1; + bool negative; + } range; CharacterType character_type; + int16_t character_class_index; int16_t jump_offset; int16_t save_index; struct Split @@ -399,15 +407,21 @@ private: if (pos != config.end and cp != '\n') return consumed(); return failed(); - case CompiledRegex::CharClass: - if (pos != config.end and - m_program.character_classes[inst.param.character_class_index].matches(cp)) + case CompiledRegex::CharRange: + if (auto actual_cp = (inst.param.range.ignore_case ? to_lower(cp) : cp); + pos != config.end and + (actual_cp >= inst.param.range.min and actual_cp <= inst.param.range.max) != inst.param.range.negative) return consumed(); return failed(); case CompiledRegex::CharType: if (pos != config.end and is_ctype(inst.param.character_type, cp)) return consumed(); return failed(); + case CompiledRegex::CharClass: + if (pos != config.end and + m_program.character_classes[inst.param.character_class_index].matches(cp)) + return consumed(); + return failed(); case CompiledRegex::Jump: thread.inst = &inst + inst.param.jump_offset; break; |
