summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2023-02-19 11:15:31 +1100
committerMaxime Coste <mawww@kakoune.org>2023-02-19 11:16:14 +1100
commitf115af7a572dd28dc757031256c49796b3a6d8e6 (patch)
tree8b08c42bb25e9637868443b6970bdfcadbd070fa /src
parentafaa47e93fb937fbedb60bcdbc768bb937108f86 (diff)
Optimize Regex CharacterClass matching
Take advantage of ranges sorting to early out, make the logic inline.
Diffstat (limited to 'src')
-rw-r--r--src/regex_impl.cc16
-rw-r--r--src/regex_impl.hh26
2 files changed, 22 insertions, 20 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 23321af8..9ada6fd5 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -959,7 +959,7 @@ private:
{
for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
{
- if (start_desc.map[cp] or is_character_class(character_class, cp))
+ if (start_desc.map[cp] or character_class.matches(cp))
start_desc.map[cp] = true;
}
}
@@ -1165,20 +1165,6 @@ CompiledRegex compile_regex(StringView re, RegexCompileFlags flags)
return RegexCompiler{RegexParser::parse(re), flags}.get_compiled_regex();
}
-bool is_character_class(const CharacterClass& character_class, Codepoint cp)
-{
- if (character_class.ignore_case)
- cp = to_lower(cp);
-
- auto it = std::find_if(character_class.ranges.begin(),
- character_class.ranges.end(),
- [cp](auto& range) { return range.min <= cp and cp <= range.max; });
-
- bool found = it != character_class.ranges.end() or (character_class.ctypes != CharacterType::None and
- is_ctype(character_class.ctypes, cp));
- return found != character_class.negative;
-}
-
bool is_ctype(CharacterType ctype, Codepoint cp)
{
auto check = [&](CharacterType bit, CharacterType not_bit, auto&& func) {
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index fd99ea1e..48788094 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -31,6 +31,8 @@ enum class CharacterType : unsigned char
};
constexpr bool with_bit_ops(Meta::Type<CharacterType>) { return true; }
+bool is_ctype(CharacterType ctype, Codepoint cp);
+
struct CharacterClass
{
struct Range
@@ -45,10 +47,24 @@ struct CharacterClass
bool ignore_case = false;
friend bool operator==(const CharacterClass&, const CharacterClass&) = default;
-};
-bool is_character_class(const CharacterClass& character_class, Codepoint cp);
-bool is_ctype(CharacterType ctype, Codepoint cp);
+ bool matches(Codepoint cp) const
+ {
+ if (ignore_case)
+ cp = to_lower(cp);
+
+ for (auto& range : ranges)
+ {
+ if (cp < range.min)
+ break;
+ else if (cp <= range.max)
+ return not negative;
+ }
+
+ return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative;
+ }
+
+};
struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
{
@@ -418,7 +434,7 @@ private:
case CompiledRegex::CharClass:
if (pos == config.end)
return failed();
- return is_character_class(m_program.character_classes[inst.param.character_class_index], codepoint(pos, config)) ?
+ return m_program.character_classes[inst.param.character_class_index].matches(codepoint(pos, config)) ?
consumed() : failed();
case CompiledRegex::CharType:
if (pos == config.end)
@@ -552,7 +568,7 @@ private:
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
{
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
- if (not is_character_class(m_program.character_classes[index], cp))
+ if (not m_program.character_classes[index].matches(cp))
return false;
}
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)