summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2017-10-14 22:10:56 +0800
committerMaxime Coste <mawww@kakoune.org>2017-11-01 14:05:14 +0800
commit9ec175f2f8880a63e169506cab88c41e7637bc40 (patch)
tree1108c867d355fed594919f2762dc126fc9bc9d96 /src
parent6e65589a344dc6ab566d76aae44a32b61128cd82 (diff)
Regex: use binary search to for character class ranges check
Diffstat (limited to 'src')
-rw-r--r--src/regex_impl.cc43
1 files changed, 39 insertions, 4 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index df9e9d06..fee7dcb3 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -315,13 +315,38 @@ private:
parse_error(format("unknown atom escape '{}'", cp));
}
+ struct CharRange { Codepoint min, max; };
+
+ void normalize_ranges(Vector<CharRange>& ranges)
+ {
+ if (ranges.empty())
+ return;
+
+ // Sort ranges so that we can use binary search
+ std::sort(ranges.begin(), ranges.end(),
+ [](auto& lhs, auto& rhs) { return lhs.min < rhs.min; });
+
+ // merge overlapping ranges
+ auto pos = ranges.begin();
+ for (auto next = pos+1; next != ranges.end(); ++next)
+ {
+ if (pos->max + 1 >= next->min)
+ {
+ if (next->max > pos->max)
+ pos->max = next->max;
+ }
+ else
+ *++pos = *next;
+ }
+ ranges.erase(pos+1, ranges.end());
+ }
+
AstNodePtr character_class()
{
const bool negative = m_pos != m_regex.end() and *m_pos == '^';
if (negative)
++m_pos;
- struct CharRange { Codepoint min, max; };
Vector<CharRange> ranges;
Vector<Codepoint> excluded;
Vector<std::pair<wctype_t, bool>> ctypes;
@@ -404,6 +429,8 @@ private:
cp = to_lower(cp);
}
+ normalize_ranges(ranges);
+
// Optimize the relatively common case of using a character class to
// escape a character, such as [*]
if (ctypes.empty() and excluded.empty() and not negative and
@@ -417,9 +444,12 @@ private:
if (ignore_case)
cp = to_lower(cp);
- auto found = contains_that(ranges, [cp](auto& r) {
- return r.min <= cp and cp <= r.max;
- }) or contains_that(ctypes, [cp](auto& c) {
+ auto it = std::lower_bound(ranges.begin(), ranges.end(), cp,
+ [](auto& range, Codepoint cp)
+ { return range.max < cp; });
+
+ auto found = (it != ranges.end() and it->min <= cp) or
+ contains_that(ctypes, [cp](auto& c) {
return (bool)iswctype(cp, c.first) == c.second;
}) or (not excluded.empty() and not contains(excluded, cp));
return negative ? not found : found;
@@ -1239,6 +1269,11 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("fOO", RegexExecFlags::Search));
kak_assert(*vm.captures()[0] == 'f');
}
+
+ {
+ TestVM<> vm{R"([d-ea-dcf-k]+)"};
+ kak_assert(vm.exec("abcde"));
+ }
}};
}