summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2017-10-06 13:40:27 +0800
committerMaxime Coste <mawww@kakoune.org>2017-11-01 14:05:14 +0800
commit3b69dda04efb1e5ed9109d9fdebe2a73ee177b41 (patch)
treeeb21382bcdf2b8575b75dd32331e7e7f236e8d6a /src
parent741772aef9d28e755e586e7eb6daf3025824085a (diff)
Regex: Find potential start position using a map of valid start chars
With this optimization we get close to performance parity with boost regex on the common use cases in Kakoune.
Diffstat (limited to 'src')
-rw-r--r--src/regex.cc3
-rw-r--r--src/regex.hh12
-rw-r--r--src/regex_impl.cc93
-rw-r--r--src/regex_impl.hh29
4 files changed, 127 insertions, 10 deletions
diff --git a/src/regex.cc b/src/regex.cc
index 749dced8..a622269b 100644
--- a/src/regex.cc
+++ b/src/regex.cc
@@ -11,7 +11,8 @@ using Utf8It = RegexUtf8It<const char*>;
Regex::Regex(StringView re, flag_type flags) try
: RegexBase{Utf8It{re.begin(), re}, Utf8It{re.end(), re}, flags}, m_str{re.str()}
{
- m_impl = compile_regex(re);
+ if (auto compiled_regex = compile_regex(re))
+ m_impl = new CompiledRegex{std::move(compiled_regex)};
} catch (std::runtime_error& err) { throw regex_error(err.what()); }
String option_to_string(const Regex& re)
diff --git a/src/regex.hh b/src/regex.hh
index 5e2f4b01..64e24304 100644
--- a/src/regex.hh
+++ b/src/regex.hh
@@ -36,11 +36,11 @@ public:
static constexpr const char* option_type_name = "regex";
- const CompiledRegex& impl() const { return m_impl; }
+ const CompiledRegex* impl() const { return m_impl.get(); }
private:
String m_str;
- CompiledRegex m_impl;
+ RefPtr<CompiledRegex> m_impl;
};
template<typename It>
@@ -143,7 +143,7 @@ bool regex_match(It begin, It end, const Regex& re)
try
{
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re);
- if (re.impl() and matched != regex_match(begin, end, re.impl()))
+ if (re.impl() and matched != regex_match(begin, end, *re.impl()))
regex_mismatch(re);
return matched;
}
@@ -160,7 +160,7 @@ bool regex_match(It begin, It end, MatchResults<It>& res, const Regex& re)
{
bool matched = boost::regex_match<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re);
Vector<It> captures;
- if (re.impl() and matched != regex_match(begin, end, captures, re.impl()))
+ if (re.impl() and matched != regex_match(begin, end, captures, *re.impl()))
regex_mismatch(re);
if (re.impl() and matched)
check_captures(re, res, captures);
@@ -179,7 +179,7 @@ bool regex_search(It begin, It end, const Regex& re,
try
{
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, re, flags);
- if (re.impl() and matched != regex_search(begin, end, re.impl(), convert_flags(flags)))
+ if (re.impl() and matched != regex_search(begin, end, *re.impl(), convert_flags(flags)))
regex_mismatch(re);
return matched;
}
@@ -197,7 +197,7 @@ bool regex_search(It begin, It end, MatchResults<It>& res, const Regex& re,
{
bool matched = boost::regex_search<RegexUtf8It<It>>({begin, begin, end}, {end, begin, end}, res, re, flags);
Vector<It> captures;
- if (re.impl() and matched != regex_search(begin, end, captures, re.impl(), convert_flags(flags)))
+ if (re.impl() and matched != regex_search(begin, end, captures, *re.impl(), convert_flags(flags)))
regex_mismatch(re);
if (re.impl() and matched)
check_captures(re, res, captures);
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 52c45d61..de00ce87 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -515,6 +515,7 @@ struct RegexCompiler
push_op(CompiledRegex::Match);
m_program.matchers = m_parsed_regex.matchers;
m_program.save_count = m_parsed_regex.capture_count * 2;
+ m_program.start_chars = compute_start_chars();
}
CompiledRegex get_compiled_regex() { return std::move(m_program); }
@@ -708,6 +709,87 @@ private:
push_codepoint(cp->value);
}
+ // Fills accepted and rejected according to which chars can start the given node,
+ // returns true if the node did not consume the char, hence a following node in
+ // sequence would be still relevant for the parent node start chars computation.
+ bool compute_start_chars(const ParsedRegex::AstNodePtr& node,
+ bool (&accepted)[256], bool (&rejected)[256]) const
+ {
+ switch (node->op)
+ {
+ case ParsedRegex::Literal:
+ if (node->value < 256)
+ accepted[node->value] = true;
+ return node->quantifier.allows_none();
+ case ParsedRegex::AnyChar:
+ for (auto& b : accepted)
+ b = true;
+ return node->quantifier.allows_none();
+ case ParsedRegex::Matcher:
+ for (auto& b : accepted) // treat matcher as everything can match for now
+ b = true;
+ return node->quantifier.allows_none();
+ case ParsedRegex::Sequence:
+ {
+ bool consumed = false;
+ for (auto& child : node->children)
+ {
+ if (not compute_start_chars(child, accepted, rejected))
+ {
+ consumed = true;
+ break;
+ }
+ }
+ return not consumed or node->quantifier.allows_none();
+ }
+ case ParsedRegex::Alternation:
+ {
+ bool all_consumed = not node->quantifier.allows_none();
+ for (auto& child : node->children)
+ {
+ if (compute_start_chars(child, accepted, rejected))
+ all_consumed = false;
+ }
+ return not all_consumed;
+ }
+ case ParsedRegex::LineStart:
+ case ParsedRegex::LineEnd:
+ case ParsedRegex::WordBoundary:
+ case ParsedRegex::NotWordBoundary:
+ case ParsedRegex::SubjectBegin:
+ case ParsedRegex::SubjectEnd:
+ case ParsedRegex::ResetStart:
+ return true;
+ case ParsedRegex::LookAhead:
+ if (node->children.empty())
+ compute_start_chars(node->children.front(), accepted, rejected);
+ return true;
+ case ParsedRegex::NegativeLookAhead:
+ if (node->children.empty())
+ compute_start_chars(node->children.front(), rejected, accepted);
+ return true;
+ case ParsedRegex::LookBehind:
+ return true;
+ case ParsedRegex::NegativeLookBehind:
+ return true;
+ }
+ return false;
+ }
+
+ std::unique_ptr<CompiledRegex::StartChars> compute_start_chars() const
+ {
+ bool accepted[256] = {};
+ bool rejected[256] = {};
+ if (compute_start_chars(m_parsed_regex.ast, accepted, rejected))
+ return nullptr;
+
+ auto start_chars = std::make_unique<CompiledRegex::StartChars>();
+ for (int i = 0; i < 256; ++i)
+ start_chars->map[i] = accepted[i] and not rejected[i];
+
+ return start_chars;
+ }
+
CompiledRegex m_program;
const ParsedRegex& m_parsed_regex;
};
@@ -1020,6 +1102,17 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("foofoofoo"));
kak_assert(not vm.exec("barbarbar"));
}
+
+ {
+ TestVM vm{R"((?<!\\)(?:\\\\)*")"};
+ kak_assert(vm.exec("foo\"", false));
+ }
+
+ {
+ TestVM vm{R"($)"};
+ kak_assert(vm.exec("foo\n", false, true));
+ kak_assert(*vm.m_captures->pos[0] == '\n');
+ }
}};
}
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 0c15b57a..9e81a701 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -6,11 +6,12 @@
#include "utf8_iterator.hh"
#include "vector.hh"
#include "flags.hh"
+#include "ref_ptr.hh"
namespace Kakoune
{
-struct CompiledRegex
+struct CompiledRegex : RefCountable
{
enum Op : char
{
@@ -41,6 +42,9 @@ struct CompiledRegex
Vector<char> bytecode;
Vector<std::function<bool (Codepoint)>> matchers;
size_t save_count;
+
+ struct StartChars { bool map[256]; };
+ std::unique_ptr<StartChars> start_chars;
};
CompiledRegex compile_regex(StringView re);
@@ -311,6 +315,16 @@ struct ThreadedRegexVM
return false;
}
+ void to_next_start(Utf8It& start, const Iterator& end, const bool* start_chars)
+ {
+ if (not start_chars)
+ return;
+
+ while (start != end and *start >= 0 and *start < 256 and
+ not start_chars[*start])
+ ++start;
+ }
+
bool exec(Iterator begin, Iterator end, RegexExecFlags flags)
{
m_begin = begin;
@@ -324,6 +338,12 @@ struct ThreadedRegexVM
const bool no_saves = (m_flags & RegexExecFlags::NoSaves);
Utf8It start{m_begin, m_begin, m_end};
+
+ const bool* start_chars = m_program.start_chars ? m_program.start_chars->map : nullptr;
+
+ if (flags & RegexExecFlags::Search)
+ to_next_start(start, end, start_chars);
+
if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads))
return true;
@@ -331,12 +351,15 @@ struct ThreadedRegexVM
if (not (flags & RegexExecFlags::Search))
return false;
- while (start != end)
+ do
{
- if (exec_from(++start, no_saves ? nullptr : new_saves<false>(nullptr),
+ to_next_start(++start, end, start_chars);
+ if (exec_from(start, no_saves ? nullptr : new_saves<false>(nullptr),
current_threads, next_threads))
return true;
}
+ while (start != end);
+
return false;
}