summaryrefslogtreecommitdiff
path: root/src/regex_impl.hh
diff options
context:
space:
mode:
authorEnrico Borba <enricozb@users.noreply.github.com>2024-12-23 09:23:58 +0100
committerGitHub <noreply@github.com>2024-12-23 09:23:58 +0100
commit52125e6336d596aebdd4da91080b3178ddca7449 (patch)
tree27d3e5c01660d567f22fee621c97753f294256b0 /src/regex_impl.hh
parent14cb35f62b36b2f1aa530adb5e31c05ff1347bfc (diff)
parent9c458c50661446fc6e7295787b06422137af099d (diff)
Merge branch 'master' into enricozb/daemon-stdin
Diffstat (limited to 'src/regex_impl.hh')
-rw-r--r--src/regex_impl.hh146
1 files changed, 91 insertions, 55 deletions
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 07936027..2f7fad4e 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -9,6 +9,7 @@
#include "utils.hh"
#include <bit>
+#include <algorithm>
namespace Kakoune
{
@@ -155,6 +156,7 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex>
{
static constexpr Codepoint count = 256;
using OffsetLimits = std::numeric_limits<uint8_t>;
+ char start_byte = 0;
uint8_t offset = 0;
bool map[count];
};
@@ -210,11 +212,12 @@ constexpr bool is_direction(RegexMode mode)
(mode & ~(RegexMode::Forward | RegexMode::Backward)) == RegexMode{0};
}
-template<typename It, typename=void>
+template<typename It>
struct SentinelType { using Type = It; };
template<typename It>
-struct SentinelType<It, void_t<typename It::Sentinel>> { using Type = typename It::Sentinel; };
+ requires requires { typename It::Sentinel; }
+struct SentinelType<It> { using Type = typename It::Sentinel; };
template<typename Iterator, RegexMode mode>
requires (has_direction(mode))
@@ -257,8 +260,6 @@ public:
if (flags & RegexExecFlags::NotInitialNull and begin == end)
return false;
- constexpr bool search = (mode & RegexMode::Search);
-
const ExecConfig config{
Sentinel{forward ? begin : end},
Sentinel{forward ? end : begin},
@@ -267,24 +268,11 @@ public:
flags
};
- Iterator start = forward ? begin : end;
- if (const auto& start_desc = forward ? m_program.forward_start_desc : m_program.backward_start_desc)
- {
- if (search)
- {
- start = find_next_start(start, config, *start_desc);
- if (start == config.end) // If start_desc is not null, it means we consume at least one char
- return false;
- }
- else if (start != config.end)
- {
- const unsigned char c = forward ? *start : *utf8::previous(start, config.end);
- if (not start_desc->map[c])
- return false;
- }
- }
+ exec_program(forward ? begin : end, config, idle_func);
- return exec_program(std::move(start), config, idle_func);
+ while (not m_threads.next_is_empty())
+ release_saves(m_threads.pop_next().saves);
+ return m_found_match;
}
ArrayView<const Iterator> captures() const
@@ -370,8 +358,9 @@ private:
// Steps a thread until it consumes the current character, matches or fail
[[gnu::always_inline]]
- void step_thread(const Iterator& pos, Codepoint cp, uint16_t current_step, Thread thread, const ExecConfig& config)
+ void step_next_thread(const Iterator& pos, Codepoint cp, uint16_t current_step, const ExecConfig& config)
{
+ Thread thread = m_threads.pop_current();
auto failed = [this, &thread]() {
release_saves(thread.saves);
};
@@ -471,33 +460,48 @@ private:
return failed();
}
- bool exec_program(Iterator pos, const ExecConfig& config, auto&& idle_func)
+ void exec_program(const Iterator& start, const ExecConfig& config, auto&& idle_func)
{
kak_assert(m_threads.current_is_empty() and m_threads.next_is_empty());
+ m_threads.ensure_initial_capacity();
release_saves(m_captures);
m_captures = -1;
- m_threads.ensure_initial_capacity();
-
- ConstArrayView<CompiledRegex::Instruction> insts{m_program.instructions};
- const auto* first_inst = insts.begin() + (forward ? 0 : m_program.first_backward_inst);
- m_threads.push_current({first_inst, -1});
+ m_found_match = false;
const auto* start_desc = (forward ? m_program.forward_start_desc : m_program.backward_start_desc).get();
- auto next_start = pos;
+ Iterator next_start = start;
+ if (start_desc)
+ {
+ if (mode & RegexMode::Search)
+ {
+ next_start = find_next_start(start, config.end, *start_desc);
+ if (next_start == config.end) // If start_desc is not null, it means we consume at least one char
+ return;
+ }
+ else if (start != config.end)
+ {
+ const unsigned char c = forward ? *start : *utf8::previous(start, config.end);
+ if (not start_desc->map[c])
+ return;
+ }
+ }
+
+ const auto insts = forward ? ArrayView(m_program.instructions).subrange(0, m_program.first_backward_inst)
+ : ArrayView(m_program.instructions).subrange(m_program.first_backward_inst);
+ m_threads.push_current({insts.begin(), -1});
- constexpr bool search = mode & RegexMode::Search;
- constexpr bool any_match = mode & RegexMode::AnyMatch;
uint16_t current_step = -1;
- m_found_match = false;
- while (true) // Iterate on all codepoints and once at the end
+ uint8_t idle_count = 0; // Run idle loop every 256 * 65536 == 16M codepoints
+ auto pos = next_start;
+ while (pos != config.end)
{
if (++current_step == 0)
{
- idle_func();
+ if (++idle_count == 0)
+ idle_func();
// We wrapped, avoid potential collision on inst.last_step by resetting them
- for (auto& inst : forward ? insts.subrange(0, m_program.first_backward_inst)
- : insts.subrange(m_program.first_backward_inst))
+ for (auto& inst : insts)
inst.last_step = 0;
current_step = 1; // step 0 is never valid
}
@@ -506,38 +510,61 @@ private:
Codepoint cp = codepoint(next, config);
while (not m_threads.current_is_empty())
- step_thread(pos, cp, current_step, m_threads.pop_current(), config);
+ step_next_thread(pos, cp, current_step, config);
- if (pos == config.end or
- (m_threads.next_is_empty() and (not search or m_found_match)) or
- (m_found_match and any_match))
- {
- while (not m_threads.next_is_empty())
- release_saves(m_threads.pop_next().saves);
- return m_found_match;
- }
-
- if (search and not m_found_match)
+ if ((mode & RegexMode::Search) and not m_found_match)
{
if (start_desc)
{
if (pos == next_start)
- next_start = find_next_start(next, config, *start_desc);
+ next_start = find_next_start(next, config.end, *start_desc);
if (m_threads.next_is_empty())
next = next_start;
}
if (not start_desc or next == next_start)
- m_threads.push_next({first_inst, -1});
+ m_threads.push_next({insts.begin(), -1});
}
+ else if (m_threads.next_is_empty() or (m_found_match and (mode & RegexMode::AnyMatch)))
+ return;
+
pos = next;
m_threads.swap_next();
}
+
+ if (++current_step == 0)
+ {
+ for (auto& inst : insts)
+ inst.last_step = 0;
+ current_step = 1; // step 0 is never valid
+ }
+ while (not m_threads.current_is_empty())
+ step_next_thread(pos, -1, current_step, config);
}
- static Iterator find_next_start(Iterator start, const ExecConfig& config, const StartDesc& start_desc)
+ static Iterator find_next_start(const Iterator& start, const Sentinel& end, const StartDesc& start_desc)
{
auto pos = start;
- while (pos != config.end)
+ if (char start_byte = start_desc.start_byte)
+ {
+ while (pos != end)
+ {
+ if constexpr (forward)
+ {
+ if (*pos == start_byte)
+ return utf8::advance(pos, start, -CharCount(start_desc.offset));
+ ++pos;
+ }
+ else
+ {
+ auto prev = utf8::previous(pos, end);
+ if (*prev == start_byte)
+ return utf8::advance(pos, start, CharCount(start_desc.offset));
+ pos = prev;
+ }
+ }
+ }
+
+ while (pos != end)
{
static_assert(StartDesc::count <= 256, "start desc should be ascii only");
if constexpr (forward)
@@ -548,9 +575,9 @@ private:
}
else
{
- auto prev = utf8::previous(pos, config.end);
+ auto prev = utf8::previous(pos, end);
if (start_desc.map[static_cast<unsigned char>(*prev)])
- return pos;
+ return utf8::advance(pos, start, CharCount(start_desc.offset));
pos = prev;
}
}
@@ -652,10 +679,14 @@ private:
bool current_is_empty() const { return m_current == m_next_begin; }
bool next_is_empty() const { return m_next_end == m_next_begin; }
+ [[gnu::always_inline]]
void push_current(Thread thread) { m_data[decrement(m_current)] = thread; grow_ifn(true); }
+ [[gnu::always_inline]]
Thread pop_current() { return m_data[post_increment(m_current)]; }
+ [[gnu::always_inline]]
void push_next(Thread thread) { m_data[post_increment(m_next_end)] = thread; grow_ifn(false); }
+ [[gnu::always_inline]]
Thread pop_next() { return m_data[decrement(m_next_end)]; }
void swap_next()
@@ -675,11 +706,16 @@ private:
m_capacity = initial_capacity;
}
+ [[gnu::always_inline]]
void grow_ifn(bool pushed_current)
{
- if (m_current != m_next_end)
- return;
+ if (m_current == m_next_end)
+ grow(pushed_current);
+ }
+ [[gnu::noinline]]
+ void grow(bool pushed_current)
+ {
const auto new_capacity = m_capacity * 2;
Thread* new_data = new Thread[new_capacity];
Thread* old_data = m_data.get();