summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2024-03-13 17:29:05 +1100
committerMaxime Coste <mawww@kakoune.org>2024-03-13 17:29:05 +1100
commitc4df0fac52c83fe68b7f583de1f419c976645dc0 (patch)
treebc95671103c778995f75267a6c04f8e65bda1354 /src
parentc956413046ae1a57ab4dab9dee6986ec3e0eb2a6 (diff)
Simplify and accelerate start desc map
Store values for all possible bytes and fill utf8 multi byte start values when necessary.
Diffstat (limited to 'src')
-rw-r--r--src/regex_impl.cc27
-rw-r--r--src/regex_impl.hh11
2 files changed, 21 insertions, 17 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 33d7a88a..b1630a1f 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -17,7 +17,6 @@
namespace Kakoune
{
-constexpr Codepoint CompiledRegex::StartDesc::other;
constexpr Codepoint CompiledRegex::StartDesc::count;
struct ParsedRegex
@@ -893,11 +892,17 @@ private:
bool compute_start_desc(ParsedRegex::NodeIndex index,
CompiledRegex::StartDesc& start_desc) const
{
+ // fill all bytes that mark the start of an utf8 multi byte sequence
+ auto add_multi_byte_utf8 = [&] {
+ std::fill(start_desc.map + 0b11000000, start_desc.map + 0b11111000, true);
+ };
+ static constexpr Codepoint single_byte_limit = 128;
+
auto& node = get_node(index);
switch (node.op)
{
case ParsedRegex::Literal:
- if (node.value < CompiledRegex::StartDesc::count)
+ if (node.value < single_byte_limit)
{
if (node.ignore_case)
{
@@ -908,14 +913,14 @@ private:
start_desc.map[node.value] = true;
}
else
- start_desc.map[CompiledRegex::StartDesc::other] = true;
+ add_multi_byte_utf8();
return node.quantifier.allows_none();
case ParsedRegex::AnyChar:
for (auto& b : start_desc.map)
b = true;
return node.quantifier.allows_none();
case ParsedRegex::AnyCharExceptNewLine:
- for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+ for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{
if (cp != '\n')
start_desc.map[cp] = true;
@@ -930,33 +935,33 @@ private:
{
for (auto& range : character_class.ranges)
{
- const auto clamp = [](Codepoint cp) { return std::min(CompiledRegex::StartDesc::count, cp); };
+ const auto clamp = [](Codepoint cp) { return std::min(single_byte_limit, cp); };
for (auto cp = clamp(range.min), end = clamp(range.max + 1); cp < end; ++cp)
start_desc.map[cp] = true;
- if (range.max >= CompiledRegex::StartDesc::count)
- start_desc.map[CompiledRegex::StartDesc::other] = true;
+ if (range.max >= single_byte_limit)
+ add_multi_byte_utf8();
}
}
else
{
- for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+ for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{
if (start_desc.map[cp] or character_class.matches(cp))
start_desc.map[cp] = true;
}
}
- start_desc.map[CompiledRegex::StartDesc::other] = true;
+ add_multi_byte_utf8();
return node.quantifier.allows_none();
}
case ParsedRegex::CharType:
{
const CharacterType ctype = (CharacterType)node.value;
- for (Codepoint cp = 0; cp < CompiledRegex::StartDesc::count; ++cp)
+ for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{
if (is_ctype(ctype, cp))
start_desc.map[cp] = true;
}
- start_desc.map[CompiledRegex::StartDesc::other] = true;
+ add_multi_byte_utf8();
return node.quantifier.allows_none();
}
case ParsedRegex::Sequence:
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 90b797c8..7997994c 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -152,8 +152,7 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
struct StartDesc : UseMemoryDomain<MemoryDomain::Regex>
{
- static constexpr Codepoint count = 128;
- static constexpr Codepoint other = 0;
+ static constexpr Codepoint count = 256;
bool map[count];
};
@@ -277,7 +276,7 @@ public:
else if (start != config.end)
{
const unsigned char c = forward ? *start : *utf8::previous(start, config.end);
- if (not start_desc->map[(c < StartDesc::count) ? c : StartDesc::other])
+ if (not start_desc->map[c])
return false;
}
}
@@ -519,11 +518,11 @@ private:
{
while (start != config.end)
{
- static_assert(StartDesc::count <= 128, "start desc should be ascii only");
+ static_assert(StartDesc::count <= 256, "start desc should be ascii only");
if constexpr (forward)
{
const unsigned char c = *start;
- if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other])
+ if (start_desc.map[c])
return;
++start;
}
@@ -531,7 +530,7 @@ private:
{
auto prev = utf8::previous(start, config.end);
const unsigned char c = *prev;
- if (start_desc.map[(c < StartDesc::count) ? c : StartDesc::other])
+ if (start_desc.map[c])
return;
start = prev;
}