summaryrefslogtreecommitdiff
path: root/src/regex_impl.cc
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2019-01-03 22:52:15 +1100
committerMaxime Coste <mawww@kakoune.org>2019-01-03 22:55:50 +1100
commit328c497be248faf4e13aaececaf849c844c59efe (patch)
treeeb3d225fc81389afc8938593f065cf715804d75d /src/regex_impl.cc
parent56ee329d79d076742e60c10974c471cc8119ed05 (diff)
Add support for named captures to the regex impl and regex highlighter
ECMAScript is adding support for it, and it is a pretty isolated change to do. Fixes #2293
Diffstat (limited to 'src/regex_impl.cc')
-rw-r--r--src/regex_impl.cc52
1 files changed, 41 insertions, 11 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index a86f2824..7596ba40 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -85,7 +85,8 @@ struct ParsedRegex
Vector<Node, MemoryDomain::Regex> nodes;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
- size_t capture_count;
+ Vector<CompiledRegex::NamedCapture, MemoryDomain::Regex> named_captures;
+ uint32_t capture_count;
};
namespace
@@ -166,7 +167,7 @@ private:
using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>;
using NodeIndex = ParsedRegex::NodeIndex;
- NodeIndex disjunction(unsigned capture = -1)
+ NodeIndex disjunction(uint32_t capture = -1)
{
NodeIndex index = new_node(ParsedRegex::Alternation);
get_node(index).value = capture;
@@ -301,15 +302,25 @@ private:
return new_node(ParsedRegex::AnyCharExceptNewLine);
case '(':
{
- auto captures = [this, it = (++m_pos).base()]() mutable {
- if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':')
- {
- m_pos = Iterator{it, m_regex};
- return false;
- }
- return true;
- };
- NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1);
+ uint32_t capture_group = -1;
+ const char* it = (++m_pos).base();
+ if (m_regex.end() - it < 2 or *it++ != '?')
+ capture_group = m_parsed_regex.capture_count++;
+ else if (*it == ':')
+ m_pos = Iterator{++it, m_regex};
+ else if (*it == '<')
+ {
+ const auto name_start = ++it;
+ while (it != m_regex.end() and is_word(*it))
+ ++it;
+ if (it == m_regex.end() or *it != '>')
+ parse_error("named captures should be only ascii word characters");
+ capture_group = m_parsed_regex.capture_count++;
+ m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group});
+ m_pos = Iterator{++it, m_regex};
+ }
+
+ NodeIndex content = disjunction(capture_group);
if (at_end() or *m_pos++ != ')')
parse_error("unclosed parenthesis");
return content;
@@ -682,6 +693,7 @@ struct RegexCompiler
m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes);
+ m_program.named_captures = std::move(m_parsed_regex.named_captures);
m_program.save_count = m_parsed_regex.capture_count * 2;
}
@@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{
const char str[] = "\0\nā˜Žā˜"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1}));
}
+
+ {
+ auto eq = [](const CompiledRegex::NamedCapture& lhs,
+ const CompiledRegex::NamedCapture& rhs) {
+ return lhs.name == rhs.name and
+ lhs.index == rhs.index;
+ };
+
+ TestVM<> vm{R"((?<year>\d+)-(?<month>\d+)-(?<day>\d+))"};
+ kak_assert(vm.exec("2019-01-03", RegexExecFlags::None));
+ kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019");
+ kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01");
+ kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03");
+ kak_assert(vm.named_captures.size() == 3);
+ kak_assert(eq(vm.named_captures[0], {"year", 1}));
+ kak_assert(eq(vm.named_captures[1], {"month", 2}));
+ kak_assert(eq(vm.named_captures[2], {"day", 3}));
+ }
}};
}