diff options
| author | Maxime Coste <mawww@kakoune.org> | 2019-01-03 22:52:15 +1100 |
|---|---|---|
| committer | Maxime Coste <mawww@kakoune.org> | 2019-01-03 22:55:50 +1100 |
| commit | 328c497be248faf4e13aaececaf849c844c59efe (patch) | |
| tree | eb3d225fc81389afc8938593f065cf715804d75d /src/regex_impl.cc | |
| parent | 56ee329d79d076742e60c10974c471cc8119ed05 (diff) | |
Add support for named captures to the regex impl and regex highlighter
ECMAScript is adding support for it, and it is a pretty isolated
change to do.
Fixes #2293
Diffstat (limited to 'src/regex_impl.cc')
| -rw-r--r-- | src/regex_impl.cc | 52 |
1 files changed, 41 insertions, 11 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc index a86f2824..7596ba40 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -85,7 +85,8 @@ struct ParsedRegex Vector<Node, MemoryDomain::Regex> nodes; Vector<CharacterClass, MemoryDomain::Regex> character_classes; - size_t capture_count; + Vector<CompiledRegex::NamedCapture, MemoryDomain::Regex> named_captures; + uint32_t capture_count; }; namespace @@ -166,7 +167,7 @@ private: using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>; using NodeIndex = ParsedRegex::NodeIndex; - NodeIndex disjunction(unsigned capture = -1) + NodeIndex disjunction(uint32_t capture = -1) { NodeIndex index = new_node(ParsedRegex::Alternation); get_node(index).value = capture; @@ -301,15 +302,25 @@ private: return new_node(ParsedRegex::AnyCharExceptNewLine); case '(': { - auto captures = [this, it = (++m_pos).base()]() mutable { - if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':') - { - m_pos = Iterator{it, m_regex}; - return false; - } - return true; - }; - NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1); + uint32_t capture_group = -1; + const char* it = (++m_pos).base(); + if (m_regex.end() - it < 2 or *it++ != '?') + capture_group = m_parsed_regex.capture_count++; + else if (*it == ':') + m_pos = Iterator{++it, m_regex}; + else if (*it == '<') + { + const auto name_start = ++it; + while (it != m_regex.end() and is_word(*it)) + ++it; + if (it == m_regex.end() or *it != '>') + parse_error("named captures should be only ascii word characters"); + capture_group = m_parsed_regex.capture_count++; + m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group}); + m_pos = Iterator{++it, m_regex}; + } + + NodeIndex content = disjunction(capture_group); if (at_end() or *m_pos++ != ')') parse_error("unclosed parenthesis"); return content; @@ -682,6 +693,7 @@ struct RegexCompiler m_program.first_backward_inst = -1; m_program.character_classes = std::move(m_parsed_regex.character_classes); + m_program.named_captures = std::move(m_parsed_regex.named_captures); m_program.save_count = m_parsed_regex.capture_count * 2; } @@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{ const char str[] = "\0\nāā"; // work around the null byte in the literal kak_assert(vm.exec({str, str + sizeof(str)-1})); } + + { + auto eq = [](const CompiledRegex::NamedCapture& lhs, + const CompiledRegex::NamedCapture& rhs) { + return lhs.name == rhs.name and + lhs.index == rhs.index; + }; + + TestVM<> vm{R"((?<year>\d+)-(?<month>\d+)-(?<day>\d+))"}; + kak_assert(vm.exec("2019-01-03", RegexExecFlags::None)); + kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019"); + kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01"); + kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03"); + kak_assert(vm.named_captures.size() == 3); + kak_assert(eq(vm.named_captures[0], {"year", 1})); + kak_assert(eq(vm.named_captures[1], {"month", 2})); + kak_assert(eq(vm.named_captures[2], {"day", 3})); + } }}; } |
