summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2019-01-03 22:52:15 +1100
committerMaxime Coste <mawww@kakoune.org>2019-01-03 22:55:50 +1100
commit328c497be248faf4e13aaececaf849c844c59efe (patch)
treeeb3d225fc81389afc8938593f065cf715804d75d
parent56ee329d79d076742e60c10974c471cc8119ed05 (diff)
Add support for named captures to the regex impl and regex highlighter
ECMAScript is adding support for it, and it is a pretty isolated change to do. Fixes #2293
-rw-r--r--doc/pages/highlighters.asciidoc4
-rw-r--r--doc/pages/regex.asciidoc16
-rw-r--r--src/highlighters.cc15
-rw-r--r--src/regex.cc7
-rw-r--r--src/regex.hh1
-rw-r--r--src/regex_impl.cc52
-rw-r--r--src/regex_impl.hh7
-rw-r--r--test/highlight/named-captures/cmd1
-rw-r--r--test/highlight/named-captures/in1
-rw-r--r--test/highlight/named-captures/rc1
-rw-r--r--test/highlight/named-captures/ui-out7
11 files changed, 90 insertions, 22 deletions
diff --git a/doc/pages/highlighters.asciidoc b/doc/pages/highlighters.asciidoc
index ea6500cd..18504b3c 100644
--- a/doc/pages/highlighters.asciidoc
+++ b/doc/pages/highlighters.asciidoc
@@ -99,6 +99,10 @@ from the remaining parameters.
add-highlighter window/ regex //\h*(TODO:)[^\n]* 0:cyan 1:yellow,red
--------------------------------------------------------------------
+ capture_id can be either the capture number, or its name if a
+ named capture is used in the regex (See
+ <<regex#Groups, `:doc regex Groups`>>)
+
*dynregex* <expression> <capture_id>:<face> ...::
similar to regex, but expand (like a command parameter would) the
given expression before building a regex from the result.
diff --git a/doc/pages/regex.asciidoc b/doc/pages/regex.asciidoc
index e228c25f..a43fc589 100644
--- a/doc/pages/regex.asciidoc
+++ b/doc/pages/regex.asciidoc
@@ -78,17 +78,21 @@ Regex atoms can be grouped using `(` and `)` or `(?:` and `)`. If `(` is
used, the group will be a capturing group, which means the positions from
the subject strings that matched between `(` and `)` will be recorded.
-Capture groups are numbered starting at 1. They are numbered in the order of
-appearance of their `(` in the regex. A special capture group 0 is
-for the whole sequence that matched.
+Capture groups are numbered starting at 1. They are numbered in the
+order of appearance of their `(` in the regex. A special capture group
+0 is for the whole sequence that matched.
-`(?:` introduces a non capturing group, which will not record the
+* `(?:` introduces a non capturing group, which will not record the
matching positions.
+* `(?<name>` introduces a named capturing group, which, in addition to
+being referred by number, can be, in certain contexts, referred by the
+given name.
+
== Alternations
-`|` introduces an alternation, which will either match its left-hand side,
-or its right-hand side (preferring the left-hand side)
+The `|` character introduces an alternation, which will either match
+its left-hand side, or its right-hand side (preferring the left-hand side)
For example, `foo|bar` matches either `foo` or `bar`, `foo(bar|baz|qux)`
matches `foo` followed by either `bar`, `baz` or `qux`.
diff --git a/src/highlighters.cc b/src/highlighters.cc
index 21418466..a45b5ee7 100644
--- a/src/highlighters.cc
+++ b/src/highlighters.cc
@@ -307,19 +307,25 @@ public:
if (params.size() < 2)
throw runtime_error("wrong parameter count");
+ Regex re{params[0], RegexCompileFlags::Optimize};
+
FacesSpec faces;
for (auto& spec : params.subrange(1))
{
auto colon = find(spec, ':');
if (colon == spec.end())
throw runtime_error(format("wrong face spec: '{}' expected <capture>:<facespec>", spec));
- int capture = str_to_int({spec.begin(), colon});
+ const StringView capture_name{spec.begin(), colon};
+ const int capture = str_to_int_ifp(capture_name).value_or_compute([&] {
+ return re.named_capture_index(capture_name);
+ });
+ if (capture < 0)
+ throw runtime_error(format("capture name {} is neither a capture index, nor an existing capture name",
+ capture_name));
faces.emplace_back(capture, String{colon+1, spec.end()});
}
- Regex ex{params[0], RegexCompileFlags::Optimize};
-
- return std::make_unique<RegexHighlighter>(std::move(ex), std::move(faces));
+ return std::make_unique<RegexHighlighter>(std::move(re), std::move(faces));
}
private:
@@ -492,7 +498,6 @@ std::unique_ptr<Highlighter> create_dynamic_regex_highlighter(HighlighterParamet
faces.emplace_back(capture, String{colon+1, spec.end()});
}
-
auto make_hl = [](auto& regex_getter, auto& face_getter) {
return std::make_unique<DynamicRegexHighlighter<std::decay_t<decltype(regex_getter)>,
std::decay_t<decltype(face_getter)>>>(
diff --git a/src/regex.cc b/src/regex.cc
index 970721c7..389b2376 100644
--- a/src/regex.cc
+++ b/src/regex.cc
@@ -1,4 +1,5 @@
#include "regex.hh"
+#include "ranges.hh"
namespace Kakoune
{
@@ -8,6 +9,12 @@ Regex::Regex(StringView re, RegexCompileFlags flags)
m_str{re.str()}
{}
+int Regex::named_capture_index(StringView name) const
+{
+ auto it = find_if(m_impl->named_captures, [&](auto& c) { return c.name == name; });
+ return it != m_impl->named_captures.end() ? it->index : -1;
+}
+
String option_to_string(const Regex& re)
{
return re.str();
diff --git a/src/regex.hh b/src/regex.hh
index aac66921..4d7cc5f0 100644
--- a/src/regex.hh
+++ b/src/regex.hh
@@ -21,6 +21,7 @@ public:
const String& str() const { return m_str; }
size_t mark_count() const { return m_impl->save_count / 2 - 1; }
+ int named_capture_index(StringView name) const;
static constexpr const char* option_type_name = "regex";
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index a86f2824..7596ba40 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -85,7 +85,8 @@ struct ParsedRegex
Vector<Node, MemoryDomain::Regex> nodes;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
- size_t capture_count;
+ Vector<CompiledRegex::NamedCapture, MemoryDomain::Regex> named_captures;
+ uint32_t capture_count;
};
namespace
@@ -166,7 +167,7 @@ private:
using Iterator = utf8::iterator<const char*, const char*, Codepoint, int, InvalidPolicy>;
using NodeIndex = ParsedRegex::NodeIndex;
- NodeIndex disjunction(unsigned capture = -1)
+ NodeIndex disjunction(uint32_t capture = -1)
{
NodeIndex index = new_node(ParsedRegex::Alternation);
get_node(index).value = capture;
@@ -301,15 +302,25 @@ private:
return new_node(ParsedRegex::AnyCharExceptNewLine);
case '(':
{
- auto captures = [this, it = (++m_pos).base()]() mutable {
- if (m_regex.end() - it >= 2 and *it++ == '?' and *it++ == ':')
- {
- m_pos = Iterator{it, m_regex};
- return false;
- }
- return true;
- };
- NodeIndex content = disjunction(captures() ? m_parsed_regex.capture_count++ : -1);
+ uint32_t capture_group = -1;
+ const char* it = (++m_pos).base();
+ if (m_regex.end() - it < 2 or *it++ != '?')
+ capture_group = m_parsed_regex.capture_count++;
+ else if (*it == ':')
+ m_pos = Iterator{++it, m_regex};
+ else if (*it == '<')
+ {
+ const auto name_start = ++it;
+ while (it != m_regex.end() and is_word(*it))
+ ++it;
+ if (it == m_regex.end() or *it != '>')
+ parse_error("named captures should be only ascii word characters");
+ capture_group = m_parsed_regex.capture_count++;
+ m_parsed_regex.named_captures.push_back({{name_start, it}, capture_group});
+ m_pos = Iterator{++it, m_regex};
+ }
+
+ NodeIndex content = disjunction(capture_group);
if (at_end() or *m_pos++ != ')')
parse_error("unclosed parenthesis");
return content;
@@ -682,6 +693,7 @@ struct RegexCompiler
m_program.first_backward_inst = -1;
m_program.character_classes = std::move(m_parsed_regex.character_classes);
+ m_program.named_captures = std::move(m_parsed_regex.named_captures);
m_program.save_count = m_parsed_regex.capture_count * 2;
}
@@ -1526,6 +1538,24 @@ auto test_regex = UnitTest{[]{
const char str[] = "\0\nā˜Žā˜"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1}));
}
+
+ {
+ auto eq = [](const CompiledRegex::NamedCapture& lhs,
+ const CompiledRegex::NamedCapture& rhs) {
+ return lhs.name == rhs.name and
+ lhs.index == rhs.index;
+ };
+
+ TestVM<> vm{R"((?<year>\d+)-(?<month>\d+)-(?<day>\d+))"};
+ kak_assert(vm.exec("2019-01-03", RegexExecFlags::None));
+ kak_assert(StringView{vm.captures()[2], vm.captures()[3]} == "2019");
+ kak_assert(StringView{vm.captures()[4], vm.captures()[5]} == "01");
+ kak_assert(StringView{vm.captures()[6], vm.captures()[7]} == "03");
+ kak_assert(vm.named_captures.size() == 3);
+ kak_assert(eq(vm.named_captures[0], {"year", 1}));
+ kak_assert(eq(vm.named_captures[1], {"month", 2}));
+ kak_assert(eq(vm.named_captures[2], {"day", 3}));
+ }
}};
}
diff --git a/src/regex_impl.hh b/src/regex_impl.hh
index 86d65ddd..86ae81a8 100644
--- a/src/regex_impl.hh
+++ b/src/regex_impl.hh
@@ -107,9 +107,16 @@ struct CompiledRegex : RefCountable, UseMemoryDomain<MemoryDomain::Regex>
explicit operator bool() const { return not instructions.empty(); }
+ struct NamedCapture
+ {
+ String name;
+ uint32_t index;
+ };
+
Vector<Instruction, MemoryDomain::Regex> instructions;
Vector<CharacterClass, MemoryDomain::Regex> character_classes;
Vector<Lookaround, MemoryDomain::Regex> lookarounds;
+ Vector<NamedCapture, MemoryDomain::Regex> named_captures;
uint32_t first_backward_inst; // -1 if no backward support, 0 if only backward, >0 if both forward and backward
uint32_t save_count;
diff --git a/test/highlight/named-captures/cmd b/test/highlight/named-captures/cmd
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/test/highlight/named-captures/cmd
@@ -0,0 +1 @@
+
diff --git a/test/highlight/named-captures/in b/test/highlight/named-captures/in
new file mode 100644
index 00000000..57ed1021
--- /dev/null
+++ b/test/highlight/named-captures/in
@@ -0,0 +1 @@
+2018-01-03
diff --git a/test/highlight/named-captures/rc b/test/highlight/named-captures/rc
new file mode 100644
index 00000000..14905790
--- /dev/null
+++ b/test/highlight/named-captures/rc
@@ -0,0 +1 @@
+add-highlighter window/ regex (?<year>\d+)-(?<month>\d+)-(?<day>\d+) year:red month:green day:yellow
diff --git a/test/highlight/named-captures/ui-out b/test/highlight/named-captures/ui-out
new file mode 100644
index 00000000..7350d79a
--- /dev/null
+++ b/test/highlight/named-captures/ui-out
@@ -0,0 +1,7 @@
+{ "jsonrpc": "2.0", "method": "set_ui_options", "params": [{}] }
+{ "jsonrpc": "2.0", "method": "draw", "params": [[[{ "face": { "fg": "black", "bg": "white", "attributes": [] }, "contents": "2" }, { "face": { "fg": "red", "bg": "default", "attributes": [] }, "contents": "018" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "green", "bg": "default", "attributes": [] }, "contents": "01" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "-" }, { "face": { "fg": "yellow", "bg": "default", "attributes": [] }, "contents": "03" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "\u000a" }]], { "fg": "default", "bg": "default", "attributes": [] }, { "fg": "blue", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "menu_hide", "params": [] }
+{ "jsonrpc": "2.0", "method": "info_hide", "params": [] }
+{ "jsonrpc": "2.0", "method": "draw_status", "params": [[], [{ "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": "out 1:1 " }, { "face": { "fg": "black", "bg": "yellow", "attributes": [] }, "contents": "" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " " }, { "face": { "fg": "blue", "bg": "default", "attributes": [] }, "contents": "1 sel" }, { "face": { "fg": "default", "bg": "default", "attributes": [] }, "contents": " - client0@[kak-tests]" }], { "fg": "cyan", "bg": "default", "attributes": [] }] }
+{ "jsonrpc": "2.0", "method": "set_cursor", "params": ["buffer", { "line": 0, "column": 0 }] }
+{ "jsonrpc": "2.0", "method": "refresh", "params": [true] }