summaryrefslogtreecommitdiff
path: root/src/regex_impl.cc
diff options
context:
space:
mode:
authorMaxime Coste <mawww@kakoune.org>2019-11-06 20:48:48 +1100
committerMaxime Coste <mawww@kakoune.org>2019-11-06 20:48:48 +1100
commit3e7301ede7fd302b361727734d1906cca896dede (patch)
tree06b60c18197db07d6477cff7d9af5d0c594862eb /src/regex_impl.cc
parent3816f1fa43cbab05b292e062f0d35a121154e0cc (diff)
Support \x and \u escapes in regex character classes
Change \u to use 6 digits to cover the full unicode range. Fixes #3172
Diffstat (limited to 'src/regex_impl.cc')
-rw-r--r--src/regex_impl.cc97
1 files changed, 63 insertions, 34 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index 02db75a0..4ad80c29 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -349,6 +349,29 @@ private:
}
}
+ Codepoint read_hex(size_t count)
+ {
+ Codepoint res = 0;
+ for (int i = 0; i < count; ++i)
+ {
+ if (at_end())
+ parse_error("unterminated hex sequence");
+ Codepoint digit = *m_pos++;
+ Codepoint digit_value;
+ if ('0' <= digit and digit <= '9')
+ digit_value = digit - '0';
+ else if ('a' <= digit and digit <= 'f')
+ digit_value = 0xa + digit - 'a';
+ else if ('A' <= digit and digit <= 'F')
+ digit_value = 0xa + digit - 'A';
+ else
+ parse_error(format("invalid hex digit '{}'", digit));
+
+ res = res * 16 + digit_value;
+ }
+ return res;
+ }
+
NodeIndex atom_escape()
{
const Codepoint cp = *m_pos++;
@@ -381,29 +404,6 @@ private:
return new_node(ParsedRegex::Literal, control.value);
}
- auto read_hex = [this](size_t count)
- {
- Codepoint res = 0;
- for (int i = 0; i < count; ++i)
- {
- if (at_end())
- parse_error("unterminated hex sequence");
- Codepoint digit = *m_pos++;
- Codepoint digit_value;
- if ('0' <= digit and digit <= '9')
- digit_value = digit - '0';
- else if ('a' <= digit and digit <= 'f')
- digit_value = 0xa + digit - 'a';
- else if ('A' <= digit and digit <= 'F')
- digit_value = 0xa + digit - 'A';
- else
- parse_error(format("invalid hex digit '{}'", digit));
-
- res = res * 16 + digit_value;
- }
- return res;
- };
-
if (cp == '0')
return new_node(ParsedRegex::Literal, '\0');
else if (cp == 'c')
@@ -418,7 +418,7 @@ private:
else if (cp == 'x')
return new_node(ParsedRegex::Literal, read_hex(2));
else if (cp == 'u')
- return new_node(ParsedRegex::Literal, read_hex(4));
+ return new_node(ParsedRegex::Literal, read_hex(6));
if (contains("^$\\.*+?()[]{}|", cp)) // SyntaxCharacter
return new_node(ParsedRegex::Literal, cp);
@@ -470,6 +470,20 @@ private:
if (at_end())
break;
+ auto read_escaped_char = [this]() {
+ Codepoint cp = *m_pos++;
+ auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
+ if (it != std::end(control_escapes))
+ return it->value;
+ if (cp == 'x')
+ return read_hex(2);
+ if (cp == 'u')
+ return read_hex(6);
+ if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
+ parse_error(format("unknown character class escape '{}'", cp));
+ return cp;
+ };
+
if (cp == '\\')
{
auto it = find_if(character_class_escapes,
@@ -481,14 +495,7 @@ private:
continue;
}
else // its an escaped character
- {
- cp = *m_pos++;
- auto it = find_if(control_escapes, [cp](auto&& t) { return t.name == cp; });
- if (it != std::end(control_escapes))
- cp = it->value;
- else if (not contains("^$\\.*+?()[]{}|-", cp)) // SyntaxCharacter and -
- parse_error(format("unknown character class escape '{}'", cp));
- }
+ cp = read_escaped_char();
}
CharacterClass::Range range = { cp, cp };
@@ -498,7 +505,10 @@ private:
break;
if (*m_pos != ']')
{
- range.max = *m_pos++;
+ cp = *m_pos++;
+ if (cp == '\\')
+ cp = read_escaped_char();
+ range.max = cp;
if (range.min > range.max)
parse_error("invalid range specified");
}
@@ -1523,12 +1533,31 @@ auto test_regex = UnitTest{[]{
}
{
+ TestVM<> vm{R"([\t-\r]+)"};
+ kak_assert(vm.exec("\t\n\v\f\r"));
+ }
+
+ {
+ TestVM<> vm{R"([^\x00-\x7F]+)"};
+ kak_assert(not vm.exec("ascii"));
+ kak_assert(vm.exec("โ†โ†‘โ†’โ†“"));
+ kak_assert(vm.exec("๐Ÿ˜„๐Ÿ˜Š๐Ÿ˜‰"));
+ }
+
+ {
+ TestVM<> vm{R"([^\u000000-\u00ffff]+)"};
+ kak_assert(not vm.exec("ascii"));
+ kak_assert(not vm.exec("โ†โ†‘โ†’โ†“"));
+ kak_assert(vm.exec("๐Ÿ˜„๐Ÿ˜Š๐Ÿ˜‰"));
+ }
+
+ {
TestVM<RegexMode::Forward | RegexMode::Search> vm{R"(ะด)"};
kak_assert(vm.exec("ะด", RegexExecFlags::None));
}
{
- TestVM<> vm{R"(\0\x0A\u260e\u260F)"};
+ TestVM<> vm{R"(\0\x0A\u00260e\u00260F)"};
const char str[] = "\0\nโ˜Žโ˜"; // work around the null byte in the literal
kak_assert(vm.exec({str, str + sizeof(str)-1}));
}