Regex: Add support for \h and \H "horizontal blank" character classes

author: Maxime Coste <mawww@kakoune.org> 2017-09-27 14:04:05 +0800
committer: Maxime Coste <mawww@kakoune.org> 2017-11-01 14:05:14 +0800
commit: e4004a7b7fbbc2fb903394dcb172c701f7733847 (patch)
tree: a1780b18c2e2d29b401538107e49632a3242ed83 /src
parent: 4ac0d35d1e00dec9461d3ecfa2057cfda1dab31e (diff)
1 files changed, 26 insertions, 10 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc
index d3ca59d9..868b319b 100644
--- a/src/regex_impl.cc
+++ b/src/regex_impl.cc
@@ -222,9 +222,9 @@ private:
             {
                 auto matcher_id = m_parsed_regex.matchers.size();
                 m_parsed_regex.matchers.push_back(
-                    [ctype = wctype(character_class.ctype),
+                    [ctype = character_class.ctype ? wctype(character_class.ctype) : (wctype_t)0,
                      chars = character_class.additional_chars] (Codepoint cp) {
-                        return iswctype(cp, ctype) or contains(chars, cp);
+                        return (ctype != 0 and iswctype(cp, ctype)) or contains(chars, cp);
                     });
                 return new_node(ParsedRegex::Matcher, matcher_id);
             }
@@ -255,6 +255,7 @@ private:
 
         struct CharRange { Codepoint min, max; };
         Vector<CharRange> ranges;
+        Vector<Codepoint> excluded;
         Vector<std::pair<wctype_t, bool>> ctypes;
         while (m_pos != m_regex.end() and *m_pos != ']')
         {
@@ -274,9 +275,15 @@ private:
                                   [cp = *m_pos](auto& t) { return t.cp == cp; });
                 if (it != std::end(character_class_escapes))
                 {
-                    ctypes.push_back({wctype(it->ctype), not it->neg});
-                    for (auto& c : it->additional_chars)
-                        ranges.push_back({(Codepoint)c, (Codepoint)c});
+                    if (it->ctype)
+                        ctypes.push_back({wctype(it->ctype), not it->neg});
+                    for (auto& c : it->additional_chars) // TODO: handle negative case
+                    {
+                        if (it->neg)
+                            excluded.push_back((Codepoint)c);
+                        else
+                            ranges.push_back({(Codepoint)c, (Codepoint)c});
+                    }
                     ++m_pos;
                     continue;
                 }
@@ -306,12 +313,13 @@ private:
         ++m_pos;
 
         auto matcher = [ranges = std::move(ranges),
-                        ctypes = std::move(ctypes), negative] (Codepoint cp) {
+                        ctypes = std::move(ctypes),
+                        excluded = std::move(excluded), negative] (Codepoint cp) {
             auto found = contains_that(ranges, [cp](auto& r) {
                 return r.min <= cp and cp <= r.max;
             }) or contains_that(ctypes, [cp](auto& c) {
                 return (bool)iswctype(cp, c.first) == c.second;
-            });
+            }) or (not excluded.empty() and not contains(excluded, cp));
             return negative ? not found : found;
         };
 
@@ -390,17 +398,19 @@ private:
         bool neg;
     };
 
-    static const CharacterClassEscape character_class_escapes[6];
+    static const CharacterClassEscape character_class_escapes[8];
 };
 
 // For some reason Gcc fails to link if this is constexpr
-const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[6] = {
+const RegexParser::CharacterClassEscape RegexParser::character_class_escapes[8] = {
     { 'd', "digit", "", false },
     { 'D', "digit", "", true },
     { 'w', "alnum", "_", false },
     { 'W', "alnum", "_", true },
     { 's', "space", "", false },
-    { 's', "space", "", true },
+    { 'S', "space", "", true },
+    { 'h', nullptr, " \t", false },
+    { 'H', nullptr, " \t", true },
 };
 
 struct CompiledRegex
@@ -983,6 +993,12 @@ auto test_regex = UnitTest{[]{
     }
 
     {
+        TestVM vm{R"([ \H]+)"};
+        kak_assert(vm.exec("abc "));
+        kak_assert(not vm.exec("a \t"));
+    }
+
+    {
         TestVM vm{R"(\Q{}[]*+?\Ea+)"};
         kak_assert(vm.exec("{}[]*+?aa"));
     }
author	Maxime Coste <mawww@kakoune.org>	2017-09-27 14:04:05 +0800
committer	Maxime Coste <mawww@kakoune.org>	2017-11-01 14:05:14 +0800
commit	e4004a7b7fbbc2fb903394dcb172c701f7733847 (patch)
tree	a1780b18c2e2d29b401538107e49632a3242ed83 /src
parent	4ac0d35d1e00dec9461d3ecfa2057cfda1dab31e (diff)