Refactor command line parsing

Command line parsing now works as follow: * Quoted strings ('...', "..." and %~...~ with '~' non nestable) use 'doubling-up' for escaping their delimiter, if the delimiter appears twice in a row, it is considered as part of the string and represent one delimiter character. So 'abc''def' == "abc'def". No other escaping takes place in those strings. * Balanced strings (%{...}) do not support any kind of escaping, but finds the matching closing delimiter by taking nesting into account. So %{abc{def}} == "abc{def}". * Non quoted words support escaping of `;` and whitespaces with `\`, `%`, `'` and '"` can be escaped with `\` at the start of the word, they do not need escaping (and will not be escaped) else where in a word where they are treated literally. Any other use of '\' is a literal '\'. So \%abc%\;\ def == "%abc%; def" As discussed in #2046 this should make our command line syntax more robust, provide a simple programmatic way to escape a string content (s/<delim>/<delim><delim>/g), be well defined instead of ad-hoc undocumented behaviour, and interact nicely with other common escaping by avoiding escaping hell (:grep <regex> can in most case be written with the regex unquoted).
author: Maxime Coste <mawww@kakoune.org> 2018-05-21 22:22:34 +1000
committer: Maxime Coste <mawww@kakoune.org> 2018-07-05 07:54:28 +1000
commit: c829595d017eb2bddb059dd984d047819827723b (patch)
tree: 50f7cf5b5cd8f14ada48d5b8b29b76ad19ca5c17 /src/command_manager.cc
parent: cad5f37efd4d4178d0f6942df063074e9ab7e686 (diff)
1 files changed, 106 insertions, 43 deletions
diff --git a/src/command_manager.cc b/src/command_manager.cc
index 9429b334..a4af3e62 100644
--- a/src/command_manager.cc
+++ b/src/command_manager.cc
@@ -11,6 +11,7 @@
 #include "register_manager.hh"
 #include "shell_manager.hh"
 #include "utils.hh"
+#include "unit_tests.hh"
 
 #include <algorithm>
 
@@ -67,44 +68,41 @@ bool is_command_separator(Codepoint c)
     return c == ';' or c == '\n';
 }
 
-template<typename Func, typename UnescapeFunc>
-String get_until_delimiter(Reader& reader, Func is_delimiter,
-                           UnescapeFunc unescape = [](Codepoint) { return false; })
+struct QuotedResult
+{
+    String content;
+    bool terminated;
+};
+
+QuotedResult parse_quoted(Reader& reader, Codepoint delimiter)
 {
     auto beg = reader.pos;
     String str;
-    bool was_antislash = false;
 
     while (reader)
     {
         const Codepoint c = *reader;
-        if (is_delimiter(c) or (was_antislash and unescape(c)))
+        if (c == delimiter)
         {
             str += reader.substr_from(beg);
-            if (was_antislash)
+            ++reader;
+            if (reader and *reader == delimiter)
             {
-                str.back() = c;
+                str += String{c};
                 beg = reader.pos+1;
             }
             else
-                return str;
+                return {str, true};
         }
-        was_antislash = c == '\\';
         ++reader;
     }
     if (beg < reader.str.end())
         str += reader.substr_from(beg);
-    return str;
+    return {str, false};
 }
 
-[[gnu::always_inline]]
-inline String get_until_delimiter(Reader& reader, Codepoint c)
-{
-    return get_until_delimiter(reader, [c](Codepoint ch) { return c == ch; }, [](Codepoint) { return false; });
-}
-
-StringView get_until_closing_delimiter(Reader& reader, Codepoint opening_delimiter,
-                                       Codepoint closing_delimiter)
+QuotedResult parse_quoted_balanced(Reader& reader, Codepoint opening_delimiter,
+                                   Codepoint closing_delimiter)
 {
     kak_assert(utf8::codepoint(utf8::previous(reader.pos, reader.str.begin()),
                                reader.str.end()) == opening_delimiter);
@@ -115,16 +113,43 @@ StringView get_until_closing_delimiter(Reader& reader, Codepoint opening_delimit
         const Codepoint c = *reader;
         if (c == opening_delimiter)
             ++level;
-        else if (c == closing_delimiter)
+        else if (c == closing_delimiter and level-- == 0)
+        {
+            auto content = reader.substr_from(start);
+            ++reader;
+            return {content.str(), true};
+        }
+        ++reader;
+    }
+    return {reader.substr_from(start).str(), false};
+}
+
+String parse_unquoted(Reader& reader)
+{
+    auto beg = reader.pos;
+    String str;
+    bool was_antislash = false;
+
+    while (reader)
+    {
+        const Codepoint c = *reader;
+        if (is_command_separator(c) or is_horizontal_blank(c))
         {
-            if (level > 0)
-                --level;
+            str += reader.substr_from(beg);
+            if (was_antislash)
+            {
+                str.back() = c;
+                beg = reader.pos+1;
+            }
             else
-                break;
+                return str;
         }
+        was_antislash = c == '\\';
         ++reader;
     }
-    return reader.substr_from(start);
+    if (beg < reader.str.end())
+        str += reader.substr_from(beg);
+    return str;
 }
 
 Token::Type token_type(StringView type_name, bool throw_on_invalid)
@@ -203,25 +228,24 @@ Token parse_percent_token(Reader& reader, bool throw_on_unterminated)
     if (it != std::end(matching_pairs))
     {
         const Codepoint closing_delimiter = it->closing;
-        auto token = get_until_closing_delimiter(reader, opening_delimiter,
-                                                 closing_delimiter);
-        if (throw_on_unterminated and not reader)
+        auto quoted = parse_quoted_balanced(reader, opening_delimiter, closing_delimiter);
+        if (throw_on_unterminated and not quoted.terminated)
             throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
                                      coord.line, coord.column, type_name,
                                      opening_delimiter, closing_delimiter)};
 
-        return {type, start - str_beg, coord, token.str()};
+        return {type, start - str_beg, coord, std::move(quoted.content)};
     }
     else
     {
-        String token = get_until_delimiter(reader, opening_delimiter);
+        auto quoted = parse_quoted(reader, opening_delimiter);
 
-        if (throw_on_unterminated and not reader)
+        if (throw_on_unterminated and not quoted.terminated)
             throw parse_error{format("{}:{}: unterminated string '%{}{}...{}'",
                                      coord.line, coord.column, type_name,
                                      opening_delimiter, opening_delimiter)};
 
-        return {type, start - str_beg, coord, std::move(token)};
+        return {type, start - str_beg, coord, std::move(quoted.content)};
     }
 }
 
@@ -297,20 +321,16 @@ Optional<Token> CommandParser::read_token(bool throw_on_unterminated)
     if (c == '"' or c == '\'')
     {
         start = (++m_reader).pos;
-        String token = get_until_delimiter(m_reader, c);
-        if (throw_on_unterminated and not m_reader)
+        QuotedResult quoted = parse_quoted(m_reader, c);
+        if (throw_on_unterminated and not quoted.terminated)
             throw parse_error{format("unterminated string {0}...{0}", c)};
-        if (m_reader)
-            ++m_reader;
         return Token{c == '"' ? Token::Type::RawEval
                               : Token::Type::RawQuoted,
-                     start - line.begin(), coord, std::move(token)};
+                     start - line.begin(), coord, std::move(quoted.content)};
     }
     else if (c == '%')
     {
         auto token = parse_percent_token(m_reader, throw_on_unterminated);
-        if (m_reader)
-            ++m_reader;
         return token;
     }
     else if (is_command_separator(*m_reader))
@@ -321,11 +341,14 @@ Optional<Token> CommandParser::read_token(bool throw_on_unterminated)
     }
     else
     {
-        String str = get_until_delimiter(m_reader, [](Codepoint c) {
-            return is_command_separator(c) or is_horizontal_blank(c);
-        }, [](Codepoint c) { return c == '%'; });
+        if (c == '\\')
+        {
+            auto next = utf8::codepoint(utf8::next(m_reader.pos, m_reader.str.end()), m_reader.str.end());
+            if (next == '%' or next == '\'' or next == '"')
+                ++m_reader;
+        }
         return Token{Token::Type::Raw, start - line.begin(),
-                     coord, std::move(str)};
+                     coord, parse_unquoted(m_reader)};
     }
     return {};
 }
@@ -350,7 +373,7 @@ String expand_impl(StringView str, const Context& context,
             {
                 res += reader.substr_from(beg);
                 res.back() = c;
-                beg = (++reader).pos;
+                beg = reader.pos;
             }
         }
         else if (c == '%')
@@ -358,7 +381,7 @@ String expand_impl(StringView str, const Context& context,
             res += reader.substr_from(beg);
             res += postprocess(expand_token(parse_percent_token(reader, true),
                                             context, shell_context));
-            beg = (++reader).pos;
+            beg = reader.pos;
         }
         else
             ++reader;
@@ -660,4 +683,44 @@ Completions CommandManager::complete(const Context& context,
     return Completions{};
 }
 
+UnitTest test_command_parsing{[]
+{
+    auto check_quoted = [](StringView str, bool terminated, StringView content)
+    {
+        Reader reader{str};
+        const Codepoint delimiter = *reader;
+        auto quoted = parse_quoted(++reader, delimiter);
+        kak_assert(quoted.terminated == terminated);
+        kak_assert(quoted.content == content);
+    };
+
+    check_quoted("'abc'", true, "abc");
+    check_quoted("'abc''def", false, "abc'def");
+    check_quoted("'abc''def'''", true, "abc'def'");
+
+    auto check_balanced = [](StringView str, Codepoint opening, Codepoint closing, bool terminated, StringView content)
+    {
+        Reader reader{str};
+        auto quoted = parse_quoted_balanced(++reader, opening, closing);
+        kak_assert(quoted.terminated == terminated);
+        kak_assert(quoted.content == content);
+    };
+
+    check_balanced("{abc}", '{', '}', true, "abc");
+    check_balanced("{abc{def}}", '{', '}', true, "abc{def}");
+    check_balanced("{{abc}{def}", '{', '}', false, "{abc}{def}");
+
+    auto check_unquoted = [](StringView str, StringView content)
+    {
+        Reader reader{str};
+        auto res = parse_unquoted(reader);
+        kak_assert(res == content);
+    };
+
+    check_unquoted("abc def", "abc");
+    check_unquoted("abc; def", "abc");
+    check_unquoted("abc\\; def", "abc;");
+    check_unquoted("abc\\;\\ def", "abc; def");
+}};
+
 }
author	Maxime Coste <mawww@kakoune.org>	2018-05-21 22:22:34 +1000
committer	Maxime Coste <mawww@kakoune.org>	2018-07-05 07:54:28 +1000
commit	c829595d017eb2bddb059dd984d047819827723b (patch)
tree	50f7cf5b5cd8f14ada48d5b8b29b76ad19ca5c17 /src/command_manager.cc
parent	cad5f37efd4d4178d0f6942df063074e9ab7e686 (diff)