diff options
| author | Maxime Coste <mawww@kakoune.org> | 2017-09-26 18:03:12 +0900 |
|---|---|---|
| committer | Maxime Coste <mawww@kakoune.org> | 2017-11-01 14:05:14 +0800 |
| commit | be157453ad4cb0a86ca7dd8ee4b20a5b0199cf5c (patch) | |
| tree | 5f9b99ad03e65535a74c9eee2765451e0b2c8828 /src | |
| parent | eb1015cdfbf78837453727113f532a961b193925 (diff) | |
Regex: Use a std::function based "Matcher" op to implement character classes
This is more extensible and should allow easier support for non ranges
classes.
Diffstat (limited to 'src')
| -rw-r--r-- | src/regex_impl.cc | 102 |
1 files changed, 26 insertions, 76 deletions
diff --git a/src/regex_impl.cc b/src/regex_impl.cc index 8a61b760..ad96846c 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -20,8 +20,7 @@ struct CompiledRegex Match, Literal, AnyChar, - CharRange, - NegativeCharRange, + Matcher, Jump, Split_PrioritizeParent, Split_PrioritizeChild, @@ -37,6 +36,7 @@ struct CompiledRegex using Offset = unsigned; Vector<char> bytecode; + Vector<std::function<bool (Codepoint)>> matchers; size_t save_count; }; @@ -75,8 +75,7 @@ enum class Op { Literal, AnyChar, - CharRange, - NegativeCharRange, + Matcher, Sequence, Alternation, LineStart, @@ -103,7 +102,7 @@ struct ParsedRegex { AstNodePtr ast; size_t capture_count; - Vector<Vector<CharRange>> ranges; + Vector<std::function<bool (Codepoint)>> matchers; }; AstNodePtr make_ast_node(Op op, Codepoint value = -1, @@ -261,14 +260,14 @@ private: const auto cp = *pos++; if (cp == '-') { - ranges.push_back({ '-', 0 }); + ranges.push_back({ '-', '-' }); continue; } if (pos == end) break; - CharRange range = { cp, 0 }; + CharRange range = { cp, cp }; if (*pos == '-') { if (++pos == end) @@ -283,10 +282,17 @@ private: throw runtime_error{"Unclosed character class"}; ++pos; - auto ranges_id = parsed_regex.ranges.size(); - parsed_regex.ranges.push_back(std::move(ranges)); + auto matcher = [negative, ranges = std::move(ranges)](Codepoint cp) { + auto found = contains_that(ranges, [cp](auto& r) { + return r.min <= cp and cp <= r.max; + }); + return negative ? not found : found; + }; + + auto matcher_id = parsed_regex.matchers.size(); + parsed_regex.matchers.push_back(std::move(matcher)); - return make_ast_node(negative ? Op::NegativeCharRange : Op::CharRange, ranges_id); + return make_ast_node(Op::Matcher, matcher_id); } static Quantifier quantifier(ParsedRegex& parsed_regex, Iterator& pos, Iterator end) @@ -371,32 +377,9 @@ CompiledRegex::Offset compile_node_inner(CompiledRegex& program, const ParsedReg case Op::AnyChar: program.bytecode.push_back(CompiledRegex::AnyChar); break; - case Op::CharRange: case Op::NegativeCharRange: - { - auto& ranges = parsed_regex.ranges[node->value]; - size_t single_count = std::count_if(ranges.begin(), ranges.end(), - [](auto& r) { return r.max == 0; }); - program.bytecode.push_back(node->op == Op::CharRange ? - CompiledRegex::CharRange - : CompiledRegex::NegativeCharRange); - - program.bytecode.push_back((char)single_count); - program.bytecode.push_back((char)(ranges.size() - single_count)); - for (auto& r : ranges) - { - if (r.max == 0) - push_codepoint(program, r.min); - } - for (auto& r : ranges) - { - if (r.max != 0) - { - push_codepoint(program, r.min); - push_codepoint(program, r.max); - } - } - break; - } + case Op::Matcher: + program.bytecode.push_back(CompiledRegex::Matcher); + program.bytecode.push_back(node->value); case Op::Sequence: for (auto& child : node->children) compile_node(program, parsed_regex, child); @@ -505,6 +488,7 @@ CompiledRegex compile(const ParsedRegex& parsed_regex) write_search_prefix(res); compile_node(res, parsed_regex, parsed_regex.ast); res.bytecode.push_back(CompiledRegex::Match); + res.matchers = parsed_regex.matchers; res.save_count = parsed_regex.capture_count * 2; return res; } @@ -547,24 +531,9 @@ void dump(const CompiledRegex& program) case CompiledRegex::Save: printf("save %d\n", *pos++); break; - case CompiledRegex::CharRange: case CompiledRegex::NegativeCharRange: - { - printf("%schar range, [", op == CompiledRegex::NegativeCharRange ? "negative " : ""); - auto single_count = *pos++; - auto range_count = *pos++; - for (int i = 0; i < single_count; ++i) - printf("%lc", utf8::read_codepoint(pos, (const char*)nullptr)); - printf("]"); - - for (int i = 0; i < range_count; ++i) - { - Codepoint min = utf8::read_codepoint(pos, (const char*)nullptr); - Codepoint max = utf8::read_codepoint(pos, (const char*)nullptr); - printf(" [%lc-%lc]", min, max); - } - printf("\n"); + case CompiledRegex::Matcher: + printf("matcher %d\n", *pos++); break; - } case CompiledRegex::LineStart: printf("line start\n"); break; @@ -649,30 +618,11 @@ struct ThreadedRegexVM thread.saves[index] = m_pos.base(); break; } - case CompiledRegex::CharRange: case CompiledRegex::NegativeCharRange: + case CompiledRegex::Matcher: { - const int single_count = *thread.inst++; - const int range_count = *thread.inst++; - for (int i = 0; i < single_count; ++i) - { - auto candidate = utf8::read_codepoint(thread.inst, prog_end); - if (cp == candidate) - { - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{single_count - (i + 1) + range_count * 2}); - return op == CompiledRegex::CharRange ? StepResult::Consumed : StepResult::Failed; - } - } - for (int i = 0; i < range_count; ++i) - { - auto min = utf8::read_codepoint(thread.inst, prog_end); - auto max = utf8::read_codepoint(thread.inst, prog_end); - if (min <= cp and cp <= max) - { - thread.inst = utf8::advance(thread.inst, prog_end, CharCount{(range_count - (i + 1)) * 2}); - return op == CompiledRegex::CharRange ? StepResult::Consumed : StepResult::Failed; - } - } - return op == CompiledRegex::CharRange ? StepResult::Failed : StepResult::Consumed; + const int matcher_id = *thread.inst++; + return m_program.matchers[matcher_id](*m_pos) ? + StepResult::Consumed : StepResult::Failed; } case CompiledRegex::LineStart: if (not is_line_start()) |
