summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJacob Collins <jaco3collins@gmail.com>2021-02-03 00:04:05 -0500
committerJacob Collins <jaco3collins@gmail.com>2021-02-25 11:03:18 -0500
commit9dfab2f1fb383478db4116a1cf63d81e1c53bd0d (patch)
treecf440ee95d259cc69a2584b0e6f00990d0d5f3f7
parent0e37ef649cdbba94974cd56c64dbab35062883a3 (diff)
Follow ECMA specification for regex whitespace
Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - <https://262.ecma-international.org/11.0/#sec-white-space> - <https://262.ecma-international.org/11.0/#sec-line-terminators> - <https://262.ecma-international.org/11.0/#sec-characterclassescape> Fixes #4034
-rw-r--r--src/unicode.hh36
1 files changed, 34 insertions, 2 deletions
diff --git a/src/unicode.hh b/src/unicode.hh
index 486fc381..a385b6eb 100644
--- a/src/unicode.hh
+++ b/src/unicode.hh
@@ -20,12 +20,44 @@ inline bool is_eol(Codepoint c) noexcept
inline bool is_horizontal_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t';
+ // Characters considered whitespace by ECMA Regex Spec
+ // minus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-white-space>
+ return c == '\t' or
+ c == '\f' or
+ c == ' ' or
+ c == U'\u00A0' or
+ c == U'\uFEFF' or
+ c == U'\u1680' or
+ c == U'\u2000' or
+ c == U'\u2001' or
+ c == U'\u2002' or
+ c == U'\u2003' or
+ c == U'\u2004' or
+ c == U'\u2005' or
+ c == U'\u2006' or
+ c == U'\u2007' or
+ c == U'\u2008' or
+ c == U'\u2009' or
+ c == U'\u200A' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ c == U'\u202F' or
+ c == U'\u205F' or
+ c == U'\u3000' ;
}
inline bool is_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t' or c == '\n';
+ // Characters considered Line Terminators by ECMA Regex Spec
+ // plus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-line-terminators>
+ return c == '\n' or
+ c == '\r' or
+ c == '\v' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ is_horizontal_blank(c) ;
}
enum WordType { Word, WORD };