summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJacob Collins <jaco3collins@gmail.com>2021-02-03 00:04:05 -0500
committerJacob Collins <jaco3collins@gmail.com>2021-02-25 11:03:18 -0500
commit9dfab2f1fb383478db4116a1cf63d81e1c53bd0d (patch)
treecf440ee95d259cc69a2584b0e6f00990d0d5f3f7 /src
parent0e37ef649cdbba94974cd56c64dbab35062883a3 (diff)
Follow ECMA specification for regex whitespace
Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - <https://262.ecma-international.org/11.0/#sec-white-space> - <https://262.ecma-international.org/11.0/#sec-line-terminators> - <https://262.ecma-international.org/11.0/#sec-characterclassescape> Fixes #4034
Diffstat (limited to 'src')
-rw-r--r--src/unicode.hh36
1 files changed, 34 insertions, 2 deletions
diff --git a/src/unicode.hh b/src/unicode.hh
index 486fc381..a385b6eb 100644
--- a/src/unicode.hh
+++ b/src/unicode.hh
@@ -20,12 +20,44 @@ inline bool is_eol(Codepoint c) noexcept
inline bool is_horizontal_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t';
+ // Characters considered whitespace by ECMA Regex Spec
+ // minus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-white-space>
+ return c == '\t' or
+ c == '\f' or
+ c == ' ' or
+ c == U'\u00A0' or
+ c == U'\uFEFF' or
+ c == U'\u1680' or
+ c == U'\u2000' or
+ c == U'\u2001' or
+ c == U'\u2002' or
+ c == U'\u2003' or
+ c == U'\u2004' or
+ c == U'\u2005' or
+ c == U'\u2006' or
+ c == U'\u2007' or
+ c == U'\u2008' or
+ c == U'\u2009' or
+ c == U'\u200A' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ c == U'\u202F' or
+ c == U'\u205F' or
+ c == U'\u3000' ;
}
inline bool is_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t' or c == '\n';
+ // Characters considered Line Terminators by ECMA Regex Spec
+ // plus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-line-terminators>
+ return c == '\n' or
+ c == '\r' or
+ c == '\v' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ is_horizontal_blank(c) ;
}
enum WordType { Word, WORD };