summaryrefslogtreecommitdiff
path: root/src/unicode.hh
diff options
context:
space:
mode:
authorJacob Collins <jaco3collins@gmail.com>2021-02-03 00:04:05 -0500
committerJacob Collins <jaco3collins@gmail.com>2021-02-25 11:03:18 -0500
commit9dfab2f1fb383478db4116a1cf63d81e1c53bd0d (patch)
treecf440ee95d259cc69a2584b0e6f00990d0d5f3f7 /src/unicode.hh
parent0e37ef649cdbba94974cd56c64dbab35062883a3 (diff)
Follow ECMA specification for regex whitespace
Changes the behaviour of the \s and \h character classes to include all WhiteSpace and LineTerminator characters defined in the ECMA specification. - <https://262.ecma-international.org/11.0/#sec-white-space> - <https://262.ecma-international.org/11.0/#sec-line-terminators> - <https://262.ecma-international.org/11.0/#sec-characterclassescape> Fixes #4034
Diffstat (limited to 'src/unicode.hh')
-rw-r--r--src/unicode.hh36
1 files changed, 34 insertions, 2 deletions
diff --git a/src/unicode.hh b/src/unicode.hh
index 486fc381..a385b6eb 100644
--- a/src/unicode.hh
+++ b/src/unicode.hh
@@ -20,12 +20,44 @@ inline bool is_eol(Codepoint c) noexcept
inline bool is_horizontal_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t';
+ // Characters considered whitespace by ECMA Regex Spec
+ // minus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-white-space>
+ return c == '\t' or
+ c == '\f' or
+ c == ' ' or
+ c == U'\u00A0' or
+ c == U'\uFEFF' or
+ c == U'\u1680' or
+ c == U'\u2000' or
+ c == U'\u2001' or
+ c == U'\u2002' or
+ c == U'\u2003' or
+ c == U'\u2004' or
+ c == U'\u2005' or
+ c == U'\u2006' or
+ c == U'\u2007' or
+ c == U'\u2008' or
+ c == U'\u2009' or
+ c == U'\u200A' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ c == U'\u202F' or
+ c == U'\u205F' or
+ c == U'\u3000' ;
}
inline bool is_blank(Codepoint c) noexcept
{
- return c == ' ' or c == '\t' or c == '\n';
+ // Characters considered Line Terminators by ECMA Regex Spec
+ // plus vertical tab
+ // <https://262.ecma-international.org/11.0/#sec-line-terminators>
+ return c == '\n' or
+ c == '\r' or
+ c == '\v' or
+ c == U'\u2028' or
+ c == U'\u2029' or
+ is_horizontal_blank(c) ;
}
enum WordType { Word, WORD };