diff options
| author | Jacob Collins <jaco3collins@gmail.com> | 2021-02-03 00:04:05 -0500 |
|---|---|---|
| committer | Jacob Collins <jaco3collins@gmail.com> | 2021-02-25 11:03:18 -0500 |
| commit | 9dfab2f1fb383478db4116a1cf63d81e1c53bd0d (patch) | |
| tree | cf440ee95d259cc69a2584b0e6f00990d0d5f3f7 /src/unicode.hh | |
| parent | 0e37ef649cdbba94974cd56c64dbab35062883a3 (diff) | |
Follow ECMA specification for regex whitespace
Changes the behaviour of the \s and \h character classes to include
all WhiteSpace and LineTerminator characters defined in the ECMA
specification.
- <https://262.ecma-international.org/11.0/#sec-white-space>
- <https://262.ecma-international.org/11.0/#sec-line-terminators>
- <https://262.ecma-international.org/11.0/#sec-characterclassescape>
Fixes #4034
Diffstat (limited to 'src/unicode.hh')
| -rw-r--r-- | src/unicode.hh | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/src/unicode.hh b/src/unicode.hh index 486fc381..a385b6eb 100644 --- a/src/unicode.hh +++ b/src/unicode.hh @@ -20,12 +20,44 @@ inline bool is_eol(Codepoint c) noexcept inline bool is_horizontal_blank(Codepoint c) noexcept { - return c == ' ' or c == '\t'; + // Characters considered whitespace by ECMA Regex Spec + // minus vertical tab + // <https://262.ecma-international.org/11.0/#sec-white-space> + return c == '\t' or + c == '\f' or + c == ' ' or + c == U'\u00A0' or + c == U'\uFEFF' or + c == U'\u1680' or + c == U'\u2000' or + c == U'\u2001' or + c == U'\u2002' or + c == U'\u2003' or + c == U'\u2004' or + c == U'\u2005' or + c == U'\u2006' or + c == U'\u2007' or + c == U'\u2008' or + c == U'\u2009' or + c == U'\u200A' or + c == U'\u2028' or + c == U'\u2029' or + c == U'\u202F' or + c == U'\u205F' or + c == U'\u3000' ; } inline bool is_blank(Codepoint c) noexcept { - return c == ' ' or c == '\t' or c == '\n'; + // Characters considered Line Terminators by ECMA Regex Spec + // plus vertical tab + // <https://262.ecma-international.org/11.0/#sec-line-terminators> + return c == '\n' or + c == '\r' or + c == '\v' or + c == U'\u2028' or + c == U'\u2029' or + is_horizontal_blank(c) ; } enum WordType { Word, WORD }; |
