From 2754e27cf2a85fd4004dd7daabe052b7df1b425a Mon Sep 17 00:00:00 2001 From: Ben Judd Date: Fri, 7 Jun 2024 17:33:32 -0700 Subject: Increase SSO from 22 to 23 chars. --- src/string.cc | 61 ++++++++++++++++++++++++++++++++++++++++++++--------------- src/string.hh | 51 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 82 insertions(+), 30 deletions(-) (limited to 'src') diff --git a/src/string.cc b/src/string.cc index fe763165..a6befa13 100644 --- a/src/string.cc +++ b/src/string.cc @@ -2,6 +2,8 @@ #include #include +#include "assert.hh" +#include "unit_tests.hh" namespace Kakoune { @@ -10,13 +12,10 @@ String::Data::Data(const char* data, size_t size, size_t capacity) { if (capacity > Short::capacity) { - if (capacity & 1) - ++capacity; - - kak_assert(capacity < Long::max_capacity); + kak_assert(capacity <= Long::max_capacity); u.l.ptr = Alloc{}.allocate(capacity+1); u.l.size = size; - u.l.capacity = capacity; + u.l.set_capacity(capacity); if (data != nullptr) memcpy(u.l.ptr, data, size); @@ -60,26 +59,27 @@ String::Data& String::Data::operator=(Data&& other) noexcept template void String::Data::reserve(size_t new_capacity) { - if (capacity() != 0 and new_capacity <= capacity()) + auto const current_capacity = capacity(); + if (current_capacity != 0 and new_capacity <= current_capacity) return; - if (is_long()) - new_capacity = std::max(u.l.capacity * 2, new_capacity); + if (!is_long() and new_capacity <= Short::capacity) + return; - if (new_capacity & 1) - ++new_capacity; + kak_assert(new_capacity <= Long::max_capacity); + new_capacity = std::max(new_capacity, // Do not upgrade new_capacity to be over limit. + std::min(current_capacity * 2, Long::max_capacity)); - kak_assert(new_capacity < Long::max_capacity); char* new_ptr = Alloc{}.allocate(new_capacity+1); if (copy) { memcpy(new_ptr, data(), size()+1); - u.l.size = size(); } release(); + u.l.size = size(); u.l.ptr = new_ptr; - u.l.capacity = new_capacity; + u.l.set_capacity(new_capacity); } template void String::Data::reserve(size_t); @@ -89,6 +89,7 @@ void String::Data::force_size(size_t new_size) { reserve(new_size); set_size(new_size); + data()[new_size] = 0; } void String::Data::append(const char* str, size_t len) @@ -131,17 +132,47 @@ void String::Data::set_size(size_t size) if (is_long()) u.l.size = size; else - u.s.size = (size << 1) | 1; + u.s.remaining_size = Short::capacity - size; } void String::Data::set_short(const char* data, size_t size) { - u.s.size = (size << 1) | 1; + kak_assert(size <= Short::capacity); + u.s.remaining_size = Short::capacity - size; if (data != nullptr) memcpy(u.s.string, data, size); u.s.string[size] = 0; } +UnitTest test_data{[]{ + using Data = String::Data; + { // Basic data usage. + Data data; + kak_assert(data.size() == 0); + kak_assert(not data.is_long()); + kak_assert(data.capacity() == 23); + + // Should be SSO-ed. + data.append("test", 4); + kak_assert(data.size() == 4); + kak_assert(data.capacity() == 23); + kak_assert(not data.is_long()); + kak_assert(data.data() == StringView("test")); + } + { + char large_buf[2048]; + memset(large_buf, 'x', 2048); + Data data(large_buf, 2048); + kak_assert(data.size() == 2048); + kak_assert(data.capacity() >= 2048); + + data.clear(); + kak_assert(data.size() == 0); + kak_assert(not data.is_long()); + kak_assert(data.capacity() == 23); + } +}}; + const String String::ms_empty; } diff --git a/src/string.hh b/src/string.hh index 004be04c..3bbb907e 100644 --- a/src/string.hh +++ b/src/string.hh @@ -1,6 +1,7 @@ #ifndef string_hh_INCLUDED #define string_hh_INCLUDED +#include #include "memory.hh" #include "hash.hh" #include "units.hh" @@ -156,17 +157,17 @@ public: // String data storage using small string optimization. // - // the LSB of the last byte is used to flag if we are using the small buffer - // or an allocated one. On big endian systems that means the allocated + // the MSB of the last byte is used to flag if we are using the allocated buffer + // (1) or a small one (0). On big endian systems that means the allocated // capacity must be pair, on little endian systems that means the allocated // capacity cannot use its most significant byte, so we effectively limit - // capacity to 2^24 on 32bit arch, and 2^60 on 64. + // capacity to 2^24 on 32bit arch, and 2^56 on 64. struct Data { using Alloc = Allocator; Data() { set_empty(); } - Data(NoCopy, const char* data, size_t size) : u{Long{const_cast(data), size, 0}} {} + Data(NoCopy, const char* data, size_t size) : u{Long{const_cast(data), size, {0}, Short::inactive_mask}} {} Data(const char* data, size_t size, size_t capacity); Data(const char* data, size_t size) : Data(data, size, size) {} @@ -177,9 +178,9 @@ public: Data& operator=(const Data& other); Data& operator=(Data&& other) noexcept; - bool is_long() const { return (u.s.size & 1) == 0; } - size_t size() const { return is_long() ? u.l.size : (u.s.size >> 1); } - size_t capacity() const { return is_long() ? u.l.capacity : Short::capacity; } + bool is_long() const { return (u.s.remaining_size & Short::inactive_mask) > 0; } + size_t size() const { return is_long() ? u.l.size : (Short::capacity - u.s.remaining_size); } + size_t capacity() const { return is_long() ? u.l.capacity() : Short::capacity; } const char* data() const { return is_long() ? u.l.ptr : u.s.string; } char* data() { return is_long() ? u.l.ptr : u.s.string; } @@ -195,18 +196,37 @@ public: struct Long { static constexpr size_t max_capacity = - (size_t)1 << 8 * (sizeof(size_t) - 1); + ((size_t)1 << (CHAR_BIT * (sizeof(size_t) - 1))) - 1; char* ptr; size_t size; - size_t capacity; + unsigned char m_capacity[sizeof(size_t) - 1]; + unsigned char mode; + size_t capacity() const + { + size_t ret{}; + auto* dest = ((unsigned char*)&ret) + + (std::endian::native == std::endian::big ? 1: 0); + memcpy(dest, m_capacity, sizeof m_capacity); + return ret; + } + void set_capacity(size_t cap) + { + auto* src = ((unsigned char*)&cap) + + (std::endian::native == std::endian::big ? 1: 0); + memcpy(m_capacity, src, sizeof m_capacity); + mode = Short::inactive_mask; + } }; struct Short { - static constexpr size_t capacity = sizeof(Long) - 2; - char string[capacity+1]; - unsigned char size; + static constexpr size_t capacity = sizeof(Long) - 1; + char string[capacity]; + // When string is full remaining_size will be 0 and be the null terminator. + // When string is empty remaining size will be 0b00010111 and not collide with inactive_mask. + unsigned char remaining_size; + static constexpr unsigned char inactive_mask = 0b1000'0000; }; union @@ -217,11 +237,12 @@ public: void release() { - if (is_long() and u.l.capacity != 0) - Alloc{}.deallocate(u.l.ptr, u.l.capacity+1); + auto const cap = u.l.capacity(); + if (is_long() and (cap != 0)) + Alloc{}.deallocate(u.l.ptr, cap+1); } - void set_empty() { u.s.size = 1; u.s.string[0] = 0; } + void set_empty() { u.s.remaining_size = Short::capacity; u.s.string[0] = '\0'; } void set_short(const char* data, size_t size); }; -- cgit v1.2.3 From 9c185249a2943bb81fcdff47c0d9423d0cc8caa6 Mon Sep 17 00:00:00 2001 From: Ben Judd Date: Fri, 7 Jun 2024 17:53:34 -0700 Subject: Fix build by moving include. --- src/string.cc | 1 - src/string.hh | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/string.cc b/src/string.cc index a6befa13..7be39715 100644 --- a/src/string.cc +++ b/src/string.cc @@ -1,7 +1,6 @@ #include "string.hh" #include -#include #include "assert.hh" #include "unit_tests.hh" diff --git a/src/string.hh b/src/string.hh index 3bbb907e..b3102272 100644 --- a/src/string.hh +++ b/src/string.hh @@ -2,6 +2,7 @@ #define string_hh_INCLUDED #include +#include #include "memory.hh" #include "hash.hh" #include "units.hh" -- cgit v1.2.3 From faf83b10e25fe237c9e780d09386420c98110a5e Mon Sep 17 00:00:00 2001 From: Ben Judd Date: Tue, 11 Jun 2024 08:33:35 -0700 Subject: Switch to bitfield. --- src/string.cc | 18 +++++++++++++----- src/string.hh | 53 +++++++++++++++++++++-------------------------------- 2 files changed, 34 insertions(+), 37 deletions(-) (limited to 'src') diff --git a/src/string.cc b/src/string.cc index 7be39715..a672433b 100644 --- a/src/string.cc +++ b/src/string.cc @@ -1,11 +1,17 @@ #include "string.hh" -#include +#include #include "assert.hh" #include "unit_tests.hh" namespace Kakoune { +namespace +{ +// Avoid including all of just for this. +constexpr auto max(auto lhs, auto rhs) { return lhs > rhs ? lhs : rhs;} +constexpr auto min(auto lhs, auto rhs) { return lhs < rhs ? lhs : rhs;} +} String::Data::Data(const char* data, size_t size, size_t capacity) { @@ -14,7 +20,8 @@ String::Data::Data(const char* data, size_t size, size_t capacity) kak_assert(capacity <= Long::max_capacity); u.l.ptr = Alloc{}.allocate(capacity+1); u.l.size = size; - u.l.set_capacity(capacity); + u.l.capacity = (capacity & Long::max_capacity); + u.l.mode = Long::active_mask; if (data != nullptr) memcpy(u.l.ptr, data, size); @@ -66,8 +73,8 @@ void String::Data::reserve(size_t new_capacity) return; kak_assert(new_capacity <= Long::max_capacity); - new_capacity = std::max(new_capacity, // Do not upgrade new_capacity to be over limit. - std::min(current_capacity * 2, Long::max_capacity)); + new_capacity = max(new_capacity, // Do not upgrade new_capacity to be over limit. + min(current_capacity * 2, Long::max_capacity)); char* new_ptr = Alloc{}.allocate(new_capacity+1); if (copy) @@ -78,7 +85,8 @@ void String::Data::reserve(size_t new_capacity) u.l.size = size(); u.l.ptr = new_ptr; - u.l.set_capacity(new_capacity); + u.l.capacity = (new_capacity & Long::max_capacity); + u.l.mode = Long::active_mask; } template void String::Data::reserve(size_t); diff --git a/src/string.hh b/src/string.hh index b3102272..857a1b19 100644 --- a/src/string.hh +++ b/src/string.hh @@ -1,14 +1,13 @@ #ifndef string_hh_INCLUDED #define string_hh_INCLUDED -#include -#include +#include +#include #include "memory.hh" #include "hash.hh" #include "units.hh" #include "utf8.hh" -#include namespace Kakoune { @@ -158,17 +157,22 @@ public: // String data storage using small string optimization. // - // the MSB of the last byte is used to flag if we are using the allocated buffer - // (1) or a small one (0). On big endian systems that means the allocated - // capacity must be pair, on little endian systems that means the allocated - // capacity cannot use its most significant byte, so we effectively limit - // capacity to 2^24 on 32bit arch, and 2^56 on 64. + // The MSB of the last byte is used to flag if we are using the allocated buffer + // (1) or in-situ storage, the small one (0). That means the allocated capacity + // cannot use its most significant byte, so we effectively limit capacity to + // 2^24 on 32bit arch, and 2^56 on 64bit. + // + // There is also a special NoCopy mode in which the data referred to is un-owned. + // It is indicated by being in Long mode with capacity == 0. struct Data { using Alloc = Allocator; Data() { set_empty(); } - Data(NoCopy, const char* data, size_t size) : u{Long{const_cast(data), size, {0}, Short::inactive_mask}} {} + Data(NoCopy, const char* data, size_t size) : u{Long{const_cast(data), + size, + /*capacity=*/0, + /*mode=*/Long::active_mask}} {} Data(const char* data, size_t size, size_t capacity); Data(const char* data, size_t size) : Data(data, size, size) {} @@ -179,9 +183,9 @@ public: Data& operator=(const Data& other); Data& operator=(Data&& other) noexcept; - bool is_long() const { return (u.s.remaining_size & Short::inactive_mask) > 0; } + bool is_long() const { return (u.l.mode& Long::active_mask) > 0; } size_t size() const { return is_long() ? u.l.size : (Short::capacity - u.s.remaining_size); } - size_t capacity() const { return is_long() ? u.l.capacity() : Short::capacity; } + size_t capacity() const { return is_long() ? u.l.capacity : Short::capacity; } const char* data() const { return is_long() ? u.l.ptr : u.s.string; } char* data() { return is_long() ? u.l.ptr : u.s.string; } @@ -201,23 +205,9 @@ public: char* ptr; size_t size; - unsigned char m_capacity[sizeof(size_t) - 1]; + size_t capacity: (sizeof(size_t) - 1) *CHAR_BIT; unsigned char mode; - size_t capacity() const - { - size_t ret{}; - auto* dest = ((unsigned char*)&ret) + - (std::endian::native == std::endian::big ? 1: 0); - memcpy(dest, m_capacity, sizeof m_capacity); - return ret; - } - void set_capacity(size_t cap) - { - auto* src = ((unsigned char*)&cap) + - (std::endian::native == std::endian::big ? 1: 0); - memcpy(m_capacity, src, sizeof m_capacity); - mode = Short::inactive_mask; - } + static constexpr unsigned char active_mask = 0b1000'0000; }; struct Short @@ -225,9 +215,9 @@ public: static constexpr size_t capacity = sizeof(Long) - 1; char string[capacity]; // When string is full remaining_size will be 0 and be the null terminator. - // When string is empty remaining size will be 0b00010111 and not collide with inactive_mask. + // When string is empty remaining size will be 23 (0b00010111) + // and not collide with Long::active_mask. unsigned char remaining_size; - static constexpr unsigned char inactive_mask = 0b1000'0000; }; union @@ -238,9 +228,8 @@ public: void release() { - auto const cap = u.l.capacity(); - if (is_long() and (cap != 0)) - Alloc{}.deallocate(u.l.ptr, cap+1); + if (is_long() and (u.l.capacity != 0)) + Alloc{}.deallocate(u.l.ptr, u.l.capacity+1); } void set_empty() { u.s.remaining_size = Short::capacity; u.s.string[0] = '\0'; } -- cgit v1.2.3