C++ 中文周刊第38期

std::string resize_and_overwrite(const std::string& str, std::size_t size) {
   std::string ret;
   const auto step = std::size(str);
   ret.resize_and_overwrite(step * size, [&](auto* buf, auto n) {
       for (auto i = 0u; i < size; i++) {
         std::memcpy(buf + i * step, std::data(str), step);
       }
       return step * size;
   });

   return ret;
}

int main(){
  std::cout << resize_and_overwrite("quantlab", 4); // prints quantlabquantlabquantlabquantlab
}

Converting integers to fix-digit representations quickly

Daniel Lemire大神新活，如何转int到字符串最快

写了几种常规操作

最常规

void to_string_backlinear(uint64_t x, char *out) {
    for(int z = 0; z < 16; z++) {
        out[15-z] = (x % 10) + 0x30;
        x /= 10;
    }
}

查表

void to_string_tree_table(uint64_t x, char *out) {
  static const char table[200] = {
      0x30, 0x30, 0x30, 0x31, 0x30, 0x32, 0x30, 0x33, 0x30, 0x34, 0x30, 0x35,
      0x30, 0x36, 0x30, 0x37, 0x30, 0x38, 0x30, 0x39, 0x31, 0x30, 0x31, 0x31,
      0x31, 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37,
      0x31, 0x38, 0x31, 0x39, 0x32, 0x30, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33,
      0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, 0x38, 0x32, 0x39,
      0x33, 0x30, 0x33, 0x31, 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35,
      0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, 0x34, 0x30, 0x34, 0x31,
      0x34, 0x32, 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37,
      0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x35, 0x31, 0x35, 0x32, 0x35, 0x33,
      0x35, 0x34, 0x35, 0x35, 0x35, 0x36, 0x35, 0x37, 0x35, 0x38, 0x35, 0x39,
      0x36, 0x30, 0x36, 0x31, 0x36, 0x32, 0x36, 0x33, 0x36, 0x34, 0x36, 0x35,
      0x36, 0x36, 0x36, 0x37, 0x36, 0x38, 0x36, 0x39, 0x37, 0x30, 0x37, 0x31,
      0x37, 0x32, 0x37, 0x33, 0x37, 0x34, 0x37, 0x35, 0x37, 0x36, 0x37, 0x37,
      0x37, 0x38, 0x37, 0x39, 0x38, 0x30, 0x38, 0x31, 0x38, 0x32, 0x38, 0x33,
      0x38, 0x34, 0x38, 0x35, 0x38, 0x36, 0x38, 0x37, 0x38, 0x38, 0x38, 0x39,
      0x39, 0x30, 0x39, 0x31, 0x39, 0x32, 0x39, 0x33, 0x39, 0x34, 0x39, 0x35,
      0x39, 0x36, 0x39, 0x37, 0x39, 0x38, 0x39, 0x39,
  };
  uint64_t top = x / 100000000;
  uint64_t bottom = x % 100000000;
  uint64_t toptop = top / 10000;
  uint64_t topbottom = top % 10000;
  uint64_t bottomtop = bottom / 10000;
  uint64_t bottombottom = bottom % 10000;
  uint64_t toptoptop = toptop / 100;
  uint64_t toptopbottom = toptop % 100;
  uint64_t topbottomtop = topbottom / 100;
  uint64_t topbottombottom = topbottom % 100;
  uint64_t bottomtoptop = bottomtop / 100;
  uint64_t bottomtopbottom = bottomtop % 100;
  uint64_t bottombottomtop = bottombottom / 100;
  uint64_t bottombottombottom = bottombottom % 100;
  //
  memcpy(out, &table[2 * toptoptop], 2);
  memcpy(out + 2, &table[2 * toptopbottom], 2);
  memcpy(out + 4, &table[2 * topbottomtop], 2);
  memcpy(out + 6, &table[2 * topbottombottom], 2);
  memcpy(out + 8, &table[2 * bottomtoptop], 2);
  memcpy(out + 10, &table[2 * bottomtopbottom], 2);
  memcpy(out + 12, &table[2 * bottombottomtop], 2);
  memcpy(out + 14, &table[2 * bottombottombottom], 2);
}

查一个巨大巨大的表

void to_string_tree_bigtable(uint64_t x, char *out) {
  #include "bigtable.h"

  uint64_t top = x / 100000000;
  uint64_t bottom = x % 100000000;
  //
  uint64_t toptop = top / 10000;
  uint64_t topbottom = top % 10000;
  uint64_t bottomtop = bottom / 10000;
  uint64_t bottombottom = bottom % 10000;

  memcpy(out, &bigtable[4 * toptop], 4);
  memcpy(out + 4, &bigtable[4 * topbottom], 4);
  memcpy(out + 8, &bigtable[4 * bottomtop], 4);
  memcpy(out + 12, &bigtable[4 * bottombottom], 4);
}

SIMD方法参考这个

#ifdef __SSE2__
// mula
#include <x86intrin.h>
void to_string_sse2(uint64_t v, char *out) {

  // v is 16-digit number = abcdefghijklmnop
  const __m128i div_10000 = _mm_set1_epi32(0xd1b71759);
  const __m128i mul_10000 = _mm_set1_epi32(10000);
  const int div_10000_shift = 45;

  const __m128i div_100 = _mm_set1_epi16(0x147b);
  const __m128i mul_100 = _mm_set1_epi16(100);
  const int div_100_shift = 3;

  const __m128i div_10 = _mm_set1_epi16(0x199a);
  const __m128i mul_10 = _mm_set1_epi16(10);

  const __m128i ascii0 = _mm_set1_epi8('0');

  // can't be easliy done in SSE
  const uint32_t a = v / 100000000; // 8-digit number: abcdefgh
  const uint32_t b = v % 100000000; // 8-digit number: ijklmnop

  //                [ 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1 | 0 | 3 | 2 | 1
  //                | 0 ]
  // x            = [       0       |      ijklmnop |       0       | abcdefgh ]
  __m128i x = _mm_set_epi64x(b, a);

  // x div 10^4   = [       0       |          ijkl |       0       | abcd ]
  __m128i x_div_10000;
  x_div_10000 = _mm_mul_epu32(x, div_10000);
  x_div_10000 = _mm_srli_epi64(x_div_10000, div_10000_shift);

  // x mod 10^4   = [       0       |          mnop |       0       | efgh ]
  __m128i x_mod_10000;
  x_mod_10000 = _mm_mul_epu32(x_div_10000, mul_10000);
  x_mod_10000 = _mm_sub_epi32(x, x_mod_10000);

  // y            = [          mnop |          ijkl |          efgh | abcd ]
  __m128i y = _mm_or_si128(x_div_10000, _mm_slli_epi64(x_mod_10000, 32));

  // y_div_100    = [   0   |    mn |   0   |    ij |   0   |    ef |   0   | ab
  // ]
  __m128i y_div_100;
  y_div_100 = _mm_mulhi_epu16(y, div_100);
  y_div_100 = _mm_srli_epi16(y_div_100, div_100_shift);

  // y_mod_100    = [   0   |    op |   0   |    kl |   0   |    gh |   0   | cd
  // ]
  __m128i y_mod_100;
  y_mod_100 = _mm_mullo_epi16(y_div_100, mul_100);
  y_mod_100 = _mm_sub_epi16(y, y_mod_100);

  // z            = [    mn |    op |    ij |    kl |    ef |    gh |    ab | cd
  // ]
  __m128i z = _mm_or_si128(y_div_100, _mm_slli_epi32(y_mod_100, 16));

  // z_div_10     = [ 0 | m | 0 | o | 0 | i | 0 | k | 0 | e | 0 | g | 0 | a | 0
  // | c ]
  __m128i z_div_10 = _mm_mulhi_epu16(z, div_10);

  // z_mod_10     = [ 0 | n | 0 | p | 0 | j | 0 | l | 0 | f | 0 | h | 0 | b | 0
  // | d ]
  __m128i z_mod_10;
  z_mod_10 = _mm_mullo_epi16(z_div_10, mul_10);
  z_mod_10 = _mm_sub_epi16(z, z_mod_10);

  // interleave z_mod_10 and z_div_10 -
  // tmp          = [ m | n | o | p | i | j | k | l | e | f | g | h | a | b | c
  // | d ]
  __m128i tmp = _mm_or_si128(z_div_10, _mm_slli_epi16(z_mod_10, 8));

  // convert to ascii
  tmp = _mm_add_epi8(tmp, ascii0);

  // and save result
  _mm_storeu_si128((__m128i *)out, tmp);
}
#endif

如果你用不了SIMD，有个Packed SIMD方案，又叫做SIMD within a register (SWAR),，原理看这个贴

// credit: Paul Khuong
uint64_t encode_ten_thousands(uint64_t hi, uint64_t lo) {
  uint64_t merged = hi | (lo << 32);
  /* Truncate division by 100: 10486 / 2**20 ~= 1/100. */
  uint64_t top = ((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL);
  /* Trailing 2 digits in the 1e4 chunks. */
  uint64_t bot = merged - 100ULL * top;
  uint64_t hundreds;
  uint64_t tens;

  /*
   * We now have 4 radix-100 digits in little-endian order, each
   * in its own 16 bit area.
   */
  hundreds = (bot << 16) + top;

  /* Divide and mod by 10 all 4 radix-100 digits in parallel. */
  tens = (hundreds * 103ULL) >> 10;
  tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL;
  tens += (hundreds - 10ULL * tens) << 8;

  return tens;
}

void to_string_khuong(uint64_t x, char *out) {
  uint64_t top = x / 100000000;
  uint64_t bottom = x % 100000000;
  uint64_t first =
      0x3030303030303030 + encode_ten_thousands(top / 10000, top % 10000);
  memcpy(out, &first, sizeof(first));
  uint64_t second =
      0x3030303030303030 + encode_ten_thousands(bottom / 10000, bottom % 10000);
  memcpy(out + 8, &second, sizeof(second));
}

简单bench，查表无敌，压测代码在这里

内核中用哪个实现？

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
	uint32_t d3, d2, d1, q, h;

	if (n < 100*1000*1000)
		return put_dec_trunc8(buf, n);

	d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
	h   = (n >> 32);
	d2  = (h      ) & 0xffff;
	d3  = (h >> 16); /* implicit "& 0xffff" */

	/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
	     = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
	q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
	q = put_dec_helper4(buf, q);

	q += 7671 * d3 + 9496 * d2 + 6 * d1;
	q = put_dec_helper4(buf+4, q);

	q += 4749 * d3 + 42 * d2;
	q = put_dec_helper4(buf+8, q);

	q += 281 * d3;
	buf += 12;
	if (q)
		buf = put_dec_trunc8(buf, q);
	else while (buf[-1] == '0')
		--buf;

	return buf;
}

另外多说一句，从字符串转成数字，最快的是std::from_chars, gcc8之后可用

simd和swar的实现说实话得研究研究才看得懂。暂时标记个TODO后面看吧

Virtual Inheritance in C++

讨论了一下菱形继承下的内存布局

Beware of fast-math

也说过很多次了，这个-fast-math有很多优化，可能会引入bug

In GCC, -ffast-math (or -Ofast) enables the following options: -fno-math-errno, -funsafe-math-optimizations, -ffinite-math-only, -fno-rounding-math, -fno-signaling-nans, -fcx-limited-range and -fexcess-precision=fast. Note that -funsafe-math-optimizations is itself a collection of options -fno-signed-zeros, -fno-trapping-math, -fassociative-math and -freciprocal-math, plus some extra ones, which we will discuss further below.

比如-fno-math-errno 会引起malloc不报错

比如-fno-signaling-nans会导致isnan函数直接失效

你一定要知道你的场景里会不会有这些判断，不然没有意义

目前c++还是没有TU级别的优化限定的。

要么所有都都优化要么都不优化，不能这个o优化那个o不优化，所以还是要注意优化项的使用

A Close Look at a Spinlock

手把手教你看spinlock汇编

C++20: Heterogeneous Lookup in (Un)ordered Containers

首先回顾一下Heterogeneous Lookup 的概念

一般来说容器的find/contains只能查相同的类型，就会有这种坑爹的场景

std::map<std::string, int> m = ...;
absl::string_view some_key = ...;
// Construct a temporary `std::string` to do the query.
// The allocation + copy + deallocation might dominate the find() call.
auto it = m.find(std::string(some_key));

白白多了一个复制，如何避免这种浪费呢，引入Heterogeneous Lookup，具体的做法就是定制一个transparent 比较器

struct StringCmp {
  using is_transparent = void;
  bool operator()(absl::string_view a, absl::string_view b) const {
    return a < b;
  }
};

std::map<std::string, int, StringCmp> m = ...;
absl::string_view some_key = ...;
// The comparator `StringCmp` will accept any type that is implicitly
// convertible to `absl::string_view` and says so by declaring the
// `is_transparent` tag.
// We can pass `some_key` to `find()` without converting it first to
// `std::string`. In this case, that avoids the unnecessary memory allocation
// required to construct the `std::string` instance.
auto it = m.find(some_key);

再来看这篇文章的例子 @Compiler Explorer

#include <iostream>
#include <map>
#include <string>
#include <functional>
#include <string_view>
#include <set>

// simple new/delete overloads, so we can check if some memory was allocated...
void* operator new(std::size_t sz){
    std::cout << "Allocating: " << sz << '\n';
    return std::malloc(sz);
}

void operator delete(void* ptr) noexcept{
    std::free(ptr);
}

int main()
{      
    std::map<std::string, int> intMap { { "Hello Super Long String", 1 }, { "Another Longish String", 2 }, {"This cannot fall into SSO buffer", 3 }};
    std::map<std::string, int, std::less<>> trIntMap { { "Hello Super Long String", 1 }, { "Another Longish String", 2 }, {"This cannot fall into SSO buffer", 3 }};
    
    std::cout << "Lookup in intMap with by const char*:\n";
    std::cout << intMap.contains("Hello Super Long String") << '\n';

    /*std::cout << "Lookup in intMap with string_view:\n";
    std::string_view sv("Another Longish String");
    std::cout << intMap.contains(sv) << '\n';*/
    
    std::cout << "Lookup in trIntMap by const char*: \n";
    std::cout << trIntMap.contains("Hello Super Long String") << '\n';
        
    std::cout << "Lookup in trIntMap by string_view: \n";
    std::string_view sv2("Another Longish String");
    std::cout << trIntMap.contains(sv2) << '\n';
}

能看到trIntMap没有多余的内存分配，std::less<>和std::less<std::string>差别这么大？

其实是特化了 transparent比较

template <>
struct less<void> { 
    using is_transparent = int;

    // simplified version...
    template <class _Ty1, class _Ty2>
    constexpr auto operator()(_Ty1&& _Left, _Ty2&& _Right) const
        return static_cast<_Ty1&&>(_Left) < static_cast<_Ty2&&>(_Right);
    }
};

这个带来的性能提升(省拷贝)还是很可观的，但是会引入转换问题，一定要注意

另外就是这个是c++14引入的,（void做模版参数）可能旧的库使用上会有ABI问题

c++23状态

简单说execution是板上钉钉了。一堆range相关的，network ts没影。（据说两者冲突，总之ASIO又没进）

Ray框架介绍

一个分布式计算框架

ELF Science Part 1

教你用python分析一下二进制ELF机器码之类的，有空可以玩玩

视频

C++ Weekly - Ep 298 - Detecting ABI Changes With abidiff

介绍abidiff的，挺有意思

Making Iterators, Views and Containers Easier to Write with Boost.STLInterfaces - Zach Laine CppCon 介绍 Boost.STLInterfaces的，一个接口类库

Meeting cpp 2021

Ivica Bogosavljevic Performance Optimization Techniques for Mobile Devices

介绍手机硬件用c++的一些经验，比如不用多线程之类的费电的的操作，这个人的博客google权重挺高的，很容易搜到，可以看看

Marc Mutz - QAnyStringView Variant String Views and Why You Care - Meeting C++ 2021 lightning talks

介绍QAnyStringView 的，qt string相关的东西太多了，qstingview不太行，又加个qanystringview，更牛逼一点

Deniz Bahadir - compile time checking user defined literals - Meeting C++ 2021 lightning talks

ip地址很容易和UDL结合，比如”192.168.0.1”_ipv4

那么，如何保证这个字符串是合法的ip呢，如何用UDL来校验ip合法性，作者给了段代码。编译期判定。很有趣, godbolt体验

// With friendly permission of BENOCS GmbH (www.benocs.com)

#include <cstdio>  // Needed for the print-function only!
#include <cstdint>
#include <variant>
#include <system_error>



/// Simplified IPv4 address type
struct IPv4Address
{
    std::uint32_t addr = 0;
};


/// Parses and extracts a decimal number from the given string
constexpr auto extract_dec_number(const char* str, unsigned long length,
                                  unsigned long& start_index,
                                  std::uint64_t max)
        noexcept
        -> std::variant<std::uint64_t, std::errc>
{
    constexpr auto is_digit  = [](char c) { return '0' <= c && c <= '9'; };
    constexpr auto get_digit = [](char c) -> std::uint8_t { return c - '0'; };

    // Parse number.
    std::uint64_t num = 0;  // Will hold the parsed number.
    auto old = num;  // For detecting overflow.
    auto index = start_index;
    for (char c = 0; index < length && num <= max && old <= num; ++index)
    {
        c = str[index];
        if (! is_digit(c)) break;
        old = num;
        num = num * 10 + get_digit(c);
    }

    // No digit parsed at all?
    if (start_index == index)
        return std::errc::result_out_of_range;
    // Parsed number is too large?
    if (num > max)
        return std::errc::value_too_large;
    // Overflow occurred?
    if (old > num)
        return std::errc::value_too_large;

    // Return new index and extracted number.
    start_index = index;
    return num;
}


/// Parses and extracts an IPv4 address from the given string
#if __cpp_consteval >= 201811
consteval
#else
constexpr
#endif
auto extract_ipv4_address(const char* str, unsigned long length,
                          unsigned long& start_index)
        -> std::uint32_t
{
    constexpr auto is_dot    = [](char c) { return '.' == c; };
    constexpr auto is_digit  = [](char c) { return '0' <= c && c <= '9'; };

    // Short-circuit if nothing to parse.
    if (start_index == length)
        throw "Invalid IPv4 address. [Reason: Empty string-literal]";
    // Starts with something different than a digit?
    if (! is_digit( str[start_index] ))
        throw "Invalid IPv4 address. [Reason: Illegal token]";

    std::uint32_t addr = 0;  // Will hold the parsed IPv4 address.
    auto index = start_index;
    unsigned long group_count = 0;
    for (; group_count < 4 && index < length; ++group_count)
    {
        // Groups not separated by '.'?
        if (group_count != 0 && ! is_dot( str[index++] ))
            throw "Invalid IPv4 address. [Reason: Illegal token]";

        // Parse next group.
        auto old_index = index;
        auto group = extract_dec_number(str, length, index, 255u);

        // Parse error?
        if (std::holds_alternative<std::errc>(group))
        {
            switch (std::get<std::errc>(group))
            {
                case std::errc::value_too_large:
                    throw "Invalid IPv4 address. [Reason: Invalid number in group]";
                case std::errc::result_out_of_range:
                    throw "Invalid IPv4 address. [Reason: Missing number in group]";
                default:
                    throw "Invalid IPv4 address.";
            }
        }
        if (index - old_index > 3)  // Maximal 3 digits per group!
            throw "Invalid IPv4 address. [Reason: Too many leading zeros in group]";
        // Store group.
        addr = (addr << 8) | static_cast<std::uint32_t>(std::get<0>(group));
    }

    // Parsed all 4 groups of the IPv4 address?
    if (group_count != 4)
        throw "Invalid IPv4 address. [Reason: Too few groups]";

    // Return new index and extracted IPv4 address.
    start_index = index;
    return addr;
}


/// A literal operator for user-defined literal representing an IPv4 address type
#if __cpp_consteval >= 201811
consteval
#else
constexpr
#endif
IPv4Address operator "" _ipv4(const char* str, std::size_t length)
{
    unsigned long index = 0;
    auto addr = extract_ipv4_address(str, length, index);
    if (index != length)
        throw "Invalid IPv4 address. [Reason: Additional tokens at end]";
    return IPv4Address{ addr };
}



void print(IPv4Address ip)
{
    std::printf( "%d.%d.%d.%d\n", ((ip.addr >> 24) & 0xff)
                                , ((ip.addr >> 16) & 0xff)
                                , ((ip.addr >> 8)  & 0xff)
                                , ((ip.addr >> 0)  & 0xff) );
}

int main()
{
    print( "192.168.0.1"_ipv4 );

    try {
        print( "192.168.0.256"_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }

    try {
        print( "192.168.0."_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }

    try {
        print( "192.168.0"_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }

    try {
        print( "192.168.0.1.123"_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }

    try {
        print( ""_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }

    try {
        print( "192.0168.0.1"_ipv4 );
    }
    catch(const char* error_msg) { std::printf("%s\n", error_msg); }



    // Important:
    // In order to guarantee compile-time evaluation of constexpr, it
    // must be called from a compile-time context, e.g. a static_assert!

    // Note:
    //         192  168   0   1
    //  <==>  0xc0 0xa8 0x0 0x1

/*
    static_assert(("192.168.0.1"_ipv4).addr == 0xc0a80001);
    static_assert(("192.168.0.256"_ipv4).addr == 0xc0a80001);
    static_assert(("192.168.0."_ipv4).addr == 0xc0a80001);
    static_assert(("192.168.0"_ipv4).addr == 0xc0a80001);
    static_assert(("192.168.0.1.123"_ipv4).addr == 0xc0a80001);
    static_assert((""_ipv4).addr == 0x0);
    static_assert(("192.0168.0.1"_ipv4).addr == 0xc0a80001);
*/

    return 0;
}