#include #include #include #include typedef __int128 int128_t; #define cat_(a, b) a##b #define cat(a, b) cat_(a, b) #define cat3(a, b, c) cat(a, cat(b, c)) #define likely(x) (__builtin_expect((int)(x), 1)) #define unlikely(x) (__builtin_expect((int)(x), 0)) #define ASM_PREVENT_CSE __asm__ volatile ("":::"memory") __attribute__((noreturn)) void escape(void) { fprintf(stderr, "escape called\n"); abort(); } /* * This is a simplified bytecode interpreter that shows clang-19 and clang-20 * performance degradation. * "fp" is a frame pointer. There are byte tags at the beginning of the frame * and there are 64-bit values after the tags. * "ip" is a pointer to the byte-code (or more precisely uint16_t-code, because * the codes are 16-bit). * * There are 10 operations (+ - * & | ^ == != < <=) and 5 types (int8_t, * int16_t, int32_t, int64_t, int128_t), for the total of 50 instructions. 0xff * is a special instruction that exits the interpreter. * * The macro "insn" is a template for a 3-address instruction. We load the * indices of arguments into "arg1" and "arg2". Then, we check the tags. If any * of the tags is non-zero, we escape from the interpreter. Then, we load the * index of a result variable into "ret". * * Next, we perform the arithmetic operation. If there was signed overflow, we * escape from the interpreter. Then, we store the result, advance "ip" by 8 * bytes, load a new instruction code into the "next_code" variable. We set * next_label to point to machine-code for the next instruction. Finally, we * jump to the machine code with "goto *next_label". * * The problem with clang-19 and clang-20 is that they join all the "goto * *next_label" statements into just one machine-code instruction. This reduces * the size of the "run" function, but the unfortunate consequence is that the * indirect branch predictor cannot be trained to follow the instruction flow, * because it can store only one target per instruction. We always get * misprediction. * * With clang-18 and older, we have multiple "jmp *(%r12,%rax,8)" instructions * in the machine code and each of them could be trained independently to jump * to the next instruction. * * It seems that AMD Zen 4 has smarter prediction, so that it can predict even * the situation when one instruction jumps to multiple targets. */ #define insn(lbl, typ, op) \ cat3(lbl,_,typ): \ arg1 = ip[1]; \ arg2 = ip[2]; \ if (unlikely(((char *)fp)[arg1] | ((char *)fp)[arg2])) {\ ASM_PREVENT_CSE; \ escape(); \ } \ ret = ip[3]; \ if (unlikely(op(*(typ *)&fp[arg1], *(typ *)&fp[arg2], &cat(result_,typ)))) {\ ASM_PREVENT_CSE; \ escape(); \ } \ *(typ *)&fp[ret] = cat(result_,typ); \ ip += 4; \ next_code = ip[0]; \ next_label = dispatch[next_code]; \ goto *next_label; #define op_and(a, b, c) ((*(c)) = (a) & (b), false) #define op_or(a, b, c) ((*(c)) = (a) | (b), false) #define op_xor(a, b, c) ((*(c)) = (a) ^ (b), false) #define op_eq(a, b, c) ((*(c)) = (a) == (b), false) #define op_neq(a, b, c) ((*(c)) = (a) != (b), false) #define op_lt(a, b, c) ((*(c)) = (a) < (b), false) #define op_le(a, b, c) ((*(c)) = (a) <= (b), false) void run(int64_t *fp, const uint16_t *ip) { unsigned arg1, arg2, ret; uint16_t next_code; int8_t result_int8_t; int16_t result_int16_t; int32_t result_int32_t; int64_t result_int64_t; int128_t result_int128_t; const void *next_label; static const void *dispatch[256] = { [0] = &&lbl_add_int8_t, [1] = &&lbl_sub_int8_t, [2] = &&lbl_mul_int8_t, [3] = &&lbl_and_int8_t, [4] = &&lbl_or_int8_t, [5] = &&lbl_xor_int8_t, [6] = &&lbl_eq_int8_t, [7] = &&lbl_neq_int8_t, [8] = &&lbl_lt_int8_t, [9] = &&lbl_le_int8_t, [10] = &&lbl_add_int16_t, [11] = &&lbl_sub_int16_t, [12] = &&lbl_mul_int16_t, [13] = &&lbl_and_int16_t, [14] = &&lbl_or_int16_t, [15] = &&lbl_xor_int16_t, [16] = &&lbl_eq_int16_t, [17] = &&lbl_neq_int16_t, [18] = &&lbl_lt_int16_t, [19] = &&lbl_le_int16_t, [20] = &&lbl_add_int32_t, [21] = &&lbl_sub_int32_t, [22] = &&lbl_mul_int32_t, [23] = &&lbl_and_int32_t, [24] = &&lbl_or_int32_t, [25] = &&lbl_xor_int32_t, [26] = &&lbl_eq_int32_t, [27] = &&lbl_neq_int32_t, [28] = &&lbl_lt_int32_t, [29] = &&lbl_le_int32_t, [30] = &&lbl_add_int64_t, [31] = &&lbl_sub_int64_t, [32] = &&lbl_mul_int64_t, [33] = &&lbl_and_int64_t, [34] = &&lbl_or_int64_t, [35] = &&lbl_xor_int64_t, [36] = &&lbl_eq_int64_t, [37] = &&lbl_neq_int64_t, [38] = &&lbl_lt_int64_t, [39] = &&lbl_le_int64_t, [40] = &&lbl_add_int128_t, [41] = &&lbl_sub_int128_t, [42] = &&lbl_mul_int128_t, [43] = &&lbl_and_int128_t, [44] = &&lbl_or_int128_t, [45] = &&lbl_xor_int128_t, [46] = &&lbl_eq_int128_t, [47] = &&lbl_neq_int128_t, [48] = &&lbl_lt_int128_t, [49] = &&lbl_le_int128_t, [255] = &&lbl_ex, }; next_code = ip[0]; next_label = dispatch[next_code]; goto *next_label; insn(lbl_add, int8_t, __builtin_add_overflow); insn(lbl_sub, int8_t, __builtin_sub_overflow); insn(lbl_mul, int8_t, __builtin_mul_overflow); insn(lbl_and, int8_t, op_and); insn(lbl_or, int8_t, op_or); insn(lbl_xor, int8_t, op_xor); insn(lbl_eq, int8_t, op_eq); insn(lbl_neq, int8_t, op_neq); insn(lbl_lt, int8_t, op_lt); insn(lbl_le, int8_t, op_le); insn(lbl_add, int16_t, __builtin_add_overflow); insn(lbl_sub, int16_t, __builtin_sub_overflow); insn(lbl_mul, int16_t, __builtin_mul_overflow); insn(lbl_and, int16_t, op_and); insn(lbl_or, int16_t, op_or); insn(lbl_xor, int16_t, op_xor); insn(lbl_eq, int16_t, op_eq); insn(lbl_neq, int16_t, op_neq); insn(lbl_lt, int16_t, op_lt); insn(lbl_le, int16_t, op_le); insn(lbl_add, int32_t, __builtin_add_overflow); insn(lbl_sub, int32_t, __builtin_sub_overflow); insn(lbl_mul, int32_t, __builtin_mul_overflow); insn(lbl_and, int32_t, op_and); insn(lbl_or, int32_t, op_or); insn(lbl_xor, int32_t, op_xor); insn(lbl_eq, int32_t, op_eq); insn(lbl_neq, int32_t, op_neq); insn(lbl_lt, int32_t, op_lt); insn(lbl_le, int32_t, op_le); insn(lbl_add, int64_t, __builtin_add_overflow); insn(lbl_sub, int64_t, __builtin_sub_overflow); insn(lbl_mul, int64_t, __builtin_mul_overflow); insn(lbl_and, int64_t, op_and); insn(lbl_or, int64_t, op_or); insn(lbl_xor, int64_t, op_xor); insn(lbl_eq, int64_t, op_eq); insn(lbl_neq, int64_t, op_neq); insn(lbl_lt, int64_t, op_lt); insn(lbl_le, int64_t, op_le); insn(lbl_add, int128_t, __builtin_add_overflow); insn(lbl_sub, int128_t, __builtin_sub_overflow); insn(lbl_mul, int128_t, __builtin_mul_overflow); insn(lbl_and, int128_t, op_and); insn(lbl_or, int128_t, op_or); insn(lbl_xor, int128_t, op_xor); insn(lbl_eq, int128_t, op_eq); insn(lbl_neq, int128_t, op_neq); insn(lbl_lt, int128_t, op_lt); insn(lbl_le, int128_t, op_le); lbl_ex: return; } #define CODE_SIZE 5000 uint16_t code[CODE_SIZE]; int64_t frame[8] __attribute__((aligned(16))); int main(void) { int i; for (i = 0; i + 1 < CODE_SIZE / 4; i++) { code[i * 4] = i % 50; code[i * 4 + 1] = 2; code[i * 4 + 2] = 4; code[i * 4 + 3] = 6; } code[i * 4] = 0xff; for (i = 0; i < 1000000; i++) { run(frame, code); } return 0; }