#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>

typedef __int128 int128_t;

#define cat_(a, b)		a##b
#define cat(a, b)		cat_(a, b)
#define cat3(a, b, c)		cat(a, cat(b, c))
#define likely(x)		(__builtin_expect((int)(x), 1))
#define unlikely(x)		(__builtin_expect((int)(x), 0))
#define ASM_PREVENT_CSE		__asm__ volatile ("":::"memory")
__attribute__((noreturn)) void escape(void)
{
	fprintf(stderr, "escape called\n");
	abort();
}

/*
 * This is a simplified bytecode interpreter that shows clang-19 and clang-20
 * performance degradation.
 * "fp" is a frame pointer. There are byte tags at the beginning of the frame
 * and there are 64-bit values after the tags.
 * "ip" is a pointer to the byte-code (or more precisely uint16_t-code, because
 * the codes are 16-bit).
 *
 * There are 10 operations (+ - * & | ^ == != < <=) and 5 types (int8_t,
 * int16_t, int32_t, int64_t, int128_t), for the total of 50 instructions. 0xff
 * is a special instruction that exits the interpreter.
 *
 * The macro "insn" is a template for a 3-address instruction. We load the
 * indices of arguments into "arg1" and "arg2". Then, we check the tags. If any
 * of the tags is non-zero, we escape from the interpreter. Then, we load the
 * index of a result variable into "ret".
 *
 * Next, we perform the arithmetic operation. If there was signed overflow, we
 * escape from the interpreter. Then, we store the result, advance "ip" by 8
 * bytes, load a new instruction code into the "next_code" variable. We set
 * next_label to point to machine-code for the next instruction. Finally, we
 * jump to the machine code with "goto *next_label".
 *
 * The problem with clang-19 and clang-20 is that they join all the "goto
 * *next_label" statements into just one machine-code instruction. This reduces
 * the size of the "run" function, but the unfortunate consequence is that the
 * indirect branch predictor cannot be trained to follow the instruction flow,
 * because it can store only one target per instruction. We always get
 * misprediction.
 *
 * With clang-18 and older, we have multiple "jmp *(%r12,%rax,8)" instructions
 * in the machine code and each of them could be trained independently to jump
 * to the next instruction.
 *
 * It seems that AMD Zen 4 has smarter prediction, so that it can predict even
 * the situation when one instruction jumps to multiple targets.
 */

#define insn(lbl, typ, op)						\
	cat3(lbl,_,typ):						\
		arg1 = ip[1];						\
		arg2 = ip[2];						\
		if (unlikely(((char *)fp)[arg1] | ((char *)fp)[arg2])) {\
			ASM_PREVENT_CSE;				\
			escape();					\
		}							\
		ret = ip[3];						\
		if (unlikely(op(*(typ *)&fp[arg1], *(typ *)&fp[arg2], &cat(result_,typ)))) {\
			ASM_PREVENT_CSE;				\
			escape();					\
		}							\
		*(typ *)&fp[ret] = cat(result_,typ);			\
		ip += 4;						\
		next_code = ip[0];					\
		next_label = dispatch[next_code];			\
		goto *next_label;

#define op_and(a, b, c)		((*(c)) = (a) & (b), false)
#define op_or(a, b, c)		((*(c)) = (a) | (b), false)
#define op_xor(a, b, c)		((*(c)) = (a) ^ (b), false)
#define op_eq(a, b, c)		((*(c)) = (a) == (b), false)
#define op_neq(a, b, c)		((*(c)) = (a) != (b), false)
#define op_lt(a, b, c)		((*(c)) = (a) < (b), false)
#define op_le(a, b, c)		((*(c)) = (a) <= (b), false)

void run(int64_t *fp, const uint16_t *ip)
{
	unsigned arg1, arg2, ret;
	uint16_t next_code;
	int8_t result_int8_t;
	int16_t result_int16_t;
	int32_t result_int32_t;
	int64_t result_int64_t;
	int128_t result_int128_t;
	const void *next_label;
	static const void *dispatch[256] = {
		[0] = &&lbl_add_int8_t,
		[1] = &&lbl_sub_int8_t,
		[2] = &&lbl_mul_int8_t,
		[3] = &&lbl_and_int8_t,
		[4] = &&lbl_or_int8_t,
		[5] = &&lbl_xor_int8_t,
		[6] = &&lbl_eq_int8_t,
		[7] = &&lbl_neq_int8_t,
		[8] = &&lbl_lt_int8_t,
		[9] = &&lbl_le_int8_t,
		[10] = &&lbl_add_int16_t,
		[11] = &&lbl_sub_int16_t,
		[12] = &&lbl_mul_int16_t,
		[13] = &&lbl_and_int16_t,
		[14] = &&lbl_or_int16_t,
		[15] = &&lbl_xor_int16_t,
		[16] = &&lbl_eq_int16_t,
		[17] = &&lbl_neq_int16_t,
		[18] = &&lbl_lt_int16_t,
		[19] = &&lbl_le_int16_t,
		[20] = &&lbl_add_int32_t,
		[21] = &&lbl_sub_int32_t,
		[22] = &&lbl_mul_int32_t,
		[23] = &&lbl_and_int32_t,
		[24] = &&lbl_or_int32_t,
		[25] = &&lbl_xor_int32_t,
		[26] = &&lbl_eq_int32_t,
		[27] = &&lbl_neq_int32_t,
		[28] = &&lbl_lt_int32_t,
		[29] = &&lbl_le_int32_t,
		[30] = &&lbl_add_int64_t,
		[31] = &&lbl_sub_int64_t,
		[32] = &&lbl_mul_int64_t,
		[33] = &&lbl_and_int64_t,
		[34] = &&lbl_or_int64_t,
		[35] = &&lbl_xor_int64_t,
		[36] = &&lbl_eq_int64_t,
		[37] = &&lbl_neq_int64_t,
		[38] = &&lbl_lt_int64_t,
		[39] = &&lbl_le_int64_t,
		[40] = &&lbl_add_int128_t,
		[41] = &&lbl_sub_int128_t,
		[42] = &&lbl_mul_int128_t,
		[43] = &&lbl_and_int128_t,
		[44] = &&lbl_or_int128_t,
		[45] = &&lbl_xor_int128_t,
		[46] = &&lbl_eq_int128_t,
		[47] = &&lbl_neq_int128_t,
		[48] = &&lbl_lt_int128_t,
		[49] = &&lbl_le_int128_t,
		[255] = &&lbl_ex,
	};
	next_code = ip[0];
	next_label = dispatch[next_code];
	goto *next_label;

	insn(lbl_add, int8_t, __builtin_add_overflow);
	insn(lbl_sub, int8_t, __builtin_sub_overflow);
	insn(lbl_mul, int8_t, __builtin_mul_overflow);
	insn(lbl_and, int8_t, op_and);
	insn(lbl_or, int8_t, op_or);
	insn(lbl_xor, int8_t, op_xor);
	insn(lbl_eq, int8_t, op_eq);
	insn(lbl_neq, int8_t, op_neq);
	insn(lbl_lt, int8_t, op_lt);
	insn(lbl_le, int8_t, op_le);
	insn(lbl_add, int16_t, __builtin_add_overflow);
	insn(lbl_sub, int16_t, __builtin_sub_overflow);
	insn(lbl_mul, int16_t, __builtin_mul_overflow);
	insn(lbl_and, int16_t, op_and);
	insn(lbl_or, int16_t, op_or);
	insn(lbl_xor, int16_t, op_xor);
	insn(lbl_eq, int16_t, op_eq);
	insn(lbl_neq, int16_t, op_neq);
	insn(lbl_lt, int16_t, op_lt);
	insn(lbl_le, int16_t, op_le);
	insn(lbl_add, int32_t, __builtin_add_overflow);
	insn(lbl_sub, int32_t, __builtin_sub_overflow);
	insn(lbl_mul, int32_t, __builtin_mul_overflow);
	insn(lbl_and, int32_t, op_and);
	insn(lbl_or, int32_t, op_or);
	insn(lbl_xor, int32_t, op_xor);
	insn(lbl_eq, int32_t, op_eq);
	insn(lbl_neq, int32_t, op_neq);
	insn(lbl_lt, int32_t, op_lt);
	insn(lbl_le, int32_t, op_le);
	insn(lbl_add, int64_t, __builtin_add_overflow);
	insn(lbl_sub, int64_t, __builtin_sub_overflow);
	insn(lbl_mul, int64_t, __builtin_mul_overflow);
	insn(lbl_and, int64_t, op_and);
	insn(lbl_or, int64_t, op_or);
	insn(lbl_xor, int64_t, op_xor);
	insn(lbl_eq, int64_t, op_eq);
	insn(lbl_neq, int64_t, op_neq);
	insn(lbl_lt, int64_t, op_lt);
	insn(lbl_le, int64_t, op_le);
	insn(lbl_add, int128_t, __builtin_add_overflow);
	insn(lbl_sub, int128_t, __builtin_sub_overflow);
	insn(lbl_mul, int128_t, __builtin_mul_overflow);
	insn(lbl_and, int128_t, op_and);
	insn(lbl_or, int128_t, op_or);
	insn(lbl_xor, int128_t, op_xor);
	insn(lbl_eq, int128_t, op_eq);
	insn(lbl_neq, int128_t, op_neq);
	insn(lbl_lt, int128_t, op_lt);
	insn(lbl_le, int128_t, op_le);

lbl_ex:
	return;
}

#define CODE_SIZE	5000
uint16_t code[CODE_SIZE];

int64_t frame[8] __attribute__((aligned(16)));

int main(void)
{
	int i;
	for (i = 0; i + 1 < CODE_SIZE / 4; i++) {
		code[i * 4] = i % 50;
		code[i * 4 + 1] = 2;
		code[i * 4 + 2] = 4;
		code[i * 4 + 3] = 6;
	}
	code[i * 4] = 0xff;
	for (i = 0; i < 1000000; i++) {
		run(frame, code);
	}
	return 0;
}