TPDE
Loading...
Searching...
No Matches
CompilerX64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/base.hpp"
10#include "tpde/x64/FunctionWriterX64.hpp"
11
12#include <bit>
13
14// Helper macros for assembling in the compiler
15#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
16 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
17#endif
18
19// Use helper, parameters might call ASM themselves => evaluate text_cur_ptr
20// after the arguments.
21#define ASM_FULL(compiler, reserve, op, ...) \
22 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
23
24#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
25#define ASMC(compiler, op, ...) \
26 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
27#define ASMF(op, flag, ...) \
28 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
29#define ASMNCF(op, flag, ...) \
30 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
31#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
32
33namespace tpde::x64 {
34
35struct AsmReg : Reg {
36 enum REG : u8 {
37 AX = 0,
38 CX,
39 DX,
40 BX,
41 SP,
42 BP,
43 SI,
44 DI,
45 R8,
46 R9,
47 R10,
48 R11,
49 R12,
50 R13,
51 R14,
52 R15,
53
54 XMM0 = 32,
55 XMM1,
56 XMM2,
57 XMM3,
58 XMM4,
59 XMM5,
60 XMM6,
61 XMM7,
62 XMM8,
63 XMM9,
64 XMM10,
65 XMM11,
66 XMM12,
67 XMM13,
68 XMM14,
69 XMM15,
70 // TODO(ts): optional support for AVX registers with compiler flag
71 };
72
73 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
74
75 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
76
77 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
78
79 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
80 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
81 }
82
83 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
84 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
85 }
86
87 constexpr operator FeRegGP() const noexcept {
88 assert(reg_id <= R15);
89 return FeRegGP{reg_id};
90 }
91
92 operator FeRegGPLH() const noexcept {
93 assert(reg_id <= R15);
94 return FeRegGP{reg_id};
95 }
96
97 constexpr operator FeRegXMM() const noexcept {
98 assert(reg_id >= XMM0 && reg_id <= XMM15);
99 return FeRegXMM{static_cast<u8>(reg_id & 0x1F)};
100 }
101};
102
103constexpr static u64
104 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
105 u64 set = 0;
106 for (const auto reg : regs) {
107 set |= 1ull << reg;
108 }
109 return set;
110}
111
112template <size_t N>
113constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
114 u64 set = 0;
115 for (const auto reg : regs) {
116 set |= 1ull << reg.id();
117 }
118 return set;
119}
120
121class CCAssignerSysV : public CCAssigner {
122public:
123 static constexpr CCInfo Info{
124 .allocatable_regs =
125 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
126 .callee_saved_regs = create_bitmask({
127 AsmReg::BX,
128 AsmReg::R12,
129 AsmReg::R13,
130 AsmReg::R14,
131 AsmReg::R15,
132 }),
133 .arg_regs = create_bitmask({
134 AsmReg::DI,
135 AsmReg::SI,
136 AsmReg::DX,
137 AsmReg::CX,
138 AsmReg::R8,
139 AsmReg::R9,
140 AsmReg::XMM0,
141 AsmReg::XMM1,
142 AsmReg::XMM2,
143 AsmReg::XMM3,
144 AsmReg::XMM4,
145 AsmReg::XMM5,
146 AsmReg::XMM6,
147 AsmReg::XMM7,
148 }),
149 .red_zone_size = 128,
150 };
151
152private:
153 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
154 // The next N assignments must go to the stack.
155 unsigned must_assign_stack = 0;
156 bool vararg;
157 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
158
159public:
160 CCAssignerSysV(bool vararg = false) noexcept
161 : CCAssigner(Info), vararg(vararg) {}
162
163 void reset() noexcept override {
164 gp_cnt = xmm_cnt = stack = 0;
165 must_assign_stack = 0;
166 vararg = false;
167 ret_gp_cnt = ret_xmm_cnt = 0;
168 }
169
170 void assign_arg(CCAssignment &arg) noexcept override {
171 if (arg.byval) {
172 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
173 arg.stack_off = stack;
174 stack += arg.size;
175 return;
176 }
177
178 if (arg.bank == RegBank{0}) {
179 static constexpr std::array<AsmReg, 6> gp_arg_regs{
180 AsmReg::DI,
181 AsmReg::SI,
182 AsmReg::DX,
183 AsmReg::CX,
184 AsmReg::R8,
185 AsmReg::R9,
186 };
187 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
188 arg.reg = gp_arg_regs[gp_cnt];
189 gp_cnt += 1;
190 } else {
191 // Next N arguments must also be assigned to the stack
192 // Increment by one, the value is immediately decremented below.
193 must_assign_stack = arg.consecutive + 1;
194 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
195 arg.stack_off = stack;
196 stack += 8;
197 }
198 } else {
199 if (!must_assign_stack && xmm_cnt < 8) {
200 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
201 xmm_cnt += 1;
202 } else {
203 // Next N arguments must also be assigned to the stack
204 // Increment by one, the value is immediately decremented below.
205 must_assign_stack = arg.consecutive + 1;
206 u32 size = util::align_up(arg.size, 8);
207 stack = util::align_up(stack, size);
208 arg.stack_off = stack;
209 stack += size;
210 }
211 }
212
213 if (must_assign_stack > 0) {
214 must_assign_stack -= 1;
215 }
216 }
217
218 u32 get_stack_size() noexcept override { return stack; }
219
220 bool is_vararg() const noexcept override { return vararg; }
221
222 void assign_ret(CCAssignment &arg) noexcept override {
223 assert(!arg.byval && !arg.sret);
224 if (arg.bank == RegBank{0}) {
225 if (ret_gp_cnt + arg.consecutive < 2) {
226 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
227 ret_gp_cnt += 1;
228 } else {
229 assert(false);
230 }
231 } else {
232 if (ret_xmm_cnt + arg.consecutive < 2) {
233 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
234 ret_xmm_cnt += 1;
235 } else {
236 assert(false);
237 }
238 }
239 }
240};
241
242struct PlatformConfig : CompilerConfigDefault {
243 using Assembler = AssemblerElfX64;
244 using AsmReg = tpde::x64::AsmReg;
245 using DefaultCCAssigner = CCAssignerSysV;
246 using FunctionWriter = FunctionWriterX64;
247
248 static constexpr RegBank GP_BANK{0};
249 static constexpr RegBank FP_BANK{1};
250 static constexpr bool FRAME_INDEXING_NEGATIVE = true;
251 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
252 static constexpr u32 NUM_BANKS = 2;
253};
254
255namespace concepts {
256template <typename T, typename Config>
257concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
258 {
259 a.arg_is_int128(std::declval<typename T::IRValueRef>())
260 } -> std::convertible_to<bool>;
261
262 {
263 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
264 } -> std::convertible_to<bool>;
265};
266} // namespace concepts
267
268template <IRAdaptor Adaptor,
269 typename Derived,
270 template <typename, typename, typename> typename BaseTy =
271 CompilerBase,
272 typename Config = PlatformConfig>
273struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
274 using Base = BaseTy<Adaptor, Derived, Config>;
275
276 using IRValueRef = typename Base::IRValueRef;
277 using IRBlockRef = typename Base::IRBlockRef;
278 using IRFuncRef = typename Base::IRFuncRef;
279
280 using ScratchReg = typename Base::ScratchReg;
281 using ValuePartRef = typename Base::ValuePartRef;
282 using ValuePart = typename Base::ValuePart;
283 using GenericValuePart = typename Base::GenericValuePart;
284
285 using Assembler = typename PlatformConfig::Assembler;
286 using RegisterFile = typename Base::RegisterFile;
287
288 using CallArg = typename Base::CallArg;
289
290 using Base::derived;
291
292
293 // TODO(ts): make this dependent on the number of callee-saved regs of the
294 // current function or if there is a call in the function?
295 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
296 6};
297
298 enum CPU_FEATURES : u32 {
299 CPU_BASELINE = 0, // x86-64-v1
300 CPU_CMPXCHG16B = (1 << 0),
301 CPU_POPCNT = (1 << 1),
302 CPU_SSE3 = (1 << 2),
303 CPU_SSSE3 = (1 << 3),
304 CPU_SSE4_1 = (1 << 4),
305 CPU_SSE4_2 = (1 << 5),
306 CPU_AVX = (1 << 6),
307 CPU_AVX2 = (1 << 7),
308 CPU_BMI1 = (1 << 8),
309 CPU_BMI2 = (1 << 9),
310 CPU_F16C = (1 << 10),
311 CPU_FMA = (1 << 11),
312 CPU_LZCNT = (1 << 12),
313 CPU_MOVBE = (1 << 13),
314 CPU_AVX512F = (1 << 14),
315 CPU_AVX512BW = (1 << 15),
316 CPU_AVX512CD = (1 << 16),
317 CPU_AVX512DQ = (1 << 17),
318 CPU_AVX512VL = (1 << 18),
319
320 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
321 CPU_SSE4_1 | CPU_SSE4_2,
322 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
323 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
324 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
325 CPU_AVX512VL,
326 };
327
328 CPU_FEATURES cpu_feats = CPU_BASELINE;
329
330 // When handling function arguments, we need to prevent argument registers
331 // from being handed out as fixed registers
332 //
333 // Additionally, for now we prevent AX,DX,CX to be fixed to not run into
334 // issues with instructions that need them as implicit arguments
335 // also AX and DX can never be fixed if exception handling is used
336 // since they are clobbered there
337 u64 fixed_assignment_nonallocatable_mask =
338 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
339 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
340 func_reg_restore_alloc = 0u;
341 /// For vararg functions only: number of scalar and xmm registers used.
342 // TODO: this information should be obtained from the CCAssigner.
343 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
344 u32 reg_save_frame_off = 0;
345 u32 var_arg_stack_off = 0;
346 util::SmallVector<u32, 8> func_ret_offs = {};
347 /// For functions without dynamic allocas, the largest size used for arguments
348 /// passed on the stack to callees. This size is added to the stack pointer
349 /// subtraction/addition in prologue/epilogue to avoid stack pointer
350 /// adjustments at call sites.
351 u32 max_callee_stack_arg_size;
352
353 /// Whether flags must be preserved when materializing constants etc.
354 bool preserve_flags;
355
356 /// Symbol for __tls_get_addr.
357 SymRef sym_tls_get_addr;
358
359 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
360 u32 stack_adjust_off = 0;
361
362 void set_stack_used() noexcept;
363
364 public:
365 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
366 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
367
368 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
369 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
370 void call_impl(std::variant<SymRef, ValuePart> &&target) noexcept;
371 void reset_stack() noexcept;
372 };
373
374 // for now, always generate an object
375 explicit CompilerX64(Adaptor *adaptor,
376 const CPU_FEATURES cpu_features = CPU_BASELINE)
377 : Base{adaptor}, cpu_feats(cpu_features) {
378 static_assert(std::is_base_of_v<CompilerX64, Derived>);
379 static_assert(concepts::Compiler<Derived, PlatformConfig>);
380 }
381
382 template <typename... Args>
383 auto asm_helper(unsigned (*enc_fn)(u8 *, int, Args...)) {
384 struct Helper {
385 CompilerX64 *compiler;
386 decltype(enc_fn) fn;
387 void encode(unsigned reserve, int flags, Args... args) {
388 if (reserve) {
389 compiler->text_writer.ensure_space(reserve);
390 }
391 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
392 assert(n != 0);
393 compiler->text_writer.cur_ptr() += n;
394 }
395 };
396 return Helper{this, enc_fn};
397 }
398
399 void start_func(u32 func_idx) noexcept;
400
401 void gen_func_prolog_and_args(CCAssigner *) noexcept;
402
403 void finish_func(u32 func_idx) noexcept;
404
405 void reset() noexcept;
406
407 // helpers
408
409 void gen_func_epilog() noexcept;
410
411 void set_preserve_flags(bool preserve) noexcept { preserve_flags = preserve; }
412 bool may_clobber_flags() noexcept { return !preserve_flags; }
413
414 void
415 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
416
417 void load_from_stack(AsmReg dst,
418 i32 frame_off,
419 u32 size,
420 bool sign_extend = false) noexcept;
421
422 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
423
424 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
425
426 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
427 assert(ap.stack_valid() && !ap.variable_ref());
428 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
429 }
430
431 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
432
433 /// Dynamic alloca of a fixed-size region.
434 void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept;
435
436 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
437 /// count must have a size of 64 bit.
438 void alloca_dynamic(u64 elem_size,
439 ValuePart &&count,
440 u32 align,
441 ValuePart &res) noexcept;
442
443 void materialize_constant(const u64 *data,
444 RegBank bank,
445 u32 size,
446 AsmReg dst) noexcept;
447
448 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef) noexcept;
449
450 /// Jump conditions.
451 enum class Jump {
452 jo = 0, ///< Jump if overflow (OF=1).
453 jno, ///< Jump if not overflow (OF=0).
454 jb, ///< Jump if below/if carry (CF=1).
455 jae, ///< Jump if above or equal/if not carry (CF=0).
456 je, ///< Jump if equal/if zero (ZF=1).
457 jne, ///< Jump if not equal/if not zero (ZF=0).
458 jbe, ///< Jump if below or equal (CF=1 or ZF=1).
459 ja, ///< Jump if above (CF=0 and ZF=0).
460 js, ///< Jump if sign (SF=1).
461 jns, ///< Jump if not sign (SF=0).
462 jp, ///< Jump if parity even (PF=1).
463 jnp, ///< Jump if parity odd (PF=0).
464 jl, ///< Jump if less (SF!=OF).
465 jge, ///< Jump if greater or equal (SF=OF).
466 jle, ///< Jump if less or equal (ZF=1 or SF!=OF).
467 jg, ///< Jump if greater (ZF=0 and SF=OF).
468 jmp, ///< Unconditional jump
469 };
470
471 Jump invert_jump(Jump jmp) noexcept;
472 Jump swap_jump(Jump jmp) noexcept;
473
474 FeCond jump_to_cond(Jump jmp) noexcept;
475
476 void generate_branch_to_block(Jump jmp,
477 IRBlockRef target,
478 bool needs_split,
479 bool last_inst) noexcept;
480
481 void generate_raw_jump(Jump jmp, Label target) noexcept;
482
483 /// Set dst to 1 if cc is true, otherwise set it to zero. If zext is false,
484 /// only the lowest 8 bit are set. Flags are not clobbered.
485 void generate_raw_set(Jump cc, AsmReg dst, bool zext = true) noexcept;
486 /// Set all bits of dst to 1 if cc is true, otherwise set it to zero
487 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
488 /// Move src into dst if cc is true, otherwise do nothing
489 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept;
490
491 void generate_raw_intext(
492 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
493
494 /// Bitfield insert. Needs a temporary register, src is not modified.
495 void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
496 /// Bitfield insert in zero. src is not modified, but src and dst must be
497 /// different.
498 void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
499
500 /// Generate a function call
501 ///
502 /// This will get the arguments into the correct registers according to the
503 /// calling convention, clear non-callee-saved registers from the register
504 /// file (make sure you do not have any fixed assignments left over) and
505 /// fill the result registers (the u8 in the ScratchReg pair indicates the
506 /// register bank)
507 ///
508 /// Targets can be a symbol (call to PLT with relocation), or an indirect
509 /// call to a ValuePart. Result is an optional reference.
510 void generate_call(std::variant<SymRef, ValuePart> &&target,
511 std::span<CallArg> arguments,
512 typename Base::ValueRef *result,
513 bool variable_args = false);
514
515private:
516 /// Internal function, don't use. Emit compare of cmp_reg with case_value.
517 void switch_emit_cmp(AsmReg cmp_reg,
518 AsmReg tmp_reg,
519 u64 case_value,
520 bool width_is_32) noexcept;
521
522public:
523 /// Internal function, don't use. Jump if cmp_reg equals case_value.
524 void switch_emit_cmpeq(Label case_label,
525 AsmReg cmp_reg,
526 AsmReg tmp_reg,
527 u64 case_value,
528 bool width_is_32) noexcept;
529 /// Internal function, don't use. Emit bounds check and jump table.
530 bool switch_emit_jump_table(Label default_label,
531 std::span<const Label> labels,
532 AsmReg cmp_reg,
533 AsmReg tmp_reg,
534 u64 low_bound,
535 u64 high_bound,
536 bool width_is_32) noexcept;
537 /// Internal function, don't use. Jump if cmp_reg is greater than case_value.
538 void switch_emit_binary_step(Label case_label,
539 Label gt_label,
540 AsmReg cmp_reg,
541 AsmReg tmp_reg,
542 u64 case_value,
543 bool width_is_32) noexcept;
544
545 /// Generate code sequence to load address of sym into a register. This will
546 /// generate a function call for dynamic TLS access models.
547 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
548
549 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
550 return ((cpu_feats & feats) == feats);
551 }
552};
553
554template <IRAdaptor Adaptor,
555 typename Derived,
556 template <typename, typename, typename> class BaseTy,
557 typename Config>
558void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
559 const u32 /*func_idx*/) noexcept {
560 this->text_writer.align(16);
561 this->assembler.except_begin_func();
562 this->preserve_flags = false;
563}
564
565template <IRAdaptor Adaptor,
566 typename Derived,
567 template <typename, typename, typename> typename BaseTy,
568 typename Config>
569void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
570 CCAssigner *cc_assigner) noexcept {
571 // prologue:
572 // push rbp
573 // mov rbp, rsp
574 // optionally create vararg save-area
575 // reserve space for callee-saved regs
576 // = 1 byte for each of the lower 8 regs and 2
577 // bytes for the higher 8 regs
578 // sub rsp, #<frame_size>+<largest_call_frame_usage>
579
580 // TODO(ts): technically we only need rbp if there
581 // is a dynamic alloca but then we need to make the
582 // frame indexing dynamic in CompilerBase and the
583 // unwind info needs to take the dynamic sub rsp for
584 // calls into account
585
586 func_ret_offs.clear();
587 func_start_off = this->text_writer.offset();
588 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
589
590 const CCInfo &cc_info = cc_assigner->get_ccinfo();
591
592 ASM(PUSHr, FE_BP);
593 ASM(MOV64rr, FE_BP, FE_SP);
594
595 func_reg_save_off = this->text_writer.offset();
596
597 auto csr = cc_info.callee_saved_regs;
598 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
599 "non-gp callee-saved registers not implemented");
600
601 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
602 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
603 // R8 and higher need a REX prefix; 7 bytes for sub rsp.
604 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp + 7;
605 this->stack.frame_size = 8 * (csr_logp + csr_higp);
606 max_callee_stack_arg_size = 0;
607
608 this->text_writer.ensure_space(reg_save_size);
609 this->text_writer.cur_ptr() += reg_save_size;
610 func_reg_save_alloc = reg_save_size;
611 // pop uses the same amount of bytes as push
612 func_reg_restore_alloc = reg_save_size;
613
614 // TODO(ts): support larger stack alignments?
615
616 if (this->adaptor->cur_is_vararg()) {
617 this->stack.frame_size += 6 * 8 + 8 * 16;
618 reg_save_frame_off = this->stack.frame_size;
619 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
620 ASM(MOV64mr, mem, FE_DI);
621 mem.off += 8;
622 ASM(MOV64mr, mem, FE_SI);
623 mem.off += 8;
624 ASM(MOV64mr, mem, FE_DX);
625 mem.off += 8;
626 ASM(MOV64mr, mem, FE_CX);
627 mem.off += 8;
628 ASM(MOV64mr, mem, FE_R8);
629 mem.off += 8;
630 ASM(MOV64mr, mem, FE_R9);
631 auto skip_fp = this->text_writer.label_create();
632 ASM(TEST8rr, FE_AX, FE_AX);
633 generate_raw_jump(Jump::je, skip_fp);
634 mem.off += 8;
635 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
636 mem.off += 16;
637 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
638 mem.off += 16;
639 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
640 mem.off += 16;
641 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
642 mem.off += 16;
643 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
644 mem.off += 16;
645 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
646 mem.off += 16;
647 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
648 mem.off += 16;
649 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
650 this->label_place(skip_fp);
651 }
652
653 // Temporarily prevent argument registers from being assigned.
654 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
655 "argument registers must also be allocatable");
656 this->register_file.allocatable &= ~cc_info.arg_regs;
657
658 u32 arg_idx = 0;
659 for (const IRValueRef arg : this->adaptor->cur_args()) {
660 derived()->handle_func_arg(
661 arg_idx,
662 arg,
663 [&](ValuePart &&vp, CCAssignment cca) -> std::optional<i32> {
664 if (!cca.byval) {
665 cca.bank = vp.bank();
666 cca.size = vp.part_size();
667 }
668
669 cc_assigner->assign_arg(cca);
670
671 if (cca.reg.valid()) [[likely]] {
672 vp.set_value_reg(this, cca.reg);
673 // Mark register as allocatable as soon as it is assigned. If the
674 // argument is unused, the register will be freed immediately and
675 // can be used for later stack arguments.
676 this->register_file.allocatable |= u64{1} << cca.reg.id();
677 return {};
678 }
679
680 if (vp.is_owned()) {
681 // no need to handle unused arguments
682 return {};
683 }
684
685 if (cca.byval) {
686 // Return byval frame_off.
687 return 0x10 + cca.stack_off;
688 } else {
689 // TODO(ts): maybe allow negative frame offsets for value
690 // assignments so we can simply reference this?
691 // but this probably doesn't work with multi-part values
692 // since the offsets are different
693 AsmReg dst = vp.alloc_reg(this);
694 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
695 }
696 return {};
697 });
698
699 arg_idx += 1;
700 }
701
702 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
703 // TODO: get this from CCAssigner?
704 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
705 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
706 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
707 this->scalar_arg_count = std::popcount(gp_regs);
708 this->vec_arg_count = std::popcount(xmm_regs);
709 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
710 }
711
712 this->register_file.allocatable |= cc_info.arg_regs;
713}
714
715template <IRAdaptor Adaptor,
716 typename Derived,
717 template <typename, typename, typename> typename BaseTy,
718 typename Config>
719void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
720 u32 func_idx) noexcept {
721 // NB: code alignment factor 1, data alignment factor -8.
722 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
723 // push rbp
724 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
725 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
726 this->assembler.eh_write_inst(
727 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
728 // mov rbp, rsp
729 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
730 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
731 dwarf::x64::DW_reg_rbp);
732
733 // Patched below
734 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
735 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
736
737 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
738 const CCInfo &ccinfo = derived()->cur_cc_assigner()->get_ccinfo();
739 auto csr = ccinfo.callee_saved_regs;
740 u64 saved_regs = this->register_file.clobbered & csr;
741 u32 num_saved_regs = 0u;
742 for (auto reg : util::BitSetIterator{saved_regs}) {
743 assert(reg <= AsmReg::R15);
744 write_ptr +=
745 fe64_PUSHr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
746 ++num_saved_regs;
747
748 // DWARF register ordering is subtly different from the encoding:
749 // x86 is: ax, cx, dx, bx, sp, bp, si, di, r8, ...
750 // DWARF is: ax, dx, cx, bx, si, di, bp, sp, r8, ...
751 static const u8 gpreg_to_dwarf[] = {
752 dwarf::x64::DW_reg_rax,
753 dwarf::x64::DW_reg_rcx,
754 dwarf::x64::DW_reg_rdx,
755 dwarf::x64::DW_reg_rbx,
756 dwarf::x64::DW_reg_rsp,
757 dwarf::x64::DW_reg_rbp,
758 dwarf::x64::DW_reg_rsi,
759 dwarf::x64::DW_reg_rdi,
760 dwarf::x64::DW_reg_r8,
761 dwarf::x64::DW_reg_r9,
762 dwarf::x64::DW_reg_r10,
763 dwarf::x64::DW_reg_r11,
764 dwarf::x64::DW_reg_r12,
765 dwarf::x64::DW_reg_r13,
766 dwarf::x64::DW_reg_r14,
767 dwarf::x64::DW_reg_r15,
768 };
769 u8 dwarf_reg = gpreg_to_dwarf[reg];
770 auto cfa_off = num_saved_regs + 2;
771 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
772 }
773
774 assert((!this->stack.has_dynamic_alloca || max_callee_stack_arg_size == 0) &&
775 "stack with dynamic alloca must adjust stack pointer at call sites");
776 // The frame_size contains the reserved frame size so we need to subtract
777 // the stack space we used for the saved registers
778 u32 final_frame_size =
779 util::align_up(this->stack.frame_size + max_callee_stack_arg_size, 16);
780 u32 rsp_adjustment = final_frame_size - num_saved_regs * 8;
781 bool needs_rsp_adjustment = this->stack.generated_call ||
782 this->stack.has_dynamic_alloca ||
783 rsp_adjustment > ccinfo.red_zone_size;
784
785 if (needs_rsp_adjustment) {
786 write_ptr += fe64_SUB64ri(write_ptr, 0, FE_SP, rsp_adjustment);
787 }
788
789 u32 prologue_size =
790 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
791 assert(prologue_size < 0x44);
792 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
793 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
794
795 // nop out the rest
796 const auto reg_save_end =
797 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
798 assert(reg_save_end >= write_ptr);
799 const u32 nop_len = reg_save_end - write_ptr;
800 if (nop_len) {
801 fe64_NOP(write_ptr, nop_len);
802 }
803
804 auto func_sym = this->func_syms[func_idx];
805 auto func_sec = this->text_writer.get_sec_ref();
806 if (func_ret_offs.empty()) {
807 // TODO(ts): honor cur_needs_unwind_info
808 auto func_size = this->text_writer.offset() - func_start_off;
809 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
810 this->assembler.eh_end_fde(fde_off, func_sym);
811 this->assembler.except_encode_func(func_sym,
812 this->text_writer.label_offsets.data());
813 return;
814 }
815
816 auto *text_data = this->text_writer.begin_ptr();
817 u32 first_ret_off = func_ret_offs[0];
818 u32 ret_size = 0;
819 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc; // add + pop + ret
820 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
821 {
822 write_ptr = text_data + first_ret_off;
823 const auto ret_start = write_ptr;
824 if (this->stack.has_dynamic_alloca) {
825 if (num_saved_regs == 0) {
826 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
827 } else {
828 write_ptr +=
829 fe64_LEA64rm(write_ptr,
830 0,
831 FE_SP,
832 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
833 }
834 } else if (needs_rsp_adjustment) {
835 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, rsp_adjustment);
836 }
837 for (auto reg : util::BitSetIterator<true>{saved_regs}) {
838 assert(reg <= AsmReg::R15);
839 write_ptr +=
840 fe64_POPr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
841 }
842 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
843 write_ptr += fe64_RET(write_ptr, 0);
844 ret_size = write_ptr - ret_start;
845 assert(ret_size <= epilogue_size && "function epilogue too long");
846
847 // write NOP for better disassembly
848 if (epilogue_size > ret_size) {
849 fe64_NOP(write_ptr, epilogue_size - ret_size);
850 if (first_ret_off == func_end_ret_off) {
851 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
852 }
853 }
854 }
855
856 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
857 std::memcpy(
858 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
859 if (func_ret_offs[i] == func_end_ret_off) {
860 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
861 }
862 }
863
864 // Do sym_def at the very end; we shorten the function here again, so only at
865 // this point we know the actual size of the function.
866 // TODO(ts): honor cur_needs_unwind_info
867 auto func_size = this->text_writer.offset() - func_start_off;
868 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
869 this->assembler.eh_end_fde(fde_off, func_sym);
870 this->assembler.except_encode_func(func_sym,
871 this->text_writer.label_offsets.data());
872}
873
874template <IRAdaptor Adaptor,
875 typename Derived,
876 template <typename, typename, typename> typename BaseTy,
877 typename Config>
878void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
879 func_ret_offs.clear();
880 sym_tls_get_addr = {};
881 Base::reset();
882}
883
884template <IRAdaptor Adaptor,
885 typename Derived,
886 template <typename, typename, typename> typename BaseTy,
887 typename Config>
888void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
889 // epilogue:
890 // if !func_has_dynamic_alloca:
891 // add rsp, #<frame_size>+<largest_call_frame_usage>
892 // else:
893 // lea rsp, [rbp - <size_of_reg_save_area>]
894 // for each saved reg:
895 // pop <reg>
896 // pop rbp
897 // ret
898 //
899 // however, since we will later patch this, we only
900 // reserve the space for now
901
902 func_ret_offs.push_back(this->text_writer.offset());
903
904 // add reg, imm32
905 // and
906 // lea rsp, [rbp - imm32]
907 // both take 7 bytes
908 u32 epilogue_size =
909 7 + 1 + 1 +
910 func_reg_restore_alloc; // add/lea + pop + ret + size of reg restore
911
912 this->text_writer.ensure_space(epilogue_size);
913 this->text_writer.cur_ptr() += epilogue_size;
914}
915
916template <IRAdaptor Adaptor,
917 typename Derived,
918 template <typename, typename, typename> typename BaseTy,
919 typename Config>
920void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
921 const AsmReg reg, const i32 frame_off, const u32 size) noexcept {
922 this->text_writer.ensure_space(16);
923 assert(frame_off < 0);
924 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
925 if (reg.id() <= AsmReg::R15) {
926 switch (size) {
927 case 1: ASMNC(MOV8mr, mem, reg); break;
928 case 2: ASMNC(MOV16mr, mem, reg); break;
929 case 4: ASMNC(MOV32mr, mem, reg); break;
930 case 8: ASMNC(MOV64mr, mem, reg); break;
931 default: TPDE_UNREACHABLE("invalid spill size");
932 }
933 return;
934 }
935
936 switch (size) {
937 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg); break;
938 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg); break;
939 case 16: ASMNC(SSE_MOVAPDmr, mem, reg); break;
940 default: TPDE_UNREACHABLE("invalid spill size");
941 }
942}
943
944template <IRAdaptor Adaptor,
945 typename Derived,
946 template <typename, typename, typename> typename BaseTy,
947 typename Config>
948void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
949 const AsmReg dst,
950 const i32 frame_off,
951 const u32 size,
952 const bool sign_extend) noexcept {
953 this->text_writer.ensure_space(16);
954 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
955
956 if (dst.id() <= AsmReg::R15) {
957 if (!sign_extend) {
958 switch (size) {
959 case 1: ASMNC(MOVZXr32m8, dst, mem); break;
960 case 2: ASMNC(MOVZXr32m16, dst, mem); break;
961 case 4: ASMNC(MOV32rm, dst, mem); break;
962 case 8: ASMNC(MOV64rm, dst, mem); break;
963 default: TPDE_UNREACHABLE("invalid spill size");
964 }
965 } else {
966 switch (size) {
967 case 1: ASMNC(MOVSXr64m8, dst, mem); break;
968 case 2: ASMNC(MOVSXr64m16, dst, mem); break;
969 case 4: ASMNC(MOVSXr64m32, dst, mem); break;
970 case 8: ASMNC(MOV64rm, dst, mem); break;
971 default: TPDE_UNREACHABLE("invalid spill size");
972 }
973 }
974 return;
975 }
976
977 assert(!sign_extend);
978
979 switch (size) {
980 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem); break;
981 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem); break;
982 case 16: ASMNC(SSE_MOVAPDrm, dst, mem); break;
983 default: TPDE_UNREACHABLE("invalid spill size");
984 }
985}
986
987template <IRAdaptor Adaptor,
988 typename Derived,
989 template <typename, typename, typename> typename BaseTy,
990 typename Config>
991void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
992 const AsmReg dst, const AssignmentPartRef ap) noexcept {
993 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
994}
995
996template <IRAdaptor Adaptor,
997 typename Derived,
998 template <typename, typename, typename> typename BaseTy,
999 typename Config>
1000void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
1001 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
1002 this->text_writer.ensure_space(16);
1003 assert(dst.valid());
1004 assert(src.valid());
1005 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
1006 if (size > 4) {
1007 ASMNC(MOV64rr, dst, src);
1008 } else {
1009 ASMNC(MOV32rr, dst, src);
1010 }
1011 } else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
1012 if (size <= 16) {
1013 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
1014 assert(has_cpu_feats(CPU_AVX512F));
1015 ASMNC(VMOVAPD128rr, dst, src);
1016 } else {
1017 ASMNC(SSE_MOVAPDrr, dst, src);
1018 }
1019 } else if (size <= 32) {
1020 assert(has_cpu_feats(CPU_AVX));
1021 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
1022 has_cpu_feats(CPU_AVX512F));
1023 ASMNC(VMOVAPD256rr, dst, src);
1024 } else {
1025 assert(size <= 64);
1026 assert(has_cpu_feats(CPU_AVX512F));
1027 ASMNC(VMOVAPD512rr, dst, src);
1028 }
1029 } else if (dst.id() <= AsmReg::R15) {
1030 // gp<-xmm
1031 assert(src.id() >= AsmReg::XMM0);
1032 assert(size <= 8);
1033 if (src.id() > AsmReg::XMM15) {
1034 assert(has_cpu_feats(CPU_AVX512F));
1035 if (size <= 4) {
1036 ASMNC(VMOVD_X2Grr, dst, src);
1037 } else {
1038 ASMNC(VMOVQ_X2Grr, dst, src);
1039 }
1040 } else {
1041 if (size <= 4) {
1042 ASMNC(SSE_MOVD_X2Grr, dst, src);
1043 } else {
1044 ASMNC(SSE_MOVQ_X2Grr, dst, src);
1045 }
1046 }
1047 } else {
1048 // xmm<-gp
1049 assert(src.id() <= AsmReg::R15);
1050 assert(dst.id() >= AsmReg::XMM0);
1051 assert(size <= 8);
1052 if (dst.id() > AsmReg::XMM15) {
1053 assert(has_cpu_feats(CPU_AVX512F));
1054 if (size <= 4) {
1055 ASMNC(VMOVD_G2Xrr, dst, src);
1056 } else {
1057 ASMNC(VMOVQ_G2Xrr, dst, src);
1058 }
1059 } else {
1060 if (size <= 4) {
1061 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1062 } else {
1063 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1064 }
1065 }
1066 }
1067}
1068
1069template <IRAdaptor Adaptor,
1070 typename Derived,
1071 template <typename, typename, typename> typename BaseTy,
1072 typename Config>
1073AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1074 GenericValuePart &gv) noexcept {
1075 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1076
1077 ScratchReg scratch{derived()};
1078 bool disp32 = i32(expr.disp) == expr.disp;
1079 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1080 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1081 if (std::holds_alternative<ScratchReg>(expr.base)) {
1082 scratch = std::move(std::get<ScratchReg>(expr.base));
1083 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1084 scratch = std::move(std::get<ScratchReg>(expr.index));
1085 } else {
1086 (void)scratch.alloc_gp();
1087 }
1088 auto dst = scratch.cur_reg();
1089 if (idx.valid()) {
1090 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1091 u8 sc = expr.scale;
1092 if (base.valid() && disp32) {
1093 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1094 expr.disp = 0;
1095 } else if (base.valid()) {
1096 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1097 } else if (disp32) {
1098 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1099 } else {
1100 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1101 }
1102 } else {
1103 assert(may_clobber_flags());
1104 u64 scale = expr.scale;
1105 if (base == idx) {
1106 base = AsmReg::make_invalid();
1107 scale += 1;
1108 }
1109
1110 ScratchReg idx_scratch{derived()};
1111 // We need a register to compute the scaled index.
1112 AsmReg idx_tmp = dst;
1113 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1114 // We can't use dst, it'd clobber base, so use the other
1115 // register we currently own.
1116 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1117 } else if (dst == base) {
1118 idx_tmp = idx_scratch.alloc_gp();
1119 }
1120
1121 if ((scale & (scale - 1)) == 0) {
1122 if (idx_tmp != idx) {
1123 ASM(MOV64rr, idx_tmp, idx);
1124 }
1125 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1126 } else {
1127 if (i32(scale) == i64(scale)) {
1128 ASM(IMUL64rri, idx_tmp, idx, scale);
1129 } else {
1130 ScratchReg scratch2{derived()};
1131 auto tmp2 = scratch2.alloc_gp();
1132 ASM(MOV64ri, tmp2, scale);
1133 if (idx_tmp != idx) {
1134 ASM(MOV64rr, idx_tmp, idx);
1135 }
1136 ASM(IMUL64rr, idx_tmp, tmp2);
1137 }
1138 }
1139 if (base.valid()) {
1140 if (disp32 || (idx_tmp != dst && base != dst)) {
1141 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1142 expr.disp = 0;
1143 } else if (dst == base) {
1144 ASM(ADD64rr, dst, idx_tmp);
1145 } else {
1146 ASM(ADD64rr, dst, base);
1147 }
1148 }
1149 }
1150 } else if (base.valid()) {
1151 if (expr.disp && disp32) {
1152 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1153 expr.disp = 0;
1154 } else if (dst != base) {
1155 ASM(MOV64rr, dst, base);
1156 }
1157 }
1158 if (expr.disp) {
1159 ScratchReg scratch2{derived()};
1160 auto tmp2 = scratch2.alloc_gp();
1161 ASM(MOV64ri, tmp2, expr.disp);
1162 if (may_clobber_flags()) {
1163 ASM(ADD64rr, dst, tmp2);
1164 } else {
1165 ASM(LEA64rm, dst, FE_MEM(dst, 1, tmp2, 0));
1166 }
1167 }
1168 gv.state = std::move(scratch);
1169 return dst;
1170}
1171
1172template <IRAdaptor Adaptor,
1173 typename Derived,
1174 template <typename, typename, typename> typename BaseTy,
1175 typename Config>
1176void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_fixed(
1177 u64 size, u32 align, ValuePart &res) noexcept {
1178 assert(this->stack.has_dynamic_alloca &&
1179 "function marked as not having dynamic allocas can't have alloca");
1180 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1181 assert(may_clobber_flags());
1182 size = tpde::util::align_up(size, 16);
1183 if (size > 0) {
1184 assert(size < 0x8000'0000);
1185 ASM(SUB64ri, FE_SP, size);
1186 }
1187 if (align > 16) {
1188 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1189 ASM(AND64ri, FE_SP, ~(align - 1));
1190 }
1191 ASM(MOV64rr, res.alloc_reg(this), FE_SP);
1192}
1193
1194template <IRAdaptor Adaptor,
1195 typename Derived,
1196 template <typename, typename, typename> typename BaseTy,
1197 typename Config>
1198void CompilerX64<Adaptor, Derived, BaseTy, Config>::alloca_dynamic(
1199 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept {
1200 assert(this->stack.has_dynamic_alloca &&
1201 "function marked as not having dynamic allocas can't have alloca");
1202 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1203 assert(may_clobber_flags());
1204 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1205 AsmReg res_reg = res.alloc_try_reuse(this, count);
1206
1207 if (elem_size == 0) {
1208 ASM(XOR32rr, res_reg, res_reg);
1209 } else if ((elem_size & (elem_size - 1)) == 0) {
1210 // elem_size is power of two
1211 const auto shift = util::cnt_tz(elem_size);
1212 if (shift > 0 && shift < 4) {
1213 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1214 } else {
1215 if (size_reg != res_reg) {
1216 ASM(MOV64rr, res_reg, size_reg);
1217 }
1218 if (elem_size != 1) {
1219 ASM(SHL64ri, res_reg, shift);
1220 }
1221 }
1222 } else {
1223 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1224 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1225 } else {
1226 ScratchReg scratch{this};
1227 auto tmp = scratch.alloc_gp();
1228 ASM(MOV64ri, tmp, elem_size);
1229 if (size_reg != res_reg) {
1230 ASM(MOV64rr, res_reg, size_reg);
1231 }
1232 ASM(IMUL64rr, res_reg, tmp);
1233 }
1234 }
1235
1236 ASM(SUB64rr, FE_SP, res_reg);
1237
1238 align = align > 16 ? align : 16;
1239 if (elem_size & (align - 1)) {
1240 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1241 ASM(AND64ri, FE_SP, ~(align - 1));
1242 }
1243
1244 ASM(MOV64rr, res_reg, FE_SP);
1245}
1246
1247template <IRAdaptor Adaptor,
1248 typename Derived,
1249 template <typename, typename, typename> typename BaseTy,
1250 typename Config>
1251void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1252 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1253 const auto const_u64 = data[0];
1254 if (bank == Config::GP_BANK) {
1255 assert(size <= 8);
1256 if (const_u64 == 0) {
1257 if (may_clobber_flags()) {
1258 ASM(XOR32rr, dst, dst);
1259 } else {
1260 ASM(MOV32ri, dst, 0);
1261 }
1262 return;
1263 }
1264
1265 if (size <= 4 || u32(const_u64) == const_u64) {
1266 ASM(MOV32ri, dst, const_u64);
1267 } else {
1268 ASM(MOV64ri, dst, const_u64);
1269 }
1270 return;
1271 }
1272
1273 assert(bank == Config::FP_BANK);
1274 const auto high_u64 = size <= 8 ? 0 : data[1];
1275 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1276 if (has_cpu_feats(CPU_AVX)) {
1277 ASM(VPXOR128rrr, dst, dst, dst);
1278 } else {
1279 ASM(SSE_PXORrr, dst, dst);
1280 }
1281 return;
1282 }
1283 const u64 ones = -u64{1};
1284 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1285 if (has_cpu_feats(CPU_AVX)) {
1286 ASM(VPCMPEQB128rrr, dst, dst, dst);
1287 } else {
1288 ASM(SSE_PCMPEQBrr, dst, dst);
1289 }
1290 return;
1291 }
1292
1293 if (size <= 8) {
1294 // We must not evict registers here (might be used within branching code),
1295 // so only use free registers and load from memory otherwise.
1296 AsmReg tmp =
1297 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1298 if (tmp.valid()) {
1299 this->register_file.mark_clobbered(tmp);
1300 materialize_constant(data, Config::GP_BANK, size, tmp);
1301 if (size <= 4) {
1302 if (has_cpu_feats(CPU_AVX)) {
1303 ASM(VMOVD_G2Xrr, dst, tmp);
1304 } else {
1305 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1306 }
1307 } else {
1308 if (has_cpu_feats(CPU_AVX)) {
1309 ASM(VMOVQ_G2Xrr, dst, tmp);
1310 } else {
1311 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1312 }
1313 }
1314 return;
1315 }
1316 }
1317
1318 // TODO: round to next power of two but at least 4 byte
1319 // We store constants in 8-byte units.
1320 auto alloc_size = util::align_up(size, 8);
1321 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), alloc_size};
1322 // TODO: deduplicate/pool constants?
1323 auto rodata = this->assembler.get_data_section(true, false);
1324 auto sym = this->assembler.sym_def_data(
1325 rodata, "", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1326 if (size <= 4) {
1327 if (has_cpu_feats(CPU_AVX)) {
1328 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1329 } else {
1330 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1331 }
1332 } else if (size <= 8) {
1333 if (has_cpu_feats(CPU_AVX)) {
1334 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1335 } else {
1336 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1337 }
1338 } else if (size <= 16) {
1339 if (has_cpu_feats(CPU_AVX)) {
1340 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1341 } else {
1342 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1343 }
1344 } else {
1345 // TODO: implement for AVX/AVX-512.
1346 TPDE_FATAL("unable to materialize constant");
1347 }
1348
1349 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1350}
1351
1352template <IRAdaptor Adaptor,
1353 typename Derived,
1354 template <typename, typename, typename> typename BaseTy,
1355 typename Config>
1356AsmReg
1357 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1358 AssignmentPartRef ap, IRValueRef) noexcept {
1359 RegBank bank = ap.bank();
1360 assert(bank.id() <= Config::NUM_BANKS);
1361 auto reg_mask = this->register_file.bank_regs(bank);
1362 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1363
1364 const auto find_possible_regs = [this,
1365 reg_mask](const u64 preferred_regs) -> u64 {
1366 // try to first get an unused reg, otherwise an unfixed reg
1367 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1368 return free_regs & preferred_regs & reg_mask;
1369 };
1370
1371 u64 possible_regs;
1372 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1373 if (!this->stack.is_leaf_function) {
1374 // we can only allocated fixed assignments from the callee-saved regs
1375 possible_regs = find_possible_regs(csr);
1376 } else {
1377 // try allocating any non-callee saved register first, except the result
1378 // registers
1379 possible_regs = find_possible_regs(~csr);
1380 if (possible_regs == 0) {
1381 // otherwise fallback to callee-saved regs
1382 possible_regs = find_possible_regs(csr);
1383 }
1384 }
1385
1386 if (possible_regs == 0) {
1387 return AsmReg::make_invalid();
1388 }
1389
1390 // try to first get an unused reg, otherwise an unfixed reg
1391 if ((possible_regs & ~this->register_file.used) != 0) {
1392 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1393 }
1394
1395 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1396 const auto reg = AsmReg{reg_id};
1397
1398 if (this->register_file.is_fixed(reg)) {
1399 continue;
1400 }
1401
1402 const auto local_idx = this->register_file.reg_local_idx(reg);
1403 const auto part = this->register_file.reg_part(reg);
1404
1405 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1406 continue;
1407 }
1408 auto *assignment = this->val_assignment(local_idx);
1409 auto ap = AssignmentPartRef{assignment, part};
1410 if (ap.modified()) {
1411 continue;
1412 }
1413
1414 return reg;
1415 }
1416
1417 return AsmReg::make_invalid();
1418}
1419
1420template <IRAdaptor Adaptor,
1421 typename Derived,
1422 template <typename, typename, typename> typename BaseTy,
1423 typename Config>
1424typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1425 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1426 Jump jmp) noexcept {
1427 switch (jmp) {
1428 case Jump::ja: return Jump::jbe;
1429 case Jump::jae: return Jump::jb;
1430 case Jump::jb: return Jump::jae;
1431 case Jump::jbe: return Jump::ja;
1432 case Jump::je: return Jump::jne;
1433 case Jump::jg: return Jump::jle;
1434 case Jump::jge: return Jump::jl;
1435 case Jump::jl: return Jump::jge;
1436 case Jump::jle: return Jump::jg;
1437 case Jump::jne: return Jump::je;
1438 case Jump::jno: return Jump::jo;
1439 case Jump::jo: return Jump::jno;
1440 case Jump::js: return Jump::jns;
1441 case Jump::jns: return Jump::js;
1442 case Jump::jp: return Jump::jnp;
1443 case Jump::jnp: return Jump::jp;
1444 default: TPDE_UNREACHABLE("invalid jump kind for invert_jump");
1445 }
1446}
1447
1448template <IRAdaptor Adaptor,
1449 typename Derived,
1450 template <typename, typename, typename> class BaseTy,
1451 typename Config>
1452typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1453 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1454 Jump jmp) noexcept {
1455 switch (jmp) {
1456 case Jump::ja: return Jump::jb;
1457 case Jump::jae: return Jump::jbe;
1458 case Jump::jb: return Jump::ja;
1459 case Jump::jbe: return Jump::jae;
1460 case Jump::je: return Jump::je;
1461 case Jump::jne: return Jump::jne;
1462 case Jump::jg: return Jump::jl;
1463 case Jump::jge: return Jump::jle;
1464 case Jump::jl: return Jump::jg;
1465 case Jump::jle: return Jump::jge;
1466 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1467 }
1468}
1469
1470template <IRAdaptor Adaptor,
1471 typename Derived,
1472 template <typename, typename, typename> class BaseTy,
1473 typename Config>
1474FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1475 Jump jmp) noexcept {
1476 // LLVM won't transform the switch into a shift.
1477 FeCond res = FeCond(u32(jmp) << 16);
1478 switch (jmp) {
1479 case Jump::ja: assert(res == FE_CC_A && "FeCond value mismatch?"); break;
1480 case Jump::jae: assert(res == FE_CC_AE && "FeCond value mismatch?"); break;
1481 case Jump::jb: assert(res == FE_CC_B && "FeCond value mismatch?"); break;
1482 case Jump::jbe: assert(res == FE_CC_BE && "FeCond value mismatch?"); break;
1483 case Jump::je: assert(res == FE_CC_E && "FeCond value mismatch?"); break;
1484 case Jump::jg: assert(res == FE_CC_G && "FeCond value mismatch?"); break;
1485 case Jump::jge: assert(res == FE_CC_GE && "FeCond value mismatch?"); break;
1486 case Jump::jl: assert(res == FE_CC_L && "FeCond value mismatch?"); break;
1487 case Jump::jle: assert(res == FE_CC_LE && "FeCond value mismatch?"); break;
1488 case Jump::jne: assert(res == FE_CC_NE && "FeCond value mismatch?"); break;
1489 case Jump::jno: assert(res == FE_CC_NO && "FeCond value mismatch?"); break;
1490 case Jump::jo: assert(res == FE_CC_O && "FeCond value mismatch?"); break;
1491 case Jump::js: assert(res == FE_CC_S && "FeCond value mismatch?"); break;
1492 case Jump::jns: assert(res == FE_CC_NS && "FeCond value mismatch?"); break;
1493 case Jump::jp: assert(res == FE_CC_P && "FeCond value mismatch?"); break;
1494 case Jump::jnp: assert(res == FE_CC_NP && "FeCond value mismatch?"); break;
1495 default: TPDE_UNREACHABLE("invalid conditional jump");
1496 }
1497 return res;
1498}
1499
1500template <IRAdaptor Adaptor,
1501 typename Derived,
1502 template <typename, typename, typename> typename BaseTy,
1503 typename Config>
1504void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1505 const Jump jmp,
1506 IRBlockRef target,
1507 const bool needs_split,
1508 const bool last_inst) noexcept {
1509 const auto target_idx = this->analyzer.block_idx(target);
1510 if (!needs_split || jmp == Jump::jmp) {
1511 this->derived()->move_to_phi_nodes(target_idx);
1512
1513 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1514 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1515 }
1516 } else {
1517 auto tmp_label = this->text_writer.label_create();
1518 generate_raw_jump(invert_jump(jmp), tmp_label);
1519
1520 this->derived()->move_to_phi_nodes(target_idx);
1521
1522 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1523
1524 this->label_place(tmp_label);
1525 }
1526}
1527
1528template <IRAdaptor Adaptor,
1529 typename Derived,
1530 template <typename, typename, typename> typename BaseTy,
1531 typename Config>
1532void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1533 Jump jmp, Label target_label) noexcept {
1534 this->text_writer.ensure_space(6); // For safe ptr arithmetic on code buffer.
1535 bool pending = this->text_writer.label_is_pending(target_label);
1536 void *target = this->text_writer.cur_ptr();
1537 if (!pending) {
1538 target = this->text_writer.begin_ptr() +
1539 this->text_writer.label_offset(target_label);
1540 }
1541
1542 if (jmp == Jump::jmp) {
1543 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1544 } else {
1545 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(jmp), target);
1546 }
1547
1548 if (pending) {
1549 this->text_writer.label_ref(target_label,
1550 this->text_writer.offset() - 4,
1551 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1552 }
1553}
1554
1555template <IRAdaptor Adaptor,
1556 typename Derived,
1557 template <typename, typename, typename> class BaseTy,
1558 typename Config>
1559void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1560 Jump cc, AsmReg dst, bool zext) noexcept {
1561 if (zext) {
1562 ASM(MOV32ri, dst, 0);
1563 }
1564 ASMF(SETcc8r, jump_to_cond(cc), dst);
1565}
1566
1567template <IRAdaptor Adaptor,
1568 typename Derived,
1569 template <typename, typename, typename> class BaseTy,
1570 typename Config>
1571void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1572 Jump cc, AsmReg dst) noexcept {
1573 // TODO: use sbb dst,dst/adc dest,-1 for carry flag
1574 generate_raw_set(cc, dst);
1575 ASM(NEG64r, dst);
1576}
1577template <IRAdaptor Adaptor,
1578 typename Derived,
1579 template <typename, typename, typename> class BaseTy,
1580 typename Config>
1581void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_cmov(
1582 Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept {
1583 if (is_64) {
1584 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1585 } else {
1586 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1587 }
1588}
1589
1590template <IRAdaptor Adaptor,
1591 typename Derived,
1592 template <typename, typename, typename> class BaseTy,
1593 typename Config>
1594void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1595 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1596 assert(from < to && to <= 64);
1597 assert(may_clobber_flags());
1598 if (!sign) {
1599 switch (from) {
1600 case 8: ASM(MOVZXr32r8, dst, src); break;
1601 case 16: ASM(MOVZXr32r16, dst, src); break;
1602 case 32: ASM(MOV32rr, dst, src); break;
1603 default:
1604 if (from < 32) {
1605 if (dst != src) {
1606 ASM(MOV32rr, dst, src);
1607 }
1608 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1609 } else if (dst != src) {
1610 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1611 ASM(AND64rr, dst, src);
1612 } else {
1613 ScratchReg tmp{this};
1614 AsmReg tmp_reg = tmp.alloc_gp();
1615 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1616 ASM(AND64rr, dst, tmp_reg);
1617 }
1618 }
1619 } else if (to <= 32) {
1620 switch (from) {
1621 case 8: ASM(MOVSXr32r8, dst, src); break;
1622 case 16: ASM(MOVSXr32r16, dst, src); break;
1623 default:
1624 if (dst != src) {
1625 ASM(MOV32rr, dst, src);
1626 }
1627 ASM(SHL32ri, dst, 32 - from);
1628 ASM(SAR32ri, dst, 32 - from);
1629 }
1630 } else {
1631 switch (from) {
1632 case 8: ASM(MOVSXr64r8, dst, src); break;
1633 case 16: ASM(MOVSXr64r16, dst, src); break;
1634 case 32: ASM(MOVSXr64r32, dst, src); break;
1635 default:
1636 if (dst != src) {
1637 ASM(MOV64rr, dst, src);
1638 }
1639 ASM(SHL64ri, dst, 64 - from);
1640 ASM(SAR64ri, dst, 64 - from);
1641 }
1642 }
1643}
1644
1645template <IRAdaptor Adaptor,
1646 typename Derived,
1647 template <typename, typename, typename> class BaseTy,
1648 typename Config>
1649void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_bfi(
1650 AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
1651 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1652 assert(may_clobber_flags());
1653 ScratchReg tmp1{this};
1654 AsmReg tmp1_reg = tmp1.alloc_gp();
1655 // First, clear relevant bits in dst.
1656 if (width == 1) {
1657 ASM(BTR64ri, dst, lsb);
1658 } else if (lsb + width <= 31) {
1659 ASM(AND64ri, dst, ~(((u64{1} << width) - 1) << lsb));
1660 } else {
1661 ASM(MOV64ri, tmp1_reg, ~(((u64{1} << width) - 1) << lsb));
1662 ASM(AND64rr, dst, tmp1_reg);
1663 }
1664 // Second, clear irrelevant bits in src; result is in tmp1_reg.
1665 if (width == 8) {
1666 ASM(MOVZXr32r8, tmp1_reg, src);
1667 } else if (width == 16) {
1668 ASM(MOVZXr32r16, tmp1_reg, src);
1669 } else if (width <= 32) {
1670 ASM(MOV32rr, tmp1_reg, src);
1671 if (width < 32) {
1672 ASM(AND32ri, tmp1_reg, (u32{1} << width) - 1);
1673 }
1674 } else {
1675 ASM(MOV64ri, tmp1_reg, (u64{1} << width) - 1);
1676 ASM(AND64rr, tmp1_reg, src);
1677 }
1678 // Third, merge. Bits are disjoint, so addition is possible.
1679 if (lsb >= 1 && lsb <= 3) {
1680 ASM(LEA64rm, dst, FE_MEM(dst, u8(1 << lsb), tmp1_reg, 0));
1681 } else {
1682 if (lsb > 0 && lsb + width <= 32) {
1683 ASM(SHL32ri, tmp1_reg, lsb);
1684 } else if (lsb > 0) {
1685 ASM(SHL64ri, tmp1_reg, lsb);
1686 }
1687 ASM(OR64rr, dst, tmp1_reg);
1688 }
1689}
1690
1691template <IRAdaptor Adaptor,
1692 typename Derived,
1693 template <typename, typename, typename> class BaseTy,
1694 typename Config>
1695void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_bfiz(
1696 AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
1697 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1698 assert(dst != src);
1699 assert(may_clobber_flags());
1700 // Clear irrelevant bits in src and move to dst.
1701 if (width == 8) {
1702 ASM(MOVZXr32r8, dst, src);
1703 } else if (width == 16) {
1704 ASM(MOVZXr32r16, dst, src);
1705 } else if (width <= 32) {
1706 ASM(MOV32rr, dst, src);
1707 if (width < 32) {
1708 ASM(AND32ri, dst, (u32{1} << width) - 1);
1709 }
1710 } else {
1711 ASM(MOV64ri, dst, (u64{1} << width) - 1);
1712 ASM(AND64rr, dst, src);
1713 }
1714 // Shift into place.
1715 if (lsb > 0 && lsb + width <= 32) {
1716 ASM(SHL32ri, dst, lsb);
1717 } else if (lsb > 0) {
1718 ASM(SHL64ri, dst, lsb);
1719 }
1720}
1721
1722template <IRAdaptor Adaptor,
1723 typename Derived,
1724 template <typename, typename, typename> class BaseTy,
1725 typename Config>
1726void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1727 set_stack_used() noexcept {
1728 if (this->compiler.stack.has_dynamic_alloca && stack_adjust_off == 0) {
1729 stack_adjust_off = this->compiler.text_writer.offset();
1730 // Always use 32-bit immediate
1731 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1732 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1733 }
1734}
1735
1736template <IRAdaptor Adaptor,
1737 typename Derived,
1738 template <typename, typename, typename> class BaseTy,
1739 typename Config>
1740void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1741 ValuePart &vp, CCAssignment &cca) noexcept {
1742 AsmReg ptr = vp.load_to_reg(&this->compiler);
1743 ScratchReg scratch{&this->compiler};
1744 AsmReg tmp = scratch.alloc_gp();
1745
1746 auto size = cca.size;
1747 set_stack_used();
1748 i32 off = 0;
1749 while (size >= 8) {
1750 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1751 ASMC(&this->compiler,
1752 MOV64mr,
1753 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1754 tmp);
1755 off += 8;
1756 size -= 8;
1757 }
1758 if (size >= 4) {
1759 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1760 ASMC(&this->compiler,
1761 MOV32mr,
1762 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1763 tmp);
1764 off += 4;
1765 size -= 4;
1766 }
1767 if (size >= 2) {
1768 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1769 ASMC(&this->compiler,
1770 MOV16mr,
1771 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1772 tmp);
1773 off += 2;
1774 size -= 2;
1775 }
1776 if (size >= 1) {
1777 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1778 ASMC(&this->compiler,
1779 MOV8mr,
1780 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1781 tmp);
1782 }
1783}
1784
1785template <IRAdaptor Adaptor,
1786 typename Derived,
1787 template <typename, typename, typename> class BaseTy,
1788 typename Config>
1789void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1790 ValuePart &vp, CCAssignment &cca) noexcept {
1791 set_stack_used();
1792
1793 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1794 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1795 switch (cca.size) {
1796 case 1:
1797 ASMC(&this->compiler,
1798 MOV8mr,
1799 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1800 reg);
1801 break;
1802 case 2:
1803 ASMC(&this->compiler,
1804 MOV16mr,
1805 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1806 reg);
1807 break;
1808 case 4:
1809 ASMC(&this->compiler,
1810 MOV32mr,
1811 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1812 reg);
1813 break;
1814 case 8:
1815 ASMC(&this->compiler,
1816 MOV64mr,
1817 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1818 reg);
1819 break;
1820 default: TPDE_UNREACHABLE("invalid GP reg size");
1821 }
1822 } else {
1823 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1824 switch (cca.size) {
1825 case 4:
1826 ASMC(&this->compiler,
1827 SSE_MOVSSmr,
1828 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1829 reg);
1830 break;
1831 case 8:
1832 ASMC(&this->compiler,
1833 SSE_MOVSDmr,
1834 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1835 reg);
1836 break;
1837 case 16:
1838 ASMC(&this->compiler,
1839 SSE_MOVDQAmr,
1840 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1841 reg);
1842 break;
1843 default: TPDE_UNREACHABLE("invalid GP reg size");
1844 }
1845 }
1846}
1847
1848template <IRAdaptor Adaptor,
1849 typename Derived,
1850 template <typename, typename, typename> class BaseTy,
1851 typename Config>
1852void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1853 std::variant<SymRef, ValuePart> &&target) noexcept {
1854 if (this->assigner.is_vararg()) {
1855 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1856 this->compiler.evict_reg(Reg{AsmReg::AX});
1857 }
1858 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1859 Config::FP_BANK, 0);
1860 unsigned xmm_cnt = 8;
1861 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1862 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1863 }
1864 if (xmm_cnt != 0) {
1865 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1866 } else {
1867 ASMC(&this->compiler, XOR32rr, FE_AX, FE_AX);
1868 }
1869 }
1870
1871 u32 sub = 0;
1872 if (stack_adjust_off != 0) {
1873 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1874 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1875 memcpy(inst_ptr + 3, &sub, sizeof(u32));
1876 } else {
1877 auto &max_stack_size = this->compiler.max_callee_stack_arg_size;
1878 max_stack_size = std::max(max_stack_size, this->assigner.get_stack_size());
1879 }
1880
1881 if (auto *sym = std::get_if<SymRef>(&target)) {
1882 this->compiler.text_writer.ensure_space(16);
1883 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1884 this->compiler.reloc_text(
1885 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1886 } else {
1887 ValuePart &tvp = std::get<ValuePart>(target);
1888 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1889 assert(tvp.assignment().stack_valid());
1890 auto off = tvp.assignment().frame_off();
1891 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1892 } else if (tvp.can_salvage()) {
1893 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1894 } else {
1895 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1896 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1897 ASMC(&this->compiler, CALLr, reg);
1898 }
1899 tvp.reset(&this->compiler);
1900 }
1901
1902 if (stack_adjust_off != 0) {
1903 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1904 }
1905}
1906
1907template <IRAdaptor Adaptor,
1908 typename Derived,
1909 template <typename, typename, typename> typename BaseTy,
1910 typename Config>
1911void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1912 std::variant<SymRef, ValuePart> &&target,
1913 std::span<CallArg> arguments,
1914 typename Base::ValueRef *result,
1915 const bool variable_args) {
1916 CCAssignerSysV assigner{variable_args};
1917 CallBuilder cb{*derived(), assigner};
1918 for (auto &arg : arguments) {
1919 cb.add_arg(std::move(arg));
1920 }
1921 cb.call(std::move(target));
1922 if (result) {
1923 cb.add_ret(*result);
1924 }
1925}
1926
1927template <IRAdaptor Adaptor,
1928 typename Derived,
1929 template <typename, typename, typename> typename BaseTy,
1930 typename Config>
1931void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1932 const AsmReg cmp_reg,
1933 const AsmReg tmp_reg,
1934 const u64 case_value,
1935 const bool width_is_32) noexcept {
1936 if (width_is_32) {
1937 ASM(CMP32ri, cmp_reg, case_value);
1938 } else {
1939 if ((i64)((i32)case_value) == (i64)case_value) {
1940 ASM(CMP64ri, cmp_reg, case_value);
1941 } else {
1942 this->materialize_constant(&case_value, Config::GP_BANK, 8, tmp_reg);
1943 ASM(CMP64rr, cmp_reg, tmp_reg);
1944 }
1945 }
1946}
1947
1948template <IRAdaptor Adaptor,
1949 typename Derived,
1950 template <typename, typename, typename> typename BaseTy,
1951 typename Config>
1952void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1953 const Label case_label,
1954 const AsmReg cmp_reg,
1955 const AsmReg tmp_reg,
1956 const u64 case_value,
1957 const bool width_is_32) noexcept {
1958 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1959 generate_raw_jump(Jump::je, case_label);
1960}
1961
1962template <IRAdaptor Adaptor,
1963 typename Derived,
1964 template <typename, typename, typename> typename BaseTy,
1965 typename Config>
1966bool CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_jump_table(
1967 Label default_label,
1968 std::span<const Label> labels,
1969 AsmReg cmp_reg,
1970 AsmReg tmp_reg,
1971 u64 low_bound,
1972 u64 high_bound,
1973 bool width_is_32) noexcept {
1974 // NB: we must not evict any registers here.
1975 if (low_bound != 0) {
1976 switch_emit_cmp(cmp_reg, tmp_reg, low_bound, width_is_32);
1977 generate_raw_jump(Jump::jb, default_label);
1978 }
1979 switch_emit_cmp(cmp_reg, tmp_reg, high_bound, width_is_32);
1980 generate_raw_jump(Jump::ja, default_label);
1981
1982 if (width_is_32) {
1983 // zero-extend cmp_reg since we use the full width
1984 ASM(MOV32rr, cmp_reg, cmp_reg);
1985 }
1986
1987 if (low_bound != 0) {
1988 if (i32(low_bound) == i64(low_bound)) {
1989 ASM(SUB64ri, cmp_reg, low_bound);
1990 } else {
1991 this->materialize_constant(&low_bound, Config::GP_BANK, 8, tmp_reg);
1992 ASM(SUB64rr, cmp_reg, tmp_reg);
1993 }
1994 }
1995
1996 Label jump_table = this->text_writer.label_create();
1997 ASM(LEA64rm, tmp_reg, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1998 // we reuse the jump offset stuff since the patch procedure is the same
1999 this->text_writer.label_ref(jump_table,
2000 this->text_writer.offset() - 4,
2001 LabelFixupKind::X64_JMP_OR_MEM_DISP);
2002 // load the 4 byte displacement from the jump table
2003 ASM(MOVSXr64m32, cmp_reg, FE_MEM(tmp_reg, 4, cmp_reg, 0));
2004 ASM(ADD64rr, tmp_reg, cmp_reg);
2005 ASM(JMPr, tmp_reg);
2006
2007 this->text_writer.align(4);
2008 this->text_writer.ensure_space(4 + 4 * labels.size());
2009 this->label_place(jump_table);
2010 const u32 table_off = this->text_writer.offset();
2011 for (u32 i = 0; i < labels.size(); i++) {
2012 if (this->text_writer.label_is_pending(labels[i])) {
2013 this->text_writer.label_ref(labels[i],
2014 this->text_writer.offset(),
2015 LabelFixupKind::X64_JUMP_TABLE);
2016 this->text_writer.write(table_off);
2017 } else {
2018 const auto label_off = this->text_writer.label_offset(labels[i]);
2019 this->text_writer.write((i32)label_off - (i32)table_off);
2020 }
2021 }
2022 return true;
2023}
2024
2025template <IRAdaptor Adaptor,
2026 typename Derived,
2027 template <typename, typename, typename> typename BaseTy,
2028 typename Config>
2029void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
2030 const Label case_label,
2031 const Label gt_label,
2032 const AsmReg cmp_reg,
2033 const AsmReg tmp_reg,
2034 const u64 case_value,
2035 const bool width_is_32) noexcept {
2036 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
2037 generate_raw_jump(Jump::ja, gt_label);
2038}
2039
2040template <IRAdaptor Adaptor,
2041 typename Derived,
2042 template <typename, typename, typename> typename BaseTy,
2043 typename Config>
2044CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
2045 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
2046 SymRef sym, TLSModel model) noexcept {
2047 switch (model) {
2048 default: // TODO: implement optimized access for non-gd-model
2049 case TLSModel::GlobalDynamic: {
2050 // Generate function call to __tls_get_addr; on x86-64, this takes a single
2051 // parameter in rdi.
2052 assert(!this->stack.is_leaf_function);
2053 assert(may_clobber_flags());
2054 this->stack.generated_call = true;
2055 auto csr = CCAssignerSysV::Info.callee_saved_regs;
2056 for (auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
2057 this->evict_reg(Reg{reg});
2058 }
2059 ScratchReg arg{this};
2060 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
2061
2062 // Call sequence with extra prefixes for linker relaxation. Code sequence
2063 // taken from "ELF Handling For Thread-Local Storage".
2064 this->text_writer.ensure_space(0x10);
2065 *this->text_writer.cur_ptr()++ = 0x66;
2066 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
2067 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
2068 *this->text_writer.cur_ptr()++ = 0x66;
2069 *this->text_writer.cur_ptr()++ = 0x66;
2070 *this->text_writer.cur_ptr()++ = 0x48;
2071 ASMNC(CALL, this->text_writer.cur_ptr());
2072 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
2073 this->sym_tls_get_addr = this->assembler.sym_add_undef(
2074 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
2075 }
2076 this->reloc_text(this->sym_tls_get_addr,
2077 R_X86_64_PLT32,
2078 this->text_writer.offset() - 4,
2079 -4);
2080 arg.reset();
2081
2082 ScratchReg res{this};
2083 res.alloc_specific(AsmReg::AX);
2084 return res;
2085 }
2086 }
2087}
2088
2089} // namespace tpde::x64