TPDE
Loading...
Searching...
No Matches
CompilerX64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "tpde/AssemblerElf.hpp"
7#include "tpde/AssignmentPartRef.hpp"
8#include "tpde/CompilerBase.hpp"
9#include "tpde/base.hpp"
10#include "tpde/x64/FunctionWriterX64.hpp"
11
12#include <bit>
13
14// Helper macros for assembling in the compiler
15#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
16 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
17#endif
18
19// Use helper, parameters might call ASM themselves => evaluate text_cur_ptr
20// after the arguments.
21#define ASM_FULL(compiler, reserve, op, ...) \
22 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
23
24#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
25#define ASMC(compiler, op, ...) \
26 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
27#define ASMF(op, flag, ...) \
28 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
29#define ASMNCF(op, flag, ...) \
30 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
31#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
32
33namespace tpde::x64 {
34
35struct AsmReg : Reg {
36 enum REG : u8 {
37 AX = 0,
38 CX,
39 DX,
40 BX,
41 SP,
42 BP,
43 SI,
44 DI,
45 R8,
46 R9,
47 R10,
48 R11,
49 R12,
50 R13,
51 R14,
52 R15,
53
54 XMM0 = 32,
55 XMM1,
56 XMM2,
57 XMM3,
58 XMM4,
59 XMM5,
60 XMM6,
61 XMM7,
62 XMM8,
63 XMM9,
64 XMM10,
65 XMM11,
66 XMM12,
67 XMM13,
68 XMM14,
69 XMM15,
70 // TODO(ts): optional support for AVX registers with compiler flag
71 };
72
73 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
74
75 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
76
77 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
78
79 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
80 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
81 }
82
83 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
84 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
85 }
86
87 constexpr operator FeRegGP() const noexcept {
88 assert(reg_id <= R15);
89 return FeRegGP{reg_id};
90 }
91
92 operator FeRegGPLH() const noexcept {
93 assert(reg_id <= R15);
94 return FeRegGP{reg_id};
95 }
96
97 constexpr operator FeRegXMM() const noexcept {
98 assert(reg_id >= XMM0 && reg_id <= XMM15);
99 return FeRegXMM{static_cast<u8>(reg_id & 0x1F)};
100 }
101};
102
103constexpr static u64
104 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
105 u64 set = 0;
106 for (const auto reg : regs) {
107 set |= 1ull << reg;
108 }
109 return set;
110}
111
112template <size_t N>
113constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
114 u64 set = 0;
115 for (const auto reg : regs) {
116 set |= 1ull << reg.id();
117 }
118 return set;
119}
120
121/// x86-64 System V calling convention.
122class CCAssignerSysV : public CCAssigner {
123public:
124 static constexpr CCInfo Info{
125 .allocatable_regs =
126 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
127 .callee_saved_regs = create_bitmask({
128 AsmReg::BX,
129 AsmReg::R12,
130 AsmReg::R13,
131 AsmReg::R14,
132 AsmReg::R15,
133 }),
134 .arg_regs = create_bitmask({
135 AsmReg::DI,
136 AsmReg::SI,
137 AsmReg::DX,
138 AsmReg::CX,
139 AsmReg::R8,
140 AsmReg::R9,
141 AsmReg::XMM0,
142 AsmReg::XMM1,
143 AsmReg::XMM2,
144 AsmReg::XMM3,
145 AsmReg::XMM4,
146 AsmReg::XMM5,
147 AsmReg::XMM6,
148 AsmReg::XMM7,
149 }),
150 .red_zone_size = 128,
151 };
152
153private:
154 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
155 // The next N assignments must go to the stack.
156 unsigned must_assign_stack = 0;
157 bool vararg;
158 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
159
160public:
161 CCAssignerSysV(bool vararg = false) noexcept
162 : CCAssigner(Info), vararg(vararg) {}
163
164 void reset() noexcept override {
165 gp_cnt = xmm_cnt = stack = 0;
166 must_assign_stack = 0;
167 vararg = false;
168 ret_gp_cnt = ret_xmm_cnt = 0;
169 }
170
171 void assign_arg(CCAssignment &arg) noexcept override {
172 if (arg.byval) {
173 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
174 arg.stack_off = stack;
175 stack += arg.size;
176 return;
177 }
178
179 if (arg.bank == RegBank{0}) {
180 static constexpr std::array<AsmReg, 6> gp_arg_regs{
181 AsmReg::DI,
182 AsmReg::SI,
183 AsmReg::DX,
184 AsmReg::CX,
185 AsmReg::R8,
186 AsmReg::R9,
187 };
188 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
189 arg.reg = gp_arg_regs[gp_cnt];
190 gp_cnt += 1;
191 } else {
192 // Next N arguments must also be assigned to the stack
193 // Increment by one, the value is immediately decremented below.
194 must_assign_stack = arg.consecutive + 1;
195 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
196 arg.stack_off = stack;
197 stack += 8;
198 }
199 } else if (arg.bank == RegBank{1}) {
200 if (!must_assign_stack && xmm_cnt < 8) {
201 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
202 xmm_cnt += 1;
203 } else {
204 // Next N arguments must also be assigned to the stack
205 // Increment by one, the value is immediately decremented below.
206 must_assign_stack = arg.consecutive + 1;
207 u32 size = util::align_up(arg.size, 8);
208 stack = util::align_up(stack, size);
209 arg.stack_off = stack;
210 stack += size;
211 }
212 } else {
213 // Argument without valid register bank, pass on stack.
214 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
215 arg.stack_off = stack;
216 stack += util::align_up(arg.size, 8);
217 }
218
219 if (must_assign_stack > 0) {
220 must_assign_stack -= 1;
221 }
222 }
223
224 u32 get_stack_size() noexcept override { return stack; }
225
226 bool is_vararg() const noexcept override { return vararg; }
227
228 void assign_ret(CCAssignment &arg) noexcept override {
229 assert(!arg.byval && !arg.sret);
230 if (arg.bank == RegBank{0}) {
231 if (ret_gp_cnt + arg.consecutive < 2) {
232 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
233 ret_gp_cnt += 1;
234 } else {
235 TPDE_UNREACHABLE("too many return values");
236 }
237 } else if (arg.bank == RegBank{1}) {
238 if (ret_xmm_cnt + arg.consecutive < 2) {
239 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
240 ret_xmm_cnt += 1;
241 } else {
242 TPDE_UNREACHABLE("too many return values");
243 }
244 } else {
245 TPDE_UNREACHABLE("return value must have valid register bank");
246 }
247 }
248};
249
250struct PlatformConfig : CompilerConfigDefault {
251 using Assembler = tpde::elf::AssemblerElfX64;
252 using AsmReg = tpde::x64::AsmReg;
253 using DefaultCCAssigner = CCAssignerSysV;
255
256 static constexpr RegBank GP_BANK{0};
257 static constexpr RegBank FP_BANK{1};
258 static constexpr bool FRAME_INDEXING_NEGATIVE = true;
259 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
260 static constexpr u32 NUM_BANKS = 2;
261};
262
263/// Compiler mixin for targeting x86-64.
264template <IRAdaptor Adaptor,
265 typename Derived,
266 template <typename, typename, typename> typename BaseTy =
267 CompilerBase,
268 typename Config = PlatformConfig>
269struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
270 using Base = BaseTy<Adaptor, Derived, Config>;
271
272 using IRValueRef = typename Base::IRValueRef;
273 using IRBlockRef = typename Base::IRBlockRef;
274 using IRFuncRef = typename Base::IRFuncRef;
275
276 using ScratchReg = typename Base::ScratchReg;
277 using ValuePartRef = typename Base::ValuePartRef;
278 using ValuePart = typename Base::ValuePart;
279 using GenericValuePart = typename Base::GenericValuePart;
280
281 using RegisterFile = typename Base::RegisterFile;
282
283 using CallArg = typename Base::CallArg;
284
285 using Base::derived;
286
287
288 // TODO(ts): make this dependent on the number of callee-saved regs of the
289 // current function or if there is a call in the function?
290 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
291 6};
292
293 enum CPU_FEATURES : u32 {
294 CPU_BASELINE = 0, // x86-64-v1
295 CPU_CMPXCHG16B = (1 << 0),
296 CPU_POPCNT = (1 << 1),
297 CPU_SSE3 = (1 << 2),
298 CPU_SSSE3 = (1 << 3),
299 CPU_SSE4_1 = (1 << 4),
300 CPU_SSE4_2 = (1 << 5),
301 CPU_AVX = (1 << 6),
302 CPU_AVX2 = (1 << 7),
303 CPU_BMI1 = (1 << 8),
304 CPU_BMI2 = (1 << 9),
305 CPU_F16C = (1 << 10),
306 CPU_FMA = (1 << 11),
307 CPU_LZCNT = (1 << 12),
308 CPU_MOVBE = (1 << 13),
309 CPU_AVX512F = (1 << 14),
310 CPU_AVX512BW = (1 << 15),
311 CPU_AVX512CD = (1 << 16),
312 CPU_AVX512DQ = (1 << 17),
313 CPU_AVX512VL = (1 << 18),
314
315 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
316 CPU_SSE4_1 | CPU_SSE4_2,
317 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
318 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
319 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
320 CPU_AVX512VL,
321 };
322
323 CPU_FEATURES cpu_feats = CPU_BASELINE;
324
325 // When handling function arguments, we need to prevent argument registers
326 // from being handed out as fixed registers
327 //
328 // Additionally, for now we prevent AX,DX,CX to be fixed to not run into
329 // issues with instructions that need them as implicit arguments
330 // also AX and DX can never be fixed if exception handling is used
331 // since they are clobbered there
332 u64 fixed_assignment_nonallocatable_mask =
333 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
334 u32 func_start_off = 0u, func_prologue_alloc = 0u;
335 /// For vararg functions only: number of scalar and xmm registers used.
336 // TODO: this information should be obtained from the CCAssigner.
337 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
338 u32 reg_save_frame_off = 0;
339 u32 var_arg_stack_off = 0;
340 util::SmallVector<u32, 8> func_ret_offs = {};
341 /// For functions without dynamic allocas, the largest size used for arguments
342 /// passed on the stack to callees. This size is added to the stack pointer
343 /// subtraction/addition in prologue/epilogue to avoid stack pointer
344 /// adjustments at call sites.
346
347 /// Whether flags must be preserved when materializing constants etc.
349
350 /// Symbol for __tls_get_addr.
352
353 /// Helper class for building call sequences.
354 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
355 u32 stack_adjust_off = 0;
356
357 void set_stack_used() noexcept;
358
359 public:
360 /// Constructor.
361 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
363
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
366 void call_impl(std::variant<SymRef, ValuePart> &&target) noexcept;
367 void reset_stack() noexcept;
368 };
369
370 // for now, always generate an object
371 explicit CompilerX64(Adaptor *adaptor,
372 const CPU_FEATURES cpu_features = CPU_BASELINE)
373 : Base{adaptor}, cpu_feats(cpu_features) {
374 static_assert(std::is_base_of_v<CompilerX64, Derived>);
375 }
376
377 template <typename... Args>
378 auto asm_helper(unsigned (*enc_fn)(u8 *, int, Args...)) {
379 struct Helper {
380 CompilerX64 *compiler;
381 decltype(enc_fn) fn;
382 void encode(unsigned reserve, int flags, Args... args) {
383 if (reserve) {
384 compiler->text_writer.ensure_space(reserve);
385 }
386 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
387 assert(n != 0);
388 compiler->text_writer.cur_ptr() += n;
389 }
390 };
391 return Helper{this, enc_fn};
392 }
393
394 void start_func(u32 func_idx) noexcept;
395
396 /// Begin prologue, prepare for assigning arguments.
397 void prologue_begin(CCAssigner *cc_assigner) noexcept;
398 /// Assign argument part. Returns the stack offset if the value should be
399 /// initialized as stack variable.
400 std::optional<i32> prologue_assign_arg_part(ValuePart &&vp,
401 CCAssignment cca) noexcept;
402 /// Finish prologue.
403 void prologue_end(CCAssigner *cc_assigner) noexcept;
404
405 void finish_func(u32 func_idx) noexcept;
406
407 void reset() noexcept;
408
409 // helpers
410
411 void gen_func_epilog() noexcept;
412
413 void set_preserve_flags(bool preserve) noexcept { preserve_flags = preserve; }
414 bool may_clobber_flags() noexcept { return !preserve_flags; }
415
416 void
417 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
418
419 void load_from_stack(AsmReg dst,
420 i32 frame_off,
421 u32 size,
422 bool sign_extend = false) noexcept;
423
424 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
425
426 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
427
428 GenericValuePart val_spill_slot(AssignmentPartRef ap) noexcept {
429 assert(ap.stack_valid() && !ap.variable_ref());
430 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
431 }
432
433 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
434
435 /// Dynamic alloca of a fixed-size region.
436 void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept;
437
438 /// Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
439 /// count must have a size of 64 bit.
440 void alloca_dynamic(u64 elem_size,
441 ValuePart &&count,
442 u32 align,
443 ValuePart &res) noexcept;
444
445 /// Materialize constant into a register.
446 void materialize_constant(const u64 *data,
447 RegBank bank,
448 u32 size,
449 AsmReg dst) noexcept;
450
451 AsmReg select_fixed_assignment_reg(AssignmentPartRef, IRValueRef) noexcept;
452
453 /// Jump conditions.
454 enum class Jump {
455 jo = 0, ///< Jump if overflow (OF=1).
456 jno, ///< Jump if not overflow (OF=0).
457 jb, ///< Jump if below/if carry (CF=1).
458 jae, ///< Jump if above or equal/if not carry (CF=0).
459 je, ///< Jump if equal/if zero (ZF=1).
460 jne, ///< Jump if not equal/if not zero (ZF=0).
461 jbe, ///< Jump if below or equal (CF=1 or ZF=1).
462 ja, ///< Jump if above (CF=0 and ZF=0).
463 js, ///< Jump if sign (SF=1).
464 jns, ///< Jump if not sign (SF=0).
465 jp, ///< Jump if parity even (PF=1).
466 jnp, ///< Jump if parity odd (PF=0).
467 jl, ///< Jump if less (SF!=OF).
468 jge, ///< Jump if greater or equal (SF=OF).
469 jle, ///< Jump if less or equal (ZF=1 or SF!=OF).
470 jg, ///< Jump if greater (ZF=0 and SF=OF).
471 jmp, ///< Unconditional jump
472 };
473
474 Jump invert_jump(Jump jmp) noexcept;
475 Jump swap_jump(Jump jmp) noexcept;
476
477 FeCond jump_to_cond(Jump jmp) noexcept;
478
479 /// Generate jump instruction to target label.
480 void generate_raw_jump(Jump jmp, Label target) noexcept;
481
482 /// Set dst to 1 if cc is true, otherwise set it to zero. If zext is false,
483 /// only the lowest 8 bit are set. Flags are not clobbered.
484 void generate_raw_set(Jump cc, AsmReg dst, bool zext = true) noexcept;
485 /// Set all bits of dst to 1 if cc is true, otherwise set it to zero
486 void generate_raw_mask(Jump cc, AsmReg dst) noexcept;
487 /// Move src into dst if cc is true, otherwise do nothing
488 void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept;
489
490 /// Integer extension. Might need a temporary register, src is not modified,
491 /// might clobber flags.
493 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
494
495 /// Bitfield insert. Needs a temporary register, src is not modified.
496 void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
497 /// Bitfield insert in zero. src is not modified, but src and dst must be
498 /// different.
499 void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept;
500
501 /// Generate a function call
502 ///
503 /// This will get the arguments into the correct registers according to the
504 /// calling convention, clear non-callee-saved registers from the register
505 /// file (make sure you do not have any fixed assignments left over) and
506 /// fill the result registers (the u8 in the ScratchReg pair indicates the
507 /// register bank)
508 ///
509 /// Targets can be a symbol (call to PLT with relocation), or an indirect
510 /// call to a ValuePart. Result is an optional reference.
511 void generate_call(std::variant<SymRef, ValuePart> &&target,
512 std::span<CallArg> arguments,
513 typename Base::ValueRef *result,
514 bool variable_args = false);
515
516private:
517 /// @internal Emit compare of cmp_reg with case_value.
518 void switch_emit_cmp(AsmReg cmp_reg,
519 AsmReg tmp_reg,
520 u64 case_value,
521 bool width_is_32) noexcept;
522
523public:
524 /// @internal Jump if cmp_reg equals case_value.
525 void switch_emit_cmpeq(Label case_label,
526 AsmReg cmp_reg,
527 AsmReg tmp_reg,
528 u64 case_value,
529 bool width_is_32) noexcept;
530 /// @internal Emit bounds check and jump table.
531 bool switch_emit_jump_table(Label default_label,
532 std::span<const Label> labels,
533 AsmReg cmp_reg,
534 AsmReg tmp_reg,
535 u64 low_bound,
536 u64 high_bound,
537 bool width_is_32) noexcept;
538 /// @internal Jump if cmp_reg is greater than case_value.
539 void switch_emit_binary_step(Label case_label,
540 Label gt_label,
541 AsmReg cmp_reg,
542 AsmReg tmp_reg,
543 u64 case_value,
544 bool width_is_32) noexcept;
545
546 /// Generate code sequence to load address of sym into a register. This will
547 /// generate a function call for dynamic TLS access models.
548 ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept;
549
550 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
551 return ((cpu_feats & feats) == feats);
552 }
553};
554
555template <IRAdaptor Adaptor,
556 typename Derived,
557 template <typename, typename, typename> class BaseTy,
558 typename Config>
559void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
560 const u32 /*func_idx*/) noexcept {
561 this->preserve_flags = false;
562}
563
564template <IRAdaptor Adaptor,
565 typename Derived,
566 template <typename, typename, typename> typename BaseTy,
567 typename Config>
569 CCAssigner *cc_assigner) noexcept {
570 func_ret_offs.clear();
571 func_start_off = this->text_writer.offset();
572 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
573
574 const CCInfo &cc_info = cc_assigner->get_ccinfo();
575
576 auto csr = cc_info.callee_saved_regs;
577 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
578 "non-gp callee-saved registers not implemented");
579
580 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
581 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
582 // R8 and higher need a REX prefix
583 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
584 this->stack.frame_size = 8 * (csr_logp + csr_higp);
586
587 // 11 bytes for push rbp/mov rbp, rsp/sub rsp
588 func_prologue_alloc = reg_save_size + 11;
589 this->text_writer.ensure_space(func_prologue_alloc);
590 this->text_writer.cur_ptr() += func_prologue_alloc;
591
592 // TODO(ts): support larger stack alignments?
593
594 if (this->adaptor->cur_is_vararg()) {
595 this->stack.frame_used = true;
596 this->stack.frame_size += 6 * 8 + 8 * 16;
597 reg_save_frame_off = this->stack.frame_size;
598 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
599 ASM(MOV64mr, mem, FE_DI);
600 mem.off += 8;
601 ASM(MOV64mr, mem, FE_SI);
602 mem.off += 8;
603 ASM(MOV64mr, mem, FE_DX);
604 mem.off += 8;
605 ASM(MOV64mr, mem, FE_CX);
606 mem.off += 8;
607 ASM(MOV64mr, mem, FE_R8);
608 mem.off += 8;
609 ASM(MOV64mr, mem, FE_R9);
610 auto skip_fp = this->text_writer.label_create();
611 ASM(TEST8rr, FE_AX, FE_AX);
612 generate_raw_jump(Jump::je, skip_fp);
613 mem.off += 8;
614 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
615 mem.off += 16;
616 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
617 mem.off += 16;
618 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
619 mem.off += 16;
620 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
621 mem.off += 16;
622 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
623 mem.off += 16;
624 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
625 mem.off += 16;
626 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
627 mem.off += 16;
628 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
629 this->label_place(skip_fp);
630 }
631}
632
633template <IRAdaptor Adaptor,
634 typename Derived,
635 template <typename, typename, typename> typename BaseTy,
636 typename Config>
637std::optional<i32>
639 ValuePart &&vp, CCAssignment cca) noexcept {
640 if (cca.reg.valid()) [[likely]] {
641 vp.set_value_reg(this, cca.reg);
642 // Mark register as allocatable as soon as it is assigned. If the argument
643 // is unused, the register will be freed immediately and can be used for
644 // later stack arguments.
645 this->register_file.allocatable |= u64{1} << cca.reg.id();
646 return {};
647 }
648
649 if (vp.is_owned()) {
650 // No need to handle unused arguments.
651 return {};
652 }
653
654 this->stack.frame_used = true;
655 i32 frame_off = 0x10 + cca.stack_off;
656 if (cca.byval) {
657 return frame_off; // Return byval frame_off.
658 } else if (vp.assignment().assignment()->part_count == 1 &&
659 !vp.assignment().register_valid()) {
660 // For single-part values, the in-memory layout is equal to our
661 // layout in the spill slot. Reuse argument area as spill slot.
662 // However, don't do this for fixed assignments (indicated by
663 // register_valid()).
664 // TODO: consider doing this for two-part values when possible.
665 vp.assignment().set_stack_valid();
666 vp.assignment().assignment()->frame_off = frame_off;
667 } else {
668 AsmReg dst = vp.alloc_reg(this);
669 this->load_from_stack(dst, frame_off, cca.size);
670 }
671 return {};
672}
673
674template <IRAdaptor Adaptor,
675 typename Derived,
676 template <typename, typename, typename> typename BaseTy,
677 typename Config>
679 CCAssigner *cc_assigner) noexcept {
680 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
681 // TODO: get this from CCAssigner?
682 const CCInfo &cc_info = cc_assigner->get_ccinfo();
683 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
684 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
685 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
686 this->scalar_arg_count = std::popcount(gp_regs);
687 this->vec_arg_count = std::popcount(xmm_regs);
688 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
689 }
690}
691
692template <IRAdaptor Adaptor,
693 typename Derived,
694 template <typename, typename, typename> typename BaseTy,
695 typename Config>
696void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
697 u32 func_idx) noexcept {
698 const CCInfo &ccinfo = derived()->cur_cc_assigner()->get_ccinfo();
699 auto csr = ccinfo.callee_saved_regs;
700 u64 saved_regs = this->register_file.clobbered & csr;
701
702 bool needs_stack_frame = this->stack.frame_used ||
703 this->stack.generated_call ||
704 this->stack.has_dynamic_alloca || saved_regs != 0;
705
706 u32 prologue_size = 0;
707 u32 num_saved_regs = 0;
708 u32 rsp_adjustment = 0;
709
710 // NB: code alignment factor 1, data alignment factor -8.
711 this->text_writer.eh_begin_fde(this->get_personality_sym());
712
713 if (needs_stack_frame) {
714 if (!func_ret_offs.empty()) {
715 this->text_writer.eh_write_inst(dwarf::DW_CFA_remember_state);
716 }
717 // push rbp
718 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
719 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
720 this->text_writer.eh_write_inst(
721 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
722 // mov rbp, rsp
723 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
724 this->text_writer.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
725 dwarf::x64::DW_reg_rbp);
726
727 // Patched below
728 auto fde_prologue_adv_off = this->text_writer.eh_writer.size();
729 if (saved_regs != 0) {
730 this->text_writer.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
731 }
732
733 auto *write_ptr = this->text_writer.begin_ptr() + func_start_off;
734 write_ptr += fe64_PUSHr(write_ptr, 0, FE_BP);
735 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_BP, FE_SP);
736 for (auto reg : util::BitSetIterator{saved_regs}) {
737 assert(reg <= AsmReg::R15);
738 write_ptr +=
739 fe64_PUSHr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
740 ++num_saved_regs;
741
742 // DWARF register ordering is subtly different from the encoding:
743 // x86 is: ax, cx, dx, bx, sp, bp, si, di, r8, ...
744 // DWARF is: ax, dx, cx, bx, si, di, bp, sp, r8, ...
745 static const u8 gpreg_to_dwarf[] = {
746 dwarf::x64::DW_reg_rax,
747 dwarf::x64::DW_reg_rcx,
748 dwarf::x64::DW_reg_rdx,
749 dwarf::x64::DW_reg_rbx,
750 dwarf::x64::DW_reg_rsp,
751 dwarf::x64::DW_reg_rbp,
752 dwarf::x64::DW_reg_rsi,
753 dwarf::x64::DW_reg_rdi,
754 dwarf::x64::DW_reg_r8,
755 dwarf::x64::DW_reg_r9,
756 dwarf::x64::DW_reg_r10,
757 dwarf::x64::DW_reg_r11,
758 dwarf::x64::DW_reg_r12,
759 dwarf::x64::DW_reg_r13,
760 dwarf::x64::DW_reg_r14,
761 dwarf::x64::DW_reg_r15,
762 };
763 u8 dwarf_reg = gpreg_to_dwarf[reg];
764 auto cfa_off = num_saved_regs + 2;
765 this->text_writer.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
766 }
767
768 assert(
769 (!this->stack.has_dynamic_alloca || max_callee_stack_arg_size == 0) &&
770 "stack with dynamic alloca must adjust stack pointer at call sites");
771 // The frame_size contains the reserved frame size so we need to subtract
772 // the stack space we used for the saved registers
773 u32 final_frame_size =
774 util::align_up(this->stack.frame_size + max_callee_stack_arg_size, 16);
775 rsp_adjustment = final_frame_size - num_saved_regs * 8;
776 bool needs_rsp_adjustment = this->stack.generated_call ||
777 this->stack.has_dynamic_alloca ||
778 rsp_adjustment > ccinfo.red_zone_size;
779
780 if (needs_rsp_adjustment) {
781 write_ptr += fe64_SUB64ri(write_ptr, 0, FE_SP, rsp_adjustment);
782 } else {
783 rsp_adjustment = 0;
784 }
785
786 prologue_size =
787 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
788 assert(prologue_size <= func_prologue_alloc);
789 if (saved_regs != 0) {
790 assert(prologue_size < 0x44 && "cannot encode too large prologue in CFI");
791 this->text_writer.eh_writer.data()[fde_prologue_adv_off] =
792 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
793 }
794 }
795
796 if (!func_ret_offs.empty()) {
797 u8 *text_data = this->text_writer.begin_ptr();
798 if (func_ret_offs.back() == this->text_writer.offset() - 5) {
799 this->text_writer.cur_ptr() -= 5;
800 func_ret_offs.pop_back();
801 }
802 for (auto ret_off : func_ret_offs) {
803 fe64_JMP(text_data + ret_off, FE_JMPL, this->text_writer.cur_ptr());
804 }
805
806 // Epilogue mirrors prologue (POP has the same size, ADD/LEA/MOV is not
807 // larger than SUB), but RET is 2B shorter than MOV RSP,RBP. However,
808 // prologue_size might be zero; for simplicity, over-allocate a few bytes.
809 this->text_writer.ensure_space(prologue_size + 1);
810 if (needs_stack_frame) {
811 if (this->stack.has_dynamic_alloca) {
812 if (num_saved_regs == 0) {
813 ASMNC(MOV64rr, FE_SP, FE_BP);
814 } else {
815 i32 reg_save_size = num_saved_regs * 8;
816 ASMNC(LEA64rm, FE_SP, FE_MEM(FE_BP, 0, FE_NOREG, -reg_save_size));
817 }
818 } else if (rsp_adjustment != 0) {
819 ASMNC(ADD64ri, FE_SP, rsp_adjustment);
820 }
821 // TODO: if the calling convention doesn't guarantee a red zone, we must
822 // emit a CFA_restore for every single pop instruction.
823 assert(ccinfo.red_zone_size >= num_saved_regs * 8 &&
824 "unwind info incorrect for calling conv without red zone");
825 for (auto reg : util::BitSetIterator<true>{saved_regs}) {
826 ASMNC(POPr, AsmReg(reg));
827 }
828 ASMNC(POPr, FE_BP);
829
830 u32 body_start = func_start_off + func_prologue_alloc;
831 this->text_writer.eh_advance(this->text_writer.offset() - body_start);
832 this->text_writer.eh_write_inst(dwarf::DW_CFA_restore_state);
833 }
834 ASMNC(RET);
835 }
836
837 // Do sym_def at the very end; we shorten the function here again, so only at
838 // this point we know the actual size of the function.
839 // TODO(ts): honor cur_needs_unwind_info
840 this->text_writer.remove_prologue_bytes(func_start_off + prologue_size,
841 func_prologue_alloc - prologue_size);
842 auto func_size = this->text_writer.offset() - func_start_off;
843 auto func_sym = this->func_syms[func_idx];
844 auto func_sec = this->text_writer.get_sec_ref();
845 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
846 this->text_writer.eh_end_fde();
847 this->text_writer.except_encode_func();
848}
849
850template <IRAdaptor Adaptor,
851 typename Derived,
852 template <typename, typename, typename> typename BaseTy,
853 typename Config>
854void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
855 func_ret_offs.clear();
856 sym_tls_get_addr = {};
857 Base::reset();
858}
859
860template <IRAdaptor Adaptor,
861 typename Derived,
862 template <typename, typename, typename> typename BaseTy,
863 typename Config>
864void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
865 // Patched at the end, just reserve the space here.
866 func_ret_offs.push_back(this->text_writer.offset());
867 this->text_writer.ensure_space(5); // JMP is 5 bytes
868 this->text_writer.cur_ptr() += 5;
869}
870
871template <IRAdaptor Adaptor,
872 typename Derived,
873 template <typename, typename, typename> typename BaseTy,
874 typename Config>
875void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
876 const AsmReg reg, const i32 frame_off, const u32 size) noexcept {
877 assert(this->stack.frame_used);
878 this->text_writer.ensure_space(16);
879 assert(frame_off < 0);
880 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
881 if (reg.id() <= AsmReg::R15) {
882 switch (size) {
883 case 1: ASMNC(MOV8mr, mem, reg); break;
884 case 2: ASMNC(MOV16mr, mem, reg); break;
885 case 4: ASMNC(MOV32mr, mem, reg); break;
886 case 8: ASMNC(MOV64mr, mem, reg); break;
887 default: TPDE_UNREACHABLE("invalid spill size");
888 }
889 return;
890 }
891
892 switch (size) {
893 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg); break;
894 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg); break;
895 case 16: ASMNC(SSE_MOVAPDmr, mem, reg); break;
896 default: TPDE_UNREACHABLE("invalid spill size");
897 }
898}
899
900template <IRAdaptor Adaptor,
901 typename Derived,
902 template <typename, typename, typename> typename BaseTy,
903 typename Config>
904void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
905 const AsmReg dst,
906 const i32 frame_off,
907 const u32 size,
908 const bool sign_extend) noexcept {
909 assert(this->stack.frame_used);
910 this->text_writer.ensure_space(16);
911 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
912
913 if (dst.id() <= AsmReg::R15) {
914 if (!sign_extend) {
915 switch (size) {
916 case 1: ASMNC(MOVZXr32m8, dst, mem); break;
917 case 2: ASMNC(MOVZXr32m16, dst, mem); break;
918 case 4: ASMNC(MOV32rm, dst, mem); break;
919 case 8: ASMNC(MOV64rm, dst, mem); break;
920 default: TPDE_UNREACHABLE("invalid spill size");
921 }
922 } else {
923 switch (size) {
924 case 1: ASMNC(MOVSXr64m8, dst, mem); break;
925 case 2: ASMNC(MOVSXr64m16, dst, mem); break;
926 case 4: ASMNC(MOVSXr64m32, dst, mem); break;
927 case 8: ASMNC(MOV64rm, dst, mem); break;
928 default: TPDE_UNREACHABLE("invalid spill size");
929 }
930 }
931 return;
932 }
933
934 assert(!sign_extend);
935
936 switch (size) {
937 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem); break;
938 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem); break;
939 case 16: ASMNC(SSE_MOVAPDrm, dst, mem); break;
940 default: TPDE_UNREACHABLE("invalid spill size");
941 }
942}
943
944template <IRAdaptor Adaptor,
945 typename Derived,
946 template <typename, typename, typename> typename BaseTy,
947 typename Config>
948void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
949 const AsmReg dst, const AssignmentPartRef ap) noexcept {
950 assert(this->stack.frame_used);
951 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
952}
953
954template <IRAdaptor Adaptor,
955 typename Derived,
956 template <typename, typename, typename> typename BaseTy,
957 typename Config>
958void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
959 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
960 this->text_writer.ensure_space(16);
961 assert(dst.valid());
962 assert(src.valid());
963 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
964 if (size > 4) {
965 ASMNC(MOV64rr, dst, src);
966 } else {
967 ASMNC(MOV32rr, dst, src);
968 }
969 } else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
970 if (size <= 16) {
971 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
972 assert(has_cpu_feats(CPU_AVX512F));
973 ASMNC(VMOVAPD128rr, dst, src);
974 } else {
975 ASMNC(SSE_MOVAPDrr, dst, src);
976 }
977 } else if (size <= 32) {
978 assert(has_cpu_feats(CPU_AVX));
979 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
980 has_cpu_feats(CPU_AVX512F));
981 ASMNC(VMOVAPD256rr, dst, src);
982 } else {
983 assert(size <= 64);
984 assert(has_cpu_feats(CPU_AVX512F));
985 ASMNC(VMOVAPD512rr, dst, src);
986 }
987 } else if (dst.id() <= AsmReg::R15) {
988 // gp<-xmm
989 assert(src.id() >= AsmReg::XMM0);
990 assert(size <= 8);
991 if (src.id() > AsmReg::XMM15) {
992 assert(has_cpu_feats(CPU_AVX512F));
993 if (size <= 4) {
994 ASMNC(VMOVD_X2Grr, dst, src);
995 } else {
996 ASMNC(VMOVQ_X2Grr, dst, src);
997 }
998 } else {
999 if (size <= 4) {
1000 ASMNC(SSE_MOVD_X2Grr, dst, src);
1001 } else {
1002 ASMNC(SSE_MOVQ_X2Grr, dst, src);
1003 }
1004 }
1005 } else {
1006 // xmm<-gp
1007 assert(src.id() <= AsmReg::R15);
1008 assert(dst.id() >= AsmReg::XMM0);
1009 assert(size <= 8);
1010 if (dst.id() > AsmReg::XMM15) {
1011 assert(has_cpu_feats(CPU_AVX512F));
1012 if (size <= 4) {
1013 ASMNC(VMOVD_G2Xrr, dst, src);
1014 } else {
1015 ASMNC(VMOVQ_G2Xrr, dst, src);
1016 }
1017 } else {
1018 if (size <= 4) {
1019 ASMNC(SSE_MOVD_G2Xrr, dst, src);
1020 } else {
1021 ASMNC(SSE_MOVQ_G2Xrr, dst, src);
1022 }
1023 }
1024 }
1025}
1026
1027template <IRAdaptor Adaptor,
1028 typename Derived,
1029 template <typename, typename, typename> typename BaseTy,
1030 typename Config>
1031AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1032 GenericValuePart &gv) noexcept {
1033 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1034
1035 ScratchReg scratch{derived()};
1036 bool disp32 = i32(expr.disp) == expr.disp;
1037 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1038 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1039 if (std::holds_alternative<ScratchReg>(expr.base)) {
1040 scratch = std::move(std::get<ScratchReg>(expr.base));
1041 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1042 scratch = std::move(std::get<ScratchReg>(expr.index));
1043 } else {
1044 (void)scratch.alloc_gp();
1045 }
1046 auto dst = scratch.cur_reg();
1047 if (idx.valid()) {
1048 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1049 u8 sc = expr.scale;
1050 if (base.valid() && disp32) {
1051 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1052 expr.disp = 0;
1053 } else if (base.valid()) {
1054 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1055 } else if (disp32) {
1056 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1057 } else {
1058 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1059 }
1060 } else {
1061 assert(may_clobber_flags());
1062 u64 scale = expr.scale;
1063 if (base == idx) {
1064 base = AsmReg::make_invalid();
1065 scale += 1;
1066 }
1067
1068 ScratchReg idx_scratch{derived()};
1069 // We need a register to compute the scaled index.
1070 AsmReg idx_tmp = dst;
1071 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1072 // We can't use dst, it'd clobber base, so use the other
1073 // register we currently own.
1074 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1075 } else if (dst == base) {
1076 idx_tmp = idx_scratch.alloc_gp();
1077 }
1078
1079 if ((scale & (scale - 1)) == 0) {
1080 if (idx_tmp != idx) {
1081 ASM(MOV64rr, idx_tmp, idx);
1082 }
1083 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1084 } else {
1085 if (i32(scale) == i64(scale)) {
1086 ASM(IMUL64rri, idx_tmp, idx, scale);
1087 } else {
1088 ScratchReg scratch2{derived()};
1089 auto tmp2 = scratch2.alloc_gp();
1090 ASM(MOV64ri, tmp2, scale);
1091 if (idx_tmp != idx) {
1092 ASM(MOV64rr, idx_tmp, idx);
1093 }
1094 ASM(IMUL64rr, idx_tmp, tmp2);
1095 }
1096 }
1097 if (base.valid()) {
1098 if (disp32 || (idx_tmp != dst && base != dst)) {
1099 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1100 expr.disp = 0;
1101 } else if (dst == base) {
1102 ASM(ADD64rr, dst, idx_tmp);
1103 } else {
1104 ASM(ADD64rr, dst, base);
1105 }
1106 }
1107 }
1108 } else if (base.valid()) {
1109 if (expr.disp && disp32) {
1110 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1111 expr.disp = 0;
1112 } else if (dst != base) {
1113 ASM(MOV64rr, dst, base);
1114 }
1115 }
1116 if (expr.disp) {
1117 ScratchReg scratch2{derived()};
1118 auto tmp2 = scratch2.alloc_gp();
1119 ASM(MOV64ri, tmp2, expr.disp);
1120 if (may_clobber_flags()) {
1121 ASM(ADD64rr, dst, tmp2);
1122 } else {
1123 ASM(LEA64rm, dst, FE_MEM(dst, 1, tmp2, 0));
1124 }
1125 }
1126 gv.state = std::move(scratch);
1127 return dst;
1128}
1129
1130template <IRAdaptor Adaptor,
1131 typename Derived,
1132 template <typename, typename, typename> typename BaseTy,
1133 typename Config>
1135 u64 size, u32 align, ValuePart &res) noexcept {
1136 assert(this->stack.has_dynamic_alloca &&
1137 "function marked as not having dynamic allocas can't have alloca");
1138 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1139 assert(may_clobber_flags());
1140 size = tpde::util::align_up(size, 16);
1141 if (size > 0) {
1142 assert(size < 0x8000'0000);
1143 ASM(SUB64ri, FE_SP, size);
1144 }
1145 if (align > 16) {
1146 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1147 ASM(AND64ri, FE_SP, ~(align - 1));
1148 }
1149 ASM(MOV64rr, res.alloc_reg(this), FE_SP);
1150}
1151
1152template <IRAdaptor Adaptor,
1153 typename Derived,
1154 template <typename, typename, typename> typename BaseTy,
1155 typename Config>
1157 u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept {
1158 assert(this->stack.has_dynamic_alloca &&
1159 "function marked as not having dynamic allocas can't have alloca");
1160 assert(align != 0 && (align & (align - 1)) == 0 && "invalid alignment");
1161 assert(may_clobber_flags());
1162 AsmReg size_reg = count.has_reg() ? count.cur_reg() : count.load_to_reg(this);
1163 AsmReg res_reg = res.alloc_try_reuse(this, count);
1164
1165 if (elem_size == 0) {
1166 ASM(XOR32rr, res_reg, res_reg);
1167 } else if ((elem_size & (elem_size - 1)) == 0) {
1168 // elem_size is power of two
1169 const auto shift = util::cnt_tz(elem_size);
1170 if (shift > 0 && shift < 4) {
1171 ASM(LEA64rm, res_reg, FE_MEM(FE_NOREG, u8(1 << shift), size_reg, 0));
1172 } else {
1173 if (size_reg != res_reg) {
1174 ASM(MOV64rr, res_reg, size_reg);
1175 }
1176 if (elem_size != 1) {
1177 ASM(SHL64ri, res_reg, shift);
1178 }
1179 }
1180 } else {
1181 if (elem_size <= 0x7FFF'FFFF) [[likely]] {
1182 ASM(IMUL64rri, res_reg, size_reg, elem_size);
1183 } else {
1184 ScratchReg scratch{this};
1185 auto tmp = scratch.alloc_gp();
1186 ASM(MOV64ri, tmp, elem_size);
1187 if (size_reg != res_reg) {
1188 ASM(MOV64rr, res_reg, size_reg);
1189 }
1190 ASM(IMUL64rr, res_reg, tmp);
1191 }
1192 }
1193
1194 ASM(SUB64rr, FE_SP, res_reg);
1195
1196 align = align > 16 ? align : 16;
1197 if (elem_size & (align - 1)) {
1198 assert(align < u32{1} << 31 && "alignment >= 2**31 not implemented");
1199 ASM(AND64ri, FE_SP, ~(align - 1));
1200 }
1201
1202 ASM(MOV64rr, res_reg, FE_SP);
1203}
1204
1205template <IRAdaptor Adaptor,
1206 typename Derived,
1207 template <typename, typename, typename> typename BaseTy,
1208 typename Config>
1210 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1211 const auto const_u64 = data[0];
1212 if (bank == Config::GP_BANK) {
1213 assert(size <= 8);
1214 if (const_u64 == 0) {
1215 if (may_clobber_flags()) {
1216 ASM(XOR32rr, dst, dst);
1217 } else {
1218 ASM(MOV32ri, dst, 0);
1219 }
1220 return;
1221 }
1222
1223 if (size <= 4 || u32(const_u64) == const_u64) {
1224 ASM(MOV32ri, dst, const_u64);
1225 } else {
1226 ASM(MOV64ri, dst, const_u64);
1227 }
1228 return;
1229 }
1230
1231 assert(bank == Config::FP_BANK);
1232 const auto high_u64 = size <= 8 ? 0 : data[1];
1233 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1234 if (has_cpu_feats(CPU_AVX)) {
1235 ASM(VPXOR128rrr, dst, dst, dst);
1236 } else {
1237 ASM(SSE_PXORrr, dst, dst);
1238 }
1239 return;
1240 }
1241 const u64 ones = -u64{1};
1242 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1243 if (has_cpu_feats(CPU_AVX)) {
1244 ASM(VPCMPEQB128rrr, dst, dst, dst);
1245 } else {
1246 ASM(SSE_PCMPEQBrr, dst, dst);
1247 }
1248 return;
1249 }
1250
1251 if (size <= 8) {
1252 // We must not evict registers here (might be used within branching code),
1253 // so only use free registers and load from memory otherwise.
1254 AsmReg tmp =
1255 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1256 if (tmp.valid()) {
1257 this->register_file.mark_clobbered(tmp);
1258 materialize_constant(data, Config::GP_BANK, size, tmp);
1259 if (size <= 4) {
1260 if (has_cpu_feats(CPU_AVX)) {
1261 ASM(VMOVD_G2Xrr, dst, tmp);
1262 } else {
1263 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1264 }
1265 } else {
1266 if (has_cpu_feats(CPU_AVX)) {
1267 ASM(VMOVQ_G2Xrr, dst, tmp);
1268 } else {
1269 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1270 }
1271 }
1272 return;
1273 }
1274 }
1275
1276 // TODO: round to next power of two but at least 4 byte
1277 // We store constants in 8-byte units.
1278 auto alloc_size = util::align_up(size, 8);
1279 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), alloc_size};
1280 // TODO: deduplicate/pool constants?
1281 auto rodata = this->assembler.get_data_section(true, false);
1282 auto sym = this->assembler.sym_def_data(
1283 rodata, "", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1284 if (size <= 4) {
1285 if (has_cpu_feats(CPU_AVX)) {
1286 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1287 } else {
1288 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1289 }
1290 } else if (size <= 8) {
1291 if (has_cpu_feats(CPU_AVX)) {
1292 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1293 } else {
1294 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1295 }
1296 } else if (size <= 16) {
1297 if (has_cpu_feats(CPU_AVX)) {
1298 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1299 } else {
1300 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1301 }
1302 } else {
1303 // TODO: implement for AVX/AVX-512.
1304 TPDE_FATAL("unable to materialize constant");
1305 }
1306
1307 this->reloc_text(sym, elf::R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1308}
1309
1310template <IRAdaptor Adaptor,
1311 typename Derived,
1312 template <typename, typename, typename> typename BaseTy,
1313 typename Config>
1314AsmReg
1315 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1316 AssignmentPartRef ap, IRValueRef) noexcept {
1317 RegBank bank = ap.bank();
1318 assert(bank.id() <= Config::NUM_BANKS);
1319 auto reg_mask = this->register_file.bank_regs(bank);
1320 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1321
1322 const auto find_possible_regs = [this,
1323 reg_mask](const u64 preferred_regs) -> u64 {
1324 // try to first get an unused reg, otherwise an unfixed reg
1325 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1326 return free_regs & preferred_regs & reg_mask;
1327 };
1328
1329 u64 possible_regs;
1330 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1331 if (!this->stack.is_leaf_function) {
1332 // we can only allocated fixed assignments from the callee-saved regs
1333 possible_regs = find_possible_regs(csr);
1334 } else {
1335 // try allocating any non-callee saved register first, except the result
1336 // registers
1337 possible_regs = find_possible_regs(~csr);
1338 if (possible_regs == 0) {
1339 // otherwise fallback to callee-saved regs
1340 possible_regs = find_possible_regs(csr);
1341 }
1342 }
1343
1344 if (possible_regs == 0) {
1345 return AsmReg::make_invalid();
1346 }
1347
1348 // try to first get an unused reg, otherwise an unfixed reg
1349 if ((possible_regs & ~this->register_file.used) != 0) {
1350 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1351 }
1352
1353 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1354 const auto reg = AsmReg{reg_id};
1355
1356 if (this->register_file.is_fixed(reg)) {
1357 continue;
1358 }
1359
1360 const auto local_idx = this->register_file.reg_local_idx(reg);
1361 const auto part = this->register_file.reg_part(reg);
1362
1363 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1364 continue;
1365 }
1366 auto *assignment = this->val_assignment(local_idx);
1367 auto ap = AssignmentPartRef{assignment, part};
1368 if (ap.modified()) {
1369 continue;
1370 }
1371
1372 return reg;
1373 }
1374
1375 return AsmReg::make_invalid();
1376}
1377
1378template <IRAdaptor Adaptor,
1379 typename Derived,
1380 template <typename, typename, typename> typename BaseTy,
1381 typename Config>
1382typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1383 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1384 Jump jmp) noexcept {
1385 switch (jmp) {
1386 case Jump::ja: return Jump::jbe;
1387 case Jump::jae: return Jump::jb;
1388 case Jump::jb: return Jump::jae;
1389 case Jump::jbe: return Jump::ja;
1390 case Jump::je: return Jump::jne;
1391 case Jump::jg: return Jump::jle;
1392 case Jump::jge: return Jump::jl;
1393 case Jump::jl: return Jump::jge;
1394 case Jump::jle: return Jump::jg;
1395 case Jump::jne: return Jump::je;
1396 case Jump::jno: return Jump::jo;
1397 case Jump::jo: return Jump::jno;
1398 case Jump::js: return Jump::jns;
1399 case Jump::jns: return Jump::js;
1400 case Jump::jp: return Jump::jnp;
1401 case Jump::jnp: return Jump::jp;
1402 default: TPDE_UNREACHABLE("invalid jump kind for invert_jump");
1403 }
1404}
1405
1406template <IRAdaptor Adaptor,
1407 typename Derived,
1408 template <typename, typename, typename> class BaseTy,
1409 typename Config>
1410typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1411 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1412 Jump jmp) noexcept {
1413 switch (jmp) {
1414 case Jump::ja: return Jump::jb;
1415 case Jump::jae: return Jump::jbe;
1416 case Jump::jb: return Jump::ja;
1417 case Jump::jbe: return Jump::jae;
1418 case Jump::je: return Jump::je;
1419 case Jump::jne: return Jump::jne;
1420 case Jump::jg: return Jump::jl;
1421 case Jump::jge: return Jump::jle;
1422 case Jump::jl: return Jump::jg;
1423 case Jump::jle: return Jump::jge;
1424 default: TPDE_UNREACHABLE("invalid jump kind for swap_jump");
1425 }
1426}
1427
1428template <IRAdaptor Adaptor,
1429 typename Derived,
1430 template <typename, typename, typename> class BaseTy,
1431 typename Config>
1432FeCond CompilerX64<Adaptor, Derived, BaseTy, Config>::jump_to_cond(
1433 Jump jmp) noexcept {
1434 // LLVM won't transform the switch into a shift.
1435 FeCond res = FeCond(u32(jmp) << 16);
1436 switch (jmp) {
1437 case Jump::ja: assert(res == FE_CC_A && "FeCond value mismatch?"); break;
1438 case Jump::jae: assert(res == FE_CC_AE && "FeCond value mismatch?"); break;
1439 case Jump::jb: assert(res == FE_CC_B && "FeCond value mismatch?"); break;
1440 case Jump::jbe: assert(res == FE_CC_BE && "FeCond value mismatch?"); break;
1441 case Jump::je: assert(res == FE_CC_E && "FeCond value mismatch?"); break;
1442 case Jump::jg: assert(res == FE_CC_G && "FeCond value mismatch?"); break;
1443 case Jump::jge: assert(res == FE_CC_GE && "FeCond value mismatch?"); break;
1444 case Jump::jl: assert(res == FE_CC_L && "FeCond value mismatch?"); break;
1445 case Jump::jle: assert(res == FE_CC_LE && "FeCond value mismatch?"); break;
1446 case Jump::jne: assert(res == FE_CC_NE && "FeCond value mismatch?"); break;
1447 case Jump::jno: assert(res == FE_CC_NO && "FeCond value mismatch?"); break;
1448 case Jump::jo: assert(res == FE_CC_O && "FeCond value mismatch?"); break;
1449 case Jump::js: assert(res == FE_CC_S && "FeCond value mismatch?"); break;
1450 case Jump::jns: assert(res == FE_CC_NS && "FeCond value mismatch?"); break;
1451 case Jump::jp: assert(res == FE_CC_P && "FeCond value mismatch?"); break;
1452 case Jump::jnp: assert(res == FE_CC_NP && "FeCond value mismatch?"); break;
1453 default: TPDE_UNREACHABLE("invalid conditional jump");
1454 }
1455 return res;
1456}
1457
1458template <IRAdaptor Adaptor,
1459 typename Derived,
1460 template <typename, typename, typename> typename BaseTy,
1461 typename Config>
1463 Jump jmp, Label target_label) noexcept {
1464 this->text_writer.ensure_space(6); // For safe ptr arithmetic on code buffer.
1465 bool pending = this->text_writer.label_is_pending(target_label);
1466 void *target = this->text_writer.cur_ptr();
1467 if (!pending) {
1468 target = this->text_writer.begin_ptr() +
1469 this->text_writer.label_offset(target_label);
1470 }
1471
1472 if (jmp == Jump::jmp) {
1473 ASMNCF(JMP, pending ? FE_JMPL : 0, target);
1474 } else {
1475 ASMNCF(Jcc, (pending ? FE_JMPL : 0) | jump_to_cond(jmp), target);
1476 }
1477
1478 if (pending) {
1479 this->text_writer.label_ref(target_label,
1480 this->text_writer.offset() - 4,
1481 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1482 }
1483}
1484
1485template <IRAdaptor Adaptor,
1486 typename Derived,
1487 template <typename, typename, typename> class BaseTy,
1488 typename Config>
1490 Jump cc, AsmReg dst, bool zext) noexcept {
1491 if (zext) {
1492 ASM(MOV32ri, dst, 0);
1493 }
1494 ASMF(SETcc8r, jump_to_cond(cc), dst);
1495}
1496
1497template <IRAdaptor Adaptor,
1498 typename Derived,
1499 template <typename, typename, typename> class BaseTy,
1500 typename Config>
1502 Jump cc, AsmReg dst) noexcept {
1503 // TODO: use sbb dst,dst/adc dest,-1 for carry flag
1504 generate_raw_set(cc, dst);
1505 ASM(NEG64r, dst);
1506}
1507template <IRAdaptor Adaptor,
1508 typename Derived,
1509 template <typename, typename, typename> class BaseTy,
1510 typename Config>
1512 Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept {
1513 if (is_64) {
1514 ASMF(CMOVcc64rr, jump_to_cond(cc), dst, src);
1515 } else {
1516 ASMF(CMOVcc32rr, jump_to_cond(cc), dst, src);
1517 }
1518}
1519
1520template <IRAdaptor Adaptor,
1521 typename Derived,
1522 template <typename, typename, typename> class BaseTy,
1523 typename Config>
1525 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1526 assert(from < to && to <= 64);
1527 assert(may_clobber_flags());
1528 if (!sign) {
1529 switch (from) {
1530 case 8: ASM(MOVZXr32r8, dst, src); break;
1531 case 16: ASM(MOVZXr32r16, dst, src); break;
1532 case 32: ASM(MOV32rr, dst, src); break;
1533 default:
1534 if (from < 32) {
1535 if (dst != src) {
1536 ASM(MOV32rr, dst, src);
1537 }
1538 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1539 } else if (dst != src) {
1540 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1541 ASM(AND64rr, dst, src);
1542 } else {
1543 ScratchReg tmp{this};
1544 AsmReg tmp_reg = tmp.alloc_gp();
1545 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1546 ASM(AND64rr, dst, tmp_reg);
1547 }
1548 }
1549 } else if (to <= 32) {
1550 switch (from) {
1551 case 8: ASM(MOVSXr32r8, dst, src); break;
1552 case 16: ASM(MOVSXr32r16, dst, src); break;
1553 default:
1554 if (dst != src) {
1555 ASM(MOV32rr, dst, src);
1556 }
1557 ASM(SHL32ri, dst, 32 - from);
1558 ASM(SAR32ri, dst, 32 - from);
1559 }
1560 } else {
1561 switch (from) {
1562 case 8: ASM(MOVSXr64r8, dst, src); break;
1563 case 16: ASM(MOVSXr64r16, dst, src); break;
1564 case 32: ASM(MOVSXr64r32, dst, src); break;
1565 default:
1566 if (dst != src) {
1567 ASM(MOV64rr, dst, src);
1568 }
1569 ASM(SHL64ri, dst, 64 - from);
1570 ASM(SAR64ri, dst, 64 - from);
1571 }
1572 }
1573}
1574
1575template <IRAdaptor Adaptor,
1576 typename Derived,
1577 template <typename, typename, typename> class BaseTy,
1578 typename Config>
1580 AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
1581 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1582 assert(may_clobber_flags());
1583 ScratchReg tmp1{this};
1584 AsmReg tmp1_reg = tmp1.alloc_gp();
1585 // First, clear relevant bits in dst.
1586 if (width == 1) {
1587 ASM(BTR64ri, dst, lsb);
1588 } else if (lsb + width <= 31) {
1589 ASM(AND64ri, dst, ~(((u64{1} << width) - 1) << lsb));
1590 } else {
1591 ASM(MOV64ri, tmp1_reg, ~(((u64{1} << width) - 1) << lsb));
1592 ASM(AND64rr, dst, tmp1_reg);
1593 }
1594 // Second, clear irrelevant bits in src; result is in tmp1_reg.
1595 if (width == 8) {
1596 ASM(MOVZXr32r8, tmp1_reg, src);
1597 } else if (width == 16) {
1598 ASM(MOVZXr32r16, tmp1_reg, src);
1599 } else if (width <= 32) {
1600 ASM(MOV32rr, tmp1_reg, src);
1601 if (width < 32) {
1602 ASM(AND32ri, tmp1_reg, (u32{1} << width) - 1);
1603 }
1604 } else {
1605 ASM(MOV64ri, tmp1_reg, (u64{1} << width) - 1);
1606 ASM(AND64rr, tmp1_reg, src);
1607 }
1608 // Third, merge. Bits are disjoint, so addition is possible.
1609 if (lsb >= 1 && lsb <= 3) {
1610 ASM(LEA64rm, dst, FE_MEM(dst, u8(1 << lsb), tmp1_reg, 0));
1611 } else {
1612 if (lsb > 0 && lsb + width <= 32) {
1613 ASM(SHL32ri, tmp1_reg, lsb);
1614 } else if (lsb > 0) {
1615 ASM(SHL64ri, tmp1_reg, lsb);
1616 }
1617 ASM(OR64rr, dst, tmp1_reg);
1618 }
1619}
1620
1621template <IRAdaptor Adaptor,
1622 typename Derived,
1623 template <typename, typename, typename> class BaseTy,
1624 typename Config>
1626 AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept {
1627 assert(lsb < 63 && width < 64 && lsb + width <= 64 && width != 0);
1628 assert(dst != src);
1629 assert(may_clobber_flags());
1630 // Clear irrelevant bits in src and move to dst.
1631 if (width == 8) {
1632 ASM(MOVZXr32r8, dst, src);
1633 } else if (width == 16) {
1634 ASM(MOVZXr32r16, dst, src);
1635 } else if (width <= 32) {
1636 ASM(MOV32rr, dst, src);
1637 if (width < 32) {
1638 ASM(AND32ri, dst, (u32{1} << width) - 1);
1639 }
1640 } else {
1641 ASM(MOV64ri, dst, (u64{1} << width) - 1);
1642 ASM(AND64rr, dst, src);
1643 }
1644 // Shift into place.
1645 if (lsb > 0 && lsb + width <= 32) {
1646 ASM(SHL32ri, dst, lsb);
1647 } else if (lsb > 0) {
1648 ASM(SHL64ri, dst, lsb);
1649 }
1650}
1651
1652template <IRAdaptor Adaptor,
1653 typename Derived,
1654 template <typename, typename, typename> class BaseTy,
1655 typename Config>
1657 set_stack_used() noexcept {
1658 if (this->compiler.stack.has_dynamic_alloca && stack_adjust_off == 0) {
1659 stack_adjust_off = this->compiler.text_writer.offset();
1660 // Always use 32-bit immediate
1661 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1662 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1663 }
1664}
1665
1666template <IRAdaptor Adaptor,
1667 typename Derived,
1668 template <typename, typename, typename> class BaseTy,
1669 typename Config>
1670void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1671 ValuePart &vp, CCAssignment &cca) noexcept {
1672 AsmReg ptr = vp.load_to_reg(&this->compiler);
1673 ScratchReg scratch{&this->compiler};
1674 AsmReg tmp = scratch.alloc_gp();
1675
1676 auto size = cca.size;
1677 set_stack_used();
1678 i32 off = 0;
1679 i32 soff = cca.stack_off;
1680 while (size >= 8) {
1681 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1682 ASMC(&this->compiler, MOV64mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1683 off += 8;
1684 size -= 8;
1685 }
1686 if (size >= 4) {
1687 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1688 ASMC(&this->compiler, MOV32mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1689 off += 4;
1690 size -= 4;
1691 }
1692 if (size >= 2) {
1693 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1694 ASMC(&this->compiler, MOV16mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1695 off += 2;
1696 size -= 2;
1697 }
1698 if (size >= 1) {
1699 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1700 ASMC(&this->compiler, MOV8mr, FE_MEM(FE_SP, 0, FE_NOREG, soff + off), tmp);
1701 }
1702}
1703
1704template <IRAdaptor Adaptor,
1705 typename Derived,
1706 template <typename, typename, typename> class BaseTy,
1707 typename Config>
1708void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1709 ValuePart &vp, CCAssignment &cca) noexcept {
1710 set_stack_used();
1711
1712 auto reg = vp.has_reg() ? vp.cur_reg() : vp.load_to_reg(&this->compiler);
1713 FeMem mem_op = FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off));
1714 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1715 switch (cca.size) {
1716 case 1: ASMC(&this->compiler, MOV8mr, mem_op, reg); break;
1717 case 2: ASMC(&this->compiler, MOV16mr, mem_op, reg); break;
1718 case 4: ASMC(&this->compiler, MOV32mr, mem_op, reg); break;
1719 case 8: ASMC(&this->compiler, MOV64mr, mem_op, reg); break;
1720 default: TPDE_UNREACHABLE("invalid GP reg size");
1721 }
1722 } else {
1723 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1724 switch (cca.size) {
1725 case 4: ASMC(&this->compiler, SSE_MOVSSmr, mem_op, reg); break;
1726 case 8: ASMC(&this->compiler, SSE_MOVSDmr, mem_op, reg); break;
1727 case 16: ASMC(&this->compiler, SSE_MOVDQAmr, mem_op, reg); break;
1728 default: TPDE_UNREACHABLE("invalid SSE reg size");
1729 }
1730 }
1731}
1732
1733template <IRAdaptor Adaptor,
1734 typename Derived,
1735 template <typename, typename, typename> class BaseTy,
1736 typename Config>
1737void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1738 std::variant<SymRef, ValuePart> &&target) noexcept {
1739 if (this->assigner.is_vararg()) {
1740 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1741 this->compiler.evict_reg(Reg{AsmReg::AX});
1742 }
1743 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1744 Config::FP_BANK, 0);
1745 unsigned xmm_cnt = 8;
1746 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1747 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1748 }
1749 if (xmm_cnt != 0) {
1750 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1751 } else {
1752 ASMC(&this->compiler, XOR32rr, FE_AX, FE_AX);
1753 }
1754 }
1755
1756 u32 sub = 0;
1757 if (stack_adjust_off != 0) {
1758 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1759 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1760 memcpy(inst_ptr + 3, &sub, sizeof(u32));
1761 } else {
1762 auto &max_stack_size = this->compiler.max_callee_stack_arg_size;
1763 max_stack_size = std::max(max_stack_size, this->assigner.get_stack_size());
1764 }
1765
1766 if (auto *sym = std::get_if<SymRef>(&target)) {
1767 this->compiler.text_writer.ensure_space(16);
1768 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1769 this->compiler.reloc_text(
1770 *sym, elf::R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1771 } else {
1772 ValuePart &tvp = std::get<ValuePart>(target);
1773 if (tvp.has_assignment() && !tvp.assignment().register_valid()) {
1774 assert(tvp.assignment().stack_valid());
1775 auto off = tvp.assignment().frame_off();
1776 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1777 } else if (tvp.can_salvage()) {
1778 ASMC(&this->compiler, CALLr, tvp.salvage(&this->compiler));
1779 } else {
1780 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1781 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1782 ASMC(&this->compiler, CALLr, reg);
1783 }
1784 tvp.reset(&this->compiler);
1785 }
1786
1787 if (stack_adjust_off != 0) {
1788 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1789 }
1790}
1791
1792template <IRAdaptor Adaptor,
1793 typename Derived,
1794 template <typename, typename, typename> typename BaseTy,
1795 typename Config>
1797 std::variant<SymRef, ValuePart> &&target,
1798 std::span<CallArg> arguments,
1799 typename Base::ValueRef *result,
1800 const bool variable_args) {
1801 CCAssignerSysV assigner{variable_args};
1802 CallBuilder cb{*derived(), assigner};
1803 for (auto &arg : arguments) {
1804 cb.add_arg(std::move(arg));
1805 }
1806 cb.call(std::move(target));
1807 if (result) {
1808 cb.add_ret(*result);
1809 }
1810}
1811
1812template <IRAdaptor Adaptor,
1813 typename Derived,
1814 template <typename, typename, typename> typename BaseTy,
1815 typename Config>
1816void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmp(
1817 const AsmReg cmp_reg,
1818 const AsmReg tmp_reg,
1819 const u64 case_value,
1820 const bool width_is_32) noexcept {
1821 if (width_is_32) {
1822 ASM(CMP32ri, cmp_reg, case_value);
1823 } else {
1824 if ((i64)((i32)case_value) == (i64)case_value) {
1825 ASM(CMP64ri, cmp_reg, case_value);
1826 } else {
1827 this->materialize_constant(&case_value, Config::GP_BANK, 8, tmp_reg);
1828 ASM(CMP64rr, cmp_reg, tmp_reg);
1829 }
1830 }
1831}
1832
1833template <IRAdaptor Adaptor,
1834 typename Derived,
1835 template <typename, typename, typename> typename BaseTy,
1836 typename Config>
1837void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_cmpeq(
1838 const Label case_label,
1839 const AsmReg cmp_reg,
1840 const AsmReg tmp_reg,
1841 const u64 case_value,
1842 const bool width_is_32) noexcept {
1843 switch_emit_cmp(cmp_reg, tmp_reg, case_value, width_is_32);
1844 generate_raw_jump(Jump::je, case_label);
1845}
1846
1847template <IRAdaptor Adaptor,
1848 typename Derived,
1849 template <typename, typename, typename> typename BaseTy,
1850 typename Config>
1851bool CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_jump_table(
1852 Label default_label,
1853 std::span<const Label> labels,
1854 AsmReg cmp_reg,
1855 AsmReg tmp_reg,
1856 u64 low_bound,
1857 u64 high_bound,
1858 bool width_is_32) noexcept {
1859 // NB: we must not evict any registers here.
1860 if (low_bound != 0) {
1861 switch_emit_cmp(cmp_reg, tmp_reg, low_bound, width_is_32);
1862 generate_raw_jump(Jump::jb, default_label);
1863 }
1864 switch_emit_cmp(cmp_reg, tmp_reg, high_bound, width_is_32);
1865 generate_raw_jump(Jump::ja, default_label);
1866
1867 if (width_is_32) {
1868 // zero-extend cmp_reg since we use the full width
1869 ASM(MOV32rr, cmp_reg, cmp_reg);
1870 }
1871
1872 if (low_bound != 0) {
1873 if (i32(low_bound) == i64(low_bound)) {
1874 ASM(SUB64ri, cmp_reg, low_bound);
1875 } else {
1876 this->materialize_constant(&low_bound, Config::GP_BANK, 8, tmp_reg);
1877 ASM(SUB64rr, cmp_reg, tmp_reg);
1878 }
1879 }
1880
1881 Label jump_table = this->text_writer.label_create();
1882 ASM(LEA64rm, tmp_reg, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1883 // we reuse the jump offset stuff since the patch procedure is the same
1884 this->text_writer.label_ref(jump_table,
1885 this->text_writer.offset() - 4,
1886 LabelFixupKind::X64_JMP_OR_MEM_DISP);
1887 // load the 4 byte displacement from the jump table
1888 ASM(MOVSXr64m32, cmp_reg, FE_MEM(tmp_reg, 4, cmp_reg, 0));
1889 ASM(ADD64rr, tmp_reg, cmp_reg);
1890 ASM(JMPr, tmp_reg);
1891
1892 this->text_writer.align(4);
1893 this->text_writer.ensure_space(4 + 4 * labels.size());
1894 this->label_place(jump_table);
1895 const u32 table_off = this->text_writer.offset();
1896 for (u32 i = 0; i < labels.size(); i++) {
1897 if (this->text_writer.label_is_pending(labels[i])) {
1898 this->text_writer.label_ref(labels[i],
1899 this->text_writer.offset(),
1900 LabelFixupKind::X64_JUMP_TABLE);
1901 this->text_writer.write(table_off);
1902 } else {
1903 const auto label_off = this->text_writer.label_offset(labels[i]);
1904 this->text_writer.write((i32)label_off - (i32)table_off);
1905 }
1906 }
1907 return true;
1908}
1909
1910template <IRAdaptor Adaptor,
1911 typename Derived,
1912 template <typename, typename, typename> typename BaseTy,
1913 typename Config>
1914void CompilerX64<Adaptor, Derived, BaseTy, Config>::switch_emit_binary_step(
1915 const Label case_label,
1916 const Label gt_label,
1917 const AsmReg cmp_reg,
1918 const AsmReg tmp_reg,
1919 const u64 case_value,
1920 const bool width_is_32) noexcept {
1921 switch_emit_cmpeq(case_label, cmp_reg, tmp_reg, case_value, width_is_32);
1922 generate_raw_jump(Jump::ja, gt_label);
1923}
1924
1925template <IRAdaptor Adaptor,
1926 typename Derived,
1927 template <typename, typename, typename> typename BaseTy,
1928 typename Config>
1929CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1931 SymRef sym, TLSModel model) noexcept {
1932 switch (model) {
1933 default: // TODO: implement optimized access for non-gd-model
1934 case TLSModel::GlobalDynamic: {
1935 // Generate function call to __tls_get_addr; on x86-64, this takes a single
1936 // parameter in rdi.
1937 assert(!this->stack.is_leaf_function);
1938 assert(may_clobber_flags());
1939 this->stack.generated_call = true;
1940 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1941 for (auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1942 this->evict_reg(Reg{reg});
1943 }
1944 ScratchReg arg{this};
1945 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1946
1947 // Call sequence with extra prefixes for linker relaxation. Code sequence
1948 // taken from "ELF Handling For Thread-Local Storage".
1949 this->text_writer.ensure_space(0x10);
1950 *this->text_writer.cur_ptr()++ = 0x66;
1951 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1952 this->reloc_text(
1953 sym, elf::R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1954 *this->text_writer.cur_ptr()++ = 0x66;
1955 *this->text_writer.cur_ptr()++ = 0x66;
1956 *this->text_writer.cur_ptr()++ = 0x48;
1957 ASMNC(CALL, this->text_writer.cur_ptr());
1958 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1959 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1960 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1961 }
1962 this->reloc_text(this->sym_tls_get_addr,
1963 elf::R_X86_64_PLT32,
1964 this->text_writer.offset() - 4,
1965 -4);
1966 arg.reset();
1967
1968 ScratchReg res{this};
1969 res.alloc_specific(AsmReg::AX);
1970 return res;
1971 }
1972 }
1973}
1974
1975} // namespace tpde::x64
Assembler base class.
@ LOCAL
Symbol with local linkage, must be defined.
Helper class to write function text.
x86-64 System V calling convention.
Helper class for building call sequences.
CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
Constructor.
Helper class to write function text for X64.
The IRAdaptor specifies the interface with which the IR-independent parts of the compiler interact wi...
Definition IRAdaptor.hpp:91
Compiler mixin for targeting x86-64.
void generate_raw_cmov(Jump cc, AsmReg dst, AsmReg src, bool is_64) noexcept
Move src into dst if cc is true, otherwise do nothing.
void prologue_end(CCAssigner *cc_assigner) noexcept
Finish prologue.
void generate_raw_bfiz(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert in zero.
ScratchReg tls_get_addr(SymRef sym, TLSModel model) noexcept
Generate code sequence to load address of sym into a register.
void generate_raw_intext(AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept
Integer extension.
void alloca_dynamic(u64 elem_size, ValuePart &&count, u32 align, ValuePart &res) noexcept
Dynamic alloca of a dynamically-sized region (elem_size * count bytes).
void generate_raw_jump(Jump jmp, Label target) noexcept
Generate jump instruction to target label.
void generate_raw_set(Jump cc, AsmReg dst, bool zext=true) noexcept
Set dst to 1 if cc is true, otherwise set it to zero.
void generate_raw_mask(Jump cc, AsmReg dst) noexcept
Set all bits of dst to 1 if cc is true, otherwise set it to zero.
void prologue_begin(CCAssigner *cc_assigner) noexcept
Begin prologue, prepare for assigning arguments.
bool preserve_flags
Whether flags must be preserved when materializing constants etc.
void materialize_constant(const u64 *data, RegBank bank, u32 size, AsmReg dst) noexcept
Materialize constant into a register.
void generate_raw_bfi(AsmReg dst, AsmReg src, u32 lsb, u32 width) noexcept
Bitfield insert. Needs a temporary register, src is not modified.
std::optional< i32 > prologue_assign_arg_part(ValuePart &&vp, CCAssignment cca) noexcept
Assign argument part.
void alloca_fixed(u64 size, u32 align, ValuePart &res) noexcept
Dynamic alloca of a fixed-size region.
u32 scalar_arg_count
For vararg functions only: number of scalar and xmm registers used.
SymRef sym_tls_get_addr
Symbol for __tls_get_addr.
@ jmp
Unconditional jump.
@ jg
Jump if greater (ZF=0 and SF=OF).
@ jno
Jump if not overflow (OF=0).
@ js
Jump if sign (SF=1).
@ jnp
Jump if parity odd (PF=0).
@ jns
Jump if not sign (SF=0).
@ jl
Jump if less (SF!=OF).
@ jp
Jump if parity even (PF=1).
@ jle
Jump if less or equal (ZF=1 or SF!=OF).
@ jo
Jump if overflow (OF=1).
@ je
Jump if equal/if zero (ZF=1).
@ jge
Jump if greater or equal (SF=OF).
@ jae
Jump if above or equal/if not carry (CF=0).
@ ja
Jump if above (CF=0 and ZF=0).
@ jne
Jump if not equal/if not zero (ZF=0).
@ jb
Jump if below/if carry (CF=1).
@ jbe
Jump if below or equal (CF=1 or ZF=1).
u32 max_callee_stack_arg_size
For functions without dynamic allocas, the largest size used for arguments passed on the stack to cal...
void generate_call(std::variant< SymRef, ValuePart > &&target, std::span< CallArg > arguments, typename Base::ValueRef *result, bool variable_args=false)
Generate a function call.