TPDE
Loading...
Searching...
No Matches
CompilerX64.hpp
1// SPDX-FileCopyrightText: 2025 Contributors to TPDE <https://tpde.org>
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4#pragma once
5
6#include "AssemblerElfX64.hpp"
7#include "tpde/CompilerBase.hpp"
8#include "tpde/ValLocalIdx.hpp"
9#include "tpde/ValueAssignment.hpp"
10#include "tpde/base.hpp"
11
12#include <bit>
13
14#ifdef TPDE_ASSERTS
15 #include <fadec.h>
16#endif
17
18// Helper macros for assembling in the compiler
19#if defined(ASM) || defined(ASMF) || defined(ASMNC) || defined(ASME)
20 #error Got definition for ASM macros from somewhere else. Maybe you included compilers for multiple architectures?
21#endif
22
23// Use helper, parameters might call ASM themselves => evaluate text_cur_ptr
24// after the arguments.
25#define ASM_FULL(compiler, reserve, op, ...) \
26 ((compiler)->asm_helper(fe64_##op).encode(reserve, __VA_ARGS__))
27
28#define ASM(op, ...) ASM_FULL(this, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
29#define ASMC(compiler, op, ...) \
30 ASM_FULL(compiler, 16, op, 0 __VA_OPT__(, ) __VA_ARGS__)
31#define ASMF(op, flag, ...) \
32 ASM_FULL(this, 16, op, flag __VA_OPT__(, ) __VA_ARGS__)
33#define ASMNCF(op, flag, ...) \
34 ASM_FULL(this, 0, op, flag __VA_OPT__(, ) __VA_ARGS__)
35#define ASMNC(op, ...) ASM_FULL(this, 0, op, 0 __VA_OPT__(, ) __VA_ARGS__)
36
37namespace tpde::x64 {
38
39struct AsmReg : Reg {
40 enum REG : u8 {
41 AX = 0,
42 CX,
43 DX,
44 BX,
45 SP,
46 BP,
47 SI,
48 DI,
49 R8,
50 R9,
51 R10,
52 R11,
53 R12,
54 R13,
55 R14,
56 R15,
57
58 XMM0 = 32,
59 XMM1,
60 XMM2,
61 XMM3,
62 XMM4,
63 XMM5,
64 XMM6,
65 XMM7,
66 XMM8,
67 XMM9,
68 XMM10,
69 XMM11,
70 XMM12,
71 XMM13,
72 XMM14,
73 XMM15,
74 // TODO(ts): optional support for AVX registers with compiler flag
75 };
76
77 constexpr explicit AsmReg() noexcept : Reg((u8)0xFF) {}
78
79 constexpr AsmReg(const REG id) noexcept : Reg((u8)id) {}
80
81 constexpr AsmReg(const Reg base) noexcept : Reg(base) {}
82
83 constexpr explicit AsmReg(const u8 id) noexcept : Reg(id) {
84 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
85 }
86
87 constexpr explicit AsmReg(const u64 id) noexcept : Reg(id) {
88 assert(id <= R15 || (id >= XMM0 && id <= XMM15));
89 }
90
91 constexpr operator FeRegGP() const noexcept {
92 assert(reg_id <= R15);
93 return FeRegGP{reg_id};
94 }
95
96 operator FeRegGPLH() const noexcept {
97 assert(reg_id <= R15);
98 return FeRegGP{reg_id};
99 }
100
101 constexpr operator FeRegXMM() const noexcept {
102 assert(reg_id >= XMM0 && reg_id <= XMM15);
103 return FeRegXMM{static_cast<u8>(reg_id & 0x1F)};
104 }
105};
106
107constexpr static u64
108 create_bitmask(const std::initializer_list<AsmReg::REG> regs) {
109 u64 set = 0;
110 for (const auto reg : regs) {
111 set |= 1ull << reg;
112 }
113 return set;
114}
115
116template <size_t N>
117constexpr static u64 create_bitmask(const std::array<AsmReg, N> regs) {
118 u64 set = 0;
119 for (const auto reg : regs) {
120 set |= 1ull << reg.id();
121 }
122 return set;
123}
124
125class CCAssignerSysV : public CCAssigner {
126public:
127 static constexpr CCInfo Info{
128 .allocatable_regs =
129 0xFFFF'0000'FFFF & ~create_bitmask({AsmReg::BP, AsmReg::SP}),
130 .callee_saved_regs = create_bitmask({
131 AsmReg::BX,
132 AsmReg::R12,
133 AsmReg::R13,
134 AsmReg::R14,
135 AsmReg::R15,
136 }),
137 .arg_regs = create_bitmask({
138 AsmReg::DI,
139 AsmReg::SI,
140 AsmReg::DX,
141 AsmReg::CX,
142 AsmReg::R8,
143 AsmReg::R9,
144 AsmReg::XMM0,
145 AsmReg::XMM1,
146 AsmReg::XMM2,
147 AsmReg::XMM3,
148 AsmReg::XMM4,
149 AsmReg::XMM5,
150 AsmReg::XMM6,
151 AsmReg::XMM7,
152 }),
153 };
154
155private:
156 u32 gp_cnt = 0, xmm_cnt = 0, stack = 0;
157 // The next N assignments must go to the stack.
158 unsigned must_assign_stack = 0;
159 bool vararg;
160 u32 ret_gp_cnt = 0, ret_xmm_cnt = 0;
161
162public:
163 CCAssignerSysV(bool vararg = false) noexcept
164 : CCAssigner(Info), vararg(vararg) {}
165
166 void reset() noexcept override {
167 gp_cnt = xmm_cnt = stack = 0;
168 must_assign_stack = 0;
169 vararg = false;
170 ret_gp_cnt = ret_xmm_cnt = 0;
171 }
172
173 void assign_arg(CCAssignment &arg) noexcept override {
174 if (arg.byval) {
175 stack = util::align_up(stack, arg.byval_align < 8 ? 8 : arg.byval_align);
176 arg.stack_off = stack;
177 stack += arg.byval_size;
178 return;
179 }
180
181 if (arg.bank == RegBank{0}) {
182 static constexpr std::array<AsmReg, 6> gp_arg_regs{
183 AsmReg::DI,
184 AsmReg::SI,
185 AsmReg::DX,
186 AsmReg::CX,
187 AsmReg::R8,
188 AsmReg::R9,
189 };
190 if (!must_assign_stack && gp_cnt + arg.consecutive < gp_arg_regs.size()) {
191 arg.reg = gp_arg_regs[gp_cnt];
192 gp_cnt += 1;
193 } else {
194 // Next N arguments must also be assigned to the stack
195 // Increment by one, the value is immediately decremented below.
196 must_assign_stack = arg.consecutive + 1;
197 stack = util::align_up(stack, arg.align < 8 ? 8 : arg.align);
198 arg.stack_off = stack;
199 stack += 8;
200 }
201 } else {
202 if (!must_assign_stack && xmm_cnt < 8) {
203 arg.reg = Reg{AsmReg::XMM0 + xmm_cnt};
204 xmm_cnt += 1;
205 } else {
206 // Next N arguments must also be assigned to the stack
207 // Increment by one, the value is immediately decremented below.
208 must_assign_stack = arg.consecutive + 1;
209 u32 size = util::align_up(arg.size, 8);
210 stack = util::align_up(stack, size);
211 arg.stack_off = stack;
212 stack += size;
213 }
214 }
215
216 if (must_assign_stack > 0) {
217 must_assign_stack -= 1;
218 }
219 }
220
221 u32 get_stack_size() noexcept override { return stack; }
222
223 bool is_vararg() const noexcept override { return vararg; }
224
225 void assign_ret(CCAssignment &arg) noexcept override {
226 assert(!arg.byval && !arg.sret);
227 if (arg.bank == RegBank{0}) {
228 if (ret_gp_cnt + arg.consecutive < 2) {
229 arg.reg = Reg{ret_gp_cnt == 0 ? AsmReg::AX : AsmReg::DX};
230 ret_gp_cnt += 1;
231 } else {
232 assert(false);
233 }
234 } else {
235 if (ret_xmm_cnt + arg.consecutive < 2) {
236 arg.reg = Reg{ret_xmm_cnt == 0 ? AsmReg::XMM0 : AsmReg::XMM1};
237 ret_xmm_cnt += 1;
238 } else {
239 assert(false);
240 }
241 }
242 }
243};
244
245struct PlatformConfig : CompilerConfigDefault {
246 using Assembler = AssemblerElfX64;
247 using AsmReg = tpde::x64::AsmReg;
248 using DefaultCCAssigner = CCAssignerSysV;
249
250 static constexpr RegBank GP_BANK{0};
251 static constexpr RegBank FP_BANK{1};
252 static constexpr bool FRAME_INDEXING_NEGATIVE = true;
253 static constexpr u32 PLATFORM_POINTER_SIZE = 8;
254 static constexpr u32 NUM_BANKS = 2;
255};
256
257namespace concepts {
258template <typename T, typename Config>
259concept Compiler = tpde::Compiler<T, Config> && requires(T a) {
260 {
261 a.arg_is_int128(std::declval<typename T::IRValueRef>())
262 } -> std::convertible_to<bool>;
263
264 {
265 a.arg_allow_split_reg_stack_passing(std::declval<typename T::IRValueRef>())
266 } -> std::convertible_to<bool>;
267};
268} // namespace concepts
269
270template <IRAdaptor Adaptor,
271 typename Derived,
272 template <typename, typename, typename> typename BaseTy =
273 CompilerBase,
274 typename Config = PlatformConfig>
275struct CompilerX64 : BaseTy<Adaptor, Derived, Config> {
276 using Base = BaseTy<Adaptor, Derived, Config>;
277
278 using IRValueRef = typename Base::IRValueRef;
279 using IRBlockRef = typename Base::IRBlockRef;
280 using IRFuncRef = typename Base::IRFuncRef;
281
282 using ScratchReg = typename Base::ScratchReg;
283 using ValuePartRef = typename Base::ValuePartRef;
284 using ValuePart = typename Base::ValuePart;
285 using GenericValuePart = typename Base::GenericValuePart;
286
287 using Assembler = typename PlatformConfig::Assembler;
288 using RegisterFile = typename Base::RegisterFile;
289
290 using CallArg = typename Base::CallArg;
291
292 using Base::derived;
293
294
295 // TODO(ts): make this dependent on the number of callee-saved regs of the
296 // current function or if there is a call in the function?
297 static constexpr u32 NUM_FIXED_ASSIGNMENTS[PlatformConfig::NUM_BANKS] = {5,
298 6};
299
300 enum CPU_FEATURES : u32 {
301 CPU_BASELINE = 0, // x86-64-v1
302 CPU_CMPXCHG16B = (1 << 0),
303 CPU_POPCNT = (1 << 1),
304 CPU_SSE3 = (1 << 2),
305 CPU_SSSE3 = (1 << 3),
306 CPU_SSE4_1 = (1 << 4),
307 CPU_SSE4_2 = (1 << 5),
308 CPU_AVX = (1 << 6),
309 CPU_AVX2 = (1 << 7),
310 CPU_BMI1 = (1 << 8),
311 CPU_BMI2 = (1 << 9),
312 CPU_F16C = (1 << 10),
313 CPU_FMA = (1 << 11),
314 CPU_LZCNT = (1 << 12),
315 CPU_MOVBE = (1 << 13),
316 CPU_AVX512F = (1 << 14),
317 CPU_AVX512BW = (1 << 15),
318 CPU_AVX512CD = (1 << 16),
319 CPU_AVX512DQ = (1 << 17),
320 CPU_AVX512VL = (1 << 18),
321
322 CPU_V2 = CPU_BASELINE | CPU_CMPXCHG16B | CPU_POPCNT | CPU_SSE3 | CPU_SSSE3 |
323 CPU_SSE4_1 | CPU_SSE4_2,
324 CPU_V3 = CPU_V2 | CPU_AVX | CPU_AVX2 | CPU_BMI1 | CPU_BMI2 | CPU_F16C |
325 CPU_FMA | CPU_LZCNT | CPU_MOVBE,
326 CPU_V4 = CPU_V3 | CPU_AVX512F | CPU_AVX512BW | CPU_AVX512CD | CPU_AVX512DQ |
327 CPU_AVX512VL,
328 };
329
330 CPU_FEATURES cpu_feats = CPU_BASELINE;
331
332 // When handling function arguments, we need to prevent argument registers
333 // from being handed out as fixed registers
334 //
335 // Additionally, for now we prevent AX,DX,CX to be fixed to not run into
336 // issues with instructions that need them as implicit arguments
337 // also AX and DX can never be fixed if exception handling is used
338 // since they are clobbered there
339 u64 fixed_assignment_nonallocatable_mask =
340 create_bitmask({AsmReg::AX, AsmReg::DX, AsmReg::CX});
341 u32 func_start_off = 0u, func_reg_save_off = 0u, func_reg_save_alloc = 0u,
342 func_reg_restore_alloc = 0u;
343 /// Offset to the `sub rsp, XXX` instruction that sets up the frame
344 u32 frame_size_setup_offset = 0u;
345 /// For vararg functions only: number of scalar and xmm registers used.
346 // TODO: this information should be obtained from the CCAssigner.
347 u32 scalar_arg_count = 0xFFFF'FFFF, vec_arg_count = 0xFFFF'FFFF;
348 u32 reg_save_frame_off = 0;
349 u32 var_arg_stack_off = 0;
350 util::SmallVector<u32, 8> func_ret_offs = {};
351
352 /// Symbol for __tls_get_addr.
353 Assembler::SymRef sym_tls_get_addr;
354
355 class CallBuilder : public Base::template CallBuilderBase<CallBuilder> {
356 u32 stack_adjust_off = 0;
357
358 void set_stack_used() noexcept;
359
360 public:
361 CallBuilder(Derived &compiler, CCAssigner &assigner) noexcept
362 : Base::template CallBuilderBase<CallBuilder>(compiler, assigner) {}
363
364 void add_arg_byval(ValuePart &vp, CCAssignment &cca) noexcept;
365 void add_arg_stack(ValuePart &vp, CCAssignment &cca) noexcept;
366 void call_impl(
367 std::variant<typename Assembler::SymRef, ValuePart> &&target) noexcept;
368 void reset_stack() noexcept;
369 };
370
371 // for now, always generate an object
372 explicit CompilerX64(Adaptor *adaptor,
373 const CPU_FEATURES cpu_features = CPU_BASELINE)
374 : Base{adaptor}, cpu_feats(cpu_features) {
375 static_assert(std::is_base_of_v<CompilerX64, Derived>);
376 static_assert(concepts::Compiler<Derived, PlatformConfig>);
377 }
378
379 template <typename... Args>
380 auto asm_helper(unsigned (*fn)(u8 *, int, Args...)) {
381 struct Helper {
382 CompilerX64 *compiler;
383 decltype(fn) fn;
384 void encode(unsigned reserve, int flags, Args... args) {
385 if (reserve) {
386 compiler->text_writer.ensure_space(reserve);
387 }
388 unsigned n = fn(compiler->text_writer.cur_ptr(), flags, args...);
389 assert(n != 0);
390 compiler->text_writer.cur_ptr() += n;
391 }
392 };
393 return Helper{this, fn};
394 }
395
396 void start_func(u32 func_idx) noexcept;
397
398 void gen_func_prolog_and_args(CCAssigner *) noexcept;
399
400 void finish_func(u32 func_idx) noexcept;
401
402 void reset() noexcept;
403
404 // helpers
405
406 void gen_func_epilog() noexcept;
407
408 void
409 spill_reg(const AsmReg reg, const i32 frame_off, const u32 size) noexcept;
410
411 void load_from_stack(AsmReg dst,
412 i32 frame_off,
413 u32 size,
414 bool sign_extend = false) noexcept;
415
416 void load_address_of_stack_var(AsmReg dst, AssignmentPartRef ap) noexcept;
417
418 void mov(AsmReg dst, AsmReg src, u32 size) noexcept;
419
420 GenericValuePart val_spill_slot(ValuePart &val_ref) noexcept {
421 const auto ap = val_ref.assignment();
422 assert(ap.stack_valid() && !ap.variable_ref());
423 return typename GenericValuePart::Expr(AsmReg::BP, ap.frame_off());
424 }
425
426 AsmReg gval_expr_as_reg(GenericValuePart &gv) noexcept;
427
428 void materialize_constant(const u64 *data,
429 RegBank bank,
430 u32 size,
431 AsmReg dst) noexcept;
432
433 AsmReg select_fixed_assignment_reg(RegBank bank, IRValueRef) noexcept;
434
435 enum class Jump {
436 ja,
437 jae,
438 jb,
439 jbe,
440 je,
441 jg,
442 jge,
443 jl,
444 jle,
445 jmp,
446 jne,
447 jno,
448 jo,
449 js,
450 jns,
451 jp,
452 jnp,
453 };
454
455 Jump invert_jump(Jump jmp) noexcept;
456 Jump swap_jump(Jump jmp) noexcept;
457
458 void generate_branch_to_block(Jump jmp,
459 IRBlockRef target,
460 bool needs_split,
461 bool last_inst) noexcept;
462
463 void generate_raw_jump(Jump jmp, Assembler::Label target) noexcept;
464
465 void generate_raw_set(Jump jmp, AsmReg dst) noexcept;
466 void generate_raw_mask(Jump jmp, AsmReg dst) noexcept;
467
468 void generate_raw_intext(
469 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept;
470
471 /// Generate a function call
472 ///
473 /// This will get the arguments into the correct registers according to the
474 /// calling convention, clear non-callee-saved registers from the register
475 /// file (make sure you do not have any fixed assignments left over) and
476 /// fill the result registers (the u8 in the ScratchReg pair indicates the
477 /// register bank)
478 ///
479 /// Targets can be a symbol (call to PLT with relocation), or an indirect
480 /// call to a ValuePart. Result is an optional reference.
481 void generate_call(std::variant<Assembler::SymRef, ValuePart> &&target,
482 std::span<CallArg> arguments,
483 typename Base::ValueRef *result,
484 bool variable_args = false);
485
486 /// Generate code sequence to load address of sym into a register. This will
487 /// generate a function call for dynamic TLS access models.
488 ScratchReg tls_get_addr(Assembler::SymRef sym, TLSModel model) noexcept;
489
490 bool has_cpu_feats(CPU_FEATURES feats) const noexcept {
491 return ((cpu_feats & feats) == feats);
492 }
493};
494
495template <IRAdaptor Adaptor,
496 typename Derived,
497 template <typename, typename, typename> class BaseTy,
498 typename Config>
499void CompilerX64<Adaptor, Derived, BaseTy, Config>::start_func(
500 const u32 /*func_idx*/) noexcept {
501 this->text_writer.align(16);
502 this->assembler.except_begin_func();
503}
504
505template <IRAdaptor Adaptor,
506 typename Derived,
507 template <typename, typename, typename> typename BaseTy,
508 typename Config>
509void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_prolog_and_args(
510 CCAssigner *cc_assigner) noexcept {
511 // prologue:
512 // push rbp
513 // mov rbp, rsp
514 // optionally create vararg save-area
515 // reserve space for callee-saved regs
516 // = 1 byte for each of the lower 8 regs and 2
517 // bytes for the higher 8 regs
518 // sub rsp, #<frame_size>+<largest_call_frame_usage>
519
520 // TODO(ts): technically we only need rbp if there
521 // is a dynamic alloca but then we need to make the
522 // frame indexing dynamic in CompilerBase and the
523 // unwind info needs to take the dynamic sub rsp for
524 // calls into account
525
526 func_ret_offs.clear();
527 func_start_off = this->text_writer.offset();
528 scalar_arg_count = vec_arg_count = 0xFFFF'FFFF;
529
530 const CCInfo &cc_info = cc_assigner->get_ccinfo();
531
532 ASM(PUSHr, FE_BP);
533 ASM(MOV64rr, FE_BP, FE_SP);
534
535 func_reg_save_off = this->text_writer.offset();
536
537 auto csr = cc_info.callee_saved_regs;
538 assert(!(csr & ~this->register_file.bank_regs(Config::GP_BANK)) &&
539 "non-gp callee-saved registers not implemented");
540
541 u32 csr_logp = std::popcount((csr >> AsmReg::AX) & 0xff);
542 u32 csr_higp = std::popcount((csr >> AsmReg::R8) & 0xff);
543 // R8 and higher need a REX prefix.
544 u32 reg_save_size = 1 * csr_logp + 2 * csr_higp;
545 this->stack.frame_size = 8 * (csr_logp + csr_higp);
546
547 this->text_writer.ensure_space(reg_save_size);
548 this->text_writer.cur_ptr() += reg_save_size;
549 func_reg_save_alloc = reg_save_size;
550 // pop uses the same amount of bytes as push
551 func_reg_restore_alloc = reg_save_size;
552
553 // TODO(ts): support larger stack alignments?
554
555 // placeholder for later
556 frame_size_setup_offset = this->text_writer.offset();
557 ASM(SUB64ri, FE_SP, 0x7FFF'FFFF);
558#ifdef TPDE_ASSERTS
559 assert((this->text_writer.offset() - frame_size_setup_offset) == 7);
560#endif
561
562 if (this->adaptor->cur_is_vararg()) {
563 this->stack.frame_size += 6 * 8 + 8 * 16;
564 reg_save_frame_off = this->stack.frame_size;
565 auto mem = FE_MEM(FE_BP, 0, FE_NOREG, -(i32)reg_save_frame_off);
566 ASM(MOV64mr, mem, FE_DI);
567 mem.off += 8;
568 ASM(MOV64mr, mem, FE_SI);
569 mem.off += 8;
570 ASM(MOV64mr, mem, FE_DX);
571 mem.off += 8;
572 ASM(MOV64mr, mem, FE_CX);
573 mem.off += 8;
574 ASM(MOV64mr, mem, FE_R8);
575 mem.off += 8;
576 ASM(MOV64mr, mem, FE_R9);
577 auto skip_fp = this->assembler.label_create();
578 ASM(TEST8rr, FE_AX, FE_AX);
579 generate_raw_jump(Jump::je, skip_fp);
580 mem.off += 8;
581 ASM(SSE_MOVDQUmr, mem, FE_XMM0);
582 mem.off += 16;
583 ASM(SSE_MOVDQUmr, mem, FE_XMM1);
584 mem.off += 16;
585 ASM(SSE_MOVDQUmr, mem, FE_XMM2);
586 mem.off += 16;
587 ASM(SSE_MOVDQUmr, mem, FE_XMM3);
588 mem.off += 16;
589 ASM(SSE_MOVDQUmr, mem, FE_XMM4);
590 mem.off += 16;
591 ASM(SSE_MOVDQUmr, mem, FE_XMM5);
592 mem.off += 16;
593 ASM(SSE_MOVDQUmr, mem, FE_XMM6);
594 mem.off += 16;
595 ASM(SSE_MOVDQUmr, mem, FE_XMM7);
596 this->label_place(skip_fp);
597 }
598
599 // Temporarily prevent argument registers from being assigned.
600 assert((cc_info.allocatable_regs & cc_info.arg_regs) == cc_info.arg_regs &&
601 "argument registers must also be allocatable");
602 this->register_file.allocatable &= ~cc_info.arg_regs;
603
604 u32 arg_idx = 0;
605 for (const IRValueRef arg : this->adaptor->cur_args()) {
606 derived()->handle_func_arg(
607 arg_idx, arg, [&](ValuePart &&vp, CCAssignment cca) {
608 cca.bank = vp.bank();
609 cca.size = vp.part_size();
610
611 cc_assigner->assign_arg(cca);
612
613 if (cca.reg.valid()) [[likely]] {
614 vp.set_value_reg(this, cca.reg);
615 // Mark register as allocatable as soon as it is assigned. If the
616 // argument is unused, the register will be freed immediately and
617 // can be used for later stack arguments.
618 this->register_file.allocatable |= u64{1} << cca.reg.id();
619 return;
620 }
621
622 if (vp.is_owned()) {
623 // no need to handle unused arguments
624 return;
625 }
626
627 if (cca.byval) {
628 ValLocalIdx local_idx = this->val_idx(arg);
629 // Ugly hack, we shouldn't create the assignment in the first place.
630 this->assignments.value_ptrs[u32(local_idx)] = nullptr;
631 this->init_variable_ref(local_idx, 0);
632 ValueAssignment *assignment = this->val_assignment(local_idx);
633 assignment->stack_variable = true;
634 assignment->frame_off = 0x10 + cca.stack_off;
635 } else {
636 // TODO(ts): maybe allow negative frame offsets for value
637 // assignments so we can simply reference this?
638 // but this probably doesn't work with multi-part values
639 // since the offsets are different
640 AsmReg dst = vp.alloc_reg(this);
641 this->load_from_stack(dst, 0x10 + cca.stack_off, cca.size);
642 }
643 });
644
645 arg_idx += 1;
646 }
647
648 if (this->adaptor->cur_is_vararg()) [[unlikely]] {
649 // TODO: get this from CCAssigner?
650 auto arg_regs = this->register_file.allocatable & cc_info.arg_regs;
651 u64 gp_regs = arg_regs & this->register_file.bank_regs(Config::GP_BANK);
652 u64 xmm_regs = arg_regs & this->register_file.bank_regs(Config::FP_BANK);
653 this->scalar_arg_count = std::popcount(gp_regs);
654 this->vec_arg_count = std::popcount(xmm_regs);
655 this->var_arg_stack_off = 0x10 + cc_assigner->get_stack_size();
656 }
657
658 this->register_file.allocatable |= cc_info.arg_regs;
659}
660
661template <IRAdaptor Adaptor,
662 typename Derived,
663 template <typename, typename, typename> typename BaseTy,
664 typename Config>
665void CompilerX64<Adaptor, Derived, BaseTy, Config>::finish_func(
666 u32 func_idx) noexcept {
667 // NB: code alignment factor 1, data alignment factor -8.
668 auto fde_off = this->assembler.eh_begin_fde(this->get_personality_sym());
669 // push rbp
670 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 1);
671 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_offset, 16);
672 this->assembler.eh_write_inst(
673 dwarf::DW_CFA_offset, dwarf::x64::DW_reg_rbp, 2);
674 // mov rbp, rsp
675 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 3);
676 this->assembler.eh_write_inst(dwarf::DW_CFA_def_cfa_register,
677 dwarf::x64::DW_reg_rbp);
678
679 // Patched below
680 auto fde_prologue_adv_off = this->assembler.eh_writer.size();
681 this->assembler.eh_write_inst(dwarf::DW_CFA_advance_loc, 0);
682
683 auto *write_ptr = this->text_writer.begin_ptr() + func_reg_save_off;
684 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
685 u64 saved_regs = this->register_file.clobbered & csr;
686 u32 num_saved_regs = 0u;
687 for (auto reg : util::BitSetIterator{saved_regs}) {
688 assert(reg <= AsmReg::R15);
689 write_ptr +=
690 fe64_PUSHr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
691 ++num_saved_regs;
692
693 // DWARF register ordering is subtly different from the encoding:
694 // x86 is: ax, cx, dx, bx, sp, bp, si, di, r8, ...
695 // DWARF is: ax, dx, cx, bx, si, di, bp, sp, r8, ...
696 static const u8 gpreg_to_dwarf[] = {
697 dwarf::x64::DW_reg_rax,
698 dwarf::x64::DW_reg_rcx,
699 dwarf::x64::DW_reg_rdx,
700 dwarf::x64::DW_reg_rbx,
701 dwarf::x64::DW_reg_rsp,
702 dwarf::x64::DW_reg_rbp,
703 dwarf::x64::DW_reg_rsi,
704 dwarf::x64::DW_reg_rdi,
705 dwarf::x64::DW_reg_r8,
706 dwarf::x64::DW_reg_r9,
707 dwarf::x64::DW_reg_r10,
708 dwarf::x64::DW_reg_r11,
709 dwarf::x64::DW_reg_r12,
710 dwarf::x64::DW_reg_r13,
711 dwarf::x64::DW_reg_r14,
712 dwarf::x64::DW_reg_r15,
713 };
714 u8 dwarf_reg = gpreg_to_dwarf[reg];
715 auto cfa_off = num_saved_regs + 2;
716 this->assembler.eh_write_inst(dwarf::DW_CFA_offset, dwarf_reg, cfa_off);
717 }
718
719 u32 prologue_size =
720 write_ptr - (this->text_writer.begin_ptr() + func_start_off);
721 assert(prologue_size < 0x44);
722 this->assembler.eh_writer.data()[fde_prologue_adv_off] =
723 dwarf::DW_CFA_advance_loc | (prologue_size - 4);
724
725 // The frame_size contains the reserved frame size so we need to subtract
726 // the stack space we used for the saved registers
727 const auto final_frame_size =
728 util::align_up(this->stack.frame_size, 16) - num_saved_regs * 8;
729 *reinterpret_cast<u32 *>(this->text_writer.begin_ptr() +
730 frame_size_setup_offset + 3) = final_frame_size;
731#ifdef TPDE_ASSERTS
732 FdInstr instr = {};
733 assert(fd_decode(this->text_writer.begin_ptr() + frame_size_setup_offset,
734 7,
735 64,
736 0,
737 &instr) == 7);
738 assert(FD_TYPE(&instr) == FDI_SUB);
739 assert(FD_OP_TYPE(&instr, 0) == FD_OT_REG);
740 assert(FD_OP_TYPE(&instr, 1) == FD_OT_IMM);
741 assert(FD_OP_SIZE(&instr, 0) == 8);
742 assert(FD_OP_SIZE(&instr, 1) == 8);
743 assert(FD_OP_IMM(&instr, 1) == final_frame_size);
744#endif
745
746 // nop out the rest
747 const auto reg_save_end =
748 this->text_writer.begin_ptr() + func_reg_save_off + func_reg_save_alloc;
749 assert(reg_save_end >= write_ptr);
750 const u32 nop_len = reg_save_end - write_ptr;
751 if (nop_len) {
752 fe64_NOP(write_ptr, nop_len);
753 }
754
755 auto func_sym = this->func_syms[func_idx];
756 auto func_sec = this->text_writer.get_sec_ref();
757 if (func_ret_offs.empty()) {
758 // TODO(ts): honor cur_needs_unwind_info
759 auto func_size = this->text_writer.offset() - func_start_off;
760 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
761 this->assembler.eh_end_fde(fde_off, func_sym);
762 this->assembler.except_encode_func(func_sym);
763 return;
764 }
765
766 auto *text_data = this->text_writer.begin_ptr();
767 u32 first_ret_off = func_ret_offs[0];
768 u32 ret_size = 0;
769 u32 epilogue_size = 7 + 1 + 1 + func_reg_restore_alloc; // add + pop + ret
770 u32 func_end_ret_off = this->text_writer.offset() - epilogue_size;
771 {
772 write_ptr = text_data + first_ret_off;
773 const auto ret_start = write_ptr;
774 if (this->adaptor->cur_has_dynamic_alloca()) {
775 if (num_saved_regs == 0) {
776 write_ptr += fe64_MOV64rr(write_ptr, 0, FE_SP, FE_BP);
777 } else {
778 write_ptr +=
779 fe64_LEA64rm(write_ptr,
780 0,
781 FE_SP,
782 FE_MEM(FE_BP, 0, FE_NOREG, -(i32)num_saved_regs * 8));
783 }
784 } else {
785 write_ptr += fe64_ADD64ri(write_ptr, 0, FE_SP, final_frame_size);
786 }
787 for (auto reg : util::BitSetIterator<true>{saved_regs}) {
788 assert(reg <= AsmReg::R15);
789 write_ptr +=
790 fe64_POPr(write_ptr, 0, AsmReg{static_cast<AsmReg::REG>(reg)});
791 }
792 write_ptr += fe64_POPr(write_ptr, 0, FE_BP);
793 write_ptr += fe64_RET(write_ptr, 0);
794 ret_size = write_ptr - ret_start;
795 assert(ret_size <= epilogue_size && "function epilogue too long");
796
797 // write NOP for better disassembly
798 if (epilogue_size > ret_size) {
799 fe64_NOP(write_ptr, epilogue_size - ret_size);
800 if (first_ret_off == func_end_ret_off) {
801 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
802 }
803 }
804 }
805
806 for (u32 i = 1; i < func_ret_offs.size(); ++i) {
807 std::memcpy(
808 text_data + func_ret_offs[i], text_data + first_ret_off, epilogue_size);
809 if (func_ret_offs[i] == func_end_ret_off) {
810 this->text_writer.cur_ptr() -= epilogue_size - ret_size;
811 }
812 }
813
814 // Do sym_def at the very end; we shorten the function here again, so only at
815 // this point we know the actual size of the function.
816 // TODO(ts): honor cur_needs_unwind_info
817 auto func_size = this->text_writer.offset() - func_start_off;
818 this->assembler.sym_def(func_sym, func_sec, func_start_off, func_size);
819 this->assembler.eh_end_fde(fde_off, func_sym);
820 this->assembler.except_encode_func(func_sym);
821}
822
823template <IRAdaptor Adaptor,
824 typename Derived,
825 template <typename, typename, typename> typename BaseTy,
826 typename Config>
827void CompilerX64<Adaptor, Derived, BaseTy, Config>::reset() noexcept {
828 func_ret_offs.clear();
829 sym_tls_get_addr = {};
830 Base::reset();
831}
832
833template <IRAdaptor Adaptor,
834 typename Derived,
835 template <typename, typename, typename> typename BaseTy,
836 typename Config>
837void CompilerX64<Adaptor, Derived, BaseTy, Config>::gen_func_epilog() noexcept {
838 // epilogue:
839 // if !func_has_dynamic_alloca:
840 // add rsp, #<frame_size>+<largest_call_frame_usage>
841 // else:
842 // lea rsp, [rbp - <size_of_reg_save_area>]
843 // for each saved reg:
844 // pop <reg>
845 // pop rbp
846 // ret
847 //
848 // however, since we will later patch this, we only
849 // reserve the space for now
850
851 func_ret_offs.push_back(this->text_writer.offset());
852
853 // add reg, imm32
854 // and
855 // lea rsp, [rbp - imm32]
856 // both take 7 bytes
857 u32 epilogue_size =
858 7 + 1 + 1 +
859 func_reg_restore_alloc; // add/lea + pop + ret + size of reg restore
860
861 this->text_writer.ensure_space(epilogue_size);
862 this->text_writer.cur_ptr() += epilogue_size;
863}
864
865template <IRAdaptor Adaptor,
866 typename Derived,
867 template <typename, typename, typename> typename BaseTy,
868 typename Config>
869void CompilerX64<Adaptor, Derived, BaseTy, Config>::spill_reg(
870 const AsmReg reg, const i32 frame_off, const u32 size) noexcept {
871 this->text_writer.ensure_space(16);
872 assert(frame_off < 0);
873 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
874 if (reg.id() <= AsmReg::R15) {
875 switch (size) {
876 case 1: ASMNC(MOV8mr, mem, reg); break;
877 case 2: ASMNC(MOV16mr, mem, reg); break;
878 case 4: ASMNC(MOV32mr, mem, reg); break;
879 case 8: ASMNC(MOV64mr, mem, reg); break;
880 default: TPDE_UNREACHABLE("invalid spill size");
881 }
882 return;
883 }
884
885 switch (size) {
886 case 4: ASMNC(SSE_MOVD_X2Gmr, mem, reg); break;
887 case 8: ASMNC(SSE_MOVQ_X2Gmr, mem, reg); break;
888 case 16: ASMNC(SSE_MOVAPDmr, mem, reg); break;
889 default: TPDE_UNREACHABLE("invalid spill size");
890 }
891}
892
893template <IRAdaptor Adaptor,
894 typename Derived,
895 template <typename, typename, typename> typename BaseTy,
896 typename Config>
897void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_from_stack(
898 const AsmReg dst,
899 const i32 frame_off,
900 const u32 size,
901 const bool sign_extend) noexcept {
902 this->text_writer.ensure_space(16);
903 const auto mem = FE_MEM(FE_BP, 0, FE_NOREG, frame_off);
904
905 if (dst.id() <= AsmReg::R15) {
906 if (!sign_extend) {
907 switch (size) {
908 case 1: ASMNC(MOVZXr32m8, dst, mem); break;
909 case 2: ASMNC(MOVZXr32m16, dst, mem); break;
910 case 4: ASMNC(MOV32rm, dst, mem); break;
911 case 8: ASMNC(MOV64rm, dst, mem); break;
912 default: TPDE_UNREACHABLE("invalid spill size");
913 }
914 } else {
915 switch (size) {
916 case 1: ASMNC(MOVSXr64m8, dst, mem); break;
917 case 2: ASMNC(MOVSXr64m16, dst, mem); break;
918 case 4: ASMNC(MOVSXr64m32, dst, mem); break;
919 case 8: ASMNC(MOV64rm, dst, mem); break;
920 default: TPDE_UNREACHABLE("invalid spill size");
921 }
922 }
923 return;
924 }
925
926 assert(!sign_extend);
927
928 switch (size) {
929 case 4: ASMNC(SSE_MOVD_G2Xrm, dst, mem); break;
930 case 8: ASMNC(SSE_MOVQ_G2Xrm, dst, mem); break;
931 case 16: ASMNC(SSE_MOVAPDrm, dst, mem); break;
932 default: TPDE_UNREACHABLE("invalid spill size");
933 }
934}
935
936template <IRAdaptor Adaptor,
937 typename Derived,
938 template <typename, typename, typename> typename BaseTy,
939 typename Config>
940void CompilerX64<Adaptor, Derived, BaseTy, Config>::load_address_of_stack_var(
941 const AsmReg dst, const AssignmentPartRef ap) noexcept {
942 ASM(LEA64rm, dst, FE_MEM(FE_BP, 0, FE_NOREG, ap.variable_stack_off()));
943}
944
945template <IRAdaptor Adaptor,
946 typename Derived,
947 template <typename, typename, typename> typename BaseTy,
948 typename Config>
949void CompilerX64<Adaptor, Derived, BaseTy, Config>::mov(
950 const AsmReg dst, const AsmReg src, const u32 size) noexcept {
951 assert(dst.valid());
952 assert(src.valid());
953 if (dst.id() <= AsmReg::R15 && src.id() <= AsmReg::R15) {
954 if (size > 4) {
955 ASM(MOV64rr, dst, src);
956 } else {
957 ASM(MOV32rr, dst, src);
958 }
959 } else if (dst.id() >= AsmReg::XMM0 && src.id() >= AsmReg::XMM0) {
960 if (size <= 16) {
961 if (dst.id() > AsmReg::XMM15 || src.id() > AsmReg::XMM15) {
962 assert(has_cpu_feats(CPU_AVX512F));
963 ASM(VMOVAPD128rr, dst, src);
964 } else {
965 ASM(SSE_MOVAPDrr, dst, src);
966 }
967 } else if (size <= 32) {
968 assert(has_cpu_feats(CPU_AVX));
969 assert((dst.id() <= AsmReg::XMM15 && src.id() <= AsmReg::XMM15) ||
970 has_cpu_feats(CPU_AVX512F));
971 ASM(VMOVAPD256rr, dst, src);
972 } else {
973 assert(size <= 64);
974 assert(has_cpu_feats(CPU_AVX512F));
975 ASM(VMOVAPD512rr, dst, src);
976 }
977 } else if (dst.id() <= AsmReg::R15) {
978 // gp<-xmm
979 assert(src.id() >= AsmReg::XMM0);
980 assert(size <= 8);
981 if (src.id() > AsmReg::XMM15) {
982 assert(has_cpu_feats(CPU_AVX512F));
983 if (size <= 4) {
984 ASM(VMOVD_X2Grr, dst, src);
985 } else {
986 ASM(VMOVQ_X2Grr, dst, src);
987 }
988 } else {
989 if (size <= 4) {
990 ASM(SSE_MOVD_X2Grr, dst, src);
991 } else {
992 ASM(SSE_MOVQ_X2Grr, dst, src);
993 }
994 }
995 } else {
996 // xmm<-gp
997 assert(src.id() <= AsmReg::R15);
998 assert(dst.id() >= AsmReg::XMM0);
999 assert(size <= 8);
1000 if (dst.id() > AsmReg::XMM15) {
1001 assert(has_cpu_feats(CPU_AVX512F));
1002 if (size <= 4) {
1003 ASM(VMOVD_G2Xrr, dst, src);
1004 } else {
1005 ASM(VMOVQ_G2Xrr, dst, src);
1006 }
1007 } else {
1008 if (size <= 4) {
1009 ASM(SSE_MOVD_G2Xrr, dst, src);
1010 } else {
1011 ASM(SSE_MOVQ_G2Xrr, dst, src);
1012 }
1013 }
1014 }
1015}
1016
1017template <IRAdaptor Adaptor,
1018 typename Derived,
1019 template <typename, typename, typename> typename BaseTy,
1020 typename Config>
1021AsmReg CompilerX64<Adaptor, Derived, BaseTy, Config>::gval_expr_as_reg(
1022 GenericValuePart &gv) noexcept {
1023 auto &expr = std::get<typename GenericValuePart::Expr>(gv.state);
1024
1025 ScratchReg scratch{derived()};
1026 bool disp32 = i32(expr.disp) == expr.disp;
1027 AsmReg base = expr.has_base() ? expr.base_reg() : AsmReg::make_invalid();
1028 AsmReg idx = expr.has_index() ? expr.index_reg() : AsmReg::make_invalid();
1029 if (std::holds_alternative<ScratchReg>(expr.base)) {
1030 scratch = std::move(std::get<ScratchReg>(expr.base));
1031 } else if (std::holds_alternative<ScratchReg>(expr.index)) {
1032 scratch = std::move(std::get<ScratchReg>(expr.index));
1033 } else {
1034 (void)scratch.alloc_gp();
1035 }
1036 auto dst = scratch.cur_reg();
1037 if (idx.valid()) {
1038 if ((expr.scale & (expr.scale - 1)) == 0 && expr.scale < 16) {
1039 u8 sc = expr.scale;
1040 if (base.valid() && disp32) {
1041 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, i32(expr.disp)));
1042 expr.disp = 0;
1043 } else if (base.valid()) {
1044 ASM(LEA64rm, dst, FE_MEM(base, sc, idx, 0));
1045 } else if (disp32) {
1046 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, i32(expr.disp)));
1047 } else {
1048 ASM(LEA64rm, dst, FE_MEM(FE_NOREG, sc, idx, 0));
1049 }
1050 } else {
1051 u64 scale = expr.scale;
1052 if (base == idx) {
1053 base = AsmReg::make_invalid();
1054 scale += 1;
1055 }
1056
1057 ScratchReg idx_scratch{derived()};
1058 // We need a register to compute the scaled index.
1059 AsmReg idx_tmp = dst;
1060 if (dst == base && std::holds_alternative<ScratchReg>(expr.index)) {
1061 // We can't use dst, it'd clobber base, so use the other
1062 // register we currently own.
1063 idx_tmp = std::get<ScratchReg>(expr.index).cur_reg();
1064 } else if (dst == base) {
1065 idx_tmp = idx_scratch.alloc_gp();
1066 }
1067
1068 if ((scale & (scale - 1)) == 0) {
1069 if (idx_tmp != idx) {
1070 ASM(MOV64rr, idx_tmp, idx);
1071 }
1072 ASM(SHL64ri, idx_tmp, util::cnt_tz(scale));
1073 } else {
1074 if (i32(scale) == i64(scale)) {
1075 ASM(IMUL64rri, idx_tmp, idx, scale);
1076 } else {
1077 ScratchReg scratch2{derived()};
1078 auto tmp2 = scratch2.alloc_gp();
1079 ASM(MOV64ri, tmp2, scale);
1080 if (idx_tmp != idx) {
1081 ASM(MOV64rr, idx_tmp, idx);
1082 }
1083 ASM(IMUL64rr, idx_tmp, tmp2);
1084 }
1085 }
1086 if (base.valid()) {
1087 if (disp32 || (idx_tmp != dst && base != dst)) {
1088 ASM(LEA64rm, dst, FE_MEM(base, 1, idx_tmp, i32(expr.disp)));
1089 expr.disp = 0;
1090 } else if (dst == base) {
1091 ASM(ADD64rr, dst, idx_tmp);
1092 } else {
1093 ASM(ADD64rr, dst, base);
1094 }
1095 }
1096 }
1097 } else if (base.valid()) {
1098 if (expr.disp && disp32) {
1099 ASM(LEA64rm, dst, FE_MEM(base, 0, FE_NOREG, i32(expr.disp)));
1100 expr.disp = 0;
1101 } else if (dst != base) {
1102 ASM(MOV64rr, dst, base);
1103 }
1104 }
1105 if (expr.disp) {
1106 ScratchReg scratch2{derived()};
1107 auto tmp2 = scratch2.alloc_gp();
1108 ASM(MOV64ri, tmp2, expr.disp);
1109 ASM(ADD64rr, dst, tmp2);
1110 }
1111 gv.state = std::move(scratch);
1112 return dst;
1113}
1114
1115template <IRAdaptor Adaptor,
1116 typename Derived,
1117 template <typename, typename, typename> typename BaseTy,
1118 typename Config>
1119void CompilerX64<Adaptor, Derived, BaseTy, Config>::materialize_constant(
1120 const u64 *data, const RegBank bank, const u32 size, AsmReg dst) noexcept {
1121 const auto const_u64 = data[0];
1122 if (bank == Config::GP_BANK) {
1123 assert(size <= 8);
1124 if (const_u64 == 0) {
1125 // note: cannot use XOR here since this might be called in-between
1126 // instructions that rely on the flags being preserved
1127 // ASM(XOR32rr, dst, dst);
1128 ASM(MOV32ri, dst, 0);
1129 return;
1130 }
1131
1132 if (size <= 4) {
1133 ASM(MOV32ri, dst, const_u64);
1134 } else {
1135 ASM(MOV64ri, dst, const_u64);
1136 }
1137 return;
1138 }
1139
1140 assert(bank == Config::FP_BANK);
1141 const auto high_u64 = size <= 8 ? 0 : data[1];
1142 if (const_u64 == 0 && (size <= 8 || (high_u64 == 0 && size <= 16))) {
1143 if (has_cpu_feats(CPU_AVX)) {
1144 ASM(VPXOR128rrr, dst, dst, dst);
1145 } else {
1146 ASM(SSE_PXORrr, dst, dst);
1147 }
1148 return;
1149 }
1150 const u64 ones = -u64{1};
1151 if (const_u64 == ones && (size <= 8 || (high_u64 == ones && size <= 16))) {
1152 if (has_cpu_feats(CPU_AVX)) {
1153 ASM(VPCMPEQB128rrr, dst, dst, dst);
1154 } else {
1155 ASM(SSE_PCMPEQBrr, dst, dst);
1156 }
1157 return;
1158 }
1159
1160 if (size <= 8) {
1161 // We must not evict registers here (might be used within branching code),
1162 // so only use free registers and load from memory otherwise.
1163 AsmReg tmp =
1164 this->register_file.find_first_free_excluding(Config::GP_BANK, 0);
1165 if (tmp.valid()) {
1166 this->register_file.mark_clobbered(tmp);
1167 materialize_constant(data, Config::GP_BANK, size, tmp);
1168 if (size <= 4) {
1169 if (has_cpu_feats(CPU_AVX)) {
1170 ASM(VMOVD_G2Xrr, dst, tmp);
1171 } else {
1172 ASM(SSE_MOVD_G2Xrr, dst, tmp);
1173 }
1174 } else {
1175 if (has_cpu_feats(CPU_AVX)) {
1176 ASM(VMOVQ_G2Xrr, dst, tmp);
1177 } else {
1178 ASM(SSE_MOVQ_G2Xrr, dst, tmp);
1179 }
1180 }
1181 return;
1182 }
1183 }
1184
1185 // TODO: round to next power of two but at least 4 byte
1186 // We store constants in 8-byte units.
1187 auto alloc_size = util::align_up(size, 8);
1188 std::span<const u8> raw_data{reinterpret_cast<const u8 *>(data), alloc_size};
1189 // TODO: deduplicate/pool constants?
1190 auto rodata = this->assembler.get_data_section(true, false);
1191 auto sym = this->assembler.sym_def_data(
1192 rodata, "", raw_data, alloc_size, Assembler::SymBinding::LOCAL);
1193 if (size <= 4) {
1194 if (has_cpu_feats(CPU_AVX)) {
1195 ASM(VMOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1196 } else {
1197 ASM(SSE_MOVSSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1198 }
1199 } else if (size <= 8) {
1200 if (has_cpu_feats(CPU_AVX)) {
1201 ASM(VMOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1202 } else {
1203 ASM(SSE_MOVSDrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1204 }
1205 } else if (size <= 16) {
1206 if (has_cpu_feats(CPU_AVX)) {
1207 ASM(VMOVAPS128rm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1208 } else {
1209 ASM(SSE_MOVAPSrm, dst, FE_MEM(FE_IP, 0, FE_NOREG, -1));
1210 }
1211 } else {
1212 // TODO: implement for AVX/AVX-512.
1213 TPDE_FATAL("unable to materialize constant");
1214 }
1215
1216 this->reloc_text(sym, R_X86_64_PC32, this->text_writer.offset() - 4, -4);
1217}
1218
1219template <IRAdaptor Adaptor,
1220 typename Derived,
1221 template <typename, typename, typename> typename BaseTy,
1222 typename Config>
1223AsmReg
1224 CompilerX64<Adaptor, Derived, BaseTy, Config>::select_fixed_assignment_reg(
1225 const RegBank bank, IRValueRef) noexcept {
1226 assert(bank.id() <= Config::NUM_BANKS);
1227 auto reg_mask = this->register_file.bank_regs(bank);
1228 reg_mask &= ~fixed_assignment_nonallocatable_mask;
1229
1230 const auto find_possible_regs = [this,
1231 reg_mask](const u64 preferred_regs) -> u64 {
1232 // try to first get an unused reg, otherwise an unfixed reg
1233 u64 free_regs = this->register_file.allocatable & ~this->register_file.used;
1234 u64 possible_regs = free_regs & preferred_regs & reg_mask;
1235 if (possible_regs == 0) {
1236 possible_regs = (this->register_file.used & ~this->register_file.fixed) &
1237 preferred_regs & reg_mask;
1238 }
1239 return possible_regs;
1240 };
1241
1242 u64 possible_regs;
1243 auto csr = derived()->cur_cc_assigner()->get_ccinfo().callee_saved_regs;
1244 if (derived()->cur_func_may_emit_calls()) {
1245 // we can only allocated fixed assignments from the callee-saved regs
1246 possible_regs = find_possible_regs(csr);
1247 } else {
1248 // try allocating any non-callee saved register first, except the result
1249 // registers
1250 possible_regs = find_possible_regs(~csr);
1251 if (possible_regs == 0) {
1252 // otherwise fallback to callee-saved regs
1253 possible_regs = find_possible_regs(csr);
1254 }
1255 }
1256
1257 if (possible_regs == 0) {
1258 return AsmReg::make_invalid();
1259 }
1260
1261 // try to first get an unused reg, otherwise an unfixed reg
1262 if ((possible_regs & ~this->register_file.used) != 0) {
1263 return AsmReg{util::cnt_tz(possible_regs & ~this->register_file.used)};
1264 }
1265
1266 for (const auto reg_id : util::BitSetIterator<>{possible_regs}) {
1267 const auto reg = AsmReg{reg_id};
1268
1269 if (this->register_file.is_fixed(reg)) {
1270 continue;
1271 }
1272
1273 const auto local_idx = this->register_file.reg_local_idx(reg);
1274 const auto part = this->register_file.reg_part(reg);
1275
1276 if (local_idx == Base::INVALID_VAL_LOCAL_IDX) {
1277 continue;
1278 }
1279 auto *assignment = this->val_assignment(local_idx);
1280 auto ap = AssignmentPartRef{assignment, part};
1281 if (ap.modified()) {
1282 continue;
1283 }
1284
1285 return reg;
1286 }
1287
1288 return AsmReg::make_invalid();
1289}
1290
1291template <IRAdaptor Adaptor,
1292 typename Derived,
1293 template <typename, typename, typename> typename BaseTy,
1294 typename Config>
1295typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1296 CompilerX64<Adaptor, Derived, BaseTy, Config>::invert_jump(
1297 Jump jmp) noexcept {
1298 switch (jmp) {
1299 case Jump::ja: return Jump::jbe;
1300 case Jump::jae: return Jump::jb;
1301 case Jump::jb: return Jump::jae;
1302 case Jump::jbe: return Jump::ja;
1303 case Jump::je: return Jump::jne;
1304 case Jump::jg: return Jump::jle;
1305 case Jump::jge: return Jump::jl;
1306 case Jump::jl: return Jump::jge;
1307 case Jump::jle: return Jump::jg;
1308 case Jump::jmp: return Jump::jmp;
1309 case Jump::jne: return Jump::je;
1310 case Jump::jno: return Jump::jo;
1311 case Jump::jo: return Jump::jno;
1312 case Jump::js: return Jump::jns;
1313 case Jump::jns: return Jump::js;
1314 case Jump::jp: return Jump::jnp;
1315 case Jump::jnp: return Jump::jp;
1316 default: TPDE_UNREACHABLE("invalid jump condition");
1317 }
1318}
1319
1320template <IRAdaptor Adaptor,
1321 typename Derived,
1322 template <typename, typename, typename> class BaseTy,
1323 typename Config>
1324typename CompilerX64<Adaptor, Derived, BaseTy, Config>::Jump
1325 CompilerX64<Adaptor, Derived, BaseTy, Config>::swap_jump(
1326 Jump jmp) noexcept {
1327 switch (jmp) {
1328 case Jump::ja: return Jump::jb;
1329 case Jump::jae: return Jump::jbe;
1330 case Jump::jb: return Jump::ja;
1331 case Jump::jbe: return Jump::jae;
1332 case Jump::je: return Jump::je;
1333 case Jump::jg: return Jump::jl;
1334 case Jump::jge: return Jump::jle;
1335 case Jump::jl: return Jump::jg;
1336 case Jump::jle: return Jump::jge;
1337 case Jump::jmp: return Jump::jmp;
1338 case Jump::jne: return Jump::jne;
1339 case Jump::jno: return Jump::jno;
1340 case Jump::jo: return Jump::jo;
1341 case Jump::js: return Jump::js;
1342 case Jump::jns: return Jump::jns;
1343 case Jump::jp: return Jump::jp;
1344 case Jump::jnp: return Jump::jnp;
1345 default: TPDE_UNREACHABLE("invalid jump condition");
1346 }
1347}
1348
1349template <IRAdaptor Adaptor,
1350 typename Derived,
1351 template <typename, typename, typename> typename BaseTy,
1352 typename Config>
1353void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_branch_to_block(
1354 const Jump jmp,
1355 IRBlockRef target,
1356 const bool needs_split,
1357 const bool last_inst) noexcept {
1358 const auto target_idx = this->analyzer.block_idx(target);
1359 if (!needs_split || jmp == Jump::jmp) {
1360 this->derived()->move_to_phi_nodes(target_idx);
1361
1362 if (!last_inst || this->analyzer.block_idx(target) != this->next_block()) {
1363 generate_raw_jump(jmp, this->block_labels[(u32)target_idx]);
1364 }
1365 } else {
1366 auto tmp_label = this->assembler.label_create();
1367 generate_raw_jump(invert_jump(jmp), tmp_label);
1368
1369 this->derived()->move_to_phi_nodes(target_idx);
1370
1371 generate_raw_jump(Jump::jmp, this->block_labels[(u32)target_idx]);
1372
1373 this->label_place(tmp_label);
1374 }
1375}
1376
1377template <IRAdaptor Adaptor,
1378 typename Derived,
1379 template <typename, typename, typename> typename BaseTy,
1380 typename Config>
1381void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_jump(
1382 Jump jmp, Assembler::Label target_label) noexcept {
1383 if (this->assembler.label_is_pending(target_label)) {
1384 this->text_writer.ensure_space(6);
1385 auto *target = this->text_writer.cur_ptr();
1386 switch (jmp) {
1387 case Jump::ja: ASMNCF(JA, FE_JMPL, target); break;
1388 case Jump::jae: ASMNCF(JNC, FE_JMPL, target); break;
1389 case Jump::jb: ASMNCF(JC, FE_JMPL, target); break;
1390 case Jump::jbe: ASMNCF(JBE, FE_JMPL, target); break;
1391 case Jump::je: ASMNCF(JZ, FE_JMPL, target); break;
1392 case Jump::jg: ASMNCF(JG, FE_JMPL, target); break;
1393 case Jump::jge: ASMNCF(JGE, FE_JMPL, target); break;
1394 case Jump::jl: ASMNCF(JL, FE_JMPL, target); break;
1395 case Jump::jle: ASMNCF(JLE, FE_JMPL, target); break;
1396 case Jump::jmp: ASMNCF(JMP, FE_JMPL, target); break;
1397 case Jump::jne: ASMNCF(JNZ, FE_JMPL, target); break;
1398 case Jump::jno: ASMNCF(JNO, FE_JMPL, target); break;
1399 case Jump::jo: ASMNCF(JO, FE_JMPL, target); break;
1400 case Jump::js: ASMNCF(JS, FE_JMPL, target); break;
1401 case Jump::jns: ASMNCF(JNS, FE_JMPL, target); break;
1402 case Jump::jp: ASMNCF(JP, FE_JMPL, target); break;
1403 case Jump::jnp: ASMNCF(JNP, FE_JMPL, target); break;
1404 }
1405
1406 this->assembler.add_unresolved_entry(
1407 target_label,
1408 this->text_writer.get_sec_ref(),
1409 this->text_writer.offset() - 4,
1410 Assembler::UnresolvedEntryKind::JMP_OR_MEM_DISP);
1411 } else {
1412 this->text_writer.ensure_space(6);
1413 auto *target = this->text_writer.begin_ptr() +
1414 this->assembler.label_offset(target_label);
1415 switch (jmp) {
1416 case Jump::ja: ASMNC(JA, target); break;
1417 case Jump::jae: ASMNC(JNC, target); break;
1418 case Jump::jb: ASMNC(JC, target); break;
1419 case Jump::jbe: ASMNC(JBE, target); break;
1420 case Jump::je: ASMNC(JZ, target); break;
1421 case Jump::jg: ASMNC(JG, target); break;
1422 case Jump::jge: ASMNC(JGE, target); break;
1423 case Jump::jl: ASMNC(JL, target); break;
1424 case Jump::jle: ASMNC(JLE, target); break;
1425 case Jump::jmp: ASMNC(JMP, target); break;
1426 case Jump::jne: ASMNC(JNZ, target); break;
1427 case Jump::jno: ASMNC(JNO, target); break;
1428 case Jump::jo: ASMNC(JO, target); break;
1429 case Jump::js: ASMNC(JS, target); break;
1430 case Jump::jns: ASMNC(JNS, target); break;
1431 case Jump::jp: ASMNC(JP, target); break;
1432 case Jump::jnp: ASMNC(JNP, target); break;
1433 }
1434 }
1435}
1436
1437template <IRAdaptor Adaptor,
1438 typename Derived,
1439 template <typename, typename, typename> class BaseTy,
1440 typename Config>
1441void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_set(
1442 Jump jmp, AsmReg dst) noexcept {
1443 ASM(MOV32ri, dst, 0);
1444 switch (jmp) {
1445 case Jump::ja: ASM(SETA8r, dst); break;
1446 case Jump::jae: ASM(SETNC8r, dst); break;
1447 case Jump::jb: ASM(SETC8r, dst); break;
1448 case Jump::jbe: ASM(SETBE8r, dst); break;
1449 case Jump::je: ASM(SETZ8r, dst); break;
1450 case Jump::jg: ASM(SETG8r, dst); break;
1451 case Jump::jge: ASM(SETGE8r, dst); break;
1452 case Jump::jl: ASM(SETL8r, dst); break;
1453 case Jump::jle: ASM(SETLE8r, dst); break;
1454 case Jump::jmp: ASM(MOV32ri, dst, 1); break;
1455 case Jump::jne: ASM(SETNZ8r, dst); break;
1456 case Jump::jno: ASM(SETNO8r, dst); break;
1457 case Jump::jo: ASM(SETO8r, dst); break;
1458 case Jump::js: ASM(SETS8r, dst); break;
1459 case Jump::jns: ASM(SETNS8r, dst); break;
1460 case Jump::jp: ASM(SETP8r, dst); break;
1461 case Jump::jnp: ASM(SETNP8r, dst); break;
1462 }
1463}
1464
1465template <IRAdaptor Adaptor,
1466 typename Derived,
1467 template <typename, typename, typename> class BaseTy,
1468 typename Config>
1469void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_mask(
1470 Jump jmp, AsmReg dst) noexcept {
1471 // TODO: use sbb dst,dst/adc dest,-1 for carry flag
1472 generate_raw_set(jmp, dst);
1473 ASM(NEG64r, dst);
1474}
1475
1476template <IRAdaptor Adaptor,
1477 typename Derived,
1478 template <typename, typename, typename> class BaseTy,
1479 typename Config>
1480void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_raw_intext(
1481 AsmReg dst, AsmReg src, bool sign, u32 from, u32 to) noexcept {
1482 assert(from < to && to <= 64);
1483 if (!sign) {
1484 switch (from) {
1485 case 8: ASM(MOVZXr32r8, dst, src); break;
1486 case 16: ASM(MOVZXr32r16, dst, src); break;
1487 case 32: ASM(MOV32rr, dst, src); break;
1488 default:
1489 if (from < 32) {
1490 if (dst != src) {
1491 ASM(MOV32rr, dst, src);
1492 }
1493 ASM(AND32ri, dst, (uint32_t{1} << from) - 1);
1494 } else if (dst != src) {
1495 ASM(MOV64ri, dst, (uint64_t{1} << from) - 1);
1496 ASM(AND64rr, dst, src);
1497 } else {
1498 ScratchReg tmp{this};
1499 AsmReg tmp_reg = tmp.alloc_gp();
1500 ASM(MOV64ri, tmp_reg, (uint64_t{1} << from) - 1);
1501 ASM(AND64rr, dst, tmp_reg);
1502 }
1503 }
1504 } else if (to <= 32) {
1505 switch (from) {
1506 case 8: ASM(MOVSXr32r8, dst, src); break;
1507 case 16: ASM(MOVSXr32r16, dst, src); break;
1508 default:
1509 if (dst != src) {
1510 ASM(MOV32rr, dst, src);
1511 }
1512 ASM(SHL32ri, dst, 32 - from);
1513 ASM(SAR32ri, dst, 32 - from);
1514 }
1515 } else {
1516 switch (from) {
1517 case 8: ASM(MOVSXr64r8, dst, src); break;
1518 case 16: ASM(MOVSXr64r16, dst, src); break;
1519 case 32: ASM(MOVSXr64r32, dst, src); break;
1520 default:
1521 if (dst != src) {
1522 ASM(MOV64rr, dst, src);
1523 }
1524 ASM(SHL64ri, dst, 64 - from);
1525 ASM(SAR64ri, dst, 64 - from);
1526 }
1527 }
1528}
1529
1530template <IRAdaptor Adaptor,
1531 typename Derived,
1532 template <typename, typename, typename> class BaseTy,
1533 typename Config>
1534void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::
1535 set_stack_used() noexcept {
1536 if (stack_adjust_off == 0) {
1537 stack_adjust_off = this->compiler.text_writer.offset();
1538 // Always use 32-bit immediate
1539 ASMC(&this->compiler, SUB64ri, FE_SP, 0x100);
1540 assert(this->compiler.text_writer.offset() == stack_adjust_off + 7);
1541 }
1542}
1543
1544template <IRAdaptor Adaptor,
1545 typename Derived,
1546 template <typename, typename, typename> class BaseTy,
1547 typename Config>
1548void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_byval(
1549 ValuePart &vp, CCAssignment &cca) noexcept {
1550 AsmReg ptr = vp.load_to_reg(&this->compiler);
1551 ScratchReg scratch{&this->compiler};
1552 AsmReg tmp = scratch.alloc_gp();
1553
1554 auto size = cca.byval_size;
1555 set_stack_used();
1556 i32 off = 0;
1557 while (size >= 8) {
1558 ASMC(&this->compiler, MOV64rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1559 ASMC(&this->compiler,
1560 MOV64mr,
1561 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1562 tmp);
1563 off += 8;
1564 size -= 8;
1565 }
1566 if (size >= 4) {
1567 ASMC(&this->compiler, MOV32rm, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1568 ASMC(&this->compiler,
1569 MOV32mr,
1570 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1571 tmp);
1572 off += 4;
1573 size -= 4;
1574 }
1575 if (size >= 2) {
1576 ASMC(&this->compiler, MOVZXr32m16, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1577 ASMC(&this->compiler,
1578 MOV16mr,
1579 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1580 tmp);
1581 off += 2;
1582 size -= 2;
1583 }
1584 if (size >= 1) {
1585 ASMC(&this->compiler, MOVZXr32m8, tmp, FE_MEM(ptr, 0, FE_NOREG, off));
1586 ASMC(&this->compiler,
1587 MOV8mr,
1588 FE_MEM(FE_SP, 0, FE_NOREG, (i32)(cca.stack_off + off)),
1589 tmp);
1590 }
1591}
1592
1593template <IRAdaptor Adaptor,
1594 typename Derived,
1595 template <typename, typename, typename> class BaseTy,
1596 typename Config>
1597void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::add_arg_stack(
1598 ValuePart &vp, CCAssignment &cca) noexcept {
1599 set_stack_used();
1600
1601 auto reg = vp.load_to_reg(&this->compiler);
1602 if (this->compiler.register_file.reg_bank(reg) == Config::GP_BANK) {
1603 switch (cca.size) {
1604 case 1:
1605 ASMC(&this->compiler,
1606 MOV8mr,
1607 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1608 reg);
1609 break;
1610 case 2:
1611 ASMC(&this->compiler,
1612 MOV16mr,
1613 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1614 reg);
1615 break;
1616 case 4:
1617 ASMC(&this->compiler,
1618 MOV32mr,
1619 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1620 reg);
1621 break;
1622 case 8:
1623 ASMC(&this->compiler,
1624 MOV64mr,
1625 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1626 reg);
1627 break;
1628 default: TPDE_UNREACHABLE("invalid GP reg size");
1629 }
1630 } else {
1631 assert(this->compiler.register_file.reg_bank(reg) == Config::FP_BANK);
1632 switch (cca.size) {
1633 case 4:
1634 ASMC(&this->compiler,
1635 SSE_MOVSSmr,
1636 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1637 reg);
1638 break;
1639 case 8:
1640 ASMC(&this->compiler,
1641 SSE_MOVSDmr,
1642 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1643 reg);
1644 break;
1645 case 16:
1646 ASMC(&this->compiler,
1647 SSE_MOVDQAmr,
1648 FE_MEM(FE_SP, 0, FE_NOREG, i32(cca.stack_off)),
1649 reg);
1650 break;
1651 default: TPDE_UNREACHABLE("invalid GP reg size");
1652 }
1653 }
1654}
1655
1656template <IRAdaptor Adaptor,
1657 typename Derived,
1658 template <typename, typename, typename> class BaseTy,
1659 typename Config>
1660void CompilerX64<Adaptor, Derived, BaseTy, Config>::CallBuilder::call_impl(
1661 std::variant<typename Assembler::SymRef, ValuePart> &&target) noexcept {
1662 if (this->assigner.is_vararg()) {
1663 if (this->compiler.register_file.is_used(Reg{AsmReg::AX})) {
1664 this->compiler.evict_reg(Reg{AsmReg::AX});
1665 }
1666 Reg next_xmm = this->compiler.register_file.find_first_free_excluding(
1667 Config::FP_BANK, 0);
1668 unsigned xmm_cnt = 8;
1669 if (next_xmm.valid() && next_xmm.id() - AsmReg::XMM0 < 8) {
1670 xmm_cnt = next_xmm.id() - AsmReg::XMM0;
1671 }
1672 ASMC(&this->compiler, MOV32ri, FE_AX, xmm_cnt);
1673 }
1674
1675 u32 sub = 0;
1676 if (stack_adjust_off != 0) {
1677 auto *inst_ptr = this->compiler.text_writer.begin_ptr() + stack_adjust_off;
1678 sub = util::align_up(this->assigner.get_stack_size(), 0x10);
1679 memcpy(inst_ptr + 3, &sub, sizeof(u32));
1680 } else {
1681 assert(this->assigner.get_stack_size() == 0);
1682 }
1683
1684 if (auto *sym = std::get_if<typename Assembler::SymRef>(&target)) {
1685 this->compiler.text_writer.ensure_space(16);
1686 ASMC(&this->compiler, CALL, this->compiler.text_writer.cur_ptr());
1687 this->compiler.reloc_text(
1688 *sym, R_X86_64_PLT32, this->compiler.text_writer.offset() - 4, -4);
1689 } else {
1690 ValuePart &tvp = std::get<ValuePart>(target);
1691 if (AsmReg reg = tvp.cur_reg_unlocked(); reg.valid()) {
1692 ASMC(&this->compiler, CALLr, reg);
1693 } else if (tvp.has_assignment() && tvp.assignment().stack_valid()) {
1694 auto off = tvp.assignment().frame_off();
1695 ASMC(&this->compiler, CALLm, FE_MEM(FE_BP, 0, FE_NOREG, off));
1696 } else {
1697 assert(!this->compiler.register_file.is_used(Reg{AsmReg::R10}));
1698 AsmReg reg = tvp.reload_into_specific_fixed(&this->compiler, AsmReg::R10);
1699 ASMC(&this->compiler, CALLr, reg);
1700 }
1701 tvp.reset(&this->compiler);
1702 }
1703
1704 if (stack_adjust_off != 0) {
1705 ASMC(&this->compiler, ADD64ri, FE_SP, sub);
1706 }
1707}
1708
1709template <IRAdaptor Adaptor,
1710 typename Derived,
1711 template <typename, typename, typename> typename BaseTy,
1712 typename Config>
1713void CompilerX64<Adaptor, Derived, BaseTy, Config>::generate_call(
1714 std::variant<Assembler::SymRef, ValuePart> &&target,
1715 std::span<CallArg> arguments,
1716 typename Base::ValueRef *result,
1717 const bool variable_args) {
1718 CCAssignerSysV assigner{variable_args};
1719 CallBuilder cb{*derived(), assigner};
1720 for (auto &arg : arguments) {
1721 cb.add_arg(std::move(arg));
1722 }
1723 cb.call(std::move(target));
1724 if (result) {
1725 cb.add_ret(*result);
1726 }
1727}
1728
1729template <IRAdaptor Adaptor,
1730 typename Derived,
1731 template <typename, typename, typename> typename BaseTy,
1732 typename Config>
1733CompilerX64<Adaptor, Derived, BaseTy, Config>::ScratchReg
1734 CompilerX64<Adaptor, Derived, BaseTy, Config>::tls_get_addr(
1735 Assembler::SymRef sym, TLSModel model) noexcept {
1736 switch (model) {
1737 default: // TODO: implement optimized access for non-gd-model
1738 case TLSModel::GlobalDynamic: {
1739 // Generate function call to __tls_get_addr; on x86-64, this takes a single
1740 // parameter in rdi.
1741 auto csr = CCAssignerSysV::Info.callee_saved_regs;
1742 for (auto reg : util::BitSetIterator<>{this->register_file.used & ~csr}) {
1743 this->evict_reg(Reg{reg});
1744 }
1745 ScratchReg arg{this};
1746 AsmReg arg_reg = arg.alloc_specific(AsmReg::DI);
1747
1748 // Call sequence with extra prefixes for linker relaxation. Code sequence
1749 // taken from "ELF Handling For Thread-Local Storage".
1750 this->text_writer.ensure_space(0x10);
1751 *this->text_writer.cur_ptr()++ = 0x66;
1752 ASMNC(LEA64rm, arg_reg, FE_MEM(FE_IP, 0, FE_NOREG, 0));
1753 this->reloc_text(sym, R_X86_64_TLSGD, this->text_writer.offset() - 4, -4);
1754 *this->text_writer.cur_ptr()++ = 0x66;
1755 *this->text_writer.cur_ptr()++ = 0x66;
1756 *this->text_writer.cur_ptr()++ = 0x48;
1757 ASMNC(CALL, this->text_writer.cur_ptr());
1758 if (!this->sym_tls_get_addr.valid()) [[unlikely]] {
1759 this->sym_tls_get_addr = this->assembler.sym_add_undef(
1760 "__tls_get_addr", Assembler::SymBinding::GLOBAL);
1761 }
1762 this->reloc_text(this->sym_tls_get_addr,
1763 R_X86_64_PLT32,
1764 this->text_writer.offset() - 4,
1765 -4);
1766 arg.reset();
1767
1768 ScratchReg res{this};
1769 res.alloc_specific(AsmReg::AX);
1770 return res;
1771 }
1772 }
1773}
1774
1775} // namespace tpde::x64