Projects
home:dingli:branches:openEuler:24.09-openjdk
openjdk-11
_service:tar_scm:8231441-3-AArch64-Initial-SVE-...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:8231441-3-AArch64-Initial-SVE-backend-support.patch of Package openjdk-11
diff --git a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk index 2af2f9a..f23b972 100644 --- a/make/hotspot/gensrc/GensrcAdlc.gmk +++ b/make/hotspot/gensrc/GensrcAdlc.gmk @@ -156,6 +156,12 @@ ifeq ($(call check-jvm-feature, compiler2), true) ))) endif + ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64) + AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ + $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \ + ))) + endif + ifeq ($(call check-jvm-feature, shenandoahgc), true) AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/shenandoah/shenandoah_$(HOTSPOT_TARGET_CPU).ad \ diff --git a/src/hotspot/cpu/aarch64/aarch64-asmtest.py b/src/hotspot/cpu/aarch64/aarch64-asmtest.py index 31c6965..e621402 100644 --- a/src/hotspot/cpu/aarch64/aarch64-asmtest.py +++ b/src/hotspot/cpu/aarch64/aarch64-asmtest.py @@ -73,6 +73,48 @@ class GeneralRegisterOrSp(Register): return self.astr() else: return self.astr("r") +class SVEVectorRegister(FloatRegister): + def __str__(self): + return self.astr("z") + +class SVEPRegister(Register): + def __str__(self): + return self.astr("p") + + def generate(self): + self.number = random.randint(0, 15) + return self + +class SVEGoverningPRegister(Register): + def __str__(self): + return self.astr("p") + def generate(self): + self.number = random.randint(0, 7) + return self + +class RegVariant(object): + def __init__(self, low, high): + self.number = random.randint(low, high) + + def astr(self): + nameMap = { + 0: ".b", + 1: ".h", + 2: ".s", + 3: ".d", + 4: ".q" + } + return nameMap.get(self.number) + + def cstr(self): + nameMap = { + 0: "__ B", + 1: "__ H", + 2: "__ S", + 3: "__ D", + 4: "__ Q" + } + return nameMap.get(self.number) class FloatZero(Operand): @@ -88,7 +130,10 @@ class OperandFactory: 'w' : GeneralRegister, 's' : FloatRegister, 'd' : FloatRegister, - 'z' : FloatZero} + 'z' : FloatZero, + 'p' : SVEPRegister, + 'P' : SVEGoverningPRegister, + 'Z' : SVEVectorRegister} @classmethod def create(cls, mode): @@ -834,6 +879,100 @@ class FloatInstruction(Instruction): % tuple([Instruction.astr(self)] + [(self.reg[i].astr(self.modes[i])) for i in range(self.numRegs)])) +class SVEVectorOp(Instruction): + def __init__(self, args): + name = args[0] + regTypes = args[1] + regs = [] + for c in regTypes: + regs.append(OperandFactory.create(c).generate()) + self.reg = regs + self.numRegs = len(regs) + if regTypes[0] != "p" and regTypes[1] == 'P': + self._isPredicated = True + self._merge = "/m" + else: + self._isPredicated = False + self._merge ="" + + self._bitwiseop = False + if name[0] == 'f': + self._width = RegVariant(2, 3) + elif not self._isPredicated and (name in ["and", "eor", "orr", "bic"]): + self._width = RegVariant(3, 3) + self._bitwiseop = True + else: + self._width = RegVariant(0, 3) + if len(args) > 2: + self._dnm = args[2] + else: + self._dnm = None + Instruction.__init__(self, name) + + def cstr(self): + formatStr = "%s%s" + ''.join([", %s" for i in range(0, self.numRegs)] + [");"]) + if self._bitwiseop: + width = [] + formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)] + [");"]) + else: + width = [self._width.cstr()] + return (formatStr + % tuple(["__ sve_" + self._name + "("] + + [str(self.reg[0])] + + width + + [str(self.reg[i]) for i in range(1, self.numRegs)])) + def astr(self): + formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)]) + if self._dnm == 'dn': + formatStr += ", %s" + dnReg = [str(self.reg[0]) + self._width.astr()] + else: + dnReg = [] + + if self._isPredicated: + restRegs = [str(self.reg[1]) + self._merge] + dnReg + [str(self.reg[i]) + self._width.astr() for i in range(2, self.numRegs)] + else: + restRegs = dnReg + [str(self.reg[i]) + self._width.astr() for i in range(1, self.numRegs)] + return (formatStr + % tuple([Instruction.astr(self)] + + [str(self.reg[0]) + self._width.astr()] + + restRegs)) + def generate(self): + return self + +class SVEReductionOp(Instruction): + def __init__(self, args): + name = args[0] + lowRegType = args[1] + self.reg = [] + Instruction.__init__(self, name) + self.reg.append(OperandFactory.create('s').generate()) + self.reg.append(OperandFactory.create('P').generate()) + self.reg.append(OperandFactory.create('Z').generate()) + self._width = RegVariant(lowRegType, 3) + def cstr(self): + return "__ sve_%s(%s, %s, %s, %s);" % (self.name(), + str(self.reg[0]), + self._width.cstr(), + str(self.reg[1]), + str(self.reg[2])) + def astr(self): + if self.name() == "uaddv": + dstRegName = "d" + str(self.reg[0].number) + else: + dstRegName = self._width.astr()[1] + str(self.reg[0].number) + formatStr = "%s %s, %s, %s" + if self.name() == "fadda": + formatStr += ", %s" + moreReg = [dstRegName] + else: + moreReg = [] + return formatStr % tuple([self.name()] + + [dstRegName] + + [str(self.reg[1])] + + moreReg + + [str(self.reg[2]) + self._width.astr()]) + class LdStSIMDOp(Instruction): def __init__(self, args): self._name, self.regnum, self.arrangement, self.addresskind = args @@ -1120,7 +1259,42 @@ generate(SpecialCases, [["ccmn", "__ ccmn(zr, zr, 3u, Assembler::LE);", ["mov", "__ mov(v1, __ T2S, 1, zr);", "mov\tv1.s[1], wzr"], ["mov", "__ mov(v1, __ T4H, 2, zr);", "mov\tv1.h[2], wzr"], ["mov", "__ mov(v1, __ T8B, 3, zr);", "mov\tv1.b[3], wzr"], - ["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"]]) + ["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"], + # SVE instructions + ["cpy", "__ sve_cpy(z0, __ S, p0, v1);", "mov\tz0.s, p0/m, s1"], + ["inc", "__ sve_inc(r0, __ S);", "incw\tx0"], + ["dec", "__ sve_dec(r1, __ H);", "dech\tx1"], + ["lsl", "__ sve_lsl(z0, __ B, z1, 7);", "lsl\tz0.b, z1.b, #7"], + ["lsl", "__ sve_lsl(z21, __ H, z1, 15);", "lsl\tz21.h, z1.h, #15"], + ["lsl", "__ sve_lsl(z0, __ S, z1, 31);", "lsl\tz0.s, z1.s, #31"], + ["lsl", "__ sve_lsl(z0, __ D, z1, 63);", "lsl\tz0.d, z1.d, #63"], + ["lsr", "__ sve_lsr(z0, __ B, z1, 7);", "lsr\tz0.b, z1.b, #7"], + ["asr", "__ sve_asr(z0, __ H, z11, 15);", "asr\tz0.h, z11.h, #15"], + ["lsr", "__ sve_lsr(z30, __ S, z1, 31);", "lsr\tz30.s, z1.s, #31"], + ["asr", "__ sve_asr(z0, __ D, z1, 63);", "asr\tz0.d, z1.d, #63"], + ["addvl", "__ sve_addvl(sp, r0, 31);", "addvl\tsp, x0, #31"], + ["addpl", "__ sve_addpl(r1, sp, -32);", "addpl\tx1, sp, -32"], + ["cntp", "__ sve_cntp(r8, __ B, p0, p1);", "cntp\tx8, p0, p1.b"], + ["dup", "__ sve_dup(z0, __ B, 127);", "dup\tz0.b, 127"], + ["dup", "__ sve_dup(z1, __ H, -128);", "dup\tz1.h, -128"], + ["dup", "__ sve_dup(z2, __ S, 32512);", "dup\tz2.s, 32512"], + ["dup", "__ sve_dup(z7, __ D, -32768);", "dup\tz7.d, -32768"], + ["ld1b", "__ sve_ld1b(z0, __ B, p0, Address(sp));", "ld1b\t{z0.b}, p0/z, [sp]"], + ["ld1h", "__ sve_ld1h(z10, __ H, p1, Address(sp, -8));", "ld1h\t{z10.h}, p1/z, [sp, #-8, MUL VL]"], + ["ld1w", "__ sve_ld1w(z20, __ S, p2, Address(r0, 7));", "ld1w\t{z20.s}, p2/z, [x0, #7, MUL VL]"], + ["ld1b", "__ sve_ld1b(z30, __ B, p3, Address(sp, r8));", "ld1b\t{z30.b}, p3/z, [sp, x8]"], + ["ld1w", "__ sve_ld1w(z0, __ S, p4, Address(sp, r28));", "ld1w\t{z0.s}, p4/z, [sp, x28, LSL #2]"], + ["ld1d", "__ sve_ld1d(z11, __ D, p5, Address(r0, r1));", "ld1d\t{z11.d}, p5/z, [x0, x1, LSL #3]"], + ["st1b", "__ sve_st1b(z22, __ B, p6, Address(sp));", "st1b\t{z22.b}, p6, [sp]"], + ["st1b", "__ sve_st1b(z31, __ B, p7, Address(sp, -8));", "st1b\t{z31.b}, p7, [sp, #-8, MUL VL]"], + ["st1w", "__ sve_st1w(z0, __ S, p1, Address(r0, 7));", "st1w\t{z0.s}, p1, [x0, #7, MUL VL]"], + ["st1b", "__ sve_st1b(z0, __ B, p2, Address(sp, r1));", "st1b\t{z0.b}, p2, [sp, x1]"], + ["st1h", "__ sve_st1h(z0, __ H, p3, Address(sp, r8));", "st1h\t{z0.h}, p3, [sp, x8, LSL #1]"], + ["st1d", "__ sve_st1d(z0, __ D, p4, Address(r0, r8));", "st1d\t{z0.d}, p4, [x0, x8, LSL #3]"], + ["ldr", "__ sve_ldr(z0, Address(sp));", "ldr\tz0, [sp]"], + ["ldr", "__ sve_ldr(z31, Address(sp, -256));", "ldr\tz31, [sp, #-256, MUL VL]"], + ["str", "__ sve_str(z8, Address(r8, 255));", "str\tz8, [x8, #255, MUL VL]"], +]) print "\n// FloatImmediateOp" for float in ("2.0", "2.125", "4.0", "4.25", "8.0", "8.5", "16.0", "17.0", "0.125", @@ -1145,6 +1319,50 @@ for size in ("x", "w"): ["ldumin", "ldumin", size, suffix], ["ldumax", "ldumax", size, suffix]]); + +generate(SVEVectorOp, [["add", "ZZZ"], + ["sub", "ZZZ"], + ["fadd", "ZZZ"], + ["fmul", "ZZZ"], + ["fsub", "ZZZ"], + ["abs", "ZPZ"], + ["add", "ZPZ", "dn"], + ["asr", "ZPZ", "dn"], + ["cnt", "ZPZ"], + ["lsl", "ZPZ", "dn"], + ["lsr", "ZPZ", "dn"], + ["mul", "ZPZ", "dn"], + ["neg", "ZPZ"], + ["not", "ZPZ"], + ["smax", "ZPZ", "dn"], + ["smin", "ZPZ", "dn"], + ["sub", "ZPZ", "dn"], + ["fabs", "ZPZ"], + ["fadd", "ZPZ", "dn"], + ["fdiv", "ZPZ", "dn"], + ["fmax", "ZPZ", "dn"], + ["fmin", "ZPZ", "dn"], + ["fmul", "ZPZ", "dn"], + ["fneg", "ZPZ"], + ["frintm", "ZPZ"], + ["frintn", "ZPZ"], + ["frintp", "ZPZ"], + ["fsqrt", "ZPZ"], + ["fsub", "ZPZ", "dn"], + ["fmla", "ZPZZ"], + ["fmls", "ZPZZ"], + ["fnmla", "ZPZZ"], + ["fnmls", "ZPZZ"], + ["mla", "ZPZZ"], + ["mls", "ZPZZ"], + ["and", "ZZZ"], + ["eor", "ZZZ"], + ["orr", "ZZZ"], + ]) + +generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0], + ["fminv", 2], ["fmaxv", 2], ["fadda", 2], ["uaddv", 0]]) + print "\n __ bind(forth);" outfile.write("forth:\n") @@ -1153,8 +1371,8 @@ outfile.close() import subprocess import sys -# compile for 8.1 because of lse atomics -subprocess.check_call([AARCH64_AS, "-march=armv8.1-a", "aarch64ops.s", "-o", "aarch64ops.o"]) +# compile for sve with 8.1 and sha2 because of lse atomics and sha512 crypto extension. +subprocess.check_call([AARCH64_AS, "-march=armv8.1-a+sha2+sve", "aarch64ops.s", "-o", "aarch64ops.o"]) print print "/*", diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index f126488..8a92ff2 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -2006,6 +2006,10 @@ void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const { // branch if we need to invalidate the method later __ nop(); + if (UseSVE > 0 && C->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } + int bangsize = C->bang_size_in_bytes(); if (C->need_stack_bang(bangsize) && UseStackBanging) __ generate_stack_overflow_check(bangsize); @@ -2172,8 +2176,28 @@ uint MachSpillCopyNode::implementation(CodeBuffer *cbuf, PhaseRegAlloc *ra_, boo if (bottom_type()->isa_vect() != NULL) { uint ireg = ideal_reg(); - assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); - if (cbuf) { + if (ireg == Op_VecA && cbuf) { + MacroAssembler _masm(cbuf); + int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { + // stack->stack + __ spill_copy_sve_vector_stack_to_stack(src_offset, dst_offset, + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { + __ spill_sve_vector(as_FloatRegister(Matcher::_regEncode[src_lo]), ra_->reg2offset(dst_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { + __ unspill_sve_vector(as_FloatRegister(Matcher::_regEncode[dst_lo]), ra_->reg2offset(src_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { + __ sve_orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + ShouldNotReachHere(); + } + } else if (cbuf) { + assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); MacroAssembler _masm(cbuf); assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity"); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { @@ -2452,15 +2476,28 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { - - // TODO - // identify extra cases that we might want to provide match rules for - // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - bool ret_value = match_rule_supported(opcode); - // Add rules here. - - return ret_value; // Per default match rules are supported. + // Identify extra cases that we might want to provide match rules for vector nodes and + // other intrinsics guarded with vector length (vlen) and element type (bt). + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { + if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { + return false; + } + int bit_size = vlen * type2aelembytes(bt) * 8; + if (UseSVE == 0 && bit_size > 128) { + return false; + } + if (UseSVE > 0) { + return op_sve_supported(opcode); + } else { // NEON + // Special cases + switch (opcode) { + case Op_MulVL: + return false; + default: + break; + } + } + return true; // Per default match rules are supported. } const bool Matcher::has_predicated_vectors(void) { @@ -3812,6 +3849,12 @@ encode %{ return; } } + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + // Only non uncommon_trap calls need to reinitialize ptrue. + if (uncommon_trap_request() == 0) { + __ reinitialize_ptrue(); + } + } %} enc_class aarch64_enc_java_dynamic_call(method meth) %{ @@ -3821,6 +3864,8 @@ encode %{ if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); } %} @@ -3857,6 +3902,9 @@ encode %{ __ bind(retaddr); __ add(sp, sp, 2 * wordSize); } + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } %} enc_class aarch64_enc_rethrow() %{ @@ -3866,6 +3914,11 @@ encode %{ enc_class aarch64_enc_ret() %{ MacroAssembler _masm(&cbuf); +#ifdef ASSERT + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ verify_ptrue(); + } +#endif __ ret(lr); %} @@ -4607,6 +4660,41 @@ operand immLoffset16() interface(CONST_INTER); %} +// 8 bit signed value. +operand immI8() +%{ + predicate(n->get_int() <= 127 && n->get_int() >= -128); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immI8_shift8() +%{ + predicate((n->get_int() <= 127 && n->get_int() >= -128) || + (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0)); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immL8_shift8() +%{ + predicate((n->get_long() <= 127 && n->get_long() >= -128) || + (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0)); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // 32 bit integer valid for add sub immediate operand immIAddSub() %{ @@ -16433,7 +16521,7 @@ instruct loadV8(vecD dst, vmem8 mem) // Load Vector (128 bits) instruct loadV16(vecX dst, vmem16 mem) %{ - predicate(n->as_LoadVector()->memory_size() == 16); + predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 16); match(Set dst (LoadVector mem)); ins_cost(4 * INSN_COST); format %{ "ldrq $dst,$mem\t# vector (128 bits)" %} @@ -16489,7 +16577,7 @@ instruct replicate8B(vecD dst, iRegIorL2I src) instruct replicate16B(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (16B)" %} @@ -16514,7 +16602,7 @@ instruct replicate8B_imm(vecD dst, immI con) instruct replicate16B_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(16B)" %} @@ -16539,7 +16627,7 @@ instruct replicate4S(vecD dst, iRegIorL2I src) instruct replicate8S(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (8S)" %} @@ -16564,7 +16652,7 @@ instruct replicate4S_imm(vecD dst, immI con) instruct replicate8S_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(8H)" %} @@ -16588,7 +16676,7 @@ instruct replicate2I(vecD dst, iRegIorL2I src) instruct replicate4I(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4I)" %} @@ -16612,7 +16700,7 @@ instruct replicate2I_imm(vecD dst, immI con) instruct replicate4I_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(4I)" %} @@ -16624,7 +16712,7 @@ instruct replicate4I_imm(vecX dst, immI con) instruct replicate2L(vecX dst, iRegL src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateL src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2L)" %} @@ -16636,7 +16724,7 @@ instruct replicate2L(vecX dst, iRegL src) instruct replicate2L_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateI zero)); ins_cost(INSN_COST); format %{ "movi $dst, $zero\t# vector(4I)" %} @@ -16663,7 +16751,7 @@ instruct replicate2F(vecD dst, vRegF src) instruct replicate4F(vecX dst, vRegF src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateF src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4F)" %} @@ -16676,7 +16764,7 @@ instruct replicate4F(vecX dst, vRegF src) instruct replicate2D(vecX dst, vRegD src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateD src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2D)" %} diff --git a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad new file mode 100644 index 0000000..8d80cb3 --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad @@ -0,0 +1,1366 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- + +// AArch64 SVE Architecture Description File + + +// 4 bit signed offset -- for predicated load/store + +operand vmemA_immIOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_int(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand vmemA_immLOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_long(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + + +operand vmemA_indOffI4(iRegP reg, vmemA_immIOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +operand vmemA_indOffL4(iRegP reg, vmemA_immLOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn = NULL; + Assembler::SIMD_RegVariant type = Assembler::B; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + + + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +// vector add + +instruct vaddB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (AddVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (AddVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector and + +instruct vand(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (AndV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_and $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_and(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector or + +instruct vor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (OrV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_orr $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_orr(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector xor + +instruct vxor(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (XorV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_eor $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_eor(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector float div + +instruct vdivF(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (DivVF dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vdivD(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (DivVD dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fmls + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmla + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fnmls + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsF(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsD(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (AddVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (AddVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (AddVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector mls + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsS(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (SubVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsI(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (SubVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsL(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (SubVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +// vector mul + +instruct vmulS(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (MulVS dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulI(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (MulVI dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulL(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (MulVL dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (MulVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (MulVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector fneg + +instruct vnegF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVF src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vnegD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVD src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector add reduction + +instruct reduce_addI(iRegINoSp dst, iRegIorL2I src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (S)\n\t" + "umov $dst, $tmp, S, 0\n\t" + "addw $dst, $dst, $src1\t # add reduction S" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ S, 0); + __ addw($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addL(iRegLNoSp dst, iRegL src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (D)\n\t" + "umov $dst, $tmp, D, 0\n\t" + "add $dst, $dst, $src1\t # add reduction D" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ D, 0); + __ add($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addF(vRegF src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVF src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addD(vRegD src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVD src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector replicate + +instruct replicateB(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI(vReg dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL(vReg dst, iRegL src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateB_imm8(vReg dst, immI8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateS_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateI_imm8(vReg dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateL_imm8(vReg dst, immL8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateF(vReg dst, vRegF src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateF src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateD(vReg dst, vRegD src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateD src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector shift + +instruct vasrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) con = 7; + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 16) con = 15; + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslB_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslS_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslI_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslL_imm(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntB(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16 && + (n->bottom_type()->is_vect()->element_basic_type() == T_BYTE)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntS(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8 && + (n->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + (n->bottom_type()->is_vect()->element_basic_type() == T_CHAR))); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntI(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + (n->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntL(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + (n->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sqrt + +instruct vsqrtF(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVF src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsqrtD(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVD src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// vector sub + +instruct vsubB(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (SubVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubS(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (SubVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubI(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubL(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubF(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubD(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} diff --git a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 new file mode 100644 index 0000000..0323f2f --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 @@ -0,0 +1,727 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Limited. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +dnl Generate the warning +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- +dnl + +// AArch64 SVE Architecture Description File + +dnl +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET($1, $2, $3 ) +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET(imm_type_abbr, imm_type, imm_len) +define(`OPERAND_VMEMORYA_IMMEDIATE_OFFSET', ` +operand vmemA_imm$1Offset$3() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_$2(), $3, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(Con$1); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%}') +dnl +// 4 bit signed offset -- for predicated load/store +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(I, int, 4) +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(L, long, 4) +dnl +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET($1, $2 ) +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET(imm_type_abbr, imm_len) +define(`OPERAND_VMEMORYA_INDIRECT_OFFSET', ` +operand vmemA_indOff$1$2(iRegP reg, vmemA_imm$1Offset$2 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + `index'(0xffffffff); + scale(0x0); + disp($off); + %} +%}') +dnl +OPERAND_VMEMORYA_INDIRECT_OFFSET(I, 4) +OPERAND_VMEMORYA_INDIRECT_OFFSET(L, 4) + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn; + Assembler::SIMD_RegVariant type; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &MacroAssembler::sve_st1b : &MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &MacroAssembler::sve_st1h : &MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &MacroAssembler::sve_st1w : &MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &MacroAssembler::sve_st1d : &MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + +dnl +dnl ELEMENT_SHORT_CHART($1, $2) +dnl ELEMENT_SHORT_CHART(etype, node) +define(`ELEMENT_SHORT_CHAR',`ifelse(`$1', `T_SHORT', + `($2->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + ($2->bottom_type()->is_vect()->element_basic_type() == T_CHAR))', + `($2->bottom_type()->is_vect()->element_basic_type() == $1)')') +dnl + +// All SVE instructions + +// vector load/store + +// Use predicated vector load/store +instruct loadV(vReg dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeV(vReg src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl UNARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, %6 ) +dnl UNARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`UNARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$6 $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +dnl +dnl BINARY_OP_UNPREDICATED($1, $2 $3, $4 $5 ) +dnl BINARY_OP_UNPREDICATED(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_UNPREDICATED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector add +BINARY_OP_UNPREDICATED(vaddB, AddVB, B, 16, sve_add) +BINARY_OP_UNPREDICATED(vaddS, AddVS, H, 8, sve_add) +BINARY_OP_UNPREDICATED(vaddI, AddVI, S, 4, sve_add) +BINARY_OP_UNPREDICATED(vaddL, AddVL, D, 2, sve_add) +BINARY_OP_UNPREDICATED(vaddF, AddVF, S, 4, sve_fadd) +BINARY_OP_UNPREDICATED(vaddD, AddVD, D, 2, sve_fadd) +dnl +dnl BINARY_OP_UNSIZED($1, $2, $3, $4 ) +dnl BINARY_OP_UNSIZED(insn_name, op_name, min_vec_len, insn) +define(`BINARY_OP_UNSIZED', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $3); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$4 $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ $4(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector and +BINARY_OP_UNSIZED(vand, AndV, 16, sve_and) + +// vector or +BINARY_OP_UNSIZED(vor, OrV, 16, sve_orr) + +// vector xor +BINARY_OP_UNSIZED(vxor, XorV, 16, sve_eor) +dnl +dnl VDIVF($1, $2 , $3 ) +dnl VDIVF(name_suffix, size, min_vec_len) +define(`VDIVF', ` +instruct vdiv$1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (DivV$1 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) ($2)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector float div +VDIVF(F, S, 4) +VDIVF(D, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, $6 ) +dnl BINARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$6 $dst_src1, $dst_src1, $src2\t # vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst_src1$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl + +dnl +dnl VFMLA($1 $2 $3 ) +dnl VFMLA(name_suffix, size, min_vec_len) +define(`VFMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmla +VFMLA(F, S, 4) +VFMLA(D, D, 2) + +dnl +dnl VFMLS($1 $2 $3 ) +dnl VFMLS(name_suffix, size, min_vec_len) +define(`VFMLS', ` +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fmls +VFMLS(F, S, 4) +VFMLS(D, D, 2) + +dnl +dnl VFNMLA($1 $2 $3 ) +dnl VFNMLA(name_suffix, size, min_vec_len) +define(`VFNMLA', ` +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmla$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmla +VFNMLA(F, S, 4) +VFNMLA(D, D, 2) + +dnl +dnl VFNMLS($1 $2 $3 ) +dnl VFNMLS(name_suffix, size, min_vec_len) +define(`VFNMLS', ` +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmls$1(vReg dst_src1, vReg src2, vReg src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fnmls +VFNMLS(F, S, 4) +VFNMLS(D, D, 2) + +dnl +dnl VMLA($1 $2 $3 ) +dnl VMLA(name_suffix, size, min_vec_len) +define(`VMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vmla$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (AddV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mla +VMLA(B, B, 16) +VMLA(S, H, 8) +VMLA(I, S, 4) +VMLA(L, D, 2) + +dnl +dnl VMLS($1 $2 $3 ) +dnl VMLS(name_suffix, size, min_vec_len) +define(`VMLS', ` +// dst_src1 = dst_src1 - src2 * src3 +instruct vmls$1(vReg dst_src1, vReg src2, vReg src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (SubV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector mls +VMLS(B, B, 16) +VMLS(S, H, 8) +VMLS(I, S, 4) +VMLS(L, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl BINARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst_src1, vReg src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst_src1, $dst_src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst_src1$$reg), __ $3, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector mul +BINARY_OP_TRUE_PREDICATE(vmulS, MulVS, H, 8, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulI, MulVI, S, 4, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulL, MulVL, D, 2, sve_mul) +BINARY_OP_UNPREDICATED(vmulF, MulVF, S, 4, sve_fmul) +BINARY_OP_UNPREDICATED(vmulD, MulVD, D, 2, sve_fmul) + +dnl +dnl UNARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl UNARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_bytes, insn) +define(`UNARY_OP_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $4); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector fneg +UNARY_OP_TRUE_PREDICATE(vnegF, NegVF, S, 16, sve_fneg) +UNARY_OP_TRUE_PREDICATE(vnegD, NegVD, D, 16, sve_fneg) + +// popcount vector + +instruct vpopcountI(vReg dst, vReg src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl REDUCE_ADD($1, $2, $3, $4, $5, $6, $7 ) +dnl REDUCE_ADD(insn_name, op_name, reg_dst, reg_src, size, elem_type, insn1) +define(`REDUCE_ADD', ` +instruct $1($3 dst, $4 src1, vReg src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + ELEMENT_SHORT_CHAR($6, n->in(2))); + match(Set dst ($2 src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) ($5)\n\t" + "umov $dst, $tmp, $5, 0\n\t" + "$7 $dst, $dst, $src1\t # add reduction $5" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ $5, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ $5, 0); + __ $7($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REDUCE_ADDF($1, $2, $3, $4 ) +dnl REDUCE_ADDF(insn_name, op_name, reg_dst, size) +define(`REDUCE_ADDF', ` +instruct $1($3 src1_dst, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst ($2 src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// vector add reduction +REDUCE_ADD(reduce_addI, AddReductionVI, iRegINoSp, iRegIorL2I, S, T_INT, addw) +REDUCE_ADD(reduce_addL, AddReductionVL, iRegLNoSp, iRegL, D, T_LONG, add) +REDUCE_ADDF(reduce_addF, AddReductionVF, vRegF, S) +REDUCE_ADDF(reduce_addD, AddReductionVD, vRegD, D) + +dnl +dnl REDUCE_FMINMAX($1, $2, $3, $4, $5 ) +dnl REDUCE_FMINMAX(min_max, name_suffix, element_type, size, reg_src_dst) +define(`REDUCE_FMINMAX', ` +instruct reduce_$1$2($5 dst, $5 src1, vReg src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == $3 && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (translit($1, `m', `M')ReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_f$1v $dst, $src2 # vector (sve) (S)\n\t" + "f$1s $dst, $dst, $src1\t # $1 reduction $2" %} + ins_encode %{ + __ sve_f$1v(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + __ f`$1'translit($4, `SD', `sd')(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +// vector max reduction +REDUCE_FMINMAX(max, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(max, D, T_DOUBLE, D, vRegD) + +// vector min reduction +REDUCE_FMINMAX(min, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(min, D, T_DOUBLE, D, vRegD) + +dnl +dnl REPLICATE($1, $2, $3, $4, $5 ) +dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`REPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REPLICATE_IMM8($1, $2, $3, $4, $5 ) +dnl REPLICATE_IMM8(insn_name, op_name, imm_type, size, min_vec_len) +define(`REPLICATE_IMM8', ` +instruct $1(vReg dst, $3 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, $con$$constant); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl FREPLICATE($1, $2, $3, $4, $5 ) +dnl FREPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`FREPLICATE', ` +instruct $1(vReg dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector replicate +REPLICATE(replicateB, ReplicateB, iRegIorL2I, B, 16) +REPLICATE(replicateS, ReplicateS, iRegIorL2I, H, 8) +REPLICATE(replicateI, ReplicateI, iRegIorL2I, S, 4) +REPLICATE(replicateL, ReplicateL, iRegL, D, 2) + +REPLICATE_IMM8(replicateB_imm8, ReplicateB, immI8, B, 16) +REPLICATE_IMM8(replicateS_imm8, ReplicateS, immI8_shift8, H, 8) +REPLICATE_IMM8(replicateI_imm8, ReplicateI, immI8_shift8, S, 4) +REPLICATE_IMM8(replicateL_imm8, ReplicateL, immL8_shift8, D, 2) + +FREPLICATE(replicateF, ReplicateF, vRegF, S, 4) +FREPLICATE(replicateD, ReplicateD, vRegD, D, 2) +dnl +dnl VSHIFT_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_TRUE_PREDICATE', ` +instruct $1(vReg dst, vReg shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 dst shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $dst, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_IMM_UNPREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_IMM_UNPREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_IMM_UNPREDICATE', ` +instruct $1(vReg dst, vReg src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + int con = (int)$shift$$constant;dnl +ifelse(eval(index(`$1', `vasr') == 0 || index(`$1', `vlsr') == 0), 1, ` + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }')dnl +ifelse(eval(index(`$1', `vasr') == 0), 1, `ifelse(eval(index(`$3', `B') == 0), 1, ` + if (con >= 8) con = 7;')ifelse(eval(index(`$3', `H') == 0), 1, ` + if (con >= 16) con = 15;')')dnl +ifelse(eval((index(`$1', `vlsl') == 0 || index(`$1', `vlsr') == 0) && (index(`$3', `B') == 0 || index(`$3', `H') == 0)), 1, ` + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }') + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_COUNT($1, $2, $3, $4 ) +dnl VSHIFT_COUNT(insn_name, size, min_vec_len, type) +define(`VSHIFT_COUNT', ` +instruct $1(vReg dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3 && + ELEMENT_SHORT_CHAR($4, n)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) ($2)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $2, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// vector shift +VSHIFT_TRUE_PREDICATE(vasrB, RShiftVB, B, 16, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrS, RShiftVS, H, 8, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrI, RShiftVI, S, 4, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrL, RShiftVL, D, 2, sve_asr) +VSHIFT_TRUE_PREDICATE(vlslB, LShiftVB, B, 16, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslS, LShiftVS, H, 8, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslI, LShiftVI, S, 4, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslL, LShiftVL, D, 2, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlsrB, URShiftVB, B, 16, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrS, URShiftVS, H, 8, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrI, URShiftVI, S, 4, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrL, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vasrB_imm, RShiftVB, B, 16, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrS_imm, RShiftVS, H, 8, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrI_imm, RShiftVI, S, 4, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrL_imm, RShiftVL, D, 2, sve_asr) +VSHIFT_IMM_UNPREDICATE(vlsrB_imm, URShiftVB, B, 16, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrS_imm, URShiftVS, H, 8, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrI_imm, URShiftVI, S, 4, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrL_imm, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlslB_imm, LShiftVB, B, 16, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslS_imm, LShiftVS, H, 8, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslI_imm, LShiftVI, S, 4, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslL_imm, LShiftVL, D, 2, sve_lsl) +VSHIFT_COUNT(vshiftcntB, B, 16, T_BYTE) +VSHIFT_COUNT(vshiftcntS, H, 8, T_SHORT) +VSHIFT_COUNT(vshiftcntI, S, 4, T_INT) +VSHIFT_COUNT(vshiftcntL, D, 2, T_LONG) + +// vector sqrt +UNARY_OP_TRUE_PREDICATE(vsqrtF, SqrtVF, S, 16, sve_fsqrt) +UNARY_OP_TRUE_PREDICATE(vsqrtD, SqrtVD, D, 16, sve_fsqrt) + +// vector sub +BINARY_OP_UNPREDICATED(vsubB, SubVB, B, 16, sve_sub) +BINARY_OP_UNPREDICATED(vsubS, SubVS, H, 8, sve_sub) +BINARY_OP_UNPREDICATED(vsubI, SubVI, S, 4, sve_sub) +BINARY_OP_UNPREDICATED(vsubL, SubVL, D, 2, sve_sub) +BINARY_OP_UNPREDICATED(vsubF, SubVF, S, 4, sve_fsub) +BINARY_OP_UNPREDICATED(vsubD, SubVD, D, 2, sve_fsub) diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp index 8047ed8..32e5333 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp @@ -96,662 +96,746 @@ void entry(CodeBuffer *cb) { __ bind(back); // ArithOp - __ add(r15, r0, r24, Assembler::LSL, 59); // add x15, x0, x24, LSL #59 - __ sub(r17, r22, r22, Assembler::ASR, 13); // sub x17, x22, x22, ASR #13 - __ adds(r10, r26, r28, Assembler::LSL, 57); // adds x10, x26, x28, LSL #57 - __ subs(r25, r16, r24, Assembler::LSL, 18); // subs x25, x16, x24, LSL #18 - __ addw(r8, r5, r28, Assembler::LSL, 7); // add w8, w5, w28, LSL #7 - __ subw(r8, r28, r1, Assembler::ASR, 28); // sub w8, w28, w1, ASR #28 - __ addsw(r12, r2, r1, Assembler::LSL, 0); // adds w12, w2, w1, LSL #0 - __ subsw(r23, r5, r17, Assembler::LSR, 25); // subs w23, w5, w17, LSR #25 - __ andr(r21, r12, r13, Assembler::LSL, 21); // and x21, x12, x13, LSL #21 - __ orr(r21, r15, r23, Assembler::ASR, 36); // orr x21, x15, x23, ASR #36 - __ eor(r22, r24, r27, Assembler::ASR, 48); // eor x22, x24, x27, ASR #48 - __ ands(r22, r15, r2, Assembler::ASR, 52); // ands x22, x15, x2, ASR #52 - __ andw(r1, r17, r24, Assembler::ASR, 3); // and w1, w17, w24, ASR #3 - __ orrw(r5, r2, r6, Assembler::ASR, 11); // orr w5, w2, w6, ASR #11 - __ eorw(r23, r1, r5, Assembler::LSR, 12); // eor w23, w1, w5, LSR #12 - __ andsw(r0, r12, r14, Assembler::ASR, 20); // ands w0, w12, w14, ASR #20 - __ bic(r1, r6, r2, Assembler::LSR, 7); // bic x1, x6, x2, LSR #7 - __ orn(r30, r8, r4, Assembler::LSL, 47); // orn x30, x8, x4, LSL #47 - __ eon(r17, r22, r20, Assembler::ASR, 53); // eon x17, x22, x20, ASR #53 - __ bics(r29, r15, r5, Assembler::ASR, 36); // bics x29, x15, x5, ASR #36 - __ bicw(r30, r23, r29, Assembler::LSR, 27); // bic w30, w23, w29, LSR #27 - __ ornw(r12, r29, r2, Assembler::LSL, 20); // orn w12, w29, w2, LSL #20 - __ eonw(r7, r12, r6, Assembler::ASR, 4); // eon w7, w12, w6, ASR #4 - __ bicsw(r16, r13, r7, Assembler::ASR, 21); // bics w16, w13, w7, ASR #21 + __ add(r27, r27, r14, Assembler::ASR, 25); // add x27, x27, x14, ASR #25 + __ sub(r4, r11, r17, Assembler::LSR, 10); // sub x4, x11, x17, LSR #10 + __ adds(r7, r17, r25, Assembler::ASR, 33); // adds x7, x17, x25, ASR #33 + __ subs(r13, r22, r20, Assembler::ASR, 5); // subs x13, x22, x20, ASR #5 + __ addw(r10, r28, r3, Assembler::ASR, 16); // add w10, w28, w3, ASR #16 + __ subw(r21, r2, r6, Assembler::LSR, 15); // sub w21, w2, w6, LSR #15 + __ addsw(r6, r0, r27, Assembler::ASR, 9); // adds w6, w0, w27, ASR #9 + __ subsw(r5, r27, r8, Assembler::ASR, 10); // subs w5, w27, w8, ASR #10 + __ andr(r12, r4, r7, Assembler::ASR, 39); // and x12, x4, x7, ASR #39 + __ orr(r21, r27, r22, Assembler::LSL, 50); // orr x21, x27, x22, LSL #50 + __ eor(r3, r21, r0, Assembler::ASR, 46); // eor x3, x21, x0, ASR #46 + __ ands(r21, r10, r5, Assembler::ASR, 22); // ands x21, x10, x5, ASR #22 + __ andw(r13, r21, r29, Assembler::LSL, 22); // and w13, w21, w29, LSL #22 + __ orrw(r17, r10, r16, Assembler::LSR, 18); // orr w17, w10, w16, LSR #18 + __ eorw(r16, r7, r23, Assembler::ASR, 27); // eor w16, w7, w23, ASR #27 + __ andsw(r10, r6, r12, Assembler::ASR, 12); // ands w10, w6, w12, ASR #12 + __ bic(r19, r25, r7, Assembler::LSL, 22); // bic x19, x25, x7, LSL #22 + __ orn(r25, r2, r7, Assembler::LSL, 53); // orn x25, x2, x7, LSL #53 + __ eon(r9, r23, r23, Assembler::ASR, 3); // eon x9, x23, x23, ASR #3 + __ bics(r5, r6, r13, Assembler::ASR, 50); // bics x5, x6, x13, ASR #50 + __ bicw(r15, r21, r10, Assembler::LSL, 9); // bic w15, w21, w10, LSL #9 + __ ornw(r17, r21, r30, Assembler::ASR, 1); // orn w17, w21, w30, ASR #1 + __ eonw(r7, r28, r29, Assembler::LSL, 19); // eon w7, w28, w29, LSL #19 + __ bicsw(r25, r22, r22, Assembler::ASR, 12); // bics w25, w22, w22, ASR #12 // AddSubImmOp - __ addw(r5, r17, 726u); // add w5, w17, #726 - __ addsw(r10, r16, 347u); // adds w10, w16, #347 - __ subw(r26, r5, 978u); // sub w26, w5, #978 - __ subsw(r21, r24, 689u); // subs w21, w24, #689 - __ add(r10, r16, 987u); // add x10, x16, #987 - __ adds(r15, r15, 665u); // adds x15, x15, #665 - __ sub(r24, r20, 39u); // sub x24, x20, #39 - __ subs(r10, r13, 76u); // subs x10, x13, #76 + __ addw(r6, r26, 788u); // add w6, w26, #788 + __ addsw(r3, r17, 490u); // adds w3, w17, #490 + __ subw(r5, r21, 507u); // sub w5, w21, #507 + __ subsw(r22, r27, 883u); // subs w22, w27, #883 + __ add(r12, r8, 244u); // add x12, x8, #244 + __ adds(r29, r8, 928u); // adds x29, x8, #928 + __ sub(r26, r3, 642u); // sub x26, x3, #642 + __ subs(r29, r15, 628u); // subs x29, x15, #628 // LogicalImmOp - __ andw(r7, r19, 8388600ull); // and w7, w19, #0x7ffff8 - __ orrw(r5, r17, 4026535935ull); // orr w5, w17, #0xf0000fff - __ eorw(r16, r28, 4186112ull); // eor w16, w28, #0x3fe000 - __ andsw(r14, r24, 7168ull); // ands w14, w24, #0x1c00 - __ andr(r14, r27, 18446744073709543551ull); // and x14, x27, #0xffffffffffffe07f - __ orr(r12, r11, 576456354256912384ull); // orr x12, x11, #0x7fffc0000000000 - __ eor(r2, r0, 18437736874454811647ull); // eor x2, x0, #0xffe00000000003ff - __ ands(r13, r20, 18446744073642573823ull); // ands x13, x20, #0xfffffffffc01ffff + __ andw(r21, r30, 4287102855ull); // and w21, w30, #0xff87ff87 + __ orrw(r21, r12, 2139127680ull); // orr w21, w12, #0x7f807f80 + __ eorw(r11, r17, 3233857728ull); // eor w11, w17, #0xc0c0c0c0 + __ andsw(r26, r30, 1056980736ull); // ands w26, w30, #0x3f003f00 + __ andr(r25, r23, 18445618178097414144ull); // and x25, x23, #0xfffc0000fffc0000 + __ orr(r30, r14, 16429131440647569407ull); // orr x30, x14, #0xe3ffffffffffffff + __ eor(r26, r4, 18446744073172942847ull); // eor x26, x4, #0xffffffffe003ffff + __ ands(r26, r0, 18446181398634037247ull); // ands x26, x0, #0xfffe003fffffffff // AbsOp - __ b(__ pc()); // b . - __ b(back); // b back - __ b(forth); // b forth - __ bl(__ pc()); // bl . - __ bl(back); // bl back - __ bl(forth); // bl forth + __ b(__ pc()); // b . + __ b(back); // b back + __ b(forth); // b forth + __ bl(__ pc()); // bl . + __ bl(back); // bl back + __ bl(forth); // bl forth // RegAndAbsOp - __ cbzw(r15, __ pc()); // cbz w15, . - __ cbzw(r15, back); // cbz w15, back - __ cbzw(r15, forth); // cbz w15, forth - __ cbnzw(r28, __ pc()); // cbnz w28, . - __ cbnzw(r28, back); // cbnz w28, back - __ cbnzw(r28, forth); // cbnz w28, forth - __ cbz(r27, __ pc()); // cbz x27, . - __ cbz(r27, back); // cbz x27, back - __ cbz(r27, forth); // cbz x27, forth - __ cbnz(r0, __ pc()); // cbnz x0, . - __ cbnz(r0, back); // cbnz x0, back - __ cbnz(r0, forth); // cbnz x0, forth - __ adr(r13, __ pc()); // adr x13, . - __ adr(r13, back); // adr x13, back - __ adr(r13, forth); // adr x13, forth - __ _adrp(r3, __ pc()); // adrp x3, . + __ cbzw(r28, __ pc()); // cbz w28, . + __ cbzw(r28, back); // cbz w28, back + __ cbzw(r28, forth); // cbz w28, forth + __ cbnzw(r17, __ pc()); // cbnz w17, . + __ cbnzw(r17, back); // cbnz w17, back + __ cbnzw(r17, forth); // cbnz w17, forth + __ cbz(r25, __ pc()); // cbz x25, . + __ cbz(r25, back); // cbz x25, back + __ cbz(r25, forth); // cbz x25, forth + __ cbnz(r2, __ pc()); // cbnz x2, . + __ cbnz(r2, back); // cbnz x2, back + __ cbnz(r2, forth); // cbnz x2, forth + __ adr(r29, __ pc()); // adr x29, . + __ adr(r29, back); // adr x29, back + __ adr(r29, forth); // adr x29, forth + __ _adrp(r29, __ pc()); // adrp x29, . // RegImmAbsOp - __ tbz(r21, 7, __ pc()); // tbz x21, #7, . - __ tbz(r21, 7, back); // tbz x21, #7, back - __ tbz(r21, 7, forth); // tbz x21, #7, forth - __ tbnz(r15, 9, __ pc()); // tbnz x15, #9, . - __ tbnz(r15, 9, back); // tbnz x15, #9, back - __ tbnz(r15, 9, forth); // tbnz x15, #9, forth + __ tbz(r6, 6, __ pc()); // tbz x6, #6, . + __ tbz(r6, 6, back); // tbz x6, #6, back + __ tbz(r6, 6, forth); // tbz x6, #6, forth + __ tbnz(r21, 2, __ pc()); // tbnz x21, #2, . + __ tbnz(r21, 2, back); // tbnz x21, #2, back + __ tbnz(r21, 2, forth); // tbnz x21, #2, forth // MoveWideImmOp - __ movnw(r14, 2655, 16); // movn w14, #2655, lsl 16 - __ movzw(r17, 7642, 0); // movz w17, #7642, lsl 0 - __ movkw(r27, 11381, 0); // movk w27, #11381, lsl 0 - __ movn(r1, 19524, 32); // movn x1, #19524, lsl 32 - __ movz(r20, 21126, 16); // movz x20, #21126, lsl 16 - __ movk(r20, 32462, 16); // movk x20, #32462, lsl 16 + __ movnw(r8, 2735, 0); // movn w8, #2735, lsl 0 + __ movzw(r11, 11185, 16); // movz w11, #11185, lsl 16 + __ movkw(r26, 26028, 16); // movk w26, #26028, lsl 16 + __ movn(r13, 13140, 0); // movn x13, #13140, lsl 0 + __ movz(r6, 5829, 48); // movz x6, #5829, lsl 48 + __ movk(r16, 10786, 32); // movk x16, #10786, lsl 32 // BitfieldOp - __ sbfm(r13, r2, 28, 20); // sbfm x13, x2, #28, #20 - __ bfmw(r16, r20, 19, 15); // bfm w16, w20, #19, #15 - __ ubfmw(r11, r11, 9, 6); // ubfm w11, w11, #9, #6 - __ sbfm(r2, r4, 25, 21); // sbfm x2, x4, #25, #21 - __ bfm(r13, r16, 2, 19); // bfm x13, x16, #2, #19 - __ ubfm(r8, r25, 8, 5); // ubfm x8, x25, #8, #5 + __ sbfm(r30, r30, 17, 26); // sbfm x30, x30, #17, #26 + __ bfmw(r4, r9, 15, 12); // bfm w4, w9, #15, #12 + __ ubfmw(r15, r20, 1, 5); // ubfm w15, w20, #1, #5 + __ sbfm(r27, r8, 19, 14); // sbfm x27, x8, #19, #14 + __ bfm(r30, r0, 21, 29); // bfm x30, x0, #21, #29 + __ ubfm(r27, r26, 22, 11); // ubfm x27, x26, #22, #11 // ExtractOp - __ extrw(r29, r27, r10, 14); // extr w29, w27, w10, #14 - __ extr(r6, r20, r6, 24); // extr x6, x20, x6, #24 + __ extrw(r12, r12, r6, 27); // extr w12, w12, w6, #27 + __ extr(r19, r13, r22, 45); // extr x19, x13, x22, #45 // CondBranchOp - __ br(Assembler::EQ, __ pc()); // b.EQ . - __ br(Assembler::EQ, back); // b.EQ back - __ br(Assembler::EQ, forth); // b.EQ forth - __ br(Assembler::NE, __ pc()); // b.NE . - __ br(Assembler::NE, back); // b.NE back - __ br(Assembler::NE, forth); // b.NE forth - __ br(Assembler::HS, __ pc()); // b.HS . - __ br(Assembler::HS, back); // b.HS back - __ br(Assembler::HS, forth); // b.HS forth - __ br(Assembler::CS, __ pc()); // b.CS . - __ br(Assembler::CS, back); // b.CS back - __ br(Assembler::CS, forth); // b.CS forth - __ br(Assembler::LO, __ pc()); // b.LO . - __ br(Assembler::LO, back); // b.LO back - __ br(Assembler::LO, forth); // b.LO forth - __ br(Assembler::CC, __ pc()); // b.CC . - __ br(Assembler::CC, back); // b.CC back - __ br(Assembler::CC, forth); // b.CC forth - __ br(Assembler::MI, __ pc()); // b.MI . - __ br(Assembler::MI, back); // b.MI back - __ br(Assembler::MI, forth); // b.MI forth - __ br(Assembler::PL, __ pc()); // b.PL . - __ br(Assembler::PL, back); // b.PL back - __ br(Assembler::PL, forth); // b.PL forth - __ br(Assembler::VS, __ pc()); // b.VS . - __ br(Assembler::VS, back); // b.VS back - __ br(Assembler::VS, forth); // b.VS forth - __ br(Assembler::VC, __ pc()); // b.VC . - __ br(Assembler::VC, back); // b.VC back - __ br(Assembler::VC, forth); // b.VC forth - __ br(Assembler::HI, __ pc()); // b.HI . - __ br(Assembler::HI, back); // b.HI back - __ br(Assembler::HI, forth); // b.HI forth - __ br(Assembler::LS, __ pc()); // b.LS . - __ br(Assembler::LS, back); // b.LS back - __ br(Assembler::LS, forth); // b.LS forth - __ br(Assembler::GE, __ pc()); // b.GE . - __ br(Assembler::GE, back); // b.GE back - __ br(Assembler::GE, forth); // b.GE forth - __ br(Assembler::LT, __ pc()); // b.LT . - __ br(Assembler::LT, back); // b.LT back - __ br(Assembler::LT, forth); // b.LT forth - __ br(Assembler::GT, __ pc()); // b.GT . - __ br(Assembler::GT, back); // b.GT back - __ br(Assembler::GT, forth); // b.GT forth - __ br(Assembler::LE, __ pc()); // b.LE . - __ br(Assembler::LE, back); // b.LE back - __ br(Assembler::LE, forth); // b.LE forth - __ br(Assembler::AL, __ pc()); // b.AL . - __ br(Assembler::AL, back); // b.AL back - __ br(Assembler::AL, forth); // b.AL forth - __ br(Assembler::NV, __ pc()); // b.NV . - __ br(Assembler::NV, back); // b.NV back - __ br(Assembler::NV, forth); // b.NV forth + __ br(Assembler::EQ, __ pc()); // b.EQ . + __ br(Assembler::EQ, back); // b.EQ back + __ br(Assembler::EQ, forth); // b.EQ forth + __ br(Assembler::NE, __ pc()); // b.NE . + __ br(Assembler::NE, back); // b.NE back + __ br(Assembler::NE, forth); // b.NE forth + __ br(Assembler::HS, __ pc()); // b.HS . + __ br(Assembler::HS, back); // b.HS back + __ br(Assembler::HS, forth); // b.HS forth + __ br(Assembler::CS, __ pc()); // b.CS . + __ br(Assembler::CS, back); // b.CS back + __ br(Assembler::CS, forth); // b.CS forth + __ br(Assembler::LO, __ pc()); // b.LO . + __ br(Assembler::LO, back); // b.LO back + __ br(Assembler::LO, forth); // b.LO forth + __ br(Assembler::CC, __ pc()); // b.CC . + __ br(Assembler::CC, back); // b.CC back + __ br(Assembler::CC, forth); // b.CC forth + __ br(Assembler::MI, __ pc()); // b.MI . + __ br(Assembler::MI, back); // b.MI back + __ br(Assembler::MI, forth); // b.MI forth + __ br(Assembler::PL, __ pc()); // b.PL . + __ br(Assembler::PL, back); // b.PL back + __ br(Assembler::PL, forth); // b.PL forth + __ br(Assembler::VS, __ pc()); // b.VS . + __ br(Assembler::VS, back); // b.VS back + __ br(Assembler::VS, forth); // b.VS forth + __ br(Assembler::VC, __ pc()); // b.VC . + __ br(Assembler::VC, back); // b.VC back + __ br(Assembler::VC, forth); // b.VC forth + __ br(Assembler::HI, __ pc()); // b.HI . + __ br(Assembler::HI, back); // b.HI back + __ br(Assembler::HI, forth); // b.HI forth + __ br(Assembler::LS, __ pc()); // b.LS . + __ br(Assembler::LS, back); // b.LS back + __ br(Assembler::LS, forth); // b.LS forth + __ br(Assembler::GE, __ pc()); // b.GE . + __ br(Assembler::GE, back); // b.GE back + __ br(Assembler::GE, forth); // b.GE forth + __ br(Assembler::LT, __ pc()); // b.LT . + __ br(Assembler::LT, back); // b.LT back + __ br(Assembler::LT, forth); // b.LT forth + __ br(Assembler::GT, __ pc()); // b.GT . + __ br(Assembler::GT, back); // b.GT back + __ br(Assembler::GT, forth); // b.GT forth + __ br(Assembler::LE, __ pc()); // b.LE . + __ br(Assembler::LE, back); // b.LE back + __ br(Assembler::LE, forth); // b.LE forth + __ br(Assembler::AL, __ pc()); // b.AL . + __ br(Assembler::AL, back); // b.AL back + __ br(Assembler::AL, forth); // b.AL forth + __ br(Assembler::NV, __ pc()); // b.NV . + __ br(Assembler::NV, back); // b.NV back + __ br(Assembler::NV, forth); // b.NV forth // ImmOp - __ svc(26948); // svc #26948 - __ hvc(29998); // hvc #29998 - __ smc(10437); // smc #10437 - __ brk(30290); // brk #30290 - __ hlt(20851); // hlt #20851 + __ svc(16084); // svc #16084 + __ hvc(5802); // hvc #5802 + __ smc(14039); // smc #14039 + __ brk(11389); // brk #11389 + __ hlt(27339); // hlt #27339 // Op - __ nop(); // nop - __ eret(); // eret - __ drps(); // drps - __ isb(); // isb + __ nop(); // nop + __ eret(); // eret + __ drps(); // drps + __ isb(); // isb // SystemOp - __ dsb(Assembler::LD); // dsb LD - __ dmb(Assembler::ISH); // dmb ISH + __ dsb(Assembler::OSH); // dsb OSH + __ dmb(Assembler::NSHST); // dmb NSHST // OneRegOp - __ br(r9); // br x9 - __ blr(r9); // blr x9 + __ br(r11); // br x11 + __ blr(r25); // blr x25 // LoadStoreExclusiveOp - __ stxr(r2, r29, r11); // stxr w2, x29, [x11] - __ stlxr(r22, r5, r28); // stlxr w22, x5, [x28] - __ ldxr(r14, r20); // ldxr x14, [x20] - __ ldaxr(r29, r19); // ldaxr x29, [x19] - __ stlr(r6, r21); // stlr x6, [x21] - __ ldar(r19, r3); // ldar x19, [x3] + __ stxr(r14, r15, r13); // stxr w14, x15, [x13] + __ stlxr(r30, r25, r1); // stlxr w30, x25, [x1] + __ ldxr(r13, r3); // ldxr x13, [x3] + __ ldaxr(r8, r21); // ldaxr x8, [x21] + __ stlr(r13, r28); // stlr x13, [x28] + __ ldar(r8, r30); // ldar x8, [x30] // LoadStoreExclusiveOp - __ stxrw(r12, r3, r27); // stxr w12, w3, [x27] - __ stlxrw(r17, r26, r15); // stlxr w17, w26, [x15] - __ ldxrw(r13, r14); // ldxr w13, [x14] - __ ldaxrw(r12, r26); // ldaxr w12, [x26] - __ stlrw(r8, r17); // stlr w8, [x17] - __ ldarw(r21, r30); // ldar w21, [x30] + __ stxrw(r13, r17, r28); // stxr w13, w17, [x28] + __ stlxrw(r21, r17, r19); // stlxr w21, w17, [x19] + __ ldxrw(r3, r8); // ldxr w3, [x8] + __ ldaxrw(r29, r21); // ldaxr w29, [x21] + __ stlrw(r9, r24); // stlr w9, [x24] + __ ldarw(r2, r6); // ldar w2, [x6] // LoadStoreExclusiveOp - __ stxrh(r0, r15, r11); // stxrh w0, w15, [x11] - __ stlxrh(r17, r20, r1); // stlxrh w17, w20, [x1] - __ ldxrh(r29, r8); // ldxrh w29, [x8] - __ ldaxrh(r17, r12); // ldaxrh w17, [x12] - __ stlrh(r11, r4); // stlrh w11, [x4] - __ ldarh(r16, r4); // ldarh w16, [x4] + __ stxrh(r12, r20, r16); // stxrh w12, w20, [x16] + __ stlxrh(r2, r28, r5); // stlxrh w2, w28, [x5] + __ ldxrh(r1, r3); // ldxrh w1, [x3] + __ ldaxrh(r24, r13); // ldaxrh w24, [x13] + __ stlrh(r15, r25); // stlrh w15, [x25] + __ ldarh(r10, r20); // ldarh w10, [x20] // LoadStoreExclusiveOp - __ stxrb(r14, r5, r4); // stxrb w14, w5, [x4] - __ stlxrb(r27, r17, r16); // stlxrb w27, w17, [x16] - __ ldxrb(r6, r27); // ldxrb w6, [x27] - __ ldaxrb(r27, r24); // ldaxrb w27, [x24] - __ stlrb(r10, r20); // stlrb w10, [x20] - __ ldarb(r9, r26); // ldarb w9, [x26] + __ stxrb(r5, r16, r13); // stxrb w5, w16, [x13] + __ stlxrb(r10, r15, r17); // stlxrb w10, w15, [x17] + __ ldxrb(r17, r19); // ldxrb w17, [x19] + __ ldaxrb(r30, r9); // ldaxrb w30, [x9] + __ stlrb(r20, r24); // stlrb w20, [x24] + __ ldarb(r10, r4); // ldarb w10, [x4] // LoadStoreExclusiveOp - __ ldxp(r5, r30, r28); // ldxp x5, x30, [x28] - __ ldaxp(r10, r9, r19); // ldaxp x10, x9, [x19] - __ stxp(r11, r16, r21, r12); // stxp w11, x16, x21, [x12] - __ stlxp(r10, r20, r23, r4); // stlxp w10, x20, x23, [x4] + __ ldxp(r25, r8, r9); // ldxp x25, x8, [x9] + __ ldaxp(r7, r10, r16); // ldaxp x7, x10, [x16] + __ stxp(r25, r16, r11, r9); // stxp w25, x16, x11, [x9] + __ stlxp(r7, r5, r9, r15); // stlxp w7, x5, x9, [x15] // LoadStoreExclusiveOp - __ ldxpw(r22, r1, r0); // ldxp w22, w1, [x0] - __ ldaxpw(r3, r1, r8); // ldaxp w3, w1, [x8] - __ stxpw(r0, r9, r23, r30); // stxp w0, w9, w23, [x30] - __ stlxpw(r23, r0, r17, r11); // stlxp w23, w0, w17, [x11] + __ ldxpw(r12, r4, r3); // ldxp w12, w4, [x3] + __ ldaxpw(r17, r2, r5); // ldaxp w17, w2, [x5] + __ stxpw(r4, r8, r24, r6); // stxp w4, w8, w24, [x6] + __ stlxpw(r4, r12, r25, r16); // stlxp w4, w12, w25, [x16] -// base_plus_unscaled_offset +// base_plus_unscaled_offset // LoadStoreOp - __ str(r6, Address(r10, -31)); // str x6, [x10, -31] - __ strw(r7, Address(r0, -5)); // str w7, [x0, -5] - __ strb(r5, Address(r16, -13)); // strb w5, [x16, -13] - __ strh(r30, Address(r19, 31)); // strh w30, [x19, 31] - __ ldr(r16, Address(r9, 119)); // ldr x16, [x9, 119] - __ ldrw(r8, Address(r16, 59)); // ldr w8, [x16, 59] - __ ldrb(r10, Address(r12, -7)); // ldrb w10, [x12, -7] - __ ldrh(r14, Address(r9, -38)); // ldrh w14, [x9, -38] - __ ldrsb(r24, Address(r30, -8)); // ldrsb x24, [x30, -8] - __ ldrsh(r7, Address(r4, 23)); // ldrsh x7, [x4, 23] - __ ldrshw(r17, Address(r14, -39)); // ldrsh w17, [x14, -39] - __ ldrsw(r11, Address(r27, -31)); // ldrsw x11, [x27, -31] - __ ldrd(v12, Address(r7, 65)); // ldr d12, [x7, 65] - __ ldrs(v0, Address(r16, -2)); // ldr s0, [x16, -2] - __ strd(v13, Address(r23, -161)); // str d13, [x23, -161] - __ strs(v21, Address(r3, -62)); // str s21, [x3, -62] - -// pre + __ str(r14, Address(r30, 11)); // str x14, [x30, 11] + __ strw(r6, Address(r29, -97)); // str w6, [x29, -97] + __ strb(r2, Address(r11, -7)); // strb w2, [x11, -7] + __ strh(r20, Address(r8, -22)); // strh w20, [x8, -22] + __ ldr(r20, Address(r29, -29)); // ldr x20, [x29, -29] + __ ldrw(r9, Address(r0, -26)); // ldr w9, [x0, -26] + __ ldrb(r14, Address(r2, 8)); // ldrb w14, [x2, 8] + __ ldrh(r13, Address(r1, -24)); // ldrh w13, [x1, -24] + __ ldrsb(r13, Address(r17, -7)); // ldrsb x13, [x17, -7] + __ ldrsh(r17, Address(r7, -11)); // ldrsh x17, [x7, -11] + __ ldrshw(r3, Address(r8, -60)); // ldrsh w3, [x8, -60] + __ ldrsw(r14, Address(r12, 12)); // ldrsw x14, [x12, 12] + __ ldrd(v5, Address(r21, -235)); // ldr d5, [x21, -235] + __ ldrs(v9, Address(r0, -54)); // ldr s9, [x0, -54] + __ strd(v15, Address(r8, 95)); // str d15, [x8, 95] + __ strs(v22, Address(r0, -16)); // str s22, [x0, -16] + +// pre // LoadStoreOp - __ str(r2, Address(__ pre(r5, 100))); // str x2, [x5, 100]! - __ strw(r9, Address(__ pre(r1, -92))); // str w9, [x1, -92]! - __ strb(r27, Address(__ pre(r30, -5))); // strb w27, [x30, -5]! - __ strh(r27, Address(__ pre(r15, 12))); // strh w27, [x15, 12]! - __ ldr(r4, Address(__ pre(r17, -212))); // ldr x4, [x17, -212]! - __ ldrw(r21, Address(__ pre(r23, 30))); // ldr w21, [x23, 30]! - __ ldrb(r13, Address(__ pre(r17, -7))); // ldrb w13, [x17, -7]! - __ ldrh(r25, Address(__ pre(r0, -50))); // ldrh w25, [x0, -50]! - __ ldrsb(r1, Address(__ pre(r21, -21))); // ldrsb x1, [x21, -21]! - __ ldrsh(r28, Address(__ pre(r21, -54))); // ldrsh x28, [x21, -54]! - __ ldrshw(r11, Address(__ pre(r4, 2))); // ldrsh w11, [x4, 2]! - __ ldrsw(r17, Address(__ pre(r9, 61))); // ldrsw x17, [x9, 61]! - __ ldrd(v29, Address(__ pre(r19, 39))); // ldr d29, [x19, 39]! - __ ldrs(v22, Address(__ pre(r22, -85))); // ldr s22, [x22, -85]! - __ strd(v9, Address(__ pre(r25, -225))); // str d9, [x25, -225]! - __ strs(v9, Address(__ pre(r2, -15))); // str s9, [x2, -15]! - -// post + __ str(r23, Address(__ pre(r4, -239))); // str x23, [x4, -239]! + __ strw(r17, Address(__ pre(r0, -122))); // str w17, [x0, -122]! + __ strb(r26, Address(__ pre(r9, -5))); // strb w26, [x9, -5]! + __ strh(r21, Address(__ pre(r14, -8))); // strh w21, [x14, -8]! + __ ldr(r8, Address(__ pre(r7, 23))); // ldr x8, [x7, 23]! + __ ldrw(r12, Address(__ pre(r8, 22))); // ldr w12, [x8, 22]! + __ ldrb(r27, Address(__ pre(r28, 6))); // ldrb w27, [x28, 6]! + __ ldrh(r6, Address(__ pre(r19, -58))); // ldrh w6, [x19, -58]! + __ ldrsb(r7, Address(__ pre(r5, -20))); // ldrsb x7, [x5, -20]! + __ ldrsh(r22, Address(__ pre(r17, -32))); // ldrsh x22, [x17, -32]! + __ ldrshw(r17, Address(__ pre(r13, -2))); // ldrsh w17, [x13, -2]! + __ ldrsw(r29, Address(__ pre(r4, 22))); // ldrsw x29, [x4, 22]! + __ ldrd(v8, Address(__ pre(r28, -78))); // ldr d8, [x28, -78]! + __ ldrs(v23, Address(__ pre(r11, -5))); // ldr s23, [x11, -5]! + __ strd(v9, Address(__ pre(r20, -23))); // str d9, [x20, -23]! + __ strs(v5, Address(__ pre(r3, -103))); // str s5, [x3, -103]! + +// post // LoadStoreOp - __ str(r13, Address(__ post(r23, -66))); // str x13, [x23], -66 - __ strw(r17, Address(__ post(r16, 10))); // str w17, [x16], 10 - __ strb(r1, Address(__ post(r14, -32))); // strb w1, [x14], -32 - __ strh(r17, Address(__ post(r0, 6))); // strh w17, [x0], 6 - __ ldr(r27, Address(__ post(r25, -172))); // ldr x27, [x25], -172 - __ ldrw(r13, Address(__ post(r25, -38))); // ldr w13, [x25], -38 - __ ldrb(r11, Address(__ post(r25, -29))); // ldrb w11, [x25], -29 - __ ldrh(r30, Address(__ post(r5, 20))); // ldrh w30, [x5], 20 - __ ldrsb(r9, Address(__ post(r7, -7))); // ldrsb x9, [x7], -7 - __ ldrsh(r0, Address(__ post(r3, -62))); // ldrsh x0, [x3], -62 - __ ldrshw(r7, Address(__ post(r14, 31))); // ldrsh w7, [x14], 31 - __ ldrsw(r17, Address(__ post(r27, 39))); // ldrsw x17, [x27], 39 - __ ldrd(v17, Address(__ post(r4, -235))); // ldr d17, [x4], -235 - __ ldrs(v26, Address(__ post(r21, 34))); // ldr s26, [x21], 34 - __ strd(v5, Address(__ post(r17, -57))); // str d5, [x17], -57 - __ strs(v13, Address(__ post(r13, -109))); // str s13, [x13], -109 - -// base_plus_reg + __ str(r13, Address(__ post(r2, 32))); // str x13, [x2], 32 + __ strw(r30, Address(__ post(r19, 57))); // str w30, [x19], 57 + __ strb(r29, Address(__ post(r1, 3))); // strb w29, [x1], 3 + __ strh(r10, Address(__ post(r26, -17))); // strh w10, [x26], -17 + __ ldr(r15, Address(__ post(r1, -12))); // ldr x15, [x1], -12 + __ ldrw(r1, Address(__ post(r5, -6))); // ldr w1, [x5], -6 + __ ldrb(r24, Address(__ post(r15, -14))); // ldrb w24, [x15], -14 + __ ldrh(r29, Address(__ post(r17, -5))); // ldrh w29, [x17], -5 + __ ldrsb(r4, Address(__ post(r15, -17))); // ldrsb x4, [x15], -17 + __ ldrsh(r6, Address(__ post(r2, -54))); // ldrsh x6, [x2], -54 + __ ldrshw(r27, Address(__ post(r27, 18))); // ldrsh w27, [x27], 18 + __ ldrsw(r9, Address(__ post(r25, -77))); // ldrsw x9, [x25], -77 + __ ldrd(v21, Address(__ post(r29, -176))); // ldr d21, [x29], -176 + __ ldrs(v30, Address(__ post(r9, -50))); // ldr s30, [x9], -50 + __ strd(v2, Address(__ post(r12, -46))); // str d2, [x12], -46 + __ strs(v7, Address(__ post(r21, -59))); // str s7, [x21], -59 + +// base_plus_reg // LoadStoreOp - __ str(r6, Address(r16, r4, Address::sxtw(3))); // str x6, [x16, w4, sxtw #3] - __ strw(r9, Address(r24, r20, Address::sxtw(2))); // str w9, [x24, w20, sxtw #2] - __ strb(r3, Address(r29, r3, Address::lsl(0))); // strb w3, [x29, x3, lsl #0] - __ strh(r10, Address(r17, r30, Address::lsl(0))); // strh w10, [x17, x30, lsl #0] - __ ldr(r27, Address(r11, r7, Address::uxtw(0))); // ldr x27, [x11, w7, uxtw #0] - __ ldrw(r14, Address(r15, r25, Address::uxtw(0))); // ldr w14, [x15, w25, uxtw #0] - __ ldrb(r24, Address(r14, r19, Address::lsl(0))); // ldrb w24, [x14, x19, lsl #0] - __ ldrh(r16, Address(r0, r6, Address::sxtw(0))); // ldrh w16, [x0, w6, sxtw #0] - __ ldrsb(r10, Address(r12, r12, Address::sxtw(0))); // ldrsb x10, [x12, w12, sxtw #0] - __ ldrsh(r26, Address(r12, r16, Address::uxtw(0))); // ldrsh x26, [x12, w16, uxtw #0] - __ ldrshw(r26, Address(r0, r14, Address::lsl(1))); // ldrsh w26, [x0, x14, lsl #1] - __ ldrsw(r17, Address(r11, r27, Address::sxtx(2))); // ldrsw x17, [x11, x27, sxtx #2] - __ ldrd(v3, Address(r0, r19, Address::sxtw(3))); // ldr d3, [x0, w19, sxtw #3] - __ ldrs(v26, Address(r15, r9, Address::lsl(2))); // ldr s26, [x15, x9, lsl #2] - __ strd(v11, Address(r13, r16, Address::sxtx(0))); // str d11, [x13, x16, sxtx #0] - __ strs(v26, Address(r19, r21, Address::uxtw(2))); // str s26, [x19, w21, uxtw #2] - -// base_plus_scaled_offset + __ str(r12, Address(r12, r23, Address::uxtw(3))); // str x12, [x12, w23, uxtw #3] + __ strw(r14, Address(r30, r22, Address::uxtw(0))); // str w14, [x30, w22, uxtw #0] + __ strb(r12, Address(r3, r26, Address::uxtw(0))); // strb w12, [x3, w26, uxtw #0] + __ strh(r11, Address(r14, r30, Address::uxtw(0))); // strh w11, [x14, w30, uxtw #0] + __ ldr(r24, Address(r8, r24, Address::lsl(3))); // ldr x24, [x8, x24, lsl #3] + __ ldrw(r12, Address(r13, r20, Address::lsl(0))); // ldr w12, [x13, x20, lsl #0] + __ ldrb(r22, Address(r4, r5, Address::uxtw(0))); // ldrb w22, [x4, w5, uxtw #0] + __ ldrh(r28, Address(r5, r6, Address::uxtw(1))); // ldrh w28, [x5, w6, uxtw #1] + __ ldrsb(r2, Address(r11, r25, Address::lsl(0))); // ldrsb x2, [x11, x25, lsl #0] + __ ldrsh(r23, Address(r22, r25, Address::sxtx(0))); // ldrsh x23, [x22, x25, sxtx #0] + __ ldrshw(r3, Address(r20, r22, Address::sxtw(1))); // ldrsh w3, [x20, w22, sxtw #1] + __ ldrsw(r9, Address(r29, r14, Address::sxtx(2))); // ldrsw x9, [x29, x14, sxtx #2] + __ ldrd(v16, Address(r23, r29, Address::sxtx(3))); // ldr d16, [x23, x29, sxtx #3] + __ ldrs(v7, Address(r28, r20, Address::lsl(2))); // ldr s7, [x28, x20, lsl #2] + __ strd(v20, Address(r20, r24, Address::lsl(3))); // str d20, [x20, x24, lsl #3] + __ strs(v25, Address(r21, r23, Address::lsl(2))); // str s25, [x21, x23, lsl #2] + +// base_plus_scaled_offset // LoadStoreOp - __ str(r8, Address(r21, 12552)); // str x8, [x21, 12552] - __ strw(r10, Address(r27, 6380)); // str w10, [x27, 6380] - __ strb(r27, Address(r14, 1733)); // strb w27, [x14, 1733] - __ strh(r16, Address(r7, 3424)); // strh w16, [x7, 3424] - __ ldr(r27, Address(r9, 12520)); // ldr x27, [x9, 12520] - __ ldrw(r24, Address(r10, 6680)); // ldr w24, [x10, 6680] - __ ldrb(r24, Address(r24, 1743)); // ldrb w24, [x24, 1743] - __ ldrh(r20, Address(r5, 3072)); // ldrh w20, [x5, 3072] - __ ldrsb(r17, Address(r4, 1570)); // ldrsb x17, [x4, 1570] - __ ldrsh(r14, Address(r13, 3392)); // ldrsh x14, [x13, 3392] - __ ldrshw(r10, Address(r25, 3722)); // ldrsh w10, [x25, 3722] - __ ldrsw(r2, Address(r26, 6160)); // ldrsw x2, [x26, 6160] - __ ldrd(v26, Address(r14, 14912)); // ldr d26, [x14, 14912] - __ ldrs(v28, Address(r4, 7804)); // ldr s28, [x4, 7804] - __ strd(v14, Address(r19, 13984)); // str d14, [x19, 13984] - __ strs(v23, Address(r28, 6364)); // str s23, [x28, 6364] - -// pcrel + __ str(r17, Address(r2, 12312)); // str x17, [x2, 12312] + __ strw(r30, Address(r7, 6968)); // str w30, [x7, 6968] + __ strb(r3, Address(r7, 1833)); // strb w3, [x7, 1833] + __ strh(r7, Address(r1, 3366)); // strh w7, [x1, 3366] + __ ldr(r7, Address(r27, 14664)); // ldr x7, [x27, 14664] + __ ldrw(r8, Address(r17, 6156)); // ldr w8, [x17, 6156] + __ ldrb(r0, Address(r17, 1594)); // ldrb w0, [x17, 1594] + __ ldrh(r0, Address(r20, 3562)); // ldrh w0, [x20, 3562] + __ ldrsb(r19, Address(r17, 1681)); // ldrsb x19, [x17, 1681] + __ ldrsh(r19, Address(r6, 3776)); // ldrsh x19, [x6, 3776] + __ ldrshw(r4, Address(r10, 3708)); // ldrsh w4, [x10, 3708] + __ ldrsw(r29, Address(r4, 6948)); // ldrsw x29, [x4, 6948] + __ ldrd(v29, Address(r1, 13352)); // ldr d29, [x1, 13352] + __ ldrs(v15, Address(r28, 6544)); // ldr s15, [x28, 6544] + __ strd(v7, Address(r26, 14112)); // str d7, [x26, 14112] + __ strs(v0, Address(r30, 6820)); // str s0, [x30, 6820] + +// pcrel // LoadStoreOp - __ ldr(r8, forth); // ldr x8, forth - __ ldrw(r17, back); // ldr w17, back + __ ldr(r25, __ pc()); // ldr x25, . + __ ldrw(r9, __ pc()); // ldr w9, . // LoadStoreOp - __ prfm(Address(r4, -175)); // prfm PLDL1KEEP, [x4, -175] + __ prfm(Address(r22, 105)); // prfm PLDL1KEEP, [x22, 105] // LoadStoreOp - __ prfm(__ pc()); // prfm PLDL1KEEP, . + __ prfm(back); // prfm PLDL1KEEP, back // LoadStoreOp - __ prfm(Address(r8, r4, Address::sxtw(0))); // prfm PLDL1KEEP, [x8, w4, sxtw #0] + __ prfm(Address(r28, r30, Address::lsl(3))); // prfm PLDL1KEEP, [x28, x30, lsl #3] // LoadStoreOp - __ prfm(Address(r12, 13248)); // prfm PLDL1KEEP, [x12, 13248] + __ prfm(Address(r19, 14592)); // prfm PLDL1KEEP, [x19, 14592] // AddSubCarryOp - __ adcw(r20, r27, r21); // adc w20, w27, w21 - __ adcsw(r7, r17, r6); // adcs w7, w17, w6 - __ sbcw(r5, r6, r25); // sbc w5, w6, w25 - __ sbcsw(r30, r11, r14); // sbcs w30, w11, w14 - __ adc(r3, r17, r11); // adc x3, x17, x11 - __ adcs(r25, r10, r17); // adcs x25, x10, x17 - __ sbc(r7, r16, r23); // sbc x7, x16, x23 - __ sbcs(r4, r10, r5); // sbcs x4, x10, x5 + __ adcw(r17, r8, r24); // adc w17, w8, w24 + __ adcsw(r14, r17, r9); // adcs w14, w17, w9 + __ sbcw(r22, r1, r25); // sbc w22, w1, w25 + __ sbcsw(r15, r9, r19); // sbcs w15, w9, w19 + __ adc(r15, r20, r11); // adc x15, x20, x11 + __ adcs(r4, r11, r30); // adcs x4, x11, x30 + __ sbc(r20, r8, r6); // sbc x20, x8, x6 + __ sbcs(r10, r21, r15); // sbcs x10, x21, x15 // AddSubExtendedOp - __ addw(r9, r30, r9, ext::uxtx, 4); // add w9, w30, w9, uxtx #4 - __ addsw(r0, r5, r16, ext::sxth, 2); // adds w0, w5, w16, sxth #2 - __ sub(r15, r29, r27, ext::sxtb, 2); // sub x15, x29, x27, sxtb #2 - __ subsw(r11, r9, r1, ext::sxtx, 4); // subs w11, w9, w1, sxtx #4 - __ add(r2, r24, r6, ext::uxtw, 3); // add x2, x24, x6, uxtw #3 - __ adds(r19, r6, r26, ext::uxtx, 4); // adds x19, x6, x26, uxtx #4 - __ sub(r8, r26, r25, ext::sxtx, 3); // sub x8, x26, x25, sxtx #3 - __ subs(r26, r20, r9, ext::uxth, 4); // subs x26, x20, x9, uxth #4 + __ addw(r1, r11, r9, ext::sxtb, 3); // add w1, w11, w9, sxtb #3 + __ addsw(r4, r17, r28, ext::sxtx, 3); // adds w4, w17, w28, sxtx #3 + __ sub(r21, r29, r28, ext::sxth, 1); // sub x21, x29, x28, sxth #1 + __ subsw(r11, r28, r23, ext::sxtw, 4); // subs w11, w28, w23, sxtw #4 + __ add(r12, r26, r5, ext::sxtx, 4); // add x12, x26, x5, sxtx #4 + __ adds(r17, r6, r30, ext::uxtx, 2); // adds x17, x6, x30, uxtx #2 + __ sub(r7, r20, r1, ext::uxtb, 2); // sub x7, x20, x1, uxtb #2 + __ subs(r30, r9, r30, ext::sxtx, 3); // subs x30, x9, x30, sxtx #3 // ConditionalCompareOp - __ ccmnw(r13, r26, 7u, Assembler::MI); // ccmn w13, w26, #7, MI - __ ccmpw(r8, r20, 15u, Assembler::LO); // ccmp w8, w20, #15, LO - __ ccmn(r22, r3, 8u, Assembler::EQ); // ccmn x22, x3, #8, EQ - __ ccmp(r2, r24, 10u, Assembler::GE); // ccmp x2, x24, #10, GE + __ ccmnw(r4, r11, 7u, Assembler::EQ); // ccmn w4, w11, #7, EQ + __ ccmpw(r15, r5, 11u, Assembler::VC); // ccmp w15, w5, #11, VC + __ ccmn(r23, r17, 6u, Assembler::CS); // ccmn x23, x17, #6, CS + __ ccmp(r11, r11, 10u, Assembler::GT); // ccmp x11, x11, #10, GT // ConditionalCompareImmedOp - __ ccmnw(r8, 16, 13, Assembler::MI); // ccmn w8, #16, #13, MI - __ ccmpw(r16, 12, 1, Assembler::EQ); // ccmp w16, #12, #1, EQ - __ ccmn(r15, 31, 3, Assembler::VC); // ccmn x15, #31, #3, VC - __ ccmp(r23, 12, 15, Assembler::EQ); // ccmp x23, #12, #15, EQ + __ ccmnw(r14, 5, 12, Assembler::NE); // ccmn w14, #5, #12, NE + __ ccmpw(r23, 28, 15, Assembler::NE); // ccmp w23, #28, #15, NE + __ ccmn(r17, 30, 7, Assembler::LO); // ccmn x17, #30, #7, LO + __ ccmp(r30, 12, 0, Assembler::HI); // ccmp x30, #12, #0, HI // ConditionalSelectOp - __ cselw(r14, r7, r26, Assembler::LO); // csel w14, w7, w26, LO - __ csincw(r3, r27, r30, Assembler::LE); // csinc w3, w27, w30, LE - __ csinvw(r11, r21, r23, Assembler::EQ); // csinv w11, w21, w23, EQ - __ csnegw(r26, r30, r21, Assembler::GT); // csneg w26, w30, w21, GT - __ csel(r28, r26, r13, Assembler::HI); // csel x28, x26, x13, HI - __ csinc(r17, r3, r16, Assembler::LS); // csinc x17, x3, x16, LS - __ csinv(r11, r5, r3, Assembler::HI); // csinv x11, x5, x3, HI - __ csneg(r1, r3, r19, Assembler::GT); // csneg x1, x3, x19, GT + __ cselw(r26, r27, r1, Assembler::PL); // csel w26, w27, w1, PL + __ csincw(r14, r11, r21, Assembler::LE); // csinc w14, w11, w21, LE + __ csinvw(r30, r6, r15, Assembler::VS); // csinv w30, w6, w15, VS + __ csnegw(r17, r2, r25, Assembler::PL); // csneg w17, w2, w25, PL + __ csel(r16, r5, r7, Assembler::HI); // csel x16, x5, x7, HI + __ csinc(r10, r20, r28, Assembler::GT); // csinc x10, x20, x28, GT + __ csinv(r6, r7, r1, Assembler::HI); // csinv x6, x7, x1, HI + __ csneg(r22, r26, r17, Assembler::CS); // csneg x22, x26, x17, CS // TwoRegOp - __ rbitw(r0, r9); // rbit w0, w9 - __ rev16w(r26, r14); // rev16 w26, w14 - __ revw(r13, r17); // rev w13, w17 - __ clzw(r11, r20); // clz w11, w20 - __ clsw(r28, r17); // cls w28, w17 - __ rbit(r13, r4); // rbit x13, x4 - __ rev16(r1, r30); // rev16 x1, x30 - __ rev32(r13, r14); // rev32 x13, x14 - __ rev(r5, r8); // rev x5, x8 - __ clz(r2, r25); // clz x2, x25 - __ cls(r20, r8); // cls x20, x8 + __ rbitw(r11, r6); // rbit w11, w6 + __ rev16w(r0, r27); // rev16 w0, w27 + __ revw(r1, r29); // rev w1, w29 + __ clzw(r20, r21); // clz w20, w21 + __ clsw(r12, r12); // cls w12, w12 + __ rbit(r24, r19); // rbit x24, x19 + __ rev16(r23, r15); // rev16 x23, x15 + __ rev32(r17, r1); // rev32 x17, x1 + __ rev(r27, r3); // rev x27, x3 + __ clz(r30, r5); // clz x30, x5 + __ cls(r15, r0); // cls x15, x0 // ThreeRegOp - __ udivw(r21, r25, r27); // udiv w21, w25, w27 - __ sdivw(r13, r10, r16); // sdiv w13, w10, w16 - __ lslvw(r28, r1, r17); // lslv w28, w1, w17 - __ lsrvw(r25, r23, r10); // lsrv w25, w23, w10 - __ asrvw(r7, r3, r7); // asrv w7, w3, w7 - __ rorvw(r14, r30, r14); // rorv w14, w30, w14 - __ udiv(r12, r22, r15); // udiv x12, x22, x15 - __ sdiv(r2, r25, r13); // sdiv x2, x25, x13 - __ lslv(r7, r23, r21); // lslv x7, x23, x21 - __ lsrv(r11, r12, r0); // lsrv x11, x12, x0 - __ asrv(r30, r9, r28); // asrv x30, x9, x28 - __ rorv(r13, r5, r22); // rorv x13, x5, x22 - __ umulh(r5, r21, r4); // umulh x5, x21, x4 - __ smulh(r17, r2, r7); // smulh x17, x2, x7 + __ udivw(r14, r0, r20); // udiv w14, w0, w20 + __ sdivw(r27, r12, r21); // sdiv w27, w12, w21 + __ lslvw(r12, r10, r26); // lslv w12, w10, w26 + __ lsrvw(r14, r19, r6); // lsrv w14, w19, w6 + __ asrvw(r27, r19, r30); // asrv w27, w19, w30 + __ rorvw(r6, r14, r16); // rorv w6, w14, w16 + __ udiv(r17, r13, r2); // udiv x17, x13, x2 + __ sdiv(r0, r29, r2); // sdiv x0, x29, x2 + __ lslv(r12, r16, r2); // lslv x12, x16, x2 + __ lsrv(r9, r23, r29); // lsrv x9, x23, x29 + __ asrv(r6, r17, r29); // asrv x6, x17, x29 + __ rorv(r14, r30, r26); // rorv x14, x30, x26 + __ umulh(r17, r24, r26); // umulh x17, x24, x26 + __ smulh(r20, r26, r14); // smulh x20, x26, x14 // FourRegMulOp - __ maddw(r12, r12, r17, r12); // madd w12, w12, w17, w12 - __ msubw(r30, r15, r1, r27); // msub w30, w15, w1, w27 - __ madd(r2, r19, r17, r29); // madd x2, x19, x17, x29 - __ msub(r4, r23, r3, r30); // msub x4, x23, x3, x30 - __ smaddl(r15, r23, r17, r15); // smaddl x15, w23, w17, x15 - __ smsubl(r27, r12, r1, r13); // smsubl x27, w12, w1, x13 - __ umaddl(r6, r13, r12, r17); // umaddl x6, w13, w12, x17 - __ umsubl(r25, r1, r6, r10); // umsubl x25, w1, w6, x10 + __ maddw(r27, r8, r2, r7); // madd w27, w8, w2, w7 + __ msubw(r28, r13, r25, r12); // msub w28, w13, w25, w12 + __ madd(r4, r9, r10, r27); // madd x4, x9, x10, x27 + __ msub(r22, r10, r8, r30); // msub x22, x10, x8, x30 + __ smaddl(r20, r20, r25, r5); // smaddl x20, w20, w25, x5 + __ smsubl(r22, r22, r11, r27); // smsubl x22, w22, w11, x27 + __ umaddl(r4, r6, r12, r19); // umaddl x4, w6, w12, x19 + __ umsubl(r17, r15, r8, r0); // umsubl x17, w15, w8, x0 // ThreeRegFloatOp - __ fmuls(v17, v3, v4); // fmul s17, s3, s4 - __ fdivs(v16, v5, v21); // fdiv s16, s5, s21 - __ fadds(v3, v27, v17); // fadd s3, s27, s17 - __ fsubs(v25, v10, v15); // fsub s25, s10, s15 - __ fmuls(v10, v17, v0); // fmul s10, s17, s0 - __ fmuld(v28, v26, v3); // fmul d28, d26, d3 - __ fdivd(v4, v0, v27); // fdiv d4, d0, d27 - __ faddd(v28, v14, v2); // fadd d28, d14, d2 - __ fsubd(v12, v26, v23); // fsub d12, d26, d23 - __ fmuld(v15, v30, v1); // fmul d15, d30, d1 + __ fmuls(v22, v5, v10); // fmul s22, s5, s10 + __ fdivs(v4, v8, v16); // fdiv s4, s8, s16 + __ fadds(v25, v8, v6); // fadd s25, s8, s6 + __ fsubs(v6, v27, v25); // fsub s6, s27, s25 + __ fmuls(v10, v23, v9); // fmul s10, s23, s9 + __ fmuld(v22, v17, v12); // fmul d22, d17, d12 + __ fdivd(v11, v0, v20); // fdiv d11, d0, d20 + __ faddd(v0, v12, v15); // fadd d0, d12, d15 + __ fsubd(v15, v22, v4); // fsub d15, d22, d4 + __ fmuld(v29, v1, v25); // fmul d29, d1, d25 // FourRegFloatOp - __ fmadds(v4, v5, v5, v13); // fmadd s4, s5, s5, s13 - __ fmsubs(v21, v13, v28, v1); // fmsub s21, s13, s28, s1 - __ fnmadds(v17, v3, v29, v7); // fnmadd s17, s3, s29, s7 - __ fnmadds(v23, v25, v29, v26); // fnmadd s23, s25, s29, s26 - __ fmaddd(v14, v7, v30, v26); // fmadd d14, d7, d30, d26 - __ fmsubd(v22, v7, v10, v9); // fmsub d22, d7, d10, d9 - __ fnmaddd(v7, v7, v14, v9); // fnmadd d7, d7, d14, d9 - __ fnmaddd(v14, v24, v15, v24); // fnmadd d14, d24, d15, d24 + __ fmadds(v9, v27, v19, v5); // fmadd s9, s27, s19, s5 + __ fmsubs(v26, v3, v6, v26); // fmsub s26, s3, s6, s26 + __ fnmadds(v4, v12, v8, v20); // fnmadd s4, s12, s8, s20 + __ fnmadds(v25, v9, v21, v17); // fnmadd s25, s9, s21, s17 + __ fmaddd(v7, v3, v30, v22); // fmadd d7, d3, d30, d22 + __ fmsubd(v1, v27, v10, v10); // fmsub d1, d27, d10, d10 + __ fnmaddd(v17, v8, v22, v1); // fnmadd d17, d8, d22, d1 + __ fnmaddd(v14, v28, v2, v27); // fnmadd d14, d28, d2, d27 // TwoRegFloatOp - __ fmovs(v22, v2); // fmov s22, s2 - __ fabss(v0, v3); // fabs s0, s3 - __ fnegs(v9, v17); // fneg s9, s17 - __ fsqrts(v24, v11); // fsqrt s24, s11 - __ fcvts(v15, v25); // fcvt d15, s25 - __ fmovd(v4, v3); // fmov d4, d3 - __ fabsd(v26, v22); // fabs d26, d22 - __ fnegd(v30, v19); // fneg d30, d19 - __ fsqrtd(v12, v14); // fsqrt d12, d14 - __ fcvtd(v17, v7); // fcvt s17, d7 + __ fmovs(v27, v30); // fmov s27, s30 + __ fabss(v5, v1); // fabs s5, s1 + __ fnegs(v23, v19); // fneg s23, s19 + __ fsqrts(v28, v17); // fsqrt s28, s17 + __ fcvts(v25, v6); // fcvt d25, s6 + __ fmovd(v20, v14); // fmov d20, d14 + __ fabsd(v17, v10); // fabs d17, d10 + __ fnegd(v10, v17); // fneg d10, d17 + __ fsqrtd(v21, v17); // fsqrt d21, d17 + __ fcvtd(v21, v15); // fcvt s21, d15 // FloatConvertOp - __ fcvtzsw(r24, v14); // fcvtzs w24, s14 - __ fcvtzs(r13, v26); // fcvtzs x13, s26 - __ fcvtzdw(r2, v1); // fcvtzs w2, d1 - __ fcvtzd(r5, v11); // fcvtzs x5, d11 - __ scvtfws(v14, r19); // scvtf s14, w19 - __ scvtfs(v1, r22); // scvtf s1, x22 - __ scvtfwd(v27, r17); // scvtf d27, w17 - __ scvtfd(v22, r9); // scvtf d22, x9 - __ fmovs(r14, v3); // fmov w14, s3 - __ fmovd(r12, v17); // fmov x12, d17 - __ fmovs(v8, r27); // fmov s8, w27 - __ fmovd(v29, r28); // fmov d29, x28 + __ fcvtzsw(r7, v11); // fcvtzs w7, s11 + __ fcvtzs(r2, v29); // fcvtzs x2, s29 + __ fcvtzdw(r3, v25); // fcvtzs w3, d25 + __ fcvtzd(r28, v8); // fcvtzs x28, d8 + __ scvtfws(v11, r3); // scvtf s11, w3 + __ scvtfs(v2, r21); // scvtf s2, x21 + __ scvtfwd(v29, r25); // scvtf d29, w25 + __ scvtfd(v19, r3); // scvtf d19, x3 + __ fmovs(r20, v29); // fmov w20, s29 + __ fmovd(r23, v17); // fmov x23, d17 + __ fmovs(v0, r28); // fmov s0, w28 + __ fmovd(v9, r20); // fmov d9, x20 // TwoRegFloatOp - __ fcmps(v0, v30); // fcmp s0, s30 - __ fcmpd(v12, v9); // fcmp d12, d9 - __ fcmps(v10, 0.0); // fcmp s10, #0.0 - __ fcmpd(v25, 0.0); // fcmp d25, #0.0 + __ fcmps(v7, v12); // fcmp s7, s12 + __ fcmpd(v13, v14); // fcmp d13, d14 + __ fcmps(v12, 0.0); // fcmp s12, #0.0 + __ fcmpd(v1, 0.0); // fcmp d1, #0.0 // LoadStorePairOp - __ stpw(r8, r30, Address(r27, -144)); // stp w8, w30, [x27, #-144] - __ ldpw(r21, r19, Address(r24, 80)); // ldp w21, w19, [x24, #80] - __ ldpsw(r16, r27, Address(r2, -240)); // ldpsw x16, x27, [x2, #-240] - __ stp(r21, r5, Address(r6, -128)); // stp x21, x5, [x6, #-128] - __ ldp(r29, r25, Address(r28, -32)); // ldp x29, x25, [x28, #-32] + __ stpw(r12, r2, Address(r22, -64)); // stp w12, w2, [x22, #-64] + __ ldpw(r27, r9, Address(r24, -208)); // ldp w27, w9, [x24, #-208] + __ ldpsw(r15, r4, Address(r24, -176)); // ldpsw x15, x4, [x24, #-176] + __ stp(r5, r21, Address(r0, 16)); // stp x5, x21, [x0, #16] + __ ldp(r6, r23, Address(r9, -208)); // ldp x6, x23, [x9, #-208] // LoadStorePairOp - __ stpw(r8, r13, Address(__ pre(r0, 128))); // stp w8, w13, [x0, #128]! - __ ldpw(r25, r20, Address(__ pre(r1, -160))); // ldp w25, w20, [x1, #-160]! - __ ldpsw(r14, r24, Address(__ pre(r22, -32))); // ldpsw x14, x24, [x22, #-32]! - __ stp(r17, r1, Address(__ pre(r6, 80))); // stp x17, x1, [x6, #80]! - __ ldp(r21, r17, Address(__ pre(r25, -64))); // ldp x21, x17, [x25, #-64]! + __ stpw(r0, r3, Address(__ pre(r29, 16))); // stp w0, w3, [x29, #16]! + __ ldpw(r29, r16, Address(__ pre(r1, -144))); // ldp w29, w16, [x1, #-144]! + __ ldpsw(r27, r19, Address(__ pre(r21, 16))); // ldpsw x27, x19, [x21, #16]! + __ stp(r6, r17, Address(__ pre(r13, -176))); // stp x6, x17, [x13, #-176]! + __ ldp(r0, r24, Address(__ pre(r1, 16))); // ldp x0, x24, [x1, #16]! // LoadStorePairOp - __ stpw(r17, r21, Address(__ post(r20, -128))); // stp w17, w21, [x20], #-128 - __ ldpw(r28, r28, Address(__ post(r2, 64))); // ldp w28, w28, [x2], #64 - __ ldpsw(r19, r30, Address(__ post(r10, -256))); // ldpsw x19, x30, [x10], #-256 - __ stp(r17, r15, Address(__ post(r17, -16))); // stp x17, x15, [x17], #-16 - __ ldp(r17, r0, Address(__ post(r25, -32))); // ldp x17, x0, [x25], #-32 + __ stpw(r0, r20, Address(__ post(r22, 0))); // stp w0, w20, [x22], #0 + __ ldpw(r17, r12, Address(__ post(r14, -48))); // ldp w17, w12, [x14], #-48 + __ ldpsw(r10, r26, Address(__ post(r1, 112))); // ldpsw x10, x26, [x1], #112 + __ stp(r20, r24, Address(__ post(r13, -96))); // stp x20, x24, [x13], #-96 + __ ldp(r1, r12, Address(__ post(r7, 48))); // ldp x1, x12, [x7], #48 // LoadStorePairOp - __ stnpw(r14, r5, Address(r24, -32)); // stnp w14, w5, [x24, #-32] - __ ldnpw(r23, r19, Address(r1, 112)); // ldnp w23, w19, [x1, #112] - __ stnp(r11, r6, Address(r14, 64)); // stnp x11, x6, [x14, #64] - __ ldnp(r2, r11, Address(r27, -224)); // ldnp x2, x11, [x27, #-224] + __ stnpw(r5, r10, Address(r23, -80)); // stnp w5, w10, [x23, #-80] + __ ldnpw(r8, r10, Address(r24, -48)); // ldnp w8, w10, [x24, #-48] + __ stnp(r11, r15, Address(r11, 64)); // stnp x11, x15, [x11, #64] + __ ldnp(r9, r28, Address(r5, 64)); // ldnp x9, x28, [x5, #64] // LdStSIMDOp - __ ld1(v16, __ T8B, Address(r17)); // ld1 {v16.8B}, [x17] - __ ld1(v29, v30, __ T16B, Address(__ post(r9, 32))); // ld1 {v29.16B, v30.16B}, [x9], 32 - __ ld1(v30, v31, v0, __ T1D, Address(__ post(r24, r21))); // ld1 {v30.1D, v31.1D, v0.1D}, [x24], x21 - __ ld1(v0, v1, v2, v3, __ T8H, Address(__ post(r2, 64))); // ld1 {v0.8H, v1.8H, v2.8H, v3.8H}, [x2], 64 - __ ld1r(v20, __ T8B, Address(r9)); // ld1r {v20.8B}, [x9] - __ ld1r(v17, __ T4S, Address(__ post(r0, 4))); // ld1r {v17.4S}, [x0], 4 - __ ld1r(v21, __ T1D, Address(__ post(r22, r26))); // ld1r {v21.1D}, [x22], x26 - __ ld2(v19, v20, __ T2D, Address(r25)); // ld2 {v19.2D, v20.2D}, [x25] - __ ld2(v10, v11, __ T4H, Address(__ post(r5, 16))); // ld2 {v10.4H, v11.4H}, [x5], 16 - __ ld2r(v10, v11, __ T16B, Address(r24)); // ld2r {v10.16B, v11.16B}, [x24] - __ ld2r(v13, v14, __ T2S, Address(__ post(r29, 8))); // ld2r {v13.2S, v14.2S}, [x29], 8 - __ ld2r(v22, v23, __ T2D, Address(__ post(r28, r2))); // ld2r {v22.2D, v23.2D}, [x28], x2 - __ ld3(v30, v31, v0, __ T4S, Address(__ post(r4, r11))); // ld3 {v30.4S, v31.4S, v0.4S}, [x4], x11 - __ ld3(v29, v30, v31, __ T2S, Address(r0)); // ld3 {v29.2S, v30.2S, v31.2S}, [x0] - __ ld3r(v23, v24, v25, __ T8H, Address(r27)); // ld3r {v23.8H, v24.8H, v25.8H}, [x27] - __ ld3r(v3, v4, v5, __ T4S, Address(__ post(r10, 12))); // ld3r {v3.4S, v4.4S, v5.4S}, [x10], 12 - __ ld3r(v19, v20, v21, __ T1D, Address(__ post(r14, r22))); // ld3r {v19.1D, v20.1D, v21.1D}, [x14], x22 - __ ld4(v14, v15, v16, v17, __ T8H, Address(__ post(r0, 64))); // ld4 {v14.8H, v15.8H, v16.8H, v17.8H}, [x0], 64 - __ ld4(v30, v31, v0, v1, __ T8B, Address(__ post(r22, r25))); // ld4 {v30.8B, v31.8B, v0.8B, v1.8B}, [x22], x25 - __ ld4r(v25, v26, v27, v28, __ T8B, Address(r0)); // ld4r {v25.8B, v26.8B, v27.8B, v28.8B}, [x0] - __ ld4r(v10, v11, v12, v13, __ T4H, Address(__ post(r8, 8))); // ld4r {v10.4H, v11.4H, v12.4H, v13.4H}, [x8], 8 - __ ld4r(v1, v2, v3, v4, __ T2S, Address(__ post(r6, r28))); // ld4r {v1.2S, v2.2S, v3.2S, v4.2S}, [x6], x28 + __ ld1(v11, __ T8B, Address(r20)); // ld1 {v11.8B}, [x20] + __ ld1(v19, v20, __ T16B, Address(__ post(r8, 32))); // ld1 {v19.16B, v20.16B}, [x8], 32 + __ ld1(v3, v4, v5, __ T1D, Address(__ post(r2, r3))); // ld1 {v3.1D, v4.1D, v5.1D}, [x2], x3 + __ ld1(v21, v22, v23, v24, __ T8H, Address(__ post(r3, 64))); // ld1 {v21.8H, v22.8H, v23.8H, v24.8H}, [x3], 64 + __ ld1r(v14, __ T8B, Address(r5)); // ld1r {v14.8B}, [x5] + __ ld1r(v13, __ T4S, Address(__ post(r27, 4))); // ld1r {v13.4S}, [x27], 4 + __ ld1r(v17, __ T1D, Address(__ post(r19, r0))); // ld1r {v17.1D}, [x19], x0 + __ ld2(v27, v28, __ T2D, Address(r5)); // ld2 {v27.2D, v28.2D}, [x5] + __ ld2(v26, v27, __ T4H, Address(__ post(r4, 16))); // ld2 {v26.4H, v27.4H}, [x4], 16 + __ ld2r(v8, v9, __ T16B, Address(r23)); // ld2r {v8.16B, v9.16B}, [x23] + __ ld2r(v14, v15, __ T2S, Address(__ post(r10, 8))); // ld2r {v14.2S, v15.2S}, [x10], 8 + __ ld2r(v10, v11, __ T2D, Address(__ post(r21, r19))); // ld2r {v10.2D, v11.2D}, [x21], x19 + __ ld3(v17, v18, v19, __ T4S, Address(__ post(r14, r30))); // ld3 {v17.4S, v18.4S, v19.4S}, [x14], x30 + __ ld3(v20, v21, v22, __ T2S, Address(r20)); // ld3 {v20.2S, v21.2S, v22.2S}, [x20] + __ ld3r(v24, v25, v26, __ T8H, Address(r21)); // ld3r {v24.8H, v25.8H, v26.8H}, [x21] + __ ld3r(v26, v27, v28, __ T4S, Address(__ post(r1, 12))); // ld3r {v26.4S, v27.4S, v28.4S}, [x1], 12 + __ ld3r(v12, v13, v14, __ T1D, Address(__ post(r2, r0))); // ld3r {v12.1D, v13.1D, v14.1D}, [x2], x0 + __ ld4(v21, v22, v23, v24, __ T8H, Address(__ post(r6, 64))); // ld4 {v21.8H, v22.8H, v23.8H, v24.8H}, [x6], 64 + __ ld4(v17, v18, v19, v20, __ T8B, Address(__ post(r28, r22))); // ld4 {v17.8B, v18.8B, v19.8B, v20.8B}, [x28], x22 + __ ld4r(v19, v20, v21, v22, __ T8B, Address(r25)); // ld4r {v19.8B, v20.8B, v21.8B, v22.8B}, [x25] + __ ld4r(v6, v7, v8, v9, __ T4H, Address(__ post(r23, 8))); // ld4r {v6.4H, v7.4H, v8.4H, v9.4H}, [x23], 8 + __ ld4r(v8, v9, v10, v11, __ T2S, Address(__ post(r9, r26))); // ld4r {v8.2S, v9.2S, v10.2S, v11.2S}, [x9], x26 // SpecialCases - __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE - __ ccmnw(zr, zr, 5u, Assembler::EQ); // ccmn wzr, wzr, #5, EQ - __ ccmp(zr, 1, 4u, Assembler::NE); // ccmp xzr, 1, #4, NE - __ ccmpw(zr, 2, 2, Assembler::GT); // ccmp wzr, 2, #2, GT - __ extr(zr, zr, zr, 0); // extr xzr, xzr, xzr, 0 - __ stlxp(r0, zr, zr, sp); // stlxp w0, xzr, xzr, [sp] - __ stlxpw(r2, zr, zr, r3); // stlxp w2, wzr, wzr, [x3] - __ stxp(r4, zr, zr, r5); // stxp w4, xzr, xzr, [x5] - __ stxpw(r6, zr, zr, sp); // stxp w6, wzr, wzr, [sp] - __ dup(v0, __ T16B, zr); // dup v0.16b, wzr - __ mov(v1, __ T1D, 0, zr); // mov v1.d[0], xzr - __ mov(v1, __ T2S, 1, zr); // mov v1.s[1], wzr - __ mov(v1, __ T4H, 2, zr); // mov v1.h[2], wzr - __ mov(v1, __ T8B, 3, zr); // mov v1.b[3], wzr - __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0 + __ ccmn(zr, zr, 3u, Assembler::LE); // ccmn xzr, xzr, #3, LE + __ ccmnw(zr, zr, 5u, Assembler::EQ); // ccmn wzr, wzr, #5, EQ + __ ccmp(zr, 1, 4u, Assembler::NE); // ccmp xzr, 1, #4, NE + __ ccmpw(zr, 2, 2, Assembler::GT); // ccmp wzr, 2, #2, GT + __ extr(zr, zr, zr, 0); // extr xzr, xzr, xzr, 0 + __ stlxp(r0, zr, zr, sp); // stlxp w0, xzr, xzr, [sp] + __ stlxpw(r2, zr, zr, r3); // stlxp w2, wzr, wzr, [x3] + __ stxp(r4, zr, zr, r5); // stxp w4, xzr, xzr, [x5] + __ stxpw(r6, zr, zr, sp); // stxp w6, wzr, wzr, [sp] + __ dup(v0, __ T16B, zr); // dup v0.16b, wzr + __ mov(v1, __ T1D, 0, zr); // mov v1.d[0], xzr + __ mov(v1, __ T2S, 1, zr); // mov v1.s[1], wzr + __ mov(v1, __ T4H, 2, zr); // mov v1.h[2], wzr + __ mov(v1, __ T8B, 3, zr); // mov v1.b[3], wzr + __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0 + __ sve_cpy(z0, __ S, p0, v1); // mov z0.s, p0/m, s1 + __ sve_inc(r0, __ S); // incw x0 + __ sve_dec(r1, __ H); // dech x1 + __ sve_lsl(z0, __ B, z1, 7); // lsl z0.b, z1.b, #7 + __ sve_lsl(z21, __ H, z1, 15); // lsl z21.h, z1.h, #15 + __ sve_lsl(z0, __ S, z1, 31); // lsl z0.s, z1.s, #31 + __ sve_lsl(z0, __ D, z1, 63); // lsl z0.d, z1.d, #63 + __ sve_lsr(z0, __ B, z1, 7); // lsr z0.b, z1.b, #7 + __ sve_asr(z0, __ H, z11, 15); // asr z0.h, z11.h, #15 + __ sve_lsr(z30, __ S, z1, 31); // lsr z30.s, z1.s, #31 + __ sve_asr(z0, __ D, z1, 63); // asr z0.d, z1.d, #63 + __ sve_addvl(sp, r0, 31); // addvl sp, x0, #31 + __ sve_addpl(r1, sp, -32); // addpl x1, sp, -32 + __ sve_cntp(r8, __ B, p0, p1); // cntp x8, p0, p1.b + __ sve_dup(z0, __ B, 127); // dup z0.b, 127 + __ sve_dup(z1, __ H, -128); // dup z1.h, -128 + __ sve_dup(z2, __ S, 32512); // dup z2.s, 32512 + __ sve_dup(z7, __ D, -32768); // dup z7.d, -32768 + __ sve_ld1b(z0, __ B, p0, Address(sp)); // ld1b {z0.b}, p0/z, [sp] + __ sve_ld1h(z10, __ H, p1, Address(sp, -8)); // ld1h {z10.h}, p1/z, [sp, #-8, MUL VL] + __ sve_ld1w(z20, __ S, p2, Address(r0, 7)); // ld1w {z20.s}, p2/z, [x0, #7, MUL VL] + __ sve_ld1b(z30, __ B, p3, Address(sp, r8)); // ld1b {z30.b}, p3/z, [sp, x8] + __ sve_ld1w(z0, __ S, p4, Address(sp, r28)); // ld1w {z0.s}, p4/z, [sp, x28, LSL #2] + __ sve_ld1d(z11, __ D, p5, Address(r0, r1)); // ld1d {z11.d}, p5/z, [x0, x1, LSL #3] + __ sve_st1b(z22, __ B, p6, Address(sp)); // st1b {z22.b}, p6, [sp] + __ sve_st1b(z31, __ B, p7, Address(sp, -8)); // st1b {z31.b}, p7, [sp, #-8, MUL VL] + __ sve_st1w(z0, __ S, p1, Address(r0, 7)); // st1w {z0.s}, p1, [x0, #7, MUL VL] + __ sve_st1b(z0, __ B, p2, Address(sp, r1)); // st1b {z0.b}, p2, [sp, x1] + __ sve_st1h(z0, __ H, p3, Address(sp, r8)); // st1h {z0.h}, p3, [sp, x8, LSL #1] + __ sve_st1d(z0, __ D, p4, Address(r0, r8)); // st1d {z0.d}, p4, [x0, x8, LSL #3] + __ sve_ldr(z0, Address(sp)); // ldr z0, [sp] + __ sve_ldr(z31, Address(sp, -256)); // ldr z31, [sp, #-256, MUL VL] + __ sve_str(z8, Address(r8, 255)); // str z8, [x8, #255, MUL VL] // FloatImmediateOp - __ fmovd(v0, 2.0); // fmov d0, #2.0 - __ fmovd(v0, 2.125); // fmov d0, #2.125 - __ fmovd(v0, 4.0); // fmov d0, #4.0 - __ fmovd(v0, 4.25); // fmov d0, #4.25 - __ fmovd(v0, 8.0); // fmov d0, #8.0 - __ fmovd(v0, 8.5); // fmov d0, #8.5 - __ fmovd(v0, 16.0); // fmov d0, #16.0 - __ fmovd(v0, 17.0); // fmov d0, #17.0 - __ fmovd(v0, 0.125); // fmov d0, #0.125 - __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 - __ fmovd(v0, 0.25); // fmov d0, #0.25 - __ fmovd(v0, 0.265625); // fmov d0, #0.265625 - __ fmovd(v0, 0.5); // fmov d0, #0.5 - __ fmovd(v0, 0.53125); // fmov d0, #0.53125 - __ fmovd(v0, 1.0); // fmov d0, #1.0 - __ fmovd(v0, 1.0625); // fmov d0, #1.0625 - __ fmovd(v0, -2.0); // fmov d0, #-2.0 - __ fmovd(v0, -2.125); // fmov d0, #-2.125 - __ fmovd(v0, -4.0); // fmov d0, #-4.0 - __ fmovd(v0, -4.25); // fmov d0, #-4.25 - __ fmovd(v0, -8.0); // fmov d0, #-8.0 - __ fmovd(v0, -8.5); // fmov d0, #-8.5 - __ fmovd(v0, -16.0); // fmov d0, #-16.0 - __ fmovd(v0, -17.0); // fmov d0, #-17.0 - __ fmovd(v0, -0.125); // fmov d0, #-0.125 - __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 - __ fmovd(v0, -0.25); // fmov d0, #-0.25 - __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 - __ fmovd(v0, -0.5); // fmov d0, #-0.5 - __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 - __ fmovd(v0, -1.0); // fmov d0, #-1.0 - __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 + __ fmovd(v0, 2.0); // fmov d0, #2.0 + __ fmovd(v0, 2.125); // fmov d0, #2.125 + __ fmovd(v0, 4.0); // fmov d0, #4.0 + __ fmovd(v0, 4.25); // fmov d0, #4.25 + __ fmovd(v0, 8.0); // fmov d0, #8.0 + __ fmovd(v0, 8.5); // fmov d0, #8.5 + __ fmovd(v0, 16.0); // fmov d0, #16.0 + __ fmovd(v0, 17.0); // fmov d0, #17.0 + __ fmovd(v0, 0.125); // fmov d0, #0.125 + __ fmovd(v0, 0.1328125); // fmov d0, #0.1328125 + __ fmovd(v0, 0.25); // fmov d0, #0.25 + __ fmovd(v0, 0.265625); // fmov d0, #0.265625 + __ fmovd(v0, 0.5); // fmov d0, #0.5 + __ fmovd(v0, 0.53125); // fmov d0, #0.53125 + __ fmovd(v0, 1.0); // fmov d0, #1.0 + __ fmovd(v0, 1.0625); // fmov d0, #1.0625 + __ fmovd(v0, -2.0); // fmov d0, #-2.0 + __ fmovd(v0, -2.125); // fmov d0, #-2.125 + __ fmovd(v0, -4.0); // fmov d0, #-4.0 + __ fmovd(v0, -4.25); // fmov d0, #-4.25 + __ fmovd(v0, -8.0); // fmov d0, #-8.0 + __ fmovd(v0, -8.5); // fmov d0, #-8.5 + __ fmovd(v0, -16.0); // fmov d0, #-16.0 + __ fmovd(v0, -17.0); // fmov d0, #-17.0 + __ fmovd(v0, -0.125); // fmov d0, #-0.125 + __ fmovd(v0, -0.1328125); // fmov d0, #-0.1328125 + __ fmovd(v0, -0.25); // fmov d0, #-0.25 + __ fmovd(v0, -0.265625); // fmov d0, #-0.265625 + __ fmovd(v0, -0.5); // fmov d0, #-0.5 + __ fmovd(v0, -0.53125); // fmov d0, #-0.53125 + __ fmovd(v0, -1.0); // fmov d0, #-1.0 + __ fmovd(v0, -1.0625); // fmov d0, #-1.0625 // LSEOp - __ swp(Assembler::xword, r16, r20, r15); // swp x16, x20, [x15] - __ ldadd(Assembler::xword, r2, r7, r28); // ldadd x2, x7, [x28] - __ ldbic(Assembler::xword, r20, r10, r25); // ldclr x20, x10, [x25] - __ ldeor(Assembler::xword, r22, r11, r2); // ldeor x22, x11, [x2] - __ ldorr(Assembler::xword, r1, r10, r19); // ldset x1, x10, [x19] - __ ldsmin(Assembler::xword, r14, r21, r3); // ldsmin x14, x21, [x3] - __ ldsmax(Assembler::xword, r28, r27, r13); // ldsmax x28, x27, [x13] - __ ldumin(Assembler::xword, r17, r30, r21); // ldumin x17, x30, [x21] - __ ldumax(Assembler::xword, r27, r16, r29); // ldumax x27, x16, [x29] + __ swp(Assembler::xword, r11, r15, r21); // swp x11, x15, [x21] + __ ldadd(Assembler::xword, r23, r8, r5); // ldadd x23, x8, [x5] + __ ldbic(Assembler::xword, r7, r6, r8); // ldclr x7, x6, [x8] + __ ldeor(Assembler::xword, r14, r14, r23); // ldeor x14, x14, [x23] + __ ldorr(Assembler::xword, r10, r25, r0); // ldset x10, x25, [x0] + __ ldsmin(Assembler::xword, r5, r9, r21); // ldsmin x5, x9, [x21] + __ ldsmax(Assembler::xword, r4, r27, r17); // ldsmax x4, x27, [x17] + __ ldumin(Assembler::xword, r10, r6, r13); // ldumin x10, x6, [x13] + __ ldumax(Assembler::xword, r3, r3, r16); // ldumax x3, x3, [x16] // LSEOp - __ swpa(Assembler::xword, r30, r9, r0); // swpa x30, x9, [x0] - __ ldadda(Assembler::xword, r28, r27, r28); // ldadda x28, x27, [x28] - __ ldbica(Assembler::xword, r21, r25, r10); // ldclra x21, x25, [x10] - __ ldeora(Assembler::xword, zr, r20, r15); // ldeora xzr, x20, [x15] - __ ldorra(Assembler::xword, r1, r25, r14); // ldseta x1, x25, [x14] - __ ldsmina(Assembler::xword, r21, r26, r29); // ldsmina x21, x26, [x29] - __ ldsmaxa(Assembler::xword, r8, r29, r25); // ldsmaxa x8, x29, [x25] - __ ldumina(Assembler::xword, r13, r2, r25); // ldumina x13, x2, [x25] - __ ldumaxa(Assembler::xword, r15, r23, r0); // ldumaxa x15, x23, [x0] + __ swpa(Assembler::xword, r9, r28, r2); // swpa x9, x28, [x2] + __ ldadda(Assembler::xword, r23, r2, r1); // ldadda x23, x2, [x1] + __ ldbica(Assembler::xword, r4, r26, r7); // ldclra x4, x26, [x7] + __ ldeora(Assembler::xword, r0, r3, r10); // ldeora x0, x3, [x10] + __ ldorra(Assembler::xword, r24, r25, r3); // ldseta x24, x25, [x3] + __ ldsmina(Assembler::xword, r11, r8, r1); // ldsmina x11, x8, [x1] + __ ldsmaxa(Assembler::xword, r16, r13, r29); // ldsmaxa x16, x13, [x29] + __ ldumina(Assembler::xword, r6, r0, r5); // ldumina x6, x0, [x5] + __ ldumaxa(Assembler::xword, r16, r17, r13); // ldumaxa x16, x17, [x13] // LSEOp - __ swpal(Assembler::xword, r3, r1, r2); // swpal x3, x1, [x2] - __ ldaddal(Assembler::xword, r28, r3, r20); // ldaddal x28, x3, [x20] - __ ldbical(Assembler::xword, r14, zr, r14); // ldclral x14, xzr, [x14] - __ ldeoral(Assembler::xword, r7, r28, r2); // ldeoral x7, x28, [x2] - __ ldorral(Assembler::xword, r0, r11, r5); // ldsetal x0, x11, [x5] - __ ldsminal(Assembler::xword, r11, r14, r20); // ldsminal x11, x14, [x20] - __ ldsmaxal(Assembler::xword, zr, r4, r2); // ldsmaxal xzr, x4, [x2] - __ lduminal(Assembler::xword, r26, r0, r22); // lduminal x26, x0, [x22] - __ ldumaxal(Assembler::xword, r17, r1, r13); // ldumaxal x17, x1, [x13] + __ swpal(Assembler::xword, r11, r27, r14); // swpal x11, x27, [x14] + __ ldaddal(Assembler::xword, r2, r13, r21); // ldaddal x2, x13, [x21] + __ ldbical(Assembler::xword, r22, zr, r12); // ldclral x22, xzr, [x12] + __ ldeoral(Assembler::xword, r7, r30, r15); // ldeoral x7, x30, [x15] + __ ldorral(Assembler::xword, r7, r16, r15); // ldsetal x7, x16, [x15] + __ ldsminal(Assembler::xword, r16, r26, r13); // ldsminal x16, x26, [x13] + __ ldsmaxal(Assembler::xword, r23, r25, r27); // ldsmaxal x23, x25, [x27] + __ lduminal(Assembler::xword, r4, r14, sp); // lduminal x4, x14, [sp] + __ ldumaxal(Assembler::xword, r24, r1, r17); // ldumaxal x24, x1, [x17] // LSEOp - __ swpl(Assembler::xword, r23, r26, r20); // swpl x23, x26, [x20] - __ ldaddl(Assembler::xword, r14, r11, r12); // ldaddl x14, x11, [x12] - __ ldbicl(Assembler::xword, r12, zr, r15); // ldclrl x12, xzr, [x15] - __ ldeorl(Assembler::xword, r27, r14, r8); // ldeorl x27, x14, [x8] - __ ldorrl(Assembler::xword, r10, r30, r25); // ldsetl x10, x30, [x25] - __ ldsminl(Assembler::xword, r22, r7, r16); // ldsminl x22, x7, [x16] - __ ldsmaxl(Assembler::xword, r1, r16, r8); // ldsmaxl x1, x16, [x8] - __ lduminl(Assembler::xword, r1, r1, r26); // lduminl x1, x1, [x26] - __ ldumaxl(Assembler::xword, r0, r23, r15); // ldumaxl x0, x23, [x15] + __ swpl(Assembler::xword, r2, r8, r24); // swpl x2, x8, [x24] + __ ldaddl(Assembler::xword, r20, r27, r19); // ldaddl x20, x27, [x19] + __ ldbicl(Assembler::xword, r19, r17, r6); // ldclrl x19, x17, [x6] + __ ldeorl(Assembler::xword, r14, r28, r26); // ldeorl x14, x28, [x26] + __ ldorrl(Assembler::xword, r2, r16, r19); // ldsetl x2, x16, [x19] + __ ldsminl(Assembler::xword, r14, r16, r4); // ldsminl x14, x16, [x4] + __ ldsmaxl(Assembler::xword, r25, r8, r9); // ldsmaxl x25, x8, [x9] + __ lduminl(Assembler::xword, r10, r5, r29); // lduminl x10, x5, [x29] + __ ldumaxl(Assembler::xword, r6, r2, r14); // ldumaxl x6, x2, [x14] // LSEOp - __ swp(Assembler::word, r11, r16, r8); // swp w11, w16, [x8] - __ ldadd(Assembler::word, r1, r7, r14); // ldadd w1, w7, [x14] - __ ldbic(Assembler::word, r16, zr, r9); // ldclr w16, wzr, [x9] - __ ldeor(Assembler::word, r22, r6, r13); // ldeor w22, w6, [x13] - __ ldorr(Assembler::word, r11, r13, r4); // ldset w11, w13, [x4] - __ ldsmin(Assembler::word, r16, r22, r0); // ldsmin w16, w22, [x0] - __ ldsmax(Assembler::word, r28, zr, r10); // ldsmax w28, wzr, [x10] - __ ldumin(Assembler::word, r16, r5, r8); // ldumin w16, w5, [x8] - __ ldumax(Assembler::word, r26, r20, r15); // ldumax w26, w20, [x15] + __ swp(Assembler::word, r17, r11, r4); // swp w17, w11, [x4] + __ ldadd(Assembler::word, r7, r16, r15); // ldadd w7, w16, [x15] + __ ldbic(Assembler::word, r11, r25, r9); // ldclr w11, w25, [x9] + __ ldeor(Assembler::word, r3, r14, r0); // ldeor w3, w14, [x0] + __ ldorr(Assembler::word, r0, r30, r0); // ldset w0, w30, [x0] + __ ldsmin(Assembler::word, r6, r10, r28); // ldsmin w6, w10, [x28] + __ ldsmax(Assembler::word, r7, r14, r6); // ldsmax w7, w14, [x6] + __ ldumin(Assembler::word, r6, r30, r0); // ldumin w6, w30, [x0] + __ ldumax(Assembler::word, r22, r30, r29); // ldumax w22, w30, [x29] // LSEOp - __ swpa(Assembler::word, r27, r6, r16); // swpa w27, w6, [x16] - __ ldadda(Assembler::word, zr, zr, r2); // ldadda wzr, wzr, [x2] - __ ldbica(Assembler::word, r24, r28, r8); // ldclra w24, w28, [x8] - __ ldeora(Assembler::word, r15, r9, r23); // ldeora w15, w9, [x23] - __ ldorra(Assembler::word, r26, r2, r7); // ldseta w26, w2, [x7] - __ ldsmina(Assembler::word, r3, r17, r15); // ldsmina w3, w17, [x15] - __ ldsmaxa(Assembler::word, r19, r5, r21); // ldsmaxa w19, w5, [x21] - __ ldumina(Assembler::word, r7, r26, r12); // ldumina w7, w26, [x12] - __ ldumaxa(Assembler::word, r12, r7, r29); // ldumaxa w12, w7, [x29] + __ swpa(Assembler::word, r16, r14, r19); // swpa w16, w14, [x19] + __ ldadda(Assembler::word, r21, r3, r25); // ldadda w21, w3, [x25] + __ ldbica(Assembler::word, r2, r16, r19); // ldclra w2, w16, [x19] + __ ldeora(Assembler::word, r26, r20, r23); // ldeora w26, w20, [x23] + __ ldorra(Assembler::word, r17, r6, sp); // ldseta w17, w6, [sp] + __ ldsmina(Assembler::word, r5, r23, r30); // ldsmina w5, w23, [x30] + __ ldsmaxa(Assembler::word, r11, r12, r14); // ldsmaxa w11, w12, [x14] + __ ldumina(Assembler::word, r2, r20, r13); // ldumina w2, w20, [x13] + __ ldumaxa(Assembler::word, r15, r17, r20); // ldumaxa w15, w17, [x20] // LSEOp - __ swpal(Assembler::word, r9, r8, r20); // swpal w9, w8, [x20] - __ ldaddal(Assembler::word, r8, zr, r30); // ldaddal w8, wzr, [x30] - __ ldbical(Assembler::word, r0, r6, r12); // ldclral w0, w6, [x12] - __ ldeoral(Assembler::word, r17, r23, r2); // ldeoral w17, w23, [x2] - __ ldorral(Assembler::word, r0, r30, r1); // ldsetal w0, w30, [x1] - __ ldsminal(Assembler::word, r22, r3, r15); // ldsminal w22, w3, [x15] - __ ldsmaxal(Assembler::word, r25, r21, r13); // ldsmaxal w25, w21, [x13] - __ lduminal(Assembler::word, r13, r24, r27); // lduminal w13, w24, [x27] - __ ldumaxal(Assembler::word, r20, r3, r11); // ldumaxal w20, w3, [x11] + __ swpal(Assembler::word, r6, r28, r23); // swpal w6, w28, [x23] + __ ldaddal(Assembler::word, r27, r16, r13); // ldaddal w27, w16, [x13] + __ ldbical(Assembler::word, r2, r23, r24); // ldclral w2, w23, [x24] + __ ldeoral(Assembler::word, r0, r28, r10); // ldeoral w0, w28, [x10] + __ ldorral(Assembler::word, r3, r15, r5); // ldsetal w3, w15, [x5] + __ ldsminal(Assembler::word, r3, r11, r29); // ldsminal w3, w11, [x29] + __ ldsmaxal(Assembler::word, r22, r27, r6); // ldsmaxal w22, w27, [x6] + __ lduminal(Assembler::word, r17, r20, r16); // lduminal w17, w20, [x16] + __ ldumaxal(Assembler::word, r23, r15, r7); // ldumaxal w23, w15, [x7] // LSEOp - __ swpl(Assembler::word, r3, r13, r21); // swpl w3, w13, [x21] - __ ldaddl(Assembler::word, r26, r15, r26); // ldaddl w26, w15, [x26] - __ ldbicl(Assembler::word, r9, r19, r2); // ldclrl w9, w19, [x2] - __ ldeorl(Assembler::word, r24, r29, r7); // ldeorl w24, w29, [x7] - __ ldorrl(Assembler::word, r29, r25, r15); // ldsetl w29, w25, [x15] - __ ldsminl(Assembler::word, r11, r30, r7); // ldsminl w11, w30, [x7] - __ ldsmaxl(Assembler::word, r11, r2, r6); // ldsmaxl w11, w2, [x6] - __ lduminl(Assembler::word, r16, r11, r14); // lduminl w16, w11, [x14] - __ ldumaxl(Assembler::word, r5, r8, r11); // ldumaxl w5, w8, [x11] + __ swpl(Assembler::word, r8, r16, r14); // swpl w8, w16, [x14] + __ ldaddl(Assembler::word, r23, r16, r23); // ldaddl w23, w16, [x23] + __ ldbicl(Assembler::word, r28, r12, r7); // ldclrl w28, w12, [x7] + __ ldeorl(Assembler::word, r28, r7, r19); // ldeorl w28, w7, [x19] + __ ldorrl(Assembler::word, r7, r12, r11); // ldsetl w7, w12, [x11] + __ ldsminl(Assembler::word, r10, zr, r20); // ldsminl w10, wzr, [x20] + __ ldsmaxl(Assembler::word, r9, r8, sp); // ldsmaxl w9, w8, [sp] + __ lduminl(Assembler::word, r10, r8, r2); // lduminl w10, w8, [x2] + __ ldumaxl(Assembler::word, r17, zr, sp); // ldumaxl w17, wzr, [sp] + +// SVEVectorOp + __ sve_add(z2, __ H, z7, z22); // add z2.h, z7.h, z22.h + __ sve_sub(z30, __ S, z22, z30); // sub z30.s, z22.s, z30.s + __ sve_fadd(z10, __ D, z22, z25); // fadd z10.d, z22.d, z25.d + __ sve_fmul(z23, __ D, z16, z12); // fmul z23.d, z16.d, z12.d + __ sve_fsub(z3, __ D, z17, z25); // fsub z3.d, z17.d, z25.d + __ sve_abs(z25, __ S, p0, z4); // abs z25.s, p0/m, z4.s + __ sve_add(z23, __ H, p6, z26); // add z23.h, p6/m, z23.h, z26.h + __ sve_asr(z6, __ D, p0, z17); // asr z6.d, p0/m, z6.d, z17.d + __ sve_cnt(z23, __ D, p3, z3); // cnt z23.d, p3/m, z3.d + __ sve_lsl(z11, __ S, p7, z9); // lsl z11.s, p7/m, z11.s, z9.s + __ sve_lsr(z27, __ S, p7, z3); // lsr z27.s, p7/m, z27.s, z3.s + __ sve_mul(z9, __ S, p4, z2); // mul z9.s, p4/m, z9.s, z2.s + __ sve_neg(z16, __ B, p2, z15); // neg z16.b, p2/m, z15.b + __ sve_not(z9, __ D, p2, z9); // not z9.d, p2/m, z9.d + __ sve_smax(z10, __ S, p5, z23); // smax z10.s, p5/m, z10.s, z23.s + __ sve_smin(z13, __ B, p5, z25); // smin z13.b, p5/m, z13.b, z25.b + __ sve_sub(z19, __ S, p5, z0); // sub z19.s, p5/m, z19.s, z0.s + __ sve_fabs(z17, __ D, p0, z22); // fabs z17.d, p0/m, z22.d + __ sve_fadd(z9, __ S, p2, z16); // fadd z9.s, p2/m, z9.s, z16.s + __ sve_fdiv(z17, __ S, p5, z0); // fdiv z17.s, p5/m, z17.s, z0.s + __ sve_fmax(z29, __ S, p5, z3); // fmax z29.s, p5/m, z29.s, z3.s + __ sve_fmin(z1, __ S, p3, z17); // fmin z1.s, p3/m, z1.s, z17.s + __ sve_fmul(z14, __ D, p2, z0); // fmul z14.d, p2/m, z14.d, z0.d + __ sve_fneg(z19, __ D, p4, z22); // fneg z19.d, p4/m, z22.d + __ sve_frintm(z17, __ D, p1, z15); // frintm z17.d, p1/m, z15.d + __ sve_frintn(z8, __ D, p4, z4); // frintn z8.d, p4/m, z4.d + __ sve_frintp(z5, __ D, p4, z29); // frintp z5.d, p4/m, z29.d + __ sve_fsqrt(z11, __ D, p0, z19); // fsqrt z11.d, p0/m, z19.d + __ sve_fsub(z10, __ D, p4, z28); // fsub z10.d, p4/m, z10.d, z28.d + __ sve_fmla(z13, __ D, p3, z15, z11); // fmla z13.d, p3/m, z15.d, z11.d + __ sve_fmls(z6, __ S, p7, z20, z15); // fmls z6.s, p7/m, z20.s, z15.s + __ sve_fnmla(z30, __ S, p2, z13, z7); // fnmla z30.s, p2/m, z13.s, z7.s + __ sve_fnmls(z22, __ D, p6, z14, z19); // fnmls z22.d, p6/m, z14.d, z19.d + __ sve_mla(z30, __ H, p3, z25, z0); // mla z30.h, p3/m, z25.h, z0.h + __ sve_mls(z10, __ D, p2, z24, z1); // mls z10.d, p2/m, z24.d, z1.d + __ sve_and(z6, z17, z22); // and z6.d, z17.d, z22.d + __ sve_eor(z10, z9, z17); // eor z10.d, z9.d, z17.d + __ sve_orr(z2, z12, z21); // orr z2.d, z12.d, z21.d + +// SVEReductionOp + __ sve_andv(v15, __ S, p6, z14); // andv s15, p6, z14.s + __ sve_orv(v9, __ D, p3, z7); // orv d9, p3, z7.d + __ sve_eorv(v30, __ H, p5, z9); // eorv h30, p5, z9.h + __ sve_smaxv(v7, __ S, p4, z26); // smaxv s7, p4, z26.s + __ sve_sminv(v20, __ S, p3, z29); // sminv s20, p3, z29.s + __ sve_fminv(v28, __ S, p3, z16); // fminv s28, p3, z16.s + __ sve_fmaxv(v6, __ D, p3, z9); // fmaxv d6, p3, z9.d + __ sve_fadda(v10, __ S, p5, z3); // fadda s10, p5, s10, z3.s + __ sve_uaddv(v21, __ B, p6, z8); // uaddv d21, p6, z8.b __ bind(forth); @@ -762,680 +846,780 @@ aarch64ops.o: file format elf64-littleaarch64 Disassembly of section .text: 0000000000000000 <back>: - 0: 8b18ec0f add x15, x0, x24, lsl #59 - 4: cb9636d1 sub x17, x22, x22, asr #13 - 8: ab1ce74a adds x10, x26, x28, lsl #57 - c: eb184a19 subs x25, x16, x24, lsl #18 - 10: 0b1c1ca8 add w8, w5, w28, lsl #7 - 14: 4b817388 sub w8, w28, w1, asr #28 - 18: 2b01004c adds w12, w2, w1 - 1c: 6b5164b7 subs w23, w5, w17, lsr #25 - 20: 8a0d5595 and x21, x12, x13, lsl #21 - 24: aa9791f5 orr x21, x15, x23, asr #36 - 28: ca9bc316 eor x22, x24, x27, asr #48 - 2c: ea82d1f6 ands x22, x15, x2, asr #52 - 30: 0a980e21 and w1, w17, w24, asr #3 - 34: 2a862c45 orr w5, w2, w6, asr #11 - 38: 4a453037 eor w23, w1, w5, lsr #12 - 3c: 6a8e5180 ands w0, w12, w14, asr #20 - 40: 8a621cc1 bic x1, x6, x2, lsr #7 - 44: aa24bd1e orn x30, x8, x4, lsl #47 - 48: cab4d6d1 eon x17, x22, x20, asr #53 - 4c: eaa591fd bics x29, x15, x5, asr #36 - 50: 0a7d6efe bic w30, w23, w29, lsr #27 - 54: 2a2253ac orn w12, w29, w2, lsl #20 - 58: 4aa61187 eon w7, w12, w6, asr #4 - 5c: 6aa755b0 bics w16, w13, w7, asr #21 - 60: 110b5a25 add w5, w17, #0x2d6 - 64: 31056e0a adds w10, w16, #0x15b - 68: 510f48ba sub w26, w5, #0x3d2 - 6c: 710ac715 subs w21, w24, #0x2b1 - 70: 910f6e0a add x10, x16, #0x3db - 74: b10a65ef adds x15, x15, #0x299 - 78: d1009e98 sub x24, x20, #0x27 - 7c: f10131aa subs x10, x13, #0x4c - 80: 121d4e67 and w7, w19, #0x7ffff8 - 84: 32043e25 orr w5, w17, #0xf0000fff - 88: 52132390 eor w16, w28, #0x3fe000 - 8c: 72160b0e ands w14, w24, #0x1c00 - 90: 9273e76e and x14, x27, #0xffffffffffffe07f - 94: b256416c orr x12, x11, #0x7fffc0000000000 - 98: d24b5002 eor x2, x0, #0xffe00000000003ff - 9c: f266da8d ands x13, x20, #0xfffffffffc01ffff - a0: 14000000 b a0 <back+0xa0> - a4: 17ffffd7 b 0 <back> - a8: 140001ee b 860 <forth> - ac: 94000000 bl ac <back+0xac> - b0: 97ffffd4 bl 0 <back> - b4: 940001eb bl 860 <forth> - b8: 3400000f cbz w15, b8 <back+0xb8> - bc: 34fffa2f cbz w15, 0 <back> - c0: 34003d0f cbz w15, 860 <forth> - c4: 3500001c cbnz w28, c4 <back+0xc4> - c8: 35fff9dc cbnz w28, 0 <back> - cc: 35003cbc cbnz w28, 860 <forth> - d0: b400001b cbz x27, d0 <back+0xd0> - d4: b4fff97b cbz x27, 0 <back> - d8: b4003c5b cbz x27, 860 <forth> - dc: b5000000 cbnz x0, dc <back+0xdc> - e0: b5fff900 cbnz x0, 0 <back> - e4: b5003be0 cbnz x0, 860 <forth> - e8: 1000000d adr x13, e8 <back+0xe8> - ec: 10fff8ad adr x13, 0 <back> - f0: 10003b8d adr x13, 860 <forth> - f4: 90000003 adrp x3, 0 <back> - f8: 36380015 tbz w21, #7, f8 <back+0xf8> - fc: 363ff835 tbz w21, #7, 0 <back> - 100: 36383b15 tbz w21, #7, 860 <forth> - 104: 3748000f tbnz w15, #9, 104 <back+0x104> - 108: 374ff7cf tbnz w15, #9, 0 <back> - 10c: 37483aaf tbnz w15, #9, 860 <forth> - 110: 12a14bee mov w14, #0xf5a0ffff // #-173998081 - 114: 5283bb51 mov w17, #0x1dda // #7642 - 118: 72858ebb movk w27, #0x2c75 - 11c: 92c98881 mov x1, #0xffffb3bbffffffff // #-83854941487105 - 120: d2aa50d4 mov x20, #0x52860000 // #1384513536 - 124: f2afd9d4 movk x20, #0x7ece, lsl #16 - 128: 935c504d sbfiz x13, x2, #36, #21 - 12c: 33133e90 bfi w16, w20, #13, #16 - 130: 5309196b ubfiz w11, w11, #23, #7 - 134: 93595482 sbfiz x2, x4, #39, #22 - 138: b3424e0d bfxil x13, x16, #2, #18 - 13c: d3481728 ubfiz x8, x25, #56, #6 - 140: 138a3b7d extr w29, w27, w10, #14 - 144: 93c66286 extr x6, x20, x6, #24 - 148: 54000000 b.eq 148 <back+0x148> // b.none - 14c: 54fff5a0 b.eq 0 <back> // b.none - 150: 54003880 b.eq 860 <forth> // b.none - 154: 54000001 b.ne 154 <back+0x154> // b.any - 158: 54fff541 b.ne 0 <back> // b.any - 15c: 54003821 b.ne 860 <forth> // b.any - 160: 54000002 b.cs 160 <back+0x160> // b.hs, b.nlast - 164: 54fff4e2 b.cs 0 <back> // b.hs, b.nlast - 168: 540037c2 b.cs 860 <forth> // b.hs, b.nlast - 16c: 54000002 b.cs 16c <back+0x16c> // b.hs, b.nlast - 170: 54fff482 b.cs 0 <back> // b.hs, b.nlast - 174: 54003762 b.cs 860 <forth> // b.hs, b.nlast - 178: 54000003 b.cc 178 <back+0x178> // b.lo, b.ul, b.last - 17c: 54fff423 b.cc 0 <back> // b.lo, b.ul, b.last - 180: 54003703 b.cc 860 <forth> // b.lo, b.ul, b.last - 184: 54000003 b.cc 184 <back+0x184> // b.lo, b.ul, b.last - 188: 54fff3c3 b.cc 0 <back> // b.lo, b.ul, b.last - 18c: 540036a3 b.cc 860 <forth> // b.lo, b.ul, b.last - 190: 54000004 b.mi 190 <back+0x190> // b.first - 194: 54fff364 b.mi 0 <back> // b.first - 198: 54003644 b.mi 860 <forth> // b.first - 19c: 54000005 b.pl 19c <back+0x19c> // b.nfrst - 1a0: 54fff305 b.pl 0 <back> // b.nfrst - 1a4: 540035e5 b.pl 860 <forth> // b.nfrst - 1a8: 54000006 b.vs 1a8 <back+0x1a8> - 1ac: 54fff2a6 b.vs 0 <back> - 1b0: 54003586 b.vs 860 <forth> - 1b4: 54000007 b.vc 1b4 <back+0x1b4> - 1b8: 54fff247 b.vc 0 <back> - 1bc: 54003527 b.vc 860 <forth> - 1c0: 54000008 b.hi 1c0 <back+0x1c0> // b.pmore - 1c4: 54fff1e8 b.hi 0 <back> // b.pmore - 1c8: 540034c8 b.hi 860 <forth> // b.pmore - 1cc: 54000009 b.ls 1cc <back+0x1cc> // b.plast - 1d0: 54fff189 b.ls 0 <back> // b.plast - 1d4: 54003469 b.ls 860 <forth> // b.plast - 1d8: 5400000a b.ge 1d8 <back+0x1d8> // b.tcont - 1dc: 54fff12a b.ge 0 <back> // b.tcont - 1e0: 5400340a b.ge 860 <forth> // b.tcont - 1e4: 5400000b b.lt 1e4 <back+0x1e4> // b.tstop - 1e8: 54fff0cb b.lt 0 <back> // b.tstop - 1ec: 540033ab b.lt 860 <forth> // b.tstop - 1f0: 5400000c b.gt 1f0 <back+0x1f0> - 1f4: 54fff06c b.gt 0 <back> - 1f8: 5400334c b.gt 860 <forth> - 1fc: 5400000d b.le 1fc <back+0x1fc> - 200: 54fff00d b.le 0 <back> - 204: 540032ed b.le 860 <forth> - 208: 5400000e b.al 208 <back+0x208> - 20c: 54ffefae b.al 0 <back> - 210: 5400328e b.al 860 <forth> - 214: 5400000f b.nv 214 <back+0x214> - 218: 54ffef4f b.nv 0 <back> - 21c: 5400322f b.nv 860 <forth> - 220: d40d2881 svc #0x6944 - 224: d40ea5c2 hvc #0x752e - 228: d40518a3 smc #0x28c5 - 22c: d42eca40 brk #0x7652 - 230: d44a2e60 hlt #0x5173 - 234: d503201f nop - 238: d69f03e0 eret - 23c: d6bf03e0 drps - 240: d5033fdf isb - 244: d5033d9f dsb ld - 248: d5033bbf dmb ish - 24c: d61f0120 br x9 - 250: d63f0120 blr x9 - 254: c8027d7d stxr w2, x29, [x11] - 258: c816ff85 stlxr w22, x5, [x28] - 25c: c85f7e8e ldxr x14, [x20] - 260: c85ffe7d ldaxr x29, [x19] - 264: c89ffea6 stlr x6, [x21] - 268: c8dffc73 ldar x19, [x3] - 26c: 880c7f63 stxr w12, w3, [x27] - 270: 8811fdfa stlxr w17, w26, [x15] - 274: 885f7dcd ldxr w13, [x14] - 278: 885fff4c ldaxr w12, [x26] - 27c: 889ffe28 stlr w8, [x17] - 280: 88dfffd5 ldar w21, [x30] - 284: 48007d6f stxrh w0, w15, [x11] - 288: 4811fc34 stlxrh w17, w20, [x1] - 28c: 485f7d1d ldxrh w29, [x8] - 290: 485ffd91 ldaxrh w17, [x12] - 294: 489ffc8b stlrh w11, [x4] - 298: 48dffc90 ldarh w16, [x4] - 29c: 080e7c85 stxrb w14, w5, [x4] - 2a0: 081bfe11 stlxrb w27, w17, [x16] - 2a4: 085f7f66 ldxrb w6, [x27] - 2a8: 085fff1b ldaxrb w27, [x24] - 2ac: 089ffe8a stlrb w10, [x20] - 2b0: 08dfff49 ldarb w9, [x26] - 2b4: c87f7b85 ldxp x5, x30, [x28] - 2b8: c87fa66a ldaxp x10, x9, [x19] - 2bc: c82b5590 stxp w11, x16, x21, [x12] - 2c0: c82adc94 stlxp w10, x20, x23, [x4] - 2c4: 887f0416 ldxp w22, w1, [x0] - 2c8: 887f8503 ldaxp w3, w1, [x8] - 2cc: 88205fc9 stxp w0, w9, w23, [x30] - 2d0: 8837c560 stlxp w23, w0, w17, [x11] - 2d4: f81e1146 stur x6, [x10, #-31] - 2d8: b81fb007 stur w7, [x0, #-5] - 2dc: 381f3205 sturb w5, [x16, #-13] - 2e0: 7801f27e sturh w30, [x19, #31] - 2e4: f8477130 ldur x16, [x9, #119] - 2e8: b843b208 ldur w8, [x16, #59] - 2ec: 385f918a ldurb w10, [x12, #-7] - 2f0: 785da12e ldurh w14, [x9, #-38] - 2f4: 389f83d8 ldursb x24, [x30, #-8] - 2f8: 78817087 ldursh x7, [x4, #23] - 2fc: 78dd91d1 ldursh w17, [x14, #-39] - 300: b89e136b ldursw x11, [x27, #-31] - 304: fc4410ec ldur d12, [x7, #65] - 308: bc5fe200 ldur s0, [x16, #-2] - 30c: fc15f2ed stur d13, [x23, #-161] - 310: bc1c2075 stur s21, [x3, #-62] - 314: f8064ca2 str x2, [x5, #100]! - 318: b81a4c29 str w9, [x1, #-92]! - 31c: 381fbfdb strb w27, [x30, #-5]! - 320: 7800cdfb strh w27, [x15, #12]! - 324: f852ce24 ldr x4, [x17, #-212]! - 328: b841eef5 ldr w21, [x23, #30]! - 32c: 385f9e2d ldrb w13, [x17, #-7]! - 330: 785cec19 ldrh w25, [x0, #-50]! - 334: 389ebea1 ldrsb x1, [x21, #-21]! - 338: 789caebc ldrsh x28, [x21, #-54]! - 33c: 78c02c8b ldrsh w11, [x4, #2]! - 340: b883dd31 ldrsw x17, [x9, #61]! - 344: fc427e7d ldr d29, [x19, #39]! - 348: bc5abed6 ldr s22, [x22, #-85]! - 34c: fc11ff29 str d9, [x25, #-225]! - 350: bc1f1c49 str s9, [x2, #-15]! - 354: f81be6ed str x13, [x23], #-66 - 358: b800a611 str w17, [x16], #10 - 35c: 381e05c1 strb w1, [x14], #-32 - 360: 78006411 strh w17, [x0], #6 - 364: f855473b ldr x27, [x25], #-172 - 368: b85da72d ldr w13, [x25], #-38 - 36c: 385e372b ldrb w11, [x25], #-29 - 370: 784144be ldrh w30, [x5], #20 - 374: 389f94e9 ldrsb x9, [x7], #-7 - 378: 789c2460 ldrsh x0, [x3], #-62 - 37c: 78c1f5c7 ldrsh w7, [x14], #31 - 380: b8827771 ldrsw x17, [x27], #39 - 384: fc515491 ldr d17, [x4], #-235 - 388: bc4226ba ldr s26, [x21], #34 - 38c: fc1c7625 str d5, [x17], #-57 - 390: bc1935ad str s13, [x13], #-109 - 394: f824da06 str x6, [x16, w4, sxtw #3] - 398: b834db09 str w9, [x24, w20, sxtw #2] - 39c: 38237ba3 strb w3, [x29, x3, lsl #0] - 3a0: 783e6a2a strh w10, [x17, x30] - 3a4: f867497b ldr x27, [x11, w7, uxtw] - 3a8: b87949ee ldr w14, [x15, w25, uxtw] - 3ac: 387379d8 ldrb w24, [x14, x19, lsl #0] - 3b0: 7866c810 ldrh w16, [x0, w6, sxtw] - 3b4: 38acd98a ldrsb x10, [x12, w12, sxtw #0] - 3b8: 78b0499a ldrsh x26, [x12, w16, uxtw] - 3bc: 78ee781a ldrsh w26, [x0, x14, lsl #1] - 3c0: b8bbf971 ldrsw x17, [x11, x27, sxtx #2] - 3c4: fc73d803 ldr d3, [x0, w19, sxtw #3] - 3c8: bc6979fa ldr s26, [x15, x9, lsl #2] - 3cc: fc30e9ab str d11, [x13, x16, sxtx] - 3d0: bc355a7a str s26, [x19, w21, uxtw #2] - 3d4: f91886a8 str x8, [x21, #12552] - 3d8: b918ef6a str w10, [x27, #6380] - 3dc: 391b15db strb w27, [x14, #1733] - 3e0: 791ac0f0 strh w16, [x7, #3424] - 3e4: f958753b ldr x27, [x9, #12520] - 3e8: b95a1958 ldr w24, [x10, #6680] - 3ec: 395b3f18 ldrb w24, [x24, #1743] - 3f0: 795800b4 ldrh w20, [x5, #3072] - 3f4: 39988891 ldrsb x17, [x4, #1570] - 3f8: 799a81ae ldrsh x14, [x13, #3392] - 3fc: 79dd172a ldrsh w10, [x25, #3722] - 400: b9981342 ldrsw x2, [x26, #6160] - 404: fd5d21da ldr d26, [x14, #14912] - 408: bd5e7c9c ldr s28, [x4, #7804] - 40c: fd1b526e str d14, [x19, #13984] - 410: bd18df97 str s23, [x28, #6364] - 414: 58002268 ldr x8, 860 <forth> - 418: 18ffdf51 ldr w17, 0 <back> - 41c: f8951080 prfum pldl1keep, [x4, #-175] - 420: d8000000 prfm pldl1keep, 420 <back+0x420> - 424: f8a4c900 prfm pldl1keep, [x8, w4, sxtw] - 428: f999e180 prfm pldl1keep, [x12, #13248] - 42c: 1a150374 adc w20, w27, w21 - 430: 3a060227 adcs w7, w17, w6 - 434: 5a1900c5 sbc w5, w6, w25 - 438: 7a0e017e sbcs w30, w11, w14 - 43c: 9a0b0223 adc x3, x17, x11 - 440: ba110159 adcs x25, x10, x17 - 444: da170207 sbc x7, x16, x23 - 448: fa050144 sbcs x4, x10, x5 - 44c: 0b2973c9 add w9, w30, w9, uxtx #4 - 450: 2b30a8a0 adds w0, w5, w16, sxth #2 - 454: cb3b8baf sub x15, x29, w27, sxtb #2 - 458: 6b21f12b subs w11, w9, w1, sxtx #4 - 45c: 8b264f02 add x2, x24, w6, uxtw #3 - 460: ab3a70d3 adds x19, x6, x26, uxtx #4 - 464: cb39ef48 sub x8, x26, x25, sxtx #3 - 468: eb29329a subs x26, x20, w9, uxth #4 - 46c: 3a5a41a7 ccmn w13, w26, #0x7, mi // mi = first - 470: 7a54310f ccmp w8, w20, #0xf, cc // cc = lo, ul, last - 474: ba4302c8 ccmn x22, x3, #0x8, eq // eq = none - 478: fa58a04a ccmp x2, x24, #0xa, ge // ge = tcont - 47c: 3a50490d ccmn w8, #0x10, #0xd, mi // mi = first - 480: 7a4c0a01 ccmp w16, #0xc, #0x1, eq // eq = none - 484: ba5f79e3 ccmn x15, #0x1f, #0x3, vc - 488: fa4c0aef ccmp x23, #0xc, #0xf, eq // eq = none - 48c: 1a9a30ee csel w14, w7, w26, cc // cc = lo, ul, last - 490: 1a9ed763 csinc w3, w27, w30, le - 494: 5a9702ab csinv w11, w21, w23, eq // eq = none - 498: 5a95c7da csneg w26, w30, w21, gt - 49c: 9a8d835c csel x28, x26, x13, hi // hi = pmore - 4a0: 9a909471 csinc x17, x3, x16, ls // ls = plast - 4a4: da8380ab csinv x11, x5, x3, hi // hi = pmore - 4a8: da93c461 csneg x1, x3, x19, gt - 4ac: 5ac00120 rbit w0, w9 - 4b0: 5ac005da rev16 w26, w14 - 4b4: 5ac00a2d rev w13, w17 - 4b8: 5ac0128b clz w11, w20 - 4bc: 5ac0163c cls w28, w17 - 4c0: dac0008d rbit x13, x4 - 4c4: dac007c1 rev16 x1, x30 - 4c8: dac009cd rev32 x13, x14 - 4cc: dac00d05 rev x5, x8 - 4d0: dac01322 clz x2, x25 - 4d4: dac01514 cls x20, x8 - 4d8: 1adb0b35 udiv w21, w25, w27 - 4dc: 1ad00d4d sdiv w13, w10, w16 - 4e0: 1ad1203c lsl w28, w1, w17 - 4e4: 1aca26f9 lsr w25, w23, w10 - 4e8: 1ac72867 asr w7, w3, w7 - 4ec: 1ace2fce ror w14, w30, w14 - 4f0: 9acf0acc udiv x12, x22, x15 - 4f4: 9acd0f22 sdiv x2, x25, x13 - 4f8: 9ad522e7 lsl x7, x23, x21 - 4fc: 9ac0258b lsr x11, x12, x0 - 500: 9adc293e asr x30, x9, x28 - 504: 9ad62cad ror x13, x5, x22 - 508: 9bc47ea5 umulh x5, x21, x4 - 50c: 9b477c51 smulh x17, x2, x7 - 510: 1b11318c madd w12, w12, w17, w12 - 514: 1b01edfe msub w30, w15, w1, w27 - 518: 9b117662 madd x2, x19, x17, x29 - 51c: 9b03fae4 msub x4, x23, x3, x30 - 520: 9b313eef smaddl x15, w23, w17, x15 - 524: 9b21b59b smsubl x27, w12, w1, x13 - 528: 9bac45a6 umaddl x6, w13, w12, x17 - 52c: 9ba6a839 umsubl x25, w1, w6, x10 - 530: 1e240871 fmul s17, s3, s4 - 534: 1e3518b0 fdiv s16, s5, s21 - 538: 1e312b63 fadd s3, s27, s17 - 53c: 1e2f3959 fsub s25, s10, s15 - 540: 1e200a2a fmul s10, s17, s0 - 544: 1e630b5c fmul d28, d26, d3 - 548: 1e7b1804 fdiv d4, d0, d27 - 54c: 1e6229dc fadd d28, d14, d2 - 550: 1e773b4c fsub d12, d26, d23 - 554: 1e610bcf fmul d15, d30, d1 - 558: 1f0534a4 fmadd s4, s5, s5, s13 - 55c: 1f1c85b5 fmsub s21, s13, s28, s1 - 560: 1f3d1c71 fnmadd s17, s3, s29, s7 - 564: 1f3d6b37 fnmadd s23, s25, s29, s26 - 568: 1f5e68ee fmadd d14, d7, d30, d26 - 56c: 1f4aa4f6 fmsub d22, d7, d10, d9 - 570: 1f6e24e7 fnmadd d7, d7, d14, d9 - 574: 1f6f630e fnmadd d14, d24, d15, d24 - 578: 1e204056 fmov s22, s2 - 57c: 1e20c060 fabs s0, s3 - 580: 1e214229 fneg s9, s17 - 584: 1e21c178 fsqrt s24, s11 - 588: 1e22c32f fcvt d15, s25 - 58c: 1e604064 fmov d4, d3 - 590: 1e60c2da fabs d26, d22 - 594: 1e61427e fneg d30, d19 - 598: 1e61c1cc fsqrt d12, d14 - 59c: 1e6240f1 fcvt s17, d7 - 5a0: 1e3801d8 fcvtzs w24, s14 - 5a4: 9e38034d fcvtzs x13, s26 - 5a8: 1e780022 fcvtzs w2, d1 - 5ac: 9e780165 fcvtzs x5, d11 - 5b0: 1e22026e scvtf s14, w19 - 5b4: 9e2202c1 scvtf s1, x22 - 5b8: 1e62023b scvtf d27, w17 - 5bc: 9e620136 scvtf d22, x9 - 5c0: 1e26006e fmov w14, s3 - 5c4: 9e66022c fmov x12, d17 - 5c8: 1e270368 fmov s8, w27 - 5cc: 9e67039d fmov d29, x28 - 5d0: 1e3e2000 fcmp s0, s30 - 5d4: 1e692180 fcmp d12, d9 - 5d8: 1e202148 fcmp s10, #0.0 - 5dc: 1e602328 fcmp d25, #0.0 - 5e0: 292e7b68 stp w8, w30, [x27, #-144] - 5e4: 294a4f15 ldp w21, w19, [x24, #80] - 5e8: 69626c50 ldpsw x16, x27, [x2, #-240] - 5ec: a93814d5 stp x21, x5, [x6, #-128] - 5f0: a97e679d ldp x29, x25, [x28, #-32] - 5f4: 29903408 stp w8, w13, [x0, #128]! - 5f8: 29ec5039 ldp w25, w20, [x1, #-160]! - 5fc: 69fc62ce ldpsw x14, x24, [x22, #-32]! - 600: a98504d1 stp x17, x1, [x6, #80]! - 604: a9fc4735 ldp x21, x17, [x25, #-64]! - 608: 28b05691 stp w17, w21, [x20], #-128 - 60c: 28c8705c ldp w28, w28, [x2], #64 - 610: 68e07953 ldpsw x19, x30, [x10], #-256 - 614: a8bf3e31 stp x17, x15, [x17], #-16 - 618: a8fe0331 ldp x17, x0, [x25], #-32 - 61c: 283c170e stnp w14, w5, [x24, #-32] - 620: 284e4c37 ldnp w23, w19, [x1, #112] - 624: a80419cb stnp x11, x6, [x14, #64] - 628: a8722f62 ldnp x2, x11, [x27, #-224] - 62c: 0c407230 ld1 {v16.8b}, [x17] - 630: 4cdfa13d ld1 {v29.16b, v30.16b}, [x9], #32 - 634: 0cd56f1e ld1 {v30.1d, v31.1d, v0.1d}, [x24], x21 - 638: 4cdf2440 ld1 {v0.8h-v3.8h}, [x2], #64 - 63c: 0d40c134 ld1r {v20.8b}, [x9] - 640: 4ddfc811 ld1r {v17.4s}, [x0], #4 - 644: 0ddaced5 ld1r {v21.1d}, [x22], x26 - 648: 4c408f33 ld2 {v19.2d, v20.2d}, [x25] - 64c: 0cdf84aa ld2 {v10.4h, v11.4h}, [x5], #16 - 650: 4d60c30a ld2r {v10.16b, v11.16b}, [x24] - 654: 0dffcbad ld2r {v13.2s, v14.2s}, [x29], #8 - 658: 4de2cf96 ld2r {v22.2d, v23.2d}, [x28], x2 - 65c: 4ccb489e ld3 {v30.4s, v31.4s, v0.4s}, [x4], x11 - 660: 0c40481d ld3 {v29.2s-v31.2s}, [x0] - 664: 4d40e777 ld3r {v23.8h-v25.8h}, [x27] - 668: 4ddfe943 ld3r {v3.4s-v5.4s}, [x10], #12 - 66c: 0dd6edd3 ld3r {v19.1d-v21.1d}, [x14], x22 - 670: 4cdf040e ld4 {v14.8h-v17.8h}, [x0], #64 - 674: 0cd902de ld4 {v30.8b, v31.8b, v0.8b, v1.8b}, [x22], x25 - 678: 0d60e019 ld4r {v25.8b-v28.8b}, [x0] - 67c: 0dffe50a ld4r {v10.4h-v13.4h}, [x8], #8 - 680: 0dfce8c1 ld4r {v1.2s-v4.2s}, [x6], x28 - 684: ba5fd3e3 ccmn xzr, xzr, #0x3, le - 688: 3a5f03e5 ccmn wzr, wzr, #0x5, eq // eq = none - 68c: fa411be4 ccmp xzr, #0x1, #0x4, ne // ne = any - 690: 7a42cbe2 ccmp wzr, #0x2, #0x2, gt - 694: 93df03ff ror xzr, xzr, #0 - 698: c820ffff stlxp w0, xzr, xzr, [sp] - 69c: 8822fc7f stlxp w2, wzr, wzr, [x3] - 6a0: c8247cbf stxp w4, xzr, xzr, [x5] - 6a4: 88267fff stxp w6, wzr, wzr, [sp] - 6a8: 4e010fe0 dup v0.16b, wzr - 6ac: 4e081fe1 mov v1.d[0], xzr - 6b0: 4e0c1fe1 mov v1.s[1], wzr - 6b4: 4e0a1fe1 mov v1.h[2], wzr - 6b8: 4e071fe1 mov v1.b[3], wzr - 6bc: 4cc0ac3f ld1 {v31.2d, v0.2d}, [x1], x0 - 6c0: 1e601000 fmov d0, #2.000000000000000000e+00 - 6c4: 1e603000 fmov d0, #2.125000000000000000e+00 - 6c8: 1e621000 fmov d0, #4.000000000000000000e+00 - 6cc: 1e623000 fmov d0, #4.250000000000000000e+00 - 6d0: 1e641000 fmov d0, #8.000000000000000000e+00 - 6d4: 1e643000 fmov d0, #8.500000000000000000e+00 - 6d8: 1e661000 fmov d0, #1.600000000000000000e+01 - 6dc: 1e663000 fmov d0, #1.700000000000000000e+01 - 6e0: 1e681000 fmov d0, #1.250000000000000000e-01 - 6e4: 1e683000 fmov d0, #1.328125000000000000e-01 - 6e8: 1e6a1000 fmov d0, #2.500000000000000000e-01 - 6ec: 1e6a3000 fmov d0, #2.656250000000000000e-01 - 6f0: 1e6c1000 fmov d0, #5.000000000000000000e-01 - 6f4: 1e6c3000 fmov d0, #5.312500000000000000e-01 - 6f8: 1e6e1000 fmov d0, #1.000000000000000000e+00 - 6fc: 1e6e3000 fmov d0, #1.062500000000000000e+00 - 700: 1e701000 fmov d0, #-2.000000000000000000e+00 - 704: 1e703000 fmov d0, #-2.125000000000000000e+00 - 708: 1e721000 fmov d0, #-4.000000000000000000e+00 - 70c: 1e723000 fmov d0, #-4.250000000000000000e+00 - 710: 1e741000 fmov d0, #-8.000000000000000000e+00 - 714: 1e743000 fmov d0, #-8.500000000000000000e+00 - 718: 1e761000 fmov d0, #-1.600000000000000000e+01 - 71c: 1e763000 fmov d0, #-1.700000000000000000e+01 - 720: 1e781000 fmov d0, #-1.250000000000000000e-01 - 724: 1e783000 fmov d0, #-1.328125000000000000e-01 - 728: 1e7a1000 fmov d0, #-2.500000000000000000e-01 - 72c: 1e7a3000 fmov d0, #-2.656250000000000000e-01 - 730: 1e7c1000 fmov d0, #-5.000000000000000000e-01 - 734: 1e7c3000 fmov d0, #-5.312500000000000000e-01 - 738: 1e7e1000 fmov d0, #-1.000000000000000000e+00 - 73c: 1e7e3000 fmov d0, #-1.062500000000000000e+00 - 740: f83081f4 swp x16, x20, [x15] - 744: f8220387 ldadd x2, x7, [x28] - 748: f834132a ldclr x20, x10, [x25] - 74c: f836204b ldeor x22, x11, [x2] - 750: f821326a ldset x1, x10, [x19] - 754: f82e5075 ldsmin x14, x21, [x3] - 758: f83c41bb ldsmax x28, x27, [x13] - 75c: f83172be ldumin x17, x30, [x21] - 760: f83b63b0 ldumax x27, x16, [x29] - 764: f8be8009 swpa x30, x9, [x0] - 768: f8bc039b ldadda x28, x27, [x28] - 76c: f8b51159 ldclra x21, x25, [x10] - 770: f8bf21f4 ldeora xzr, x20, [x15] - 774: f8a131d9 ldseta x1, x25, [x14] - 778: f8b553ba ldsmina x21, x26, [x29] - 77c: f8a8433d ldsmaxa x8, x29, [x25] - 780: f8ad7322 ldumina x13, x2, [x25] - 784: f8af6017 ldumaxa x15, x23, [x0] - 788: f8e38041 swpal x3, x1, [x2] - 78c: f8fc0283 ldaddal x28, x3, [x20] - 790: f8ee11df ldclral x14, xzr, [x14] - 794: f8e7205c ldeoral x7, x28, [x2] - 798: f8e030ab ldsetal x0, x11, [x5] - 79c: f8eb528e ldsminal x11, x14, [x20] - 7a0: f8ff4044 ldsmaxal xzr, x4, [x2] - 7a4: f8fa72c0 lduminal x26, x0, [x22] - 7a8: f8f161a1 ldumaxal x17, x1, [x13] - 7ac: f877829a swpl x23, x26, [x20] - 7b0: f86e018b ldaddl x14, x11, [x12] - 7b4: f86c11ff stclrl x12, [x15] - 7b8: f87b210e ldeorl x27, x14, [x8] - 7bc: f86a333e ldsetl x10, x30, [x25] - 7c0: f8765207 ldsminl x22, x7, [x16] - 7c4: f8614110 ldsmaxl x1, x16, [x8] - 7c8: f8617341 lduminl x1, x1, [x26] - 7cc: f86061f7 ldumaxl x0, x23, [x15] - 7d0: b82b8110 swp w11, w16, [x8] - 7d4: b82101c7 ldadd w1, w7, [x14] - 7d8: b830113f stclr w16, [x9] - 7dc: b83621a6 ldeor w22, w6, [x13] - 7e0: b82b308d ldset w11, w13, [x4] - 7e4: b8305016 ldsmin w16, w22, [x0] - 7e8: b83c415f stsmax w28, [x10] - 7ec: b8307105 ldumin w16, w5, [x8] - 7f0: b83a61f4 ldumax w26, w20, [x15] - 7f4: b8bb8206 swpa w27, w6, [x16] - 7f8: b8bf005f ldadda wzr, wzr, [x2] - 7fc: b8b8111c ldclra w24, w28, [x8] - 800: b8af22e9 ldeora w15, w9, [x23] - 804: b8ba30e2 ldseta w26, w2, [x7] - 808: b8a351f1 ldsmina w3, w17, [x15] - 80c: b8b342a5 ldsmaxa w19, w5, [x21] - 810: b8a7719a ldumina w7, w26, [x12] - 814: b8ac63a7 ldumaxa w12, w7, [x29] - 818: b8e98288 swpal w9, w8, [x20] - 81c: b8e803df ldaddal w8, wzr, [x30] - 820: b8e01186 ldclral w0, w6, [x12] - 824: b8f12057 ldeoral w17, w23, [x2] - 828: b8e0303e ldsetal w0, w30, [x1] - 82c: b8f651e3 ldsminal w22, w3, [x15] - 830: b8f941b5 ldsmaxal w25, w21, [x13] - 834: b8ed7378 lduminal w13, w24, [x27] - 838: b8f46163 ldumaxal w20, w3, [x11] - 83c: b86382ad swpl w3, w13, [x21] - 840: b87a034f ldaddl w26, w15, [x26] - 844: b8691053 ldclrl w9, w19, [x2] - 848: b87820fd ldeorl w24, w29, [x7] - 84c: b87d31f9 ldsetl w29, w25, [x15] - 850: b86b50fe ldsminl w11, w30, [x7] - 854: b86b40c2 ldsmaxl w11, w2, [x6] - 858: b87071cb lduminl w16, w11, [x14] - 85c: b8656168 ldumaxl w5, w8, [x11] + 0: 8b8e677b add x27, x27, x14, asr #25 + 4: cb512964 sub x4, x11, x17, lsr #10 + 8: ab998627 adds x7, x17, x25, asr #33 + c: eb9416cd subs x13, x22, x20, asr #5 + 10: 0b83438a add w10, w28, w3, asr #16 + 14: 4b463c55 sub w21, w2, w6, lsr #15 + 18: 2b9b2406 adds w6, w0, w27, asr #9 + 1c: 6b882b65 subs w5, w27, w8, asr #10 + 20: 8a879c8c and x12, x4, x7, asr #39 + 24: aa16cb75 orr x21, x27, x22, lsl #50 + 28: ca80baa3 eor x3, x21, x0, asr #46 + 2c: ea855955 ands x21, x10, x5, asr #22 + 30: 0a1d5aad and w13, w21, w29, lsl #22 + 34: 2a504951 orr w17, w10, w16, lsr #18 + 38: 4a976cf0 eor w16, w7, w23, asr #27 + 3c: 6a8c30ca ands w10, w6, w12, asr #12 + 40: 8a275b33 bic x19, x25, x7, lsl #22 + 44: aa27d459 orn x25, x2, x7, lsl #53 + 48: cab70ee9 eon x9, x23, x23, asr #3 + 4c: eaadc8c5 bics x5, x6, x13, asr #50 + 50: 0a2a26af bic w15, w21, w10, lsl #9 + 54: 2abe06b1 orn w17, w21, w30, asr #1 + 58: 4a3d4f87 eon w7, w28, w29, lsl #19 + 5c: 6ab632d9 bics w25, w22, w22, asr #12 + 60: 110c5346 add w6, w26, #0x314 + 64: 3107aa23 adds w3, w17, #0x1ea + 68: 5107eea5 sub w5, w21, #0x1fb + 6c: 710dcf76 subs w22, w27, #0x373 + 70: 9103d10c add x12, x8, #0xf4 + 74: b10e811d adds x29, x8, #0x3a0 + 78: d10a087a sub x26, x3, #0x282 + 7c: f109d1fd subs x29, x15, #0x274 + 80: 1209afd5 and w21, w30, #0xff87ff87 + 84: 32099d95 orr w21, w12, #0x7f807f80 + 88: 5202c62b eor w11, w17, #0xc0c0c0c0 + 8c: 720897da ands w26, w30, #0x3f003f00 + 90: 920e36f9 and x25, x23, #0xfffc0000fffc0000 + 94: b243f1de orr x30, x14, #0xe3ffffffffffffff + 98: d263d09a eor x26, x4, #0xffffffffe003ffff + 9c: f24fd01a ands x26, x0, #0xfffe003fffffffff + a0: 14000000 b a0 <back+0xa0> + a4: 17ffffd7 b 0 <back> + a8: 1400023e b 9a0 <forth> + ac: 94000000 bl ac <back+0xac> + b0: 97ffffd4 bl 0 <back> + b4: 9400023b bl 9a0 <forth> + b8: 3400001c cbz w28, b8 <back+0xb8> + bc: 34fffa3c cbz w28, 0 <back> + c0: 3400471c cbz w28, 9a0 <forth> + c4: 35000011 cbnz w17, c4 <back+0xc4> + c8: 35fff9d1 cbnz w17, 0 <back> + cc: 350046b1 cbnz w17, 9a0 <forth> + d0: b4000019 cbz x25, d0 <back+0xd0> + d4: b4fff979 cbz x25, 0 <back> + d8: b4004659 cbz x25, 9a0 <forth> + dc: b5000002 cbnz x2, dc <back+0xdc> + e0: b5fff902 cbnz x2, 0 <back> + e4: b50045e2 cbnz x2, 9a0 <forth> + e8: 1000001d adr x29, e8 <back+0xe8> + ec: 10fff8bd adr x29, 0 <back> + f0: 1000459d adr x29, 9a0 <forth> + f4: 9000001d adrp x29, 0 <back> + f8: 36300006 tbz w6, #6, f8 <back+0xf8> + fc: 3637f826 tbz w6, #6, 0 <back> + 100: 36304506 tbz w6, #6, 9a0 <forth> + 104: 37100015 tbnz w21, #2, 104 <back+0x104> + 108: 3717f7d5 tbnz w21, #2, 0 <back> + 10c: 371044b5 tbnz w21, #2, 9a0 <forth> + 110: 128155e8 mov w8, #0xfffff550 // #-2736 + 114: 52a5762b mov w11, #0x2bb10000 // #733020160 + 118: 72acb59a movk w26, #0x65ac, lsl #16 + 11c: 92866a8d mov x13, #0xffffffffffffccab // #-13141 + 120: d2e2d8a6 mov x6, #0x16c5000000000000 // #1640717639246413824 + 124: f2c54450 movk x16, #0x2a22, lsl #32 + 128: 93516bde sbfx x30, x30, #17, #10 + 12c: 330f3124 bfi w4, w9, #17, #13 + 130: 5301168f ubfx w15, w20, #1, #5 + 134: 9353391b sbfiz x27, x8, #45, #15 + 138: b355741e bfxil x30, x0, #21, #9 + 13c: d3562f5b ubfiz x27, x26, #42, #12 + 140: 13866d8c extr w12, w12, w6, #27 + 144: 93d6b5b3 extr x19, x13, x22, #45 + 148: 54000000 b.eq 148 <back+0x148> // b.none + 14c: 54fff5a0 b.eq 0 <back> // b.none + 150: 54004280 b.eq 9a0 <forth> // b.none + 154: 54000001 b.ne 154 <back+0x154> // b.any + 158: 54fff541 b.ne 0 <back> // b.any + 15c: 54004221 b.ne 9a0 <forth> // b.any + 160: 54000002 b.cs 160 <back+0x160> // b.hs, b.nlast + 164: 54fff4e2 b.cs 0 <back> // b.hs, b.nlast + 168: 540041c2 b.cs 9a0 <forth> // b.hs, b.nlast + 16c: 54000002 b.cs 16c <back+0x16c> // b.hs, b.nlast + 170: 54fff482 b.cs 0 <back> // b.hs, b.nlast + 174: 54004162 b.cs 9a0 <forth> // b.hs, b.nlast + 178: 54000003 b.cc 178 <back+0x178> // b.lo, b.ul, b.last + 17c: 54fff423 b.cc 0 <back> // b.lo, b.ul, b.last + 180: 54004103 b.cc 9a0 <forth> // b.lo, b.ul, b.last + 184: 54000003 b.cc 184 <back+0x184> // b.lo, b.ul, b.last + 188: 54fff3c3 b.cc 0 <back> // b.lo, b.ul, b.last + 18c: 540040a3 b.cc 9a0 <forth> // b.lo, b.ul, b.last + 190: 54000004 b.mi 190 <back+0x190> // b.first + 194: 54fff364 b.mi 0 <back> // b.first + 198: 54004044 b.mi 9a0 <forth> // b.first + 19c: 54000005 b.pl 19c <back+0x19c> // b.nfrst + 1a0: 54fff305 b.pl 0 <back> // b.nfrst + 1a4: 54003fe5 b.pl 9a0 <forth> // b.nfrst + 1a8: 54000006 b.vs 1a8 <back+0x1a8> + 1ac: 54fff2a6 b.vs 0 <back> + 1b0: 54003f86 b.vs 9a0 <forth> + 1b4: 54000007 b.vc 1b4 <back+0x1b4> + 1b8: 54fff247 b.vc 0 <back> + 1bc: 54003f27 b.vc 9a0 <forth> + 1c0: 54000008 b.hi 1c0 <back+0x1c0> // b.pmore + 1c4: 54fff1e8 b.hi 0 <back> // b.pmore + 1c8: 54003ec8 b.hi 9a0 <forth> // b.pmore + 1cc: 54000009 b.ls 1cc <back+0x1cc> // b.plast + 1d0: 54fff189 b.ls 0 <back> // b.plast + 1d4: 54003e69 b.ls 9a0 <forth> // b.plast + 1d8: 5400000a b.ge 1d8 <back+0x1d8> // b.tcont + 1dc: 54fff12a b.ge 0 <back> // b.tcont + 1e0: 54003e0a b.ge 9a0 <forth> // b.tcont + 1e4: 5400000b b.lt 1e4 <back+0x1e4> // b.tstop + 1e8: 54fff0cb b.lt 0 <back> // b.tstop + 1ec: 54003dab b.lt 9a0 <forth> // b.tstop + 1f0: 5400000c b.gt 1f0 <back+0x1f0> + 1f4: 54fff06c b.gt 0 <back> + 1f8: 54003d4c b.gt 9a0 <forth> + 1fc: 5400000d b.le 1fc <back+0x1fc> + 200: 54fff00d b.le 0 <back> + 204: 54003ced b.le 9a0 <forth> + 208: 5400000e b.al 208 <back+0x208> + 20c: 54ffefae b.al 0 <back> + 210: 54003c8e b.al 9a0 <forth> + 214: 5400000f b.nv 214 <back+0x214> + 218: 54ffef4f b.nv 0 <back> + 21c: 54003c2f b.nv 9a0 <forth> + 220: d407da81 svc #0x3ed4 + 224: d402d542 hvc #0x16aa + 228: d406dae3 smc #0x36d7 + 22c: d4258fa0 brk #0x2c7d + 230: d44d5960 hlt #0x6acb + 234: d503201f nop + 238: d69f03e0 eret + 23c: d6bf03e0 drps + 240: d5033fdf isb + 244: d503339f dsb osh + 248: d50336bf dmb nshst + 24c: d61f0160 br x11 + 250: d63f0320 blr x25 + 254: c80e7daf stxr w14, x15, [x13] + 258: c81efc39 stlxr w30, x25, [x1] + 25c: c85f7c6d ldxr x13, [x3] + 260: c85ffea8 ldaxr x8, [x21] + 264: c89fff8d stlr x13, [x28] + 268: c8dfffc8 ldar x8, [x30] + 26c: 880d7f91 stxr w13, w17, [x28] + 270: 8815fe71 stlxr w21, w17, [x19] + 274: 885f7d03 ldxr w3, [x8] + 278: 885ffebd ldaxr w29, [x21] + 27c: 889fff09 stlr w9, [x24] + 280: 88dffcc2 ldar w2, [x6] + 284: 480c7e14 stxrh w12, w20, [x16] + 288: 4802fcbc stlxrh w2, w28, [x5] + 28c: 485f7c61 ldxrh w1, [x3] + 290: 485ffdb8 ldaxrh w24, [x13] + 294: 489fff2f stlrh w15, [x25] + 298: 48dffe8a ldarh w10, [x20] + 29c: 08057db0 stxrb w5, w16, [x13] + 2a0: 080afe2f stlxrb w10, w15, [x17] + 2a4: 085f7e71 ldxrb w17, [x19] + 2a8: 085ffd3e ldaxrb w30, [x9] + 2ac: 089fff14 stlrb w20, [x24] + 2b0: 08dffc8a ldarb w10, [x4] + 2b4: c87f2139 ldxp x25, x8, [x9] + 2b8: c87faa07 ldaxp x7, x10, [x16] + 2bc: c8392d30 stxp w25, x16, x11, [x9] + 2c0: c827a5e5 stlxp w7, x5, x9, [x15] + 2c4: 887f106c ldxp w12, w4, [x3] + 2c8: 887f88b1 ldaxp w17, w2, [x5] + 2cc: 882460c8 stxp w4, w8, w24, [x6] + 2d0: 8824e60c stlxp w4, w12, w25, [x16] + 2d4: f800b3ce stur x14, [x30, #11] + 2d8: b819f3a6 stur w6, [x29, #-97] + 2dc: 381f9162 sturb w2, [x11, #-7] + 2e0: 781ea114 sturh w20, [x8, #-22] + 2e4: f85e33b4 ldur x20, [x29, #-29] + 2e8: b85e6009 ldur w9, [x0, #-26] + 2ec: 3940204e ldrb w14, [x2, #8] + 2f0: 785e802d ldurh w13, [x1, #-24] + 2f4: 389f922d ldursb x13, [x17, #-7] + 2f8: 789f50f1 ldursh x17, [x7, #-11] + 2fc: 78dc4103 ldursh w3, [x8, #-60] + 300: b9800d8e ldrsw x14, [x12, #12] + 304: fc5152a5 ldur d5, [x21, #-235] + 308: bc5ca009 ldur s9, [x0, #-54] + 30c: fc05f10f stur d15, [x8, #95] + 310: bc1f0016 stur s22, [x0, #-16] + 314: f8111c97 str x23, [x4, #-239]! + 318: b8186c11 str w17, [x0, #-122]! + 31c: 381fbd3a strb w26, [x9, #-5]! + 320: 781f8dd5 strh w21, [x14, #-8]! + 324: f8417ce8 ldr x8, [x7, #23]! + 328: b8416d0c ldr w12, [x8, #22]! + 32c: 38406f9b ldrb w27, [x28, #6]! + 330: 785c6e66 ldrh w6, [x19, #-58]! + 334: 389ecca7 ldrsb x7, [x5, #-20]! + 338: 789e0e36 ldrsh x22, [x17, #-32]! + 33c: 78dfedb1 ldrsh w17, [x13, #-2]! + 340: b8816c9d ldrsw x29, [x4, #22]! + 344: fc5b2f88 ldr d8, [x28, #-78]! + 348: bc5fbd77 ldr s23, [x11, #-5]! + 34c: fc1e9e89 str d9, [x20, #-23]! + 350: bc199c65 str s5, [x3, #-103]! + 354: f802044d str x13, [x2], #32 + 358: b803967e str w30, [x19], #57 + 35c: 3800343d strb w29, [x1], #3 + 360: 781ef74a strh w10, [x26], #-17 + 364: f85f442f ldr x15, [x1], #-12 + 368: b85fa4a1 ldr w1, [x5], #-6 + 36c: 385f25f8 ldrb w24, [x15], #-14 + 370: 785fb63d ldrh w29, [x17], #-5 + 374: 389ef5e4 ldrsb x4, [x15], #-17 + 378: 789ca446 ldrsh x6, [x2], #-54 + 37c: 78c1277b ldrsh w27, [x27], #18 + 380: b89b3729 ldrsw x9, [x25], #-77 + 384: fc5507b5 ldr d21, [x29], #-176 + 388: bc5ce53e ldr s30, [x9], #-50 + 38c: fc1d2582 str d2, [x12], #-46 + 390: bc1c56a7 str s7, [x21], #-59 + 394: f837598c str x12, [x12, w23, uxtw #3] + 398: b8364bce str w14, [x30, w22, uxtw] + 39c: 383a586c strb w12, [x3, w26, uxtw #0] + 3a0: 783e49cb strh w11, [x14, w30, uxtw] + 3a4: f8787918 ldr x24, [x8, x24, lsl #3] + 3a8: b87469ac ldr w12, [x13, x20] + 3ac: 38655896 ldrb w22, [x4, w5, uxtw #0] + 3b0: 786658bc ldrh w28, [x5, w6, uxtw #1] + 3b4: 38b97962 ldrsb x2, [x11, x25, lsl #0] + 3b8: 78b9ead7 ldrsh x23, [x22, x25, sxtx] + 3bc: 78f6da83 ldrsh w3, [x20, w22, sxtw #1] + 3c0: b8aefba9 ldrsw x9, [x29, x14, sxtx #2] + 3c4: fc7dfaf0 ldr d16, [x23, x29, sxtx #3] + 3c8: bc747b87 ldr s7, [x28, x20, lsl #2] + 3cc: fc387a94 str d20, [x20, x24, lsl #3] + 3d0: bc377ab9 str s25, [x21, x23, lsl #2] + 3d4: f9180c51 str x17, [x2, #12312] + 3d8: b91b38fe str w30, [x7, #6968] + 3dc: 391ca4e3 strb w3, [x7, #1833] + 3e0: 791a4c27 strh w7, [x1, #3366] + 3e4: f95ca767 ldr x7, [x27, #14664] + 3e8: b9580e28 ldr w8, [x17, #6156] + 3ec: 3958ea20 ldrb w0, [x17, #1594] + 3f0: 795bd680 ldrh w0, [x20, #3562] + 3f4: 399a4633 ldrsb x19, [x17, #1681] + 3f8: 799d80d3 ldrsh x19, [x6, #3776] + 3fc: 79dcf944 ldrsh w4, [x10, #3708] + 400: b99b249d ldrsw x29, [x4, #6948] + 404: fd5a143d ldr d29, [x1, #13352] + 408: bd59938f ldr s15, [x28, #6544] + 40c: fd1b9347 str d7, [x26, #14112] + 410: bd1aa7c0 str s0, [x30, #6820] + 414: 58000019 ldr x25, 414 <back+0x414> + 418: 18000009 ldr w9, 418 <back+0x418> + 41c: f88692c0 prfum pldl1keep, [x22, #105] + 420: d8ffdf00 prfm pldl1keep, 0 <back> + 424: f8be7b80 prfm pldl1keep, [x28, x30, lsl #3] + 428: f99c8260 prfm pldl1keep, [x19, #14592] + 42c: 1a180111 adc w17, w8, w24 + 430: 3a09022e adcs w14, w17, w9 + 434: 5a190036 sbc w22, w1, w25 + 438: 7a13012f sbcs w15, w9, w19 + 43c: 9a0b028f adc x15, x20, x11 + 440: ba1e0164 adcs x4, x11, x30 + 444: da060114 sbc x20, x8, x6 + 448: fa0f02aa sbcs x10, x21, x15 + 44c: 0b298d61 add w1, w11, w9, sxtb #3 + 450: 2b3cee24 adds w4, w17, w28, sxtx #3 + 454: cb3ca7b5 sub x21, x29, w28, sxth #1 + 458: 6b37d38b subs w11, w28, w23, sxtw #4 + 45c: 8b25f34c add x12, x26, x5, sxtx #4 + 460: ab3e68d1 adds x17, x6, x30, uxtx #2 + 464: cb210a87 sub x7, x20, w1, uxtb #2 + 468: eb3eed3e subs x30, x9, x30, sxtx #3 + 46c: 3a4b0087 ccmn w4, w11, #0x7, eq // eq = none + 470: 7a4571eb ccmp w15, w5, #0xb, vc + 474: ba5122e6 ccmn x23, x17, #0x6, cs // cs = hs, nlast + 478: fa4bc16a ccmp x11, x11, #0xa, gt + 47c: 3a4519cc ccmn w14, #0x5, #0xc, ne // ne = any + 480: 7a5c1aef ccmp w23, #0x1c, #0xf, ne // ne = any + 484: ba5e3a27 ccmn x17, #0x1e, #0x7, cc // cc = lo, ul, last + 488: fa4c8bc0 ccmp x30, #0xc, #0x0, hi // hi = pmore + 48c: 1a81537a csel w26, w27, w1, pl // pl = nfrst + 490: 1a95d56e csinc w14, w11, w21, le + 494: 5a8f60de csinv w30, w6, w15, vs + 498: 5a995451 csneg w17, w2, w25, pl // pl = nfrst + 49c: 9a8780b0 csel x16, x5, x7, hi // hi = pmore + 4a0: 9a9cc68a csinc x10, x20, x28, gt + 4a4: da8180e6 csinv x6, x7, x1, hi // hi = pmore + 4a8: da912756 csneg x22, x26, x17, cs // cs = hs, nlast + 4ac: 5ac000cb rbit w11, w6 + 4b0: 5ac00760 rev16 w0, w27 + 4b4: 5ac00ba1 rev w1, w29 + 4b8: 5ac012b4 clz w20, w21 + 4bc: 5ac0158c cls w12, w12 + 4c0: dac00278 rbit x24, x19 + 4c4: dac005f7 rev16 x23, x15 + 4c8: dac00831 rev32 x17, x1 + 4cc: dac00c7b rev x27, x3 + 4d0: dac010be clz x30, x5 + 4d4: dac0140f cls x15, x0 + 4d8: 1ad4080e udiv w14, w0, w20 + 4dc: 1ad50d9b sdiv w27, w12, w21 + 4e0: 1ada214c lsl w12, w10, w26 + 4e4: 1ac6266e lsr w14, w19, w6 + 4e8: 1ade2a7b asr w27, w19, w30 + 4ec: 1ad02dc6 ror w6, w14, w16 + 4f0: 9ac209b1 udiv x17, x13, x2 + 4f4: 9ac20fa0 sdiv x0, x29, x2 + 4f8: 9ac2220c lsl x12, x16, x2 + 4fc: 9add26e9 lsr x9, x23, x29 + 500: 9add2a26 asr x6, x17, x29 + 504: 9ada2fce ror x14, x30, x26 + 508: 9bda7f11 umulh x17, x24, x26 + 50c: 9b4e7f54 smulh x20, x26, x14 + 510: 1b021d1b madd w27, w8, w2, w7 + 514: 1b19b1bc msub w28, w13, w25, w12 + 518: 9b0a6d24 madd x4, x9, x10, x27 + 51c: 9b08f956 msub x22, x10, x8, x30 + 520: 9b391694 smaddl x20, w20, w25, x5 + 524: 9b2beed6 smsubl x22, w22, w11, x27 + 528: 9bac4cc4 umaddl x4, w6, w12, x19 + 52c: 9ba881f1 umsubl x17, w15, w8, x0 + 530: 1e2a08b6 fmul s22, s5, s10 + 534: 1e301904 fdiv s4, s8, s16 + 538: 1e262919 fadd s25, s8, s6 + 53c: 1e393b66 fsub s6, s27, s25 + 540: 1e290aea fmul s10, s23, s9 + 544: 1e6c0a36 fmul d22, d17, d12 + 548: 1e74180b fdiv d11, d0, d20 + 54c: 1e6f2980 fadd d0, d12, d15 + 550: 1e643acf fsub d15, d22, d4 + 554: 1e79083d fmul d29, d1, d25 + 558: 1f131769 fmadd s9, s27, s19, s5 + 55c: 1f06e87a fmsub s26, s3, s6, s26 + 560: 1f285184 fnmadd s4, s12, s8, s20 + 564: 1f354539 fnmadd s25, s9, s21, s17 + 568: 1f5e5867 fmadd d7, d3, d30, d22 + 56c: 1f4aab61 fmsub d1, d27, d10, d10 + 570: 1f760511 fnmadd d17, d8, d22, d1 + 574: 1f626f8e fnmadd d14, d28, d2, d27 + 578: 1e2043db fmov s27, s30 + 57c: 1e20c025 fabs s5, s1 + 580: 1e214277 fneg s23, s19 + 584: 1e21c23c fsqrt s28, s17 + 588: 1e22c0d9 fcvt d25, s6 + 58c: 1e6041d4 fmov d20, d14 + 590: 1e60c151 fabs d17, d10 + 594: 1e61422a fneg d10, d17 + 598: 1e61c235 fsqrt d21, d17 + 59c: 1e6241f5 fcvt s21, d15 + 5a0: 1e380167 fcvtzs w7, s11 + 5a4: 9e3803a2 fcvtzs x2, s29 + 5a8: 1e780323 fcvtzs w3, d25 + 5ac: 9e78011c fcvtzs x28, d8 + 5b0: 1e22006b scvtf s11, w3 + 5b4: 9e2202a2 scvtf s2, x21 + 5b8: 1e62033d scvtf d29, w25 + 5bc: 9e620073 scvtf d19, x3 + 5c0: 1e2603b4 fmov w20, s29 + 5c4: 9e660237 fmov x23, d17 + 5c8: 1e270380 fmov s0, w28 + 5cc: 9e670289 fmov d9, x20 + 5d0: 1e2c20e0 fcmp s7, s12 + 5d4: 1e6e21a0 fcmp d13, d14 + 5d8: 1e202188 fcmp s12, #0.0 + 5dc: 1e602028 fcmp d1, #0.0 + 5e0: 29380acc stp w12, w2, [x22, #-64] + 5e4: 2966271b ldp w27, w9, [x24, #-208] + 5e8: 696a130f ldpsw x15, x4, [x24, #-176] + 5ec: a9015405 stp x5, x21, [x0, #16] + 5f0: a9735d26 ldp x6, x23, [x9, #-208] + 5f4: 29820fa0 stp w0, w3, [x29, #16]! + 5f8: 29ee403d ldp w29, w16, [x1, #-144]! + 5fc: 69c24ebb ldpsw x27, x19, [x21, #16]! + 600: a9b545a6 stp x6, x17, [x13, #-176]! + 604: a9c16020 ldp x0, x24, [x1, #16]! + 608: 288052c0 stp w0, w20, [x22], #0 + 60c: 28fa31d1 ldp w17, w12, [x14], #-48 + 610: 68ce682a ldpsw x10, x26, [x1], #112 + 614: a8ba61b4 stp x20, x24, [x13], #-96 + 618: a8c330e1 ldp x1, x12, [x7], #48 + 61c: 28362ae5 stnp w5, w10, [x23, #-80] + 620: 287a2b08 ldnp w8, w10, [x24, #-48] + 624: a8043d6b stnp x11, x15, [x11, #64] + 628: a84470a9 ldnp x9, x28, [x5, #64] + 62c: 0c40728b ld1 {v11.8b}, [x20] + 630: 4cdfa113 ld1 {v19.16b, v20.16b}, [x8], #32 + 634: 0cc36c43 ld1 {v3.1d-v5.1d}, [x2], x3 + 638: 4cdf2475 ld1 {v21.8h-v24.8h}, [x3], #64 + 63c: 0d40c0ae ld1r {v14.8b}, [x5] + 640: 4ddfcb6d ld1r {v13.4s}, [x27], #4 + 644: 0dc0ce71 ld1r {v17.1d}, [x19], x0 + 648: 4c408cbb ld2 {v27.2d, v28.2d}, [x5] + 64c: 0cdf849a ld2 {v26.4h, v27.4h}, [x4], #16 + 650: 4d60c2e8 ld2r {v8.16b, v9.16b}, [x23] + 654: 0dffc94e ld2r {v14.2s, v15.2s}, [x10], #8 + 658: 4df3ceaa ld2r {v10.2d, v11.2d}, [x21], x19 + 65c: 4cde49d1 ld3 {v17.4s-v19.4s}, [x14], x30 + 660: 0c404a94 ld3 {v20.2s-v22.2s}, [x20] + 664: 4d40e6b8 ld3r {v24.8h-v26.8h}, [x21] + 668: 4ddfe83a ld3r {v26.4s-v28.4s}, [x1], #12 + 66c: 0dc0ec4c ld3r {v12.1d-v14.1d}, [x2], x0 + 670: 4cdf04d5 ld4 {v21.8h-v24.8h}, [x6], #64 + 674: 0cd60391 ld4 {v17.8b-v20.8b}, [x28], x22 + 678: 0d60e333 ld4r {v19.8b-v22.8b}, [x25] + 67c: 0dffe6e6 ld4r {v6.4h-v9.4h}, [x23], #8 + 680: 0dfae928 ld4r {v8.2s-v11.2s}, [x9], x26 + 684: ba5fd3e3 ccmn xzr, xzr, #0x3, le + 688: 3a5f03e5 ccmn wzr, wzr, #0x5, eq // eq = none + 68c: fa411be4 ccmp xzr, #0x1, #0x4, ne // ne = any + 690: 7a42cbe2 ccmp wzr, #0x2, #0x2, gt + 694: 93df03ff ror xzr, xzr, #0 + 698: c820ffff stlxp w0, xzr, xzr, [sp] + 69c: 8822fc7f stlxp w2, wzr, wzr, [x3] + 6a0: c8247cbf stxp w4, xzr, xzr, [x5] + 6a4: 88267fff stxp w6, wzr, wzr, [sp] + 6a8: 4e010fe0 dup v0.16b, wzr + 6ac: 4e081fe1 mov v1.d[0], xzr + 6b0: 4e0c1fe1 mov v1.s[1], wzr + 6b4: 4e0a1fe1 mov v1.h[2], wzr + 6b8: 4e071fe1 mov v1.b[3], wzr + 6bc: 4cc0ac3f ld1 {v31.2d, v0.2d}, [x1], x0 + 6c0: 05a08020 mov z0.s, p0/m, s1 + 6c4: 04b0e3e0 incw x0 + 6c8: 0470e7e1 dech x1 + 6cc: 042f9c20 lsl z0.b, z1.b, #7 + 6d0: 043f9c35 lsl z21.h, z1.h, #15 + 6d4: 047f9c20 lsl z0.s, z1.s, #31 + 6d8: 04ff9c20 lsl z0.d, z1.d, #63 + 6dc: 04299420 lsr z0.b, z1.b, #7 + 6e0: 04319160 asr z0.h, z11.h, #15 + 6e4: 0461943e lsr z30.s, z1.s, #31 + 6e8: 04a19020 asr z0.d, z1.d, #63 + 6ec: 042053ff addvl sp, x0, #31 + 6f0: 047f5401 addpl x1, sp, #-32 + 6f4: 25208028 cntp x8, p0, p1.b + 6f8: 2538cfe0 mov z0.b, #127 + 6fc: 2578d001 mov z1.h, #-128 + 700: 25b8efe2 mov z2.s, #32512 + 704: 25f8f007 mov z7.d, #-32768 + 708: a400a3e0 ld1b {z0.b}, p0/z, [sp] + 70c: a4a8a7ea ld1h {z10.h}, p1/z, [sp, #-8, mul vl] + 710: a547a814 ld1w {z20.s}, p2/z, [x0, #7, mul vl] + 714: a4084ffe ld1b {z30.b}, p3/z, [sp, x8] + 718: a55c53e0 ld1w {z0.s}, p4/z, [sp, x28, lsl #2] + 71c: a5e1540b ld1d {z11.d}, p5/z, [x0, x1, lsl #3] + 720: e400fbf6 st1b {z22.b}, p6, [sp] + 724: e408ffff st1b {z31.b}, p7, [sp, #-8, mul vl] + 728: e547e400 st1w {z0.s}, p1, [x0, #7, mul vl] + 72c: e4014be0 st1b {z0.b}, p2, [sp, x1] + 730: e4a84fe0 st1h {z0.h}, p3, [sp, x8, lsl #1] + 734: e5e85000 st1d {z0.d}, p4, [x0, x8, lsl #3] + 738: 858043e0 ldr z0, [sp] + 73c: 85a043ff ldr z31, [sp, #-256, mul vl] + 740: e59f5d08 str z8, [x8, #255, mul vl] + 744: 1e601000 fmov d0, #2.000000000000000000e+00 + 748: 1e603000 fmov d0, #2.125000000000000000e+00 + 74c: 1e621000 fmov d0, #4.000000000000000000e+00 + 750: 1e623000 fmov d0, #4.250000000000000000e+00 + 754: 1e641000 fmov d0, #8.000000000000000000e+00 + 758: 1e643000 fmov d0, #8.500000000000000000e+00 + 75c: 1e661000 fmov d0, #1.600000000000000000e+01 + 760: 1e663000 fmov d0, #1.700000000000000000e+01 + 764: 1e681000 fmov d0, #1.250000000000000000e-01 + 768: 1e683000 fmov d0, #1.328125000000000000e-01 + 76c: 1e6a1000 fmov d0, #2.500000000000000000e-01 + 770: 1e6a3000 fmov d0, #2.656250000000000000e-01 + 774: 1e6c1000 fmov d0, #5.000000000000000000e-01 + 778: 1e6c3000 fmov d0, #5.312500000000000000e-01 + 77c: 1e6e1000 fmov d0, #1.000000000000000000e+00 + 780: 1e6e3000 fmov d0, #1.062500000000000000e+00 + 784: 1e701000 fmov d0, #-2.000000000000000000e+00 + 788: 1e703000 fmov d0, #-2.125000000000000000e+00 + 78c: 1e721000 fmov d0, #-4.000000000000000000e+00 + 790: 1e723000 fmov d0, #-4.250000000000000000e+00 + 794: 1e741000 fmov d0, #-8.000000000000000000e+00 + 798: 1e743000 fmov d0, #-8.500000000000000000e+00 + 79c: 1e761000 fmov d0, #-1.600000000000000000e+01 + 7a0: 1e763000 fmov d0, #-1.700000000000000000e+01 + 7a4: 1e781000 fmov d0, #-1.250000000000000000e-01 + 7a8: 1e783000 fmov d0, #-1.328125000000000000e-01 + 7ac: 1e7a1000 fmov d0, #-2.500000000000000000e-01 + 7b0: 1e7a3000 fmov d0, #-2.656250000000000000e-01 + 7b4: 1e7c1000 fmov d0, #-5.000000000000000000e-01 + 7b8: 1e7c3000 fmov d0, #-5.312500000000000000e-01 + 7bc: 1e7e1000 fmov d0, #-1.000000000000000000e+00 + 7c0: 1e7e3000 fmov d0, #-1.062500000000000000e+00 + 7c4: f82b82af swp x11, x15, [x21] + 7c8: f83700a8 ldadd x23, x8, [x5] + 7cc: f8271106 ldclr x7, x6, [x8] + 7d0: f82e22ee ldeor x14, x14, [x23] + 7d4: f82a3019 ldset x10, x25, [x0] + 7d8: f82552a9 ldsmin x5, x9, [x21] + 7dc: f824423b ldsmax x4, x27, [x17] + 7e0: f82a71a6 ldumin x10, x6, [x13] + 7e4: f8236203 ldumax x3, x3, [x16] + 7e8: f8a9805c swpa x9, x28, [x2] + 7ec: f8b70022 ldadda x23, x2, [x1] + 7f0: f8a410fa ldclra x4, x26, [x7] + 7f4: f8a02143 ldeora x0, x3, [x10] + 7f8: f8b83079 ldseta x24, x25, [x3] + 7fc: f8ab5028 ldsmina x11, x8, [x1] + 800: f8b043ad ldsmaxa x16, x13, [x29] + 804: f8a670a0 ldumina x6, x0, [x5] + 808: f8b061b1 ldumaxa x16, x17, [x13] + 80c: f8eb81db swpal x11, x27, [x14] + 810: f8e202ad ldaddal x2, x13, [x21] + 814: f8f6119f ldclral x22, xzr, [x12] + 818: f8e721fe ldeoral x7, x30, [x15] + 81c: f8e731f0 ldsetal x7, x16, [x15] + 820: f8f051ba ldsminal x16, x26, [x13] + 824: f8f74379 ldsmaxal x23, x25, [x27] + 828: f8e473ee lduminal x4, x14, [sp] + 82c: f8f86221 ldumaxal x24, x1, [x17] + 830: f8628308 swpl x2, x8, [x24] + 834: f874027b ldaddl x20, x27, [x19] + 838: f87310d1 ldclrl x19, x17, [x6] + 83c: f86e235c ldeorl x14, x28, [x26] + 840: f8623270 ldsetl x2, x16, [x19] + 844: f86e5090 ldsminl x14, x16, [x4] + 848: f8794128 ldsmaxl x25, x8, [x9] + 84c: f86a73a5 lduminl x10, x5, [x29] + 850: f86661c2 ldumaxl x6, x2, [x14] + 854: b831808b swp w17, w11, [x4] + 858: b82701f0 ldadd w7, w16, [x15] + 85c: b82b1139 ldclr w11, w25, [x9] + 860: b823200e ldeor w3, w14, [x0] + 864: b820301e ldset w0, w30, [x0] + 868: b826538a ldsmin w6, w10, [x28] + 86c: b82740ce ldsmax w7, w14, [x6] + 870: b826701e ldumin w6, w30, [x0] + 874: b83663be ldumax w22, w30, [x29] + 878: b8b0826e swpa w16, w14, [x19] + 87c: b8b50323 ldadda w21, w3, [x25] + 880: b8a21270 ldclra w2, w16, [x19] + 884: b8ba22f4 ldeora w26, w20, [x23] + 888: b8b133e6 ldseta w17, w6, [sp] + 88c: b8a553d7 ldsmina w5, w23, [x30] + 890: b8ab41cc ldsmaxa w11, w12, [x14] + 894: b8a271b4 ldumina w2, w20, [x13] + 898: b8af6291 ldumaxa w15, w17, [x20] + 89c: b8e682fc swpal w6, w28, [x23] + 8a0: b8fb01b0 ldaddal w27, w16, [x13] + 8a4: b8e21317 ldclral w2, w23, [x24] + 8a8: b8e0215c ldeoral w0, w28, [x10] + 8ac: b8e330af ldsetal w3, w15, [x5] + 8b0: b8e353ab ldsminal w3, w11, [x29] + 8b4: b8f640db ldsmaxal w22, w27, [x6] + 8b8: b8f17214 lduminal w17, w20, [x16] + 8bc: b8f760ef ldumaxal w23, w15, [x7] + 8c0: b86881d0 swpl w8, w16, [x14] + 8c4: b87702f0 ldaddl w23, w16, [x23] + 8c8: b87c10ec ldclrl w28, w12, [x7] + 8cc: b87c2267 ldeorl w28, w7, [x19] + 8d0: b867316c ldsetl w7, w12, [x11] + 8d4: b86a529f stsminl w10, [x20] + 8d8: b86943e8 ldsmaxl w9, w8, [sp] + 8dc: b86a7048 lduminl w10, w8, [x2] + 8e0: b87163ff stumaxl w17, [sp] + 8e4: 047600e2 add z2.h, z7.h, z22.h + 8e8: 04be06de sub z30.s, z22.s, z30.s + 8ec: 65d902ca fadd z10.d, z22.d, z25.d + 8f0: 65cc0a17 fmul z23.d, z16.d, z12.d + 8f4: 65d90623 fsub z3.d, z17.d, z25.d + 8f8: 0496a099 abs z25.s, p0/m, z4.s + 8fc: 04401b57 add z23.h, p6/m, z23.h, z26.h + 900: 04d08226 asr z6.d, p0/m, z6.d, z17.d + 904: 04daac77 cnt z23.d, p3/m, z3.d + 908: 04939d2b lsl z11.s, p7/m, z11.s, z9.s + 90c: 04919c7b lsr z27.s, p7/m, z27.s, z3.s + 910: 04901049 mul z9.s, p4/m, z9.s, z2.s + 914: 0417a9f0 neg z16.b, p2/m, z15.b + 918: 04dea929 not z9.d, p2/m, z9.d + 91c: 048816ea smax z10.s, p5/m, z10.s, z23.s + 920: 040a172d smin z13.b, p5/m, z13.b, z25.b + 924: 04811413 sub z19.s, p5/m, z19.s, z0.s + 928: 04dca2d1 fabs z17.d, p0/m, z22.d + 92c: 65808a09 fadd z9.s, p2/m, z9.s, z16.s + 930: 658d9411 fdiv z17.s, p5/m, z17.s, z0.s + 934: 6586947d fmax z29.s, p5/m, z29.s, z3.s + 938: 65878e21 fmin z1.s, p3/m, z1.s, z17.s + 93c: 65c2880e fmul z14.d, p2/m, z14.d, z0.d + 940: 04ddb2d3 fneg z19.d, p4/m, z22.d + 944: 65c2a5f1 frintm z17.d, p1/m, z15.d + 948: 65c0b088 frintn z8.d, p4/m, z4.d + 94c: 65c1b3a5 frintp z5.d, p4/m, z29.d + 950: 65cda26b fsqrt z11.d, p0/m, z19.d + 954: 65c1938a fsub z10.d, p4/m, z10.d, z28.d + 958: 65eb0ded fmla z13.d, p3/m, z15.d, z11.d + 95c: 65af3e86 fmls z6.s, p7/m, z20.s, z15.s + 960: 65a749be fnmla z30.s, p2/m, z13.s, z7.s + 964: 65f379d6 fnmls z22.d, p6/m, z14.d, z19.d + 968: 04404f3e mla z30.h, p3/m, z25.h, z0.h + 96c: 04c16b0a mls z10.d, p2/m, z24.d, z1.d + 970: 04363226 and z6.d, z17.d, z22.d + 974: 04b1312a eor z10.d, z9.d, z17.d + 978: 04753182 orr z2.d, z12.d, z21.d + 97c: 049a39cf andv s15, p6, z14.s + 980: 04d82ce9 orv d9, p3, z7.d + 984: 0459353e eorv h30, p5, z9.h + 988: 04883347 smaxv s7, p4, z26.s + 98c: 048a2fb4 sminv s20, p3, z29.s + 990: 65872e1c fminv s28, p3, z16.s + 994: 65c62d26 fmaxv d6, p3, z9.d + 998: 6598346a fadda s10, p5, s10, z3.s + 99c: 04013915 uaddv d21, p6, z8.b */ static const unsigned int insns[] = { - 0x8b18ec0f, 0xcb9636d1, 0xab1ce74a, 0xeb184a19, - 0x0b1c1ca8, 0x4b817388, 0x2b01004c, 0x6b5164b7, - 0x8a0d5595, 0xaa9791f5, 0xca9bc316, 0xea82d1f6, - 0x0a980e21, 0x2a862c45, 0x4a453037, 0x6a8e5180, - 0x8a621cc1, 0xaa24bd1e, 0xcab4d6d1, 0xeaa591fd, - 0x0a7d6efe, 0x2a2253ac, 0x4aa61187, 0x6aa755b0, - 0x110b5a25, 0x31056e0a, 0x510f48ba, 0x710ac715, - 0x910f6e0a, 0xb10a65ef, 0xd1009e98, 0xf10131aa, - 0x121d4e67, 0x32043e25, 0x52132390, 0x72160b0e, - 0x9273e76e, 0xb256416c, 0xd24b5002, 0xf266da8d, - 0x14000000, 0x17ffffd7, 0x140001ee, 0x94000000, - 0x97ffffd4, 0x940001eb, 0x3400000f, 0x34fffa2f, - 0x34003d0f, 0x3500001c, 0x35fff9dc, 0x35003cbc, - 0xb400001b, 0xb4fff97b, 0xb4003c5b, 0xb5000000, - 0xb5fff900, 0xb5003be0, 0x1000000d, 0x10fff8ad, - 0x10003b8d, 0x90000003, 0x36380015, 0x363ff835, - 0x36383b15, 0x3748000f, 0x374ff7cf, 0x37483aaf, - 0x12a14bee, 0x5283bb51, 0x72858ebb, 0x92c98881, - 0xd2aa50d4, 0xf2afd9d4, 0x935c504d, 0x33133e90, - 0x5309196b, 0x93595482, 0xb3424e0d, 0xd3481728, - 0x138a3b7d, 0x93c66286, 0x54000000, 0x54fff5a0, - 0x54003880, 0x54000001, 0x54fff541, 0x54003821, - 0x54000002, 0x54fff4e2, 0x540037c2, 0x54000002, - 0x54fff482, 0x54003762, 0x54000003, 0x54fff423, - 0x54003703, 0x54000003, 0x54fff3c3, 0x540036a3, - 0x54000004, 0x54fff364, 0x54003644, 0x54000005, - 0x54fff305, 0x540035e5, 0x54000006, 0x54fff2a6, - 0x54003586, 0x54000007, 0x54fff247, 0x54003527, - 0x54000008, 0x54fff1e8, 0x540034c8, 0x54000009, - 0x54fff189, 0x54003469, 0x5400000a, 0x54fff12a, - 0x5400340a, 0x5400000b, 0x54fff0cb, 0x540033ab, - 0x5400000c, 0x54fff06c, 0x5400334c, 0x5400000d, - 0x54fff00d, 0x540032ed, 0x5400000e, 0x54ffefae, - 0x5400328e, 0x5400000f, 0x54ffef4f, 0x5400322f, - 0xd40d2881, 0xd40ea5c2, 0xd40518a3, 0xd42eca40, - 0xd44a2e60, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, - 0xd5033fdf, 0xd5033d9f, 0xd5033bbf, 0xd61f0120, - 0xd63f0120, 0xc8027d7d, 0xc816ff85, 0xc85f7e8e, - 0xc85ffe7d, 0xc89ffea6, 0xc8dffc73, 0x880c7f63, - 0x8811fdfa, 0x885f7dcd, 0x885fff4c, 0x889ffe28, - 0x88dfffd5, 0x48007d6f, 0x4811fc34, 0x485f7d1d, - 0x485ffd91, 0x489ffc8b, 0x48dffc90, 0x080e7c85, - 0x081bfe11, 0x085f7f66, 0x085fff1b, 0x089ffe8a, - 0x08dfff49, 0xc87f7b85, 0xc87fa66a, 0xc82b5590, - 0xc82adc94, 0x887f0416, 0x887f8503, 0x88205fc9, - 0x8837c560, 0xf81e1146, 0xb81fb007, 0x381f3205, - 0x7801f27e, 0xf8477130, 0xb843b208, 0x385f918a, - 0x785da12e, 0x389f83d8, 0x78817087, 0x78dd91d1, - 0xb89e136b, 0xfc4410ec, 0xbc5fe200, 0xfc15f2ed, - 0xbc1c2075, 0xf8064ca2, 0xb81a4c29, 0x381fbfdb, - 0x7800cdfb, 0xf852ce24, 0xb841eef5, 0x385f9e2d, - 0x785cec19, 0x389ebea1, 0x789caebc, 0x78c02c8b, - 0xb883dd31, 0xfc427e7d, 0xbc5abed6, 0xfc11ff29, - 0xbc1f1c49, 0xf81be6ed, 0xb800a611, 0x381e05c1, - 0x78006411, 0xf855473b, 0xb85da72d, 0x385e372b, - 0x784144be, 0x389f94e9, 0x789c2460, 0x78c1f5c7, - 0xb8827771, 0xfc515491, 0xbc4226ba, 0xfc1c7625, - 0xbc1935ad, 0xf824da06, 0xb834db09, 0x38237ba3, - 0x783e6a2a, 0xf867497b, 0xb87949ee, 0x387379d8, - 0x7866c810, 0x38acd98a, 0x78b0499a, 0x78ee781a, - 0xb8bbf971, 0xfc73d803, 0xbc6979fa, 0xfc30e9ab, - 0xbc355a7a, 0xf91886a8, 0xb918ef6a, 0x391b15db, - 0x791ac0f0, 0xf958753b, 0xb95a1958, 0x395b3f18, - 0x795800b4, 0x39988891, 0x799a81ae, 0x79dd172a, - 0xb9981342, 0xfd5d21da, 0xbd5e7c9c, 0xfd1b526e, - 0xbd18df97, 0x58002268, 0x18ffdf51, 0xf8951080, - 0xd8000000, 0xf8a4c900, 0xf999e180, 0x1a150374, - 0x3a060227, 0x5a1900c5, 0x7a0e017e, 0x9a0b0223, - 0xba110159, 0xda170207, 0xfa050144, 0x0b2973c9, - 0x2b30a8a0, 0xcb3b8baf, 0x6b21f12b, 0x8b264f02, - 0xab3a70d3, 0xcb39ef48, 0xeb29329a, 0x3a5a41a7, - 0x7a54310f, 0xba4302c8, 0xfa58a04a, 0x3a50490d, - 0x7a4c0a01, 0xba5f79e3, 0xfa4c0aef, 0x1a9a30ee, - 0x1a9ed763, 0x5a9702ab, 0x5a95c7da, 0x9a8d835c, - 0x9a909471, 0xda8380ab, 0xda93c461, 0x5ac00120, - 0x5ac005da, 0x5ac00a2d, 0x5ac0128b, 0x5ac0163c, - 0xdac0008d, 0xdac007c1, 0xdac009cd, 0xdac00d05, - 0xdac01322, 0xdac01514, 0x1adb0b35, 0x1ad00d4d, - 0x1ad1203c, 0x1aca26f9, 0x1ac72867, 0x1ace2fce, - 0x9acf0acc, 0x9acd0f22, 0x9ad522e7, 0x9ac0258b, - 0x9adc293e, 0x9ad62cad, 0x9bc47ea5, 0x9b477c51, - 0x1b11318c, 0x1b01edfe, 0x9b117662, 0x9b03fae4, - 0x9b313eef, 0x9b21b59b, 0x9bac45a6, 0x9ba6a839, - 0x1e240871, 0x1e3518b0, 0x1e312b63, 0x1e2f3959, - 0x1e200a2a, 0x1e630b5c, 0x1e7b1804, 0x1e6229dc, - 0x1e773b4c, 0x1e610bcf, 0x1f0534a4, 0x1f1c85b5, - 0x1f3d1c71, 0x1f3d6b37, 0x1f5e68ee, 0x1f4aa4f6, - 0x1f6e24e7, 0x1f6f630e, 0x1e204056, 0x1e20c060, - 0x1e214229, 0x1e21c178, 0x1e22c32f, 0x1e604064, - 0x1e60c2da, 0x1e61427e, 0x1e61c1cc, 0x1e6240f1, - 0x1e3801d8, 0x9e38034d, 0x1e780022, 0x9e780165, - 0x1e22026e, 0x9e2202c1, 0x1e62023b, 0x9e620136, - 0x1e26006e, 0x9e66022c, 0x1e270368, 0x9e67039d, - 0x1e3e2000, 0x1e692180, 0x1e202148, 0x1e602328, - 0x292e7b68, 0x294a4f15, 0x69626c50, 0xa93814d5, - 0xa97e679d, 0x29903408, 0x29ec5039, 0x69fc62ce, - 0xa98504d1, 0xa9fc4735, 0x28b05691, 0x28c8705c, - 0x68e07953, 0xa8bf3e31, 0xa8fe0331, 0x283c170e, - 0x284e4c37, 0xa80419cb, 0xa8722f62, 0x0c407230, - 0x4cdfa13d, 0x0cd56f1e, 0x4cdf2440, 0x0d40c134, - 0x4ddfc811, 0x0ddaced5, 0x4c408f33, 0x0cdf84aa, - 0x4d60c30a, 0x0dffcbad, 0x4de2cf96, 0x4ccb489e, - 0x0c40481d, 0x4d40e777, 0x4ddfe943, 0x0dd6edd3, - 0x4cdf040e, 0x0cd902de, 0x0d60e019, 0x0dffe50a, - 0x0dfce8c1, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, + 0x8b8e677b, 0xcb512964, 0xab998627, 0xeb9416cd, + 0x0b83438a, 0x4b463c55, 0x2b9b2406, 0x6b882b65, + 0x8a879c8c, 0xaa16cb75, 0xca80baa3, 0xea855955, + 0x0a1d5aad, 0x2a504951, 0x4a976cf0, 0x6a8c30ca, + 0x8a275b33, 0xaa27d459, 0xcab70ee9, 0xeaadc8c5, + 0x0a2a26af, 0x2abe06b1, 0x4a3d4f87, 0x6ab632d9, + 0x110c5346, 0x3107aa23, 0x5107eea5, 0x710dcf76, + 0x9103d10c, 0xb10e811d, 0xd10a087a, 0xf109d1fd, + 0x1209afd5, 0x32099d95, 0x5202c62b, 0x720897da, + 0x920e36f9, 0xb243f1de, 0xd263d09a, 0xf24fd01a, + 0x14000000, 0x17ffffd7, 0x1400023e, 0x94000000, + 0x97ffffd4, 0x9400023b, 0x3400001c, 0x34fffa3c, + 0x3400471c, 0x35000011, 0x35fff9d1, 0x350046b1, + 0xb4000019, 0xb4fff979, 0xb4004659, 0xb5000002, + 0xb5fff902, 0xb50045e2, 0x1000001d, 0x10fff8bd, + 0x1000459d, 0x9000001d, 0x36300006, 0x3637f826, + 0x36304506, 0x37100015, 0x3717f7d5, 0x371044b5, + 0x128155e8, 0x52a5762b, 0x72acb59a, 0x92866a8d, + 0xd2e2d8a6, 0xf2c54450, 0x93516bde, 0x330f3124, + 0x5301168f, 0x9353391b, 0xb355741e, 0xd3562f5b, + 0x13866d8c, 0x93d6b5b3, 0x54000000, 0x54fff5a0, + 0x54004280, 0x54000001, 0x54fff541, 0x54004221, + 0x54000002, 0x54fff4e2, 0x540041c2, 0x54000002, + 0x54fff482, 0x54004162, 0x54000003, 0x54fff423, + 0x54004103, 0x54000003, 0x54fff3c3, 0x540040a3, + 0x54000004, 0x54fff364, 0x54004044, 0x54000005, + 0x54fff305, 0x54003fe5, 0x54000006, 0x54fff2a6, + 0x54003f86, 0x54000007, 0x54fff247, 0x54003f27, + 0x54000008, 0x54fff1e8, 0x54003ec8, 0x54000009, + 0x54fff189, 0x54003e69, 0x5400000a, 0x54fff12a, + 0x54003e0a, 0x5400000b, 0x54fff0cb, 0x54003dab, + 0x5400000c, 0x54fff06c, 0x54003d4c, 0x5400000d, + 0x54fff00d, 0x54003ced, 0x5400000e, 0x54ffefae, + 0x54003c8e, 0x5400000f, 0x54ffef4f, 0x54003c2f, + 0xd407da81, 0xd402d542, 0xd406dae3, 0xd4258fa0, + 0xd44d5960, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, + 0xd5033fdf, 0xd503339f, 0xd50336bf, 0xd61f0160, + 0xd63f0320, 0xc80e7daf, 0xc81efc39, 0xc85f7c6d, + 0xc85ffea8, 0xc89fff8d, 0xc8dfffc8, 0x880d7f91, + 0x8815fe71, 0x885f7d03, 0x885ffebd, 0x889fff09, + 0x88dffcc2, 0x480c7e14, 0x4802fcbc, 0x485f7c61, + 0x485ffdb8, 0x489fff2f, 0x48dffe8a, 0x08057db0, + 0x080afe2f, 0x085f7e71, 0x085ffd3e, 0x089fff14, + 0x08dffc8a, 0xc87f2139, 0xc87faa07, 0xc8392d30, + 0xc827a5e5, 0x887f106c, 0x887f88b1, 0x882460c8, + 0x8824e60c, 0xf800b3ce, 0xb819f3a6, 0x381f9162, + 0x781ea114, 0xf85e33b4, 0xb85e6009, 0x3940204e, + 0x785e802d, 0x389f922d, 0x789f50f1, 0x78dc4103, + 0xb9800d8e, 0xfc5152a5, 0xbc5ca009, 0xfc05f10f, + 0xbc1f0016, 0xf8111c97, 0xb8186c11, 0x381fbd3a, + 0x781f8dd5, 0xf8417ce8, 0xb8416d0c, 0x38406f9b, + 0x785c6e66, 0x389ecca7, 0x789e0e36, 0x78dfedb1, + 0xb8816c9d, 0xfc5b2f88, 0xbc5fbd77, 0xfc1e9e89, + 0xbc199c65, 0xf802044d, 0xb803967e, 0x3800343d, + 0x781ef74a, 0xf85f442f, 0xb85fa4a1, 0x385f25f8, + 0x785fb63d, 0x389ef5e4, 0x789ca446, 0x78c1277b, + 0xb89b3729, 0xfc5507b5, 0xbc5ce53e, 0xfc1d2582, + 0xbc1c56a7, 0xf837598c, 0xb8364bce, 0x383a586c, + 0x783e49cb, 0xf8787918, 0xb87469ac, 0x38655896, + 0x786658bc, 0x38b97962, 0x78b9ead7, 0x78f6da83, + 0xb8aefba9, 0xfc7dfaf0, 0xbc747b87, 0xfc387a94, + 0xbc377ab9, 0xf9180c51, 0xb91b38fe, 0x391ca4e3, + 0x791a4c27, 0xf95ca767, 0xb9580e28, 0x3958ea20, + 0x795bd680, 0x399a4633, 0x799d80d3, 0x79dcf944, + 0xb99b249d, 0xfd5a143d, 0xbd59938f, 0xfd1b9347, + 0xbd1aa7c0, 0x58000019, 0x18000009, 0xf88692c0, + 0xd8ffdf00, 0xf8be7b80, 0xf99c8260, 0x1a180111, + 0x3a09022e, 0x5a190036, 0x7a13012f, 0x9a0b028f, + 0xba1e0164, 0xda060114, 0xfa0f02aa, 0x0b298d61, + 0x2b3cee24, 0xcb3ca7b5, 0x6b37d38b, 0x8b25f34c, + 0xab3e68d1, 0xcb210a87, 0xeb3eed3e, 0x3a4b0087, + 0x7a4571eb, 0xba5122e6, 0xfa4bc16a, 0x3a4519cc, + 0x7a5c1aef, 0xba5e3a27, 0xfa4c8bc0, 0x1a81537a, + 0x1a95d56e, 0x5a8f60de, 0x5a995451, 0x9a8780b0, + 0x9a9cc68a, 0xda8180e6, 0xda912756, 0x5ac000cb, + 0x5ac00760, 0x5ac00ba1, 0x5ac012b4, 0x5ac0158c, + 0xdac00278, 0xdac005f7, 0xdac00831, 0xdac00c7b, + 0xdac010be, 0xdac0140f, 0x1ad4080e, 0x1ad50d9b, + 0x1ada214c, 0x1ac6266e, 0x1ade2a7b, 0x1ad02dc6, + 0x9ac209b1, 0x9ac20fa0, 0x9ac2220c, 0x9add26e9, + 0x9add2a26, 0x9ada2fce, 0x9bda7f11, 0x9b4e7f54, + 0x1b021d1b, 0x1b19b1bc, 0x9b0a6d24, 0x9b08f956, + 0x9b391694, 0x9b2beed6, 0x9bac4cc4, 0x9ba881f1, + 0x1e2a08b6, 0x1e301904, 0x1e262919, 0x1e393b66, + 0x1e290aea, 0x1e6c0a36, 0x1e74180b, 0x1e6f2980, + 0x1e643acf, 0x1e79083d, 0x1f131769, 0x1f06e87a, + 0x1f285184, 0x1f354539, 0x1f5e5867, 0x1f4aab61, + 0x1f760511, 0x1f626f8e, 0x1e2043db, 0x1e20c025, + 0x1e214277, 0x1e21c23c, 0x1e22c0d9, 0x1e6041d4, + 0x1e60c151, 0x1e61422a, 0x1e61c235, 0x1e6241f5, + 0x1e380167, 0x9e3803a2, 0x1e780323, 0x9e78011c, + 0x1e22006b, 0x9e2202a2, 0x1e62033d, 0x9e620073, + 0x1e2603b4, 0x9e660237, 0x1e270380, 0x9e670289, + 0x1e2c20e0, 0x1e6e21a0, 0x1e202188, 0x1e602028, + 0x29380acc, 0x2966271b, 0x696a130f, 0xa9015405, + 0xa9735d26, 0x29820fa0, 0x29ee403d, 0x69c24ebb, + 0xa9b545a6, 0xa9c16020, 0x288052c0, 0x28fa31d1, + 0x68ce682a, 0xa8ba61b4, 0xa8c330e1, 0x28362ae5, + 0x287a2b08, 0xa8043d6b, 0xa84470a9, 0x0c40728b, + 0x4cdfa113, 0x0cc36c43, 0x4cdf2475, 0x0d40c0ae, + 0x4ddfcb6d, 0x0dc0ce71, 0x4c408cbb, 0x0cdf849a, + 0x4d60c2e8, 0x0dffc94e, 0x4df3ceaa, 0x4cde49d1, + 0x0c404a94, 0x4d40e6b8, 0x4ddfe83a, 0x0dc0ec4c, + 0x4cdf04d5, 0x0cd60391, 0x0d60e333, 0x0dffe6e6, + 0x0dfae928, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, 0x7a42cbe2, 0x93df03ff, 0xc820ffff, 0x8822fc7f, 0xc8247cbf, 0x88267fff, 0x4e010fe0, 0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, 0x4e071fe1, 0x4cc0ac3f, - 0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000, - 0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000, - 0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000, - 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, 0x1e6e3000, - 0x1e701000, 0x1e703000, 0x1e721000, 0x1e723000, - 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, - 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, - 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, 0x1e7e3000, - 0xf83081f4, 0xf8220387, 0xf834132a, 0xf836204b, - 0xf821326a, 0xf82e5075, 0xf83c41bb, 0xf83172be, - 0xf83b63b0, 0xf8be8009, 0xf8bc039b, 0xf8b51159, - 0xf8bf21f4, 0xf8a131d9, 0xf8b553ba, 0xf8a8433d, - 0xf8ad7322, 0xf8af6017, 0xf8e38041, 0xf8fc0283, - 0xf8ee11df, 0xf8e7205c, 0xf8e030ab, 0xf8eb528e, - 0xf8ff4044, 0xf8fa72c0, 0xf8f161a1, 0xf877829a, - 0xf86e018b, 0xf86c11ff, 0xf87b210e, 0xf86a333e, - 0xf8765207, 0xf8614110, 0xf8617341, 0xf86061f7, - 0xb82b8110, 0xb82101c7, 0xb830113f, 0xb83621a6, - 0xb82b308d, 0xb8305016, 0xb83c415f, 0xb8307105, - 0xb83a61f4, 0xb8bb8206, 0xb8bf005f, 0xb8b8111c, - 0xb8af22e9, 0xb8ba30e2, 0xb8a351f1, 0xb8b342a5, - 0xb8a7719a, 0xb8ac63a7, 0xb8e98288, 0xb8e803df, - 0xb8e01186, 0xb8f12057, 0xb8e0303e, 0xb8f651e3, - 0xb8f941b5, 0xb8ed7378, 0xb8f46163, 0xb86382ad, - 0xb87a034f, 0xb8691053, 0xb87820fd, 0xb87d31f9, - 0xb86b50fe, 0xb86b40c2, 0xb87071cb, 0xb8656168, + 0x05a08020, 0x04b0e3e0, 0x0470e7e1, 0x042f9c20, + 0x043f9c35, 0x047f9c20, 0x04ff9c20, 0x04299420, + 0x04319160, 0x0461943e, 0x04a19020, 0x042053ff, + 0x047f5401, 0x25208028, 0x2538cfe0, 0x2578d001, + 0x25b8efe2, 0x25f8f007, 0xa400a3e0, 0xa4a8a7ea, + 0xa547a814, 0xa4084ffe, 0xa55c53e0, 0xa5e1540b, + 0xe400fbf6, 0xe408ffff, 0xe547e400, 0xe4014be0, + 0xe4a84fe0, 0xe5e85000, 0x858043e0, 0x85a043ff, + 0xe59f5d08, 0x1e601000, 0x1e603000, 0x1e621000, + 0x1e623000, 0x1e641000, 0x1e643000, 0x1e661000, + 0x1e663000, 0x1e681000, 0x1e683000, 0x1e6a1000, + 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, + 0x1e6e3000, 0x1e701000, 0x1e703000, 0x1e721000, + 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, + 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, + 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, + 0x1e7e3000, 0xf82b82af, 0xf83700a8, 0xf8271106, + 0xf82e22ee, 0xf82a3019, 0xf82552a9, 0xf824423b, + 0xf82a71a6, 0xf8236203, 0xf8a9805c, 0xf8b70022, + 0xf8a410fa, 0xf8a02143, 0xf8b83079, 0xf8ab5028, + 0xf8b043ad, 0xf8a670a0, 0xf8b061b1, 0xf8eb81db, + 0xf8e202ad, 0xf8f6119f, 0xf8e721fe, 0xf8e731f0, + 0xf8f051ba, 0xf8f74379, 0xf8e473ee, 0xf8f86221, + 0xf8628308, 0xf874027b, 0xf87310d1, 0xf86e235c, + 0xf8623270, 0xf86e5090, 0xf8794128, 0xf86a73a5, + 0xf86661c2, 0xb831808b, 0xb82701f0, 0xb82b1139, + 0xb823200e, 0xb820301e, 0xb826538a, 0xb82740ce, + 0xb826701e, 0xb83663be, 0xb8b0826e, 0xb8b50323, + 0xb8a21270, 0xb8ba22f4, 0xb8b133e6, 0xb8a553d7, + 0xb8ab41cc, 0xb8a271b4, 0xb8af6291, 0xb8e682fc, + 0xb8fb01b0, 0xb8e21317, 0xb8e0215c, 0xb8e330af, + 0xb8e353ab, 0xb8f640db, 0xb8f17214, 0xb8f760ef, + 0xb86881d0, 0xb87702f0, 0xb87c10ec, 0xb87c2267, + 0xb867316c, 0xb86a529f, 0xb86943e8, 0xb86a7048, + 0xb87163ff, 0x047600e2, 0x04be06de, 0x65d902ca, + 0x65cc0a17, 0x65d90623, 0x0496a099, 0x04401b57, + 0x04d08226, 0x04daac77, 0x04939d2b, 0x04919c7b, + 0x04901049, 0x0417a9f0, 0x04dea929, 0x048816ea, + 0x040a172d, 0x04811413, 0x04dca2d1, 0x65808a09, + 0x658d9411, 0x6586947d, 0x65878e21, 0x65c2880e, + 0x04ddb2d3, 0x65c2a5f1, 0x65c0b088, 0x65c1b3a5, + 0x65cda26b, 0x65c1938a, 0x65eb0ded, 0x65af3e86, + 0x65a749be, 0x65f379d6, 0x04404f3e, 0x04c16b0a, + 0x04363226, 0x04b1312a, 0x04753182, 0x049a39cf, + 0x04d82ce9, 0x0459353e, 0x04883347, 0x048a2fb4, + 0x65872e1c, 0x65c62d26, 0x6598346a, 0x04013915, }; // END Generated code -- do not edit diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 8f0d7f5..13daa4e 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -152,6 +152,9 @@ REGISTER_DECLARATION(Register, rdispatch, r21); // Java stack pointer REGISTER_DECLARATION(Register, esp, r20); +// Preserved predicate register with all elements set TRUE. +REGISTER_DECLARATION(PRegister, ptrue, p7); + #define assert_cond(ARG1) assert(ARG1, #ARG1) namespace asm_util { @@ -581,6 +584,18 @@ class Address { void lea(MacroAssembler *, Register) const; static bool offset_ok_for_immed(int64_t offset, uint shift = 0); + + static bool offset_ok_for_sve_immed(long offset, int shift, int vl /* sve vector length */) { + if (offset % vl == 0) { + // Convert address offset into sve imm offset (MUL VL). + int sve_offset = offset / vl; + if (((-(1 << (shift - 1))) <= sve_offset) && (sve_offset < (1 << (shift - 1)))) { + // sve_offset can be encoded + return true; + } + } + return false; + } }; // Convience classes @@ -2473,13 +2488,18 @@ public: f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); } - void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { - starti; - f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); - f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10); - rf(Vn, 5), rf(Rd, 0); +#define INSN(NAME, op) \ + void NAME(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { \ + starti; \ + f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); \ + f(((idx<<1)|1)<<(int)T, 20, 16), f(op, 15, 10); \ + rf(Vn, 5), rf(Rd, 0); \ } + INSN(umov, 0b001111); + INSN(smov, 0b001011); +#undef INSN + #define INSN(NAME, opc, opc2, isSHR) \ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){ \ starti; \ @@ -2721,6 +2741,240 @@ void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister V f(0, 10), rf(Vn, 5), rf(Vd, 0); } +// SVE arithmetics - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid register variant"); \ + f(0b00000100, 31, 24), f(T, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_add, 0b000); + INSN(sve_sub, 0b001); +#undef INSN + +// SVE floating-point arithmetic - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T == S || T == D, "invalid register variant"); \ + f(0b01100101, 31, 24), f(T, 23, 22), f(0, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_fadd, 0b000); + INSN(sve_fmul, 0b010); + INSN(sve_fsub, 0b001); +#undef INSN + +private: + void sve_predicate_reg_insn(unsigned op24, unsigned op13, + FloatRegister Zd_or_Vd, SIMD_RegVariant T, + PRegister Pg, FloatRegister Zn_or_Vn) { + starti; + f(op24, 31, 24), f(T, 23, 22), f(op13, 21, 13); + pgrf(Pg, 10), rf(Zn_or_Vn, 5), rf(Zd_or_Vd, 0); + } + +public: + +// SVE integer arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn_or_Zd_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm_or_Vn) { \ + assert(T != Q, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zdn_or_Zd_or_Vd, T, Pg, Znm_or_Vn); \ + } + + INSN(sve_abs, 0b00000100, 0b010110101); // vector abs, unary + INSN(sve_add, 0b00000100, 0b000000000); // vector add + INSN(sve_andv, 0b00000100, 0b011010001); // bitwise and reduction to scalar + INSN(sve_asr, 0b00000100, 0b010000100); // vector arithmetic shift right + INSN(sve_cnt, 0b00000100, 0b011010101) // count non-zero bits + INSN(sve_cpy, 0b00000101, 0b100000100); // copy scalar to each active vector element + INSN(sve_eorv, 0b00000100, 0b011001001); // bitwise xor reduction to scalar + INSN(sve_lsl, 0b00000100, 0b010011100); // vector logical shift left + INSN(sve_lsr, 0b00000100, 0b010001100); // vector logical shift right + INSN(sve_mul, 0b00000100, 0b010000000); // vector mul + INSN(sve_neg, 0b00000100, 0b010111101); // vector neg, unary + INSN(sve_not, 0b00000100, 0b011110101); // bitwise invert vector, unary + INSN(sve_orv, 0b00000100, 0b011000001); // bitwise or reduction to scalar + INSN(sve_smax, 0b00000100, 0b001000000); // signed maximum vectors + INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar + INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors + INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar + INSN(sve_sub, 0b00000100, 0b000001000); // vector sub + INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar +#undef INSN + +// SVE floating-point arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zd_or_Zdn_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn_or_Zm) { \ + assert(T == S || T == D, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zd_or_Zdn_or_Vd, T, Pg, Zn_or_Zm); \ + } + + INSN(sve_fabs, 0b00000100, 0b011100101); + INSN(sve_fadd, 0b01100101, 0b000000100); + INSN(sve_fadda, 0b01100101, 0b011000001); // add strictly-ordered reduction to scalar Vd + INSN(sve_fdiv, 0b01100101, 0b001101100); + INSN(sve_fmax, 0b01100101, 0b000110100); // floating-point maximum + INSN(sve_fmaxv, 0b01100101, 0b000110001); // floating-point maximum recursive reduction to scalar + INSN(sve_fmin, 0b01100101, 0b000111100); // floating-point minimum + INSN(sve_fminv, 0b01100101, 0b000111001); // floating-point minimum recursive reduction to scalar + INSN(sve_fmul, 0b01100101, 0b000010100); + INSN(sve_fneg, 0b00000100, 0b011101101); + INSN(sve_frintm, 0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity + INSN(sve_frintn, 0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even + INSN(sve_frintp, 0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity + INSN(sve_fsqrt, 0b01100101, 0b001101101); + INSN(sve_fsub, 0b01100101, 0b000001100); +#undef INSN + + // SVE multiple-add/sub - predicated +#define INSN(NAME, op0, op1, op2) \ + void NAME(FloatRegister Zda, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(op0, 31, 24), f(T, 23, 22), f(op1, 21), rf(Zm, 16); \ + f(op2, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zda, 0); \ + } + + INSN(sve_fmla, 0b01100101, 1, 0b000); // floating-point fused multiply-add: Zda = Zda + Zn * Zm + INSN(sve_fmls, 0b01100101, 1, 0b001); // floating-point fused multiply-subtract: Zda = Zda + -Zn * Zm + INSN(sve_fnmla, 0b01100101, 1, 0b010); // floating-point negated fused multiply-add: Zda = -Zda + -Zn * Zm + INSN(sve_fnmls, 0b01100101, 1, 0b011); // floating-point negated fused multiply-subtract: Zda = -Zda + Zn * Zm + INSN(sve_mla, 0b00000100, 0, 0b010); // multiply-add: Zda = Zda + Zn*Zm + INSN(sve_mls, 0b00000100, 0, 0b011); // multiply-subtract: Zda = Zda + -Zn*Zm +#undef INSN + +// SVE bitwise logical - unpredicated +#define INSN(NAME, opc) \ + void NAME(FloatRegister Zd, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + f(0b00000100, 31, 24), f(opc, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_and, 0b00); + INSN(sve_eor, 0b10); + INSN(sve_orr, 0b01); +#undef INSN + +// SVE shift immediate - unpredicated +#define INSN(NAME, opc, isSHR) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, int shift) { \ + starti; \ + /* The encodings for the tszh:tszl:imm3 fields (bits 23:22 20:19 18:16) \ + * for shift right is calculated as: \ + * 0001 xxx B, shift = 16 - UInt(tszh:tszl:imm3) \ + * 001x xxx H, shift = 32 - UInt(tszh:tszl:imm3) \ + * 01xx xxx S, shift = 64 - UInt(tszh:tszl:imm3) \ + * 1xxx xxx D, shift = 128 - UInt(tszh:tszl:imm3) \ + * for shift left is calculated as: \ + * 0001 xxx B, shift = UInt(tszh:tszl:imm3) - 8 \ + * 001x xxx H, shift = UInt(tszh:tszl:imm3) - 16 \ + * 01xx xxx S, shift = UInt(tszh:tszl:imm3) - 32 \ + * 1xxx xxx D, shift = UInt(tszh:tszl:imm3) - 64 \ + */ \ + assert(T != Q, "Invalid register variant"); \ + if (isSHR) { \ + assert(((1 << (T + 3)) >= shift) && (shift > 0) , "Invalid shift value"); \ + } else { \ + assert(((1 << (T + 3)) > shift) && (shift >= 0) , "Invalid shift value"); \ + } \ + int cVal = (1 << ((T + 3) + (isSHR ? 1 : 0))); \ + int encodedShift = isSHR ? cVal - shift : cVal + shift; \ + int tszh = encodedShift >> 5; \ + int tszl_imm = encodedShift & 0x1f; \ + f(0b00000100, 31, 24); \ + f(tszh, 23, 22), f(1,21), f(tszl_imm, 20, 16); \ + f(0b100, 15, 13), f(opc, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_asr, 0b100, /* isSHR = */ true); + INSN(sve_lsl, 0b111, /* isSHR = */ false); + INSN(sve_lsr, 0b101, /* isSHR = */ true); +#undef INSN + +private: + + // Scalar base + immediate index + void sve_ld_st1(FloatRegister Zt, Register Xn, int imm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + f(0, 20), sf(imm, 19, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + // Scalar base + scalar index + void sve_ld_st1(FloatRegister Zt, Register Xn, Register Xm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + rf(Xm, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + void sve_ld_st1(FloatRegister Zt, PRegister Pg, + SIMD_RegVariant T, const Address &a, + int op1, int type, int imm_op2, int scalar_op2) { + switch (a.getMode()) { + case Address::base_plus_offset: + sve_ld_st1(Zt, a.base(), a.offset(), Pg, T, op1, type, imm_op2); + break; + case Address::base_plus_offset_reg: + sve_ld_st1(Zt, a.base(), a.index(), Pg, T, op1, type, scalar_op2); + break; + default: + ShouldNotReachHere(); + } + } + +public: + +// SVE load/store - predicated +#define INSN(NAME, op1, type, imm_op2, scalar_op2) \ + void NAME(FloatRegister Zt, SIMD_RegVariant T, PRegister Pg, const Address &a) { \ + assert(T != Q, "invalid register variant"); \ + sve_ld_st1(Zt, Pg, T, a, op1, type, imm_op2, scalar_op2); \ + } + + INSN(sve_ld1b, 0b1010010, 0b00, 0b101, 0b010); + INSN(sve_st1b, 0b1110010, 0b00, 0b111, 0b010); + INSN(sve_ld1h, 0b1010010, 0b01, 0b101, 0b010); + INSN(sve_st1h, 0b1110010, 0b01, 0b111, 0b010); + INSN(sve_ld1w, 0b1010010, 0b10, 0b101, 0b010); + INSN(sve_st1w, 0b1110010, 0b10, 0b111, 0b010); + INSN(sve_ld1d, 0b1010010, 0b11, 0b101, 0b010); + INSN(sve_st1d, 0b1110010, 0b11, 0b111, 0b010); +#undef INSN + +// SVE load/store - unpredicated +#define INSN(NAME, op1) \ + void NAME(FloatRegister Zt, const Address &a) { \ + starti; \ + assert(a.index() == noreg, "invalid address variant"); \ + f(op1, 31, 29), f(0b0010110, 28, 22), sf(a.offset() >> 3, 21, 16), \ + f(0b010, 15, 13), f(a.offset() & 0x7, 12, 10), srf(a.base(), 5), rf(Zt, 0); \ + } + + INSN(sve_ldr, 0b100); // LDR (vector) + INSN(sve_str, 0b111); // STR (vector) +#undef INSN + +#define INSN(NAME, op) \ + void NAME(Register Xd, Register Xn, int imm6) { \ + starti; \ + f(0b000001000, 31, 23), f(op, 22, 21); \ + srf(Xn, 16), f(0b01010, 15, 11), sf(imm6, 10, 5), srf(Xd, 0); \ + } + + INSN(sve_addvl, 0b01); + INSN(sve_addpl, 0b11); +#undef INSN + // SVE inc/dec register by element count #define INSN(NAME, op) \ void NAME(Register Xdn, SIMD_RegVariant T, unsigned imm4 = 1, int pattern = 0b11111) { \ @@ -2734,6 +2988,45 @@ void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister V INSN(sve_dec, 1); #undef INSN +// SVE predicate count + void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) { + starti; + assert(T != Q, "invalid size"); + f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14); + prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0); + } + + // SVE dup scalar + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, Register Rn) { + starti; + assert(T != Q, "invalid size"); + f(0b00000101, 31, 24), f(T, 23, 22), f(0b100000001110, 21, 10); + srf(Rn, 5), rf(Zd, 0); + } + + // SVE dup imm + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, int imm8) { + starti; + assert(T != Q, "invalid size"); + int sh = 0; + if (imm8 <= 127 && imm8 >= -128) { + sh = 0; + } else if (T != B && imm8 <= 32512 && imm8 >= -32768 && (imm8 & 0xff) == 0) { + sh = 1; + imm8 = (imm8 >> 8); + } else { + guarantee(false, "invalid immediate"); + } + f(0b00100101, 31, 24), f(T, 23, 22), f(0b11100011, 21, 14); + f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0); + } + + void sve_ptrue(PRegister pd, SIMD_RegVariant esize, int pattern = 0b11111) { + starti; + f(0b00100101, 31, 24), f(esize, 23, 22), f(0b011000111000, 21, 10); + f(pattern, 9, 5), f(0b0, 4), prf(pd, 0); + } + Assembler(CodeBuffer* code) : AbstractAssembler(code) { } diff --git a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp index 6ac54f2..a258528 100644 --- a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp @@ -456,8 +456,12 @@ void ZBarrierSetAssembler::generate_c2_load_barrier_stub(MacroAssembler* masm, Z ZSetupArguments setup_arguments(masm, stub); __ mov(rscratch1, stub->slow_path()); __ blr(rscratch1); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } } - // Stub exit __ b(*stub->continuation()); } diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index c70d424..7cfa70a 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -2131,8 +2131,17 @@ int MacroAssembler::pop(unsigned int bitset, Register stack) { } // Push lots of registers in the bit set supplied. Don't push sp. -// Return the number of words pushed +// Return the number of dwords pushed int MacroAssembler::push_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif + // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2147,8 +2156,18 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + sub(stack, stack, sve_vector_size_in_bytes * count); + for (int i = 0; i < count; i++) { + sve_str(as_FloatRegister(regs[i]), Address(stack, i)); + } + return count * sve_vector_size_in_bytes / 8; + } + add(stack, stack, -count * wordSize * 2); + // NEON if (count & 1) { strq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2161,7 +2180,16 @@ int MacroAssembler::push_fp(unsigned int bitset, Register stack) { return count; } +// Return the number of dwords poped int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { + int words_pushed = 0; + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); +#endif // Scan bitset to accumulate register pairs unsigned char regs[32]; int count = 0; @@ -2176,6 +2204,16 @@ int MacroAssembler::pop_fp(unsigned int bitset, Register stack) { return 0; } + // SVE + if (use_sve && sve_vector_size_in_bytes > 16) { + for (int i = count - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(regs[i]), Address(stack, i)); + } + add(stack, stack, sve_vector_size_in_bytes * count); + return count * sve_vector_size_in_bytes / 8; + } + + // NEON if (count & 1) { ldrq(as_FloatRegister(regs[0]), Address(stack)); i += 1; @@ -2659,23 +2697,39 @@ void MacroAssembler::pop_call_clobbered_registers() { pop(call_clobbered_registers() - RegSet::of(rscratch1, rscratch2), sp); } -void MacroAssembler::push_CPU_state(bool save_vectors) { - int step = (save_vectors ? 8 : 4) * wordSize; +void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve, + int sve_vector_size_in_bytes) { push(RegSet::range(r0, r29), sp); // integer registers except lr & sp - mov(rscratch1, -step); - sub(sp, sp, step); - for (int i = 28; i >= 4; i -= 4) { - st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) { + sub(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { + sve_str(as_FloatRegister(i), Address(sp, i)); + } + } else { + int step = (save_vectors ? 8 : 4) * wordSize; + mov(rscratch1, -step); + sub(sp, sp, step); + for (int i = 28; i >= 4; i -= 4) { + st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + } + st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } - st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } -void MacroAssembler::pop_CPU_state(bool restore_vectors) { - int step = (restore_vectors ? 8 : 4) * wordSize; - for (int i = 0; i <= 28; i += 4) - ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); +void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve, + int sve_vector_size_in_bytes) { + if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) { + for (int i = FloatRegisterImpl::number_of_registers - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(i), Address(sp, i)); + } + add(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + } else { + int step = (restore_vectors ? 8 : 4) * wordSize; + for (int i = 0; i <= 28; i += 4) + ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); + } // integer registers except lr & sp pop(RegSet::range(r0, r17), sp); @@ -2732,6 +2786,21 @@ Address MacroAssembler::spill_address(int size, int offset, Register tmp) return Address(base, offset); } +Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) { + assert(offset >= 0, "spill to negative address?"); + + Register base = sp; + + // An immediate offset in the range 0 to 255 which is multiplied + // by the current vector or predicate register size in bytes. + if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) { + return Address(base, offset / sve_reg_size_in_bytes); + } + + add(tmp, base, offset); + return Address(tmp); +} + // Checks whether offset is aligned. // Returns true if it is, else false. bool MacroAssembler::merge_alignment_check(Register base, @@ -5930,3 +5999,13 @@ void MacroAssembler::verify_sve_vector_length() { stop("Error: SVE vector length has changed since jvm startup"); bind(verify_ok); } + +void MacroAssembler::verify_ptrue() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count. + sve_dec(rscratch1, B); + cbz(rscratch1, verify_ok); + stop("Error: the preserved predicate register (p7) elements are not all true"); + bind(verify_ok); +} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index ec9b3cc..07e3169 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -862,8 +862,10 @@ public: DEBUG_ONLY(void verify_heapbase(const char* msg);) - void push_CPU_state(bool save_vectors = false); - void pop_CPU_state(bool restore_vectors = false) ; + void push_CPU_state(bool save_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); + void pop_CPU_state(bool restore_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); // Round up to a power of two void round_to(Register reg, int modulus); @@ -939,6 +941,10 @@ public: Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); void verify_sve_vector_length(); + void reinitialize_ptrue() { + sve_ptrue(ptrue, B); + } + void verify_ptrue(); // Debugging @@ -1338,6 +1344,7 @@ private: // Returns an address on the stack which is reachable with a ldr/str of size // Uses rscratch2 if the address is not directly reachable Address spill_address(int size, int offset, Register tmp=rscratch2); + Address sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp=rscratch2); bool merge_alignment_check(Register base, size_t size, int64_t cur_offset, int64_t prev_offset) const; @@ -1361,6 +1368,9 @@ public: void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) { str(Vx, T, spill_address(1 << (int)T, offset)); } + void spill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_str(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void unspill(Register Rx, bool is64, int offset) { if (is64) { ldr(Rx, spill_address(8, offset)); @@ -1371,6 +1381,9 @@ public: void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) { ldr(Vx, T, spill_address(1 << (int)T, offset)); } + void unspill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_ldr(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void spill_copy128(int src_offset, int dst_offset, Register tmp1=rscratch1, Register tmp2=rscratch2) { if (src_offset < 512 && (src_offset & 7) == 0 && @@ -1384,6 +1397,15 @@ public: spill(tmp1, true, dst_offset+8); } } + void spill_copy_sve_vector_stack_to_stack(int src_offset, int dst_offset, + int sve_vec_reg_size_in_bytes) { + assert(sve_vec_reg_size_in_bytes % 16 == 0, "unexpected sve vector reg size"); + for (int i = 0; i < sve_vec_reg_size_in_bytes / 16; i++) { + spill_copy128(src_offset, dst_offset); + src_offset += 16; + dst_offset += 16; + } + } }; #ifdef ASSERT diff --git a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp index 1602a78..e476456 100644 --- a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp @@ -196,3 +196,5 @@ REGISTER_DEFINITION(PRegister, p4); REGISTER_DEFINITION(PRegister, p5); REGISTER_DEFINITION(PRegister, p6); REGISTER_DEFINITION(PRegister, p7); + +REGISTER_DEFINITION(PRegister, ptrue); diff --git a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp index 4b35aa6..491e29d 100644 --- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp @@ -152,7 +152,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ // Save Integer and Float registers. __ enter(); - __ push_CPU_state(save_vectors); + __ push_CPU_state(save_vectors, use_sve, sve_vector_size_in_bytes); // Set an oopmap for the call site. This oopmap will map all // oop-registers and debug-info registers as callee-saved. This @@ -191,10 +191,15 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { -#ifndef COMPILER2 +#ifdef COMPILER2 + __ pop_CPU_state(restore_vectors, Matcher::supports_scalable_vector(), + Matcher::scalable_vector_reg_size(T_BYTE)); +#else +#if !INCLUDE_JVMCI assert(!restore_vectors, "vectors are generated only by C2 and JVMCI"); #endif __ pop_CPU_state(restore_vectors); +#endif __ leave(); } @@ -2810,6 +2815,12 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t __ maybe_isb(); __ membar(Assembler::LoadLoad | Assembler::LoadStore); + if (UseSVE > 0 && save_vectors) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); __ cbz(rscratch1, noException); diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index d307871..cd3f6f4 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -488,6 +488,11 @@ class StubGenerator: public StubCodeGenerator { __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); + if (UseSVE > 0 ) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } // we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own @@ -5421,6 +5426,12 @@ class StubGenerator: public StubCodeGenerator { __ reset_last_Java_frame(true); __ maybe_isb(); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ leave(); // check for pending exceptions diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index f142afa..98e6780 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1093,7 +1093,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index b3bf64c..cc1e1a1 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2242,7 +2242,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index ea09aaa..782c1c7 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1522,7 +1522,7 @@ const bool Matcher::match_rule_supported(int opcode) { // BUT: make sure match rule is not disabled by a false predicate! } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // Identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen. diff --git a/src/hotspot/cpu/sparc/sparc.ad b/src/hotspot/cpu/sparc/sparc.ad index a09c795..3b1b104 100644 --- a/src/hotspot/cpu/sparc/sparc.ad +++ b/src/hotspot/cpu/sparc/sparc.ad @@ -1710,7 +1710,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 4e1336f..b75d0ff 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1379,7 +1379,7 @@ const bool Matcher::match_rule_supported(int opcode) { return ret_value; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { +const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen bool ret_value = match_rule_supported(opcode); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index ed890f8..9a83071 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -310,7 +310,7 @@ public: // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen - static const bool match_rule_supported_vector(int opcode, int vlen); + static const bool match_rule_supported_vector(int opcode, int vlen, BasicType bt); // Some microarchitectures have mask registers used on vectors static const bool has_predicated_vectors(void); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index fed52e4..ee58323 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -96,8 +96,11 @@ static const bool _do_vector_loop_experimental = false; // Experimental vectoriz //------------------------------transform_loop--------------------------- void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { assert(UseSuperWord, "should be"); - // Do vectors exist on this architecture? - if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; + // SuperWord only works with power of two vector sizes. + int vector_width = Matcher::vector_width_in_bytes(T_BYTE); + if (vector_width < 2 || !is_power_of_2(vector_width)) { + return; + } assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 1f2cf2c..6867177 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -236,7 +236,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = VectorNode::opcode(opc, bt); - return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen); + return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } @@ -653,7 +653,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported(vopc); + return vopc != opc && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; }
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2