Projects
home:dingli:branches:openEuler:24.09-openjdk
openjdk-1.8.0
_service:tar_scm:8256488-Use-ldpq-stpq-instead-...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:8256488-Use-ldpq-stpq-instead-of-ld4-st4-for-small-c.patch of Package openjdk-1.8.0
Subject: 8256488: Use ldpq/stpq instead of ld4/st4 for small copies in StubGenerator::copy_memory -- .../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp index f61028d5007..cf66df296e4 100644 --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -1149,10 +1149,10 @@ class StubGenerator: public StubCodeGenerator { Register count, Register tmp, int step) { copy_direction direction = step < 0 ? copy_backwards : copy_forwards; bool is_backwards = step < 0; - int granularity = uabs(step); + unsigned granularity = uabs(step); const Register t0 = r3, t1 = r4; - // <= 96 bytes do inline. Direction doesn't matter because we always + // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always // load all the data before writing anything Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; @@ -1207,9 +1207,31 @@ class StubGenerator: public StubCodeGenerator { // (96 bytes if SIMD because we do 32 byes per instruction) __ bind(copy80); if (UseSIMDForMemoryOps) { - __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); + __ ldpq(v0, v1, Address(s, 0)); + __ ldpq(v2, v3, Address(s, 32)); + // Unaligned pointers can be an issue for copying. + // The issue has more chances to happen when granularity of data is + // less than 4(sizeof(jint)). Pointers for arrays of jint are at least + // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. + // The most performance drop has been seen for the range 65-80 bytes. + // For such cases using the pair of ldp/stp instead of the third pair of + // ldpq/stpq fixes the performance issue. + if (granularity < sizeof (jint)) { + Label copy96; + __ cmp(count, u1(80/granularity)); + __ br(Assembler::HI, copy96); + __ ldp(t0, t1, Address(send, -16)); + + __ stpq(v0, v1, Address(d, 0)); + __ stpq(v2, v3, Address(d, 32)); + __ stp(t0, t1, Address(dend, -16)); + __ b(finish); + + __ bind(copy96); + } __ ldpq(v4, v5, Address(send, -32)); - __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); + __ stpq(v0, v1, Address(d, 0)); + __ stpq(v2, v3, Address(d, 32)); __ stpq(v4, v5, Address(dend, -32)); } else { __ ldp(t0, t1, Address(s, 0)); -- 2.19.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2