Projects
openEuler:24.03:SP1:Everything
gcc
_service:tar_scm:0049-Add-more-flexible-check-f...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch of Package gcc
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001 From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com> Date: Tue, 12 Mar 2024 23:30:56 +0800 Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during vectorization It takes minimum between number of iteration and segment length it helps to speed up loops with small number of iterations when only tail can be vectorized --- gcc/params.opt | 5 ++ .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++ gcc/tree-data-ref.cc | 67 +++++++++++++------ gcc/tree-data-ref.h | 11 ++- gcc/tree-vect-data-refs.cc | 14 +++- 5 files changed, 95 insertions(+), 25 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c diff --git a/gcc/params.opt b/gcc/params.opt index 6176d4790..7e5c119cf 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop. Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check. +-param=vect-alias-flexible-segment-len= +Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization +Use a minimum length of different segments. Currenlty the minimum between +iteration number and vectorization length is chosen by this param. + -param=vect-max-version-for-alignment-checks= Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c new file mode 100644 index 000000000..894f075f3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */ + +#define TYPE int +#define SIZE 257 + +void __attribute__ ((weak)) +f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) +{ + for (int i = 0; i < SIZE; ++i) + x[i * n] += y[i * n]; +} + +/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */ +/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ +/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ +/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ +/* Should use a WAR check that multiplies by (VF-2)*4 rather than + an overlap check that multiplies by (257-1)*4. */ +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ +/* One range check and a check for n being zero. */ +/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */ +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc index 397792c35..e6ae9e847 100644 --- a/gcc/tree-data-ref.cc +++ b/gcc/tree-data-ref.cc @@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, same arguments. Try to optimize cases in which the second access is a write and in which some overlap is valid. */ -static bool -create_waw_or_war_checks (tree *cond_expr, +static void +create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a, const dr_with_seg_len_pair_t &alias_pair) { const dr_with_seg_len& dr_a = alias_pair.first; const dr_with_seg_len& dr_b = alias_pair.second; - /* Check for cases in which: - - (a) DR_B is always a write; - (b) the accesses are well-ordered in both the original and new code - (see the comment above the DR_ALIAS_* flags for details); and - (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ - if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) - return false; - - /* Check for equal (but possibly variable) steps. */ tree step = DR_STEP (dr_a.dr); - if (!operand_equal_p (step, DR_STEP (dr_b.dr))) - return false; - - /* Make sure that we can operate on sizetype without loss of precision. */ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); - if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) - return false; /* All addresses involved are known to have a common alignment ALIGN. We can therefore subtract ALIGN from an exclusive endpoint to get @@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr, fold_convert (ssizetype, indicator), ssize_int (0)); - /* Get lengths in sizetype. */ - tree seg_len_a - = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); /* Each access has the following pattern: @@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr, *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); if (dump_enabled_p ()) dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); +} + +/* This is a wrapper function for create_waw_or_war_checks2. */ +static bool +create_waw_or_war_checks (tree *cond_expr, + const dr_with_seg_len_pair_t &alias_pair) +{ + const dr_with_seg_len& dr_a = alias_pair.first; + const dr_with_seg_len& dr_b = alias_pair.second; + + /* Check for cases in which: + + (a) DR_B is always a write; + (b) the accesses are well-ordered in both the original and new code + (see the comment above the DR_ALIAS_* flags for details); and + (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ + if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) + return false; + + /* Check for equal (but possibly variable) steps. */ + tree step = DR_STEP (dr_a.dr); + if (!operand_equal_p (step, DR_STEP (dr_b.dr))) + return false; + + /* Make sure that we can operate on sizetype without loss of precision. */ + tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); + if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) + return false; + + /* Get lengths in sizetype. */ + tree seg_len_a + = fold_convert (sizetype, + rewrite_to_non_trapping_overflow (dr_a.seg_len)); + create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair); + if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2) + { + tree seg_len2_a + = fold_convert (sizetype, + rewrite_to_non_trapping_overflow (dr_a.seg_len2)); + tree cond_expr2; + create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair); + *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, + *cond_expr, cond_expr2); + } return true; } diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index f643a95b2..9bc5f16ee 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -213,12 +213,19 @@ class dr_with_seg_len public: dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size, unsigned int a) - : dr (d), seg_len (len), access_size (size), align (a) {} - + : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a) + {} + dr_with_seg_len (data_reference_p d, tree len, tree len2, + unsigned HOST_WIDE_INT size, unsigned int a) + : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a) + {} data_reference_p dr; /* The offset of the last access that needs to be checked minus the offset of the first. */ tree seg_len; + /* The second version of segment length. Currently this is used to + soften checks for a small number of iterations. */ + tree seg_len2; /* A value that, when added to abs (SEG_LEN), gives the total number of bytes in the segment. */ poly_uint64 access_size; diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index 4e615b80b..04e68f621 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) { poly_uint64 lower_bound; tree segment_length_a, segment_length_b; + tree segment_length2_a, segment_length2_b; unsigned HOST_WIDE_INT access_size_a, access_size_b; unsigned int align_a, align_b; @@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) { segment_length_a = size_zero_node; segment_length_b = size_zero_node; + segment_length2_a = size_zero_node; + segment_length2_b = size_zero_node; } else { @@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) length_factor = scalar_loop_iters; else length_factor = size_int (vect_factor); + /* In any case we should rememeber scalar_loop_iters + this helps to create flexible aliasing check + for small number of iterations. */ segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); + segment_length2_a + = vect_vfa_segment_size (dr_info_a, scalar_loop_iters); + segment_length2_b + = vect_vfa_segment_size (dr_info_b, scalar_loop_iters); } access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a); access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b); @@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) } dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, - access_size_a, align_a); + segment_length2_a, access_size_a, align_a); dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, - access_size_b, align_b); + segment_length2_b, access_size_b, align_b); /* Canonicalize the order to be the one that's needed for accurate RAW, WAR and WAW flags, in cases where the data references are well-ordered. The order doesn't really matter otherwise, -- 2.33.0
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2