Projects
Mega:23.03
gcc
_service:tar_scm:0045-Transposed-SLP-Enable-Tra...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:0045-Transposed-SLP-Enable-Transposed-SLP.patch of Package gcc
From 639b5248cbab1806618545fc30215ed9d1a019e7 Mon Sep 17 00:00:00 2001 From: luohailing <luo_hailing@qq.com> Date: Fri, 17 Jun 2022 22:38:55 +0800 Subject: [PATCH 11/12] [Transposed SLP] Enable Transposed SLP Enable Transposed SLP when memory is uncontinual with -ftree-slp-transpose-vectorize. --- gcc/common.opt | 4 + gcc/testsuite/gcc.dg/vect/transpose-1.c | 53 ++ gcc/testsuite/gcc.dg/vect/transpose-2.c | 50 ++ gcc/testsuite/gcc.dg/vect/transpose-3.c | 54 ++ gcc/testsuite/gcc.dg/vect/transpose-4.c | 53 ++ gcc/testsuite/gcc.dg/vect/transpose-5.c | 73 ++ gcc/testsuite/gcc.dg/vect/transpose-6.c | 67 ++ gcc/testsuite/gcc.dg/vect/transpose-7.c | 53 ++ gcc/testsuite/gcc.dg/vect/transpose-8.c | 53 ++ gcc/testsuite/gcc.dg/vect/vect.exp | 7 + gcc/tree-vect-data-refs.c | 236 +++++ gcc/tree-vect-slp.c | 1090 ++++++++++++++++++++++- gcc/tree-vect-stmts.c | 763 +++++++++++++++- gcc/tree-vectorizer.h | 89 ++ 14 files changed, 2641 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c diff --git a/gcc/common.opt b/gcc/common.opt index 24834cf60..d38401b71 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3049,6 +3049,10 @@ ftree-vect-analyze-slp-group Common Report Var(flag_tree_slp_group) Init(0) Disable SLP vectorization for reduction chain on tree. +ftree-slp-transpose-vectorize +Common Report Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) +Enable basic block vectorization (SLP) for transposed stores and loads on trees. + fvect-cost-model= Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization -fvect-cost-model=[unlimited|dynamic|cheap] Specifies the cost model for vectorization. diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c new file mode 100644 index 000000000..8237a8b9e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c @@ -0,0 +1,53 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 4 +#define M 256 + +int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; + for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + c2[i] = pix1[2] - pix2[2]; + c3[i] = pix1[3] - pix2[3]; + c4[i] = pix1[4] - pix2[4]; + c5[i] = pix1[5] - pix2[5]; + c6[i] = pix1[6] - pix2[6]; + c7[i] = pix1[7] - pix2[7]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 16; + int i2 = 8; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 2; + input2[i] = i; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 1264) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c new file mode 100644 index 000000000..b01a0410e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c @@ -0,0 +1,50 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 8 +#define M 256 + +int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; + for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + c2[i] = pix1[2] - pix2[2]; + c3[i] = pix1[3] - pix2[3]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i] + c2[i] + c3[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 5; + int i2 = 4; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 4; + input2[i] = i * 2; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 1440) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c new file mode 100644 index 000000000..529581c59 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 4 +#define M 256 + +int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; + for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + c2[i] = pix1[2] - pix2[2]; + c3[i] = pix1[3] - pix2[3]; + c4[i] = pix1[4] - pix2[4]; + c5[i] = pix1[5] - pix2[5]; + c6[i] = pix1[6] - pix2[6]; + c7[i] = pix1[7] - pix2[7]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned short input1[M]; + unsigned short input2[M]; + int i1 = 8; + int i2 = 4; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 4; + input2[i] = i; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 1680) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c new file mode 100644 index 000000000..0b4adea9b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c @@ -0,0 +1,53 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 4 +#define M 256 + +int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; + for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + c2[i] = pix1[2] - pix2[2]; + c3[i] = pix1[3] - pix2[3]; + c4[i] = pix1[4] - pix2[4]; + c5[i] = pix1[5] - pix2[5]; + c6[i] = pix1[6] - pix2[6]; + c7[i] = pix1[7] - pix2[7]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned input1[M]; + unsigned input2[M]; + int i1 = 12; + int i2 = 6; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 7; + input2[i] = i * 3; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 3616) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c new file mode 100644 index 000000000..81a248840 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c @@ -0,0 +1,73 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include "tree-vect.h" + +#define N 4 +#define M 256 +#define eps 1e-8 + +double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + unsigned a0[N]; + unsigned a1[N]; + unsigned a2[N]; + unsigned a3[N]; + + int b0[N]; + int b1[N]; + int b2[N]; + int b3[N]; + + for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16); + a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16); + a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16); + a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16); + } + + for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]); + b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]); + b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]); + b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]); + } + + double sum = 0; + for (int i = 0; i < N; i++) + { + sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 8; + int i2 = 3; + unsigned char m = 2; + unsigned short n = 12; + float t = 3.0; + double k = 4.2; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 6; + input2[i] = i * 3; + } + double sum = foo (input1, i1, input2, i2); + if (fabs (sum - 78648144) > eps) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c new file mode 100644 index 000000000..3e134ac02 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c @@ -0,0 +1,67 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_float } */ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include "tree-vect.h" + +#define N 4 +#define M 256 +#define eps 1e-8 + +float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + unsigned a0[N]; + unsigned a1[N]; + unsigned a2[N]; + unsigned a3[N]; + + float c0[N]; + float c1[N]; + float c2[N]; + float c3[N]; + + for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + + c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]); + c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]); + c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]); + c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]); + } + + float sum = 0; + for (int i = 0; i < N; i++) + { + sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 18; + int i2 = 6; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 4; + input2[i] = i * 2; + } + float sum = foo (input1, i1, input2, i2); + if (fabs (sum - 106041168) > eps) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c new file mode 100644 index 000000000..2074d9aa8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c @@ -0,0 +1,53 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 16 +#define M 256 + +int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned char c0[N], c1[N]; + for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + } + for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 6; + int i2 = 4; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 5; + input2[i] = i * 2; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 3280) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c new file mode 100644 index 000000000..a154f012a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c @@ -0,0 +1,53 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ +/* { dg-require-effective-target vect_int } */ +#include <stdio.h> +#include <stdlib.h> +#include "tree-vect.h" + +#define N 32 +#define M 256 + +int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + int i = 0; + int sum = 0; + unsigned char c0[N], c1[N]; + for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + } + for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) + { + c0[i] = pix1[0] - pix2[0]; + c1[i] = pix1[1] - pix2[1]; + } + for (int i = 0; i < N; i++) + { + sum += c0[i] + c1[i]; + } + return sum; +} + +int main (int argc, const char* argv[]) +{ + unsigned char input1[M]; + unsigned char input2[M]; + int i1 = 6; + int i2 = 4; + check_vect (); + for (int i = 0; i < M; i++) + { + input1[i] = i * 5; + input2[i] = i * 2; + } + int sum = foo (input1, i1, input2, i2); + if (sum != 7584) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp index efe17ac6f..d92e1ba5b 100644 --- a/gcc/testsuite/gcc.dg/vect/vect.exp +++ b/gcc/testsuite/gcc.dg/vect/vect.exp @@ -114,6 +114,13 @@ et-dg-runtest dg-runtest [lsort \ [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +# -ftree-slp-transpose-vectorize SLP tests +set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS +lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize" +et-dg-runtest dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \ + "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3" + # -ffast-math tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-ffast-math" diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index fcc0726bd..d78b06455 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2647,6 +2647,9 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; DR_GROUP_SIZE (stmt_info) = groupsize; + + DR_GROUP_SLP_TRANSPOSE (stmt_info) = false; + if (dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, @@ -2676,6 +2679,20 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info) DR_GROUP_GAP (stmt_info)); } + /* SLP: create an SLP data structure for every interleaving group of + loads for further analysis in vect_analyse_slp. */ + if (DR_IS_READ (dr) && !slp_impossible) + { + if (loop_vinfo) + { + LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info); + } + if (bb_vinfo) + { + BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info); + } + } + /* SLP: create an SLP data structure for every interleaving group of stores for further analysis in vect_analyse_slp. */ if (DR_IS_WRITE (dr) && !slp_impossible) @@ -5413,6 +5430,225 @@ vect_permute_store_chain (vec<tree> dr_chain, } } +/* Encoding the PERM_MASK_FIRST. */ + +static void +vect_indices_encoding_first (tree vectype, unsigned int array_num, + tree &perm_mask_high_first, + tree &perm_mask_low_first) +{ + unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); + unsigned int group_num = nelt / array_num; + unsigned int index = 0; + unsigned int array = 0; + unsigned int group = 0; + + /* The encoding has 1 pattern in the fisrt stage. */ + for (array = 0; array < array_num / 2; array++) + { + for (group = 0; group < group_num * 2; group++) + { + sel[index++] = array + array_num * group; + } + } + vec_perm_indices indices (sel, 2, nelt); + perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices); + + index = 0; + for (array = array_num / 2; array < array_num; array++) + { + for (group = 0; group < group_num * 2; group++) + { + sel[index++] = array + array_num * group; + } + } + indices.new_vector (sel, 2, nelt); + perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices); +} + +/* Encoding the PERM_MASK. */ + +static void +vect_indices_encoding (tree vectype, unsigned int array_num, + tree &perm_mask_high, tree &perm_mask_low) +{ + unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + vec_perm_builder sel (nelt, nelt, 1); + sel.quick_grow (nelt); + unsigned int group_num = nelt / array_num; + unsigned int index = 0; + unsigned int array = 0; + unsigned int group = 0; + + /* The encoding has 2 patterns in the folllowing stages. */ + for (array = 0; array < array_num / 2; array++) + { + for (group = 0; group < group_num; group++) + { + sel[index++] = group + group_num * array; + } + for (group = 0; group < group_num; group++) + { + sel[index++] = nelt + group + group_num * array; + } + } + vec_perm_indices indices (sel, 2, nelt); + perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); + + index = 0; + for (array = array_num / 2; array < array_num; array++) + { + for (group = 0; group < group_num; group++) + { + sel[index++] = group + group_num * array; + } + for (group = 0; group < group_num; group++) + { + sel[index++] = nelt + group + group_num * array; + } + } + indices.new_vector (sel, 2, nelt); + perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); +} + +/* Function vect_transpose_store_chain. + + Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that + must be a power of 2. Generate interleave_high/low stmts to reorder + the data correctly for the stores. Return the final references for stores + in RESULT_CHAIN. This function is similar to vect_permute_store_chain (), + we interleave the contents of the vectors in their order. + + E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM + is 4. That is, the input is 4 vectors each containing 8 elements. + And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave + the contents of the four vectors in their order. We assign a number to each + element, the input sequence is: + + 1st vec: 0 1 2 3 4 5 6 7 + 2nd vec: 8 9 10 11 12 13 14 15 + 3rd vec: 16 17 18 19 20 21 22 23 + 4th vec: 24 25 26 27 28 29 30 31 + + The output sequence should be: + + 1st vec: 0 4 8 12 16 20 24 28 + 2nd vec: 1 5 9 13 17 21 25 29 + 3rd vec: 2 6 10 14 18 22 26 30 + 4th vec: 3 7 11 15 19 23 27 31 + + In our example, + We get 2 (VF / ARRAY_NUM) elements together in every vector. + + I1: 0 4 1 5 2 6 3 7 + I2: 8 12 9 13 10 14 11 15 + I3: 16 20 17 21 18 22 19 23 + I4: 24 28 25 29 26 30 27 31 + + Then, we use interleave_high/low instructions to create such output. + Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation + is done in log LENGTH stages. + + I1: interleave_high (1st vec, 3rd vec) + I2: interleave_low (1st vec, 3rd vec) + I3: interleave_high (2nd vec, 4th vec) + I4: interleave_low (2nd vec, 4th vec) + + The first stage of the sequence should be: + + I1: 0 4 16 20 1 5 17 21 + I2: 2 6 18 22 3 7 19 23 + I3: 8 12 24 28 9 13 25 29 + I4: 10 14 26 30 11 15 27 31 + + The following stage sequence should be, i.e. the final result is: + + I1: 0 4 8 12 16 20 24 28 + I2: 1 5 9 13 17 21 25 29 + I3: 2 6 10 14 18 22 26 30 + I4: 3 7 11 15 19 23 27 31. */ + +void +vect_transpose_store_chain (vec<tree> dr_chain, unsigned int length, + unsigned int array_num, stmt_vec_info stmt_info, + gimple_stmt_iterator *gsi, vec<tree> *result_chain) +{ + gimple *perm_stmt = NULL; + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + tree perm_mask_low_first = NULL; + tree perm_mask_high_first = NULL; + tree perm_mask_low = NULL; + tree perm_mask_high = NULL; + unsigned int log_length = exact_log2 (length); + + /* Only power of 2 is supported. */ + gcc_assert (pow2p_hwi (length)); + + /* The encoding has 2 types, one for the grouped pattern in the fisrt stage, + another for the interleaved patterns in the following stages. */ + gcc_assert (array_num != 0); + + /* Create grouped stmt (in the first stage): + group = nelt / array_num; + high_first = VEC_PERM_EXPR <vect1, vect2, + {0, array_num, 2*array_num, ..., (2*group-1)*array_num, + 1, 1+array_num, 1+2*array_num, ..., 1+(2*group-1)*array_num, + ..., + array_num/2-1, (array_num/2-1)+array_num, ..., + (array_num/2-1)+(2*group-1)*array_num}> + low_first = VEC_PERM_EXPR <vect1, vect2, + {array_num/2, array_num/2+array_num, array_num/2+2*array_num, + ..., array_num/2+(2*group-1)*array_num, + array_num/2+1, array_num/2+1+array_num, + ..., array_num/2+1+(2*group-1)*array_num, + ..., + array_num-1, array_num-1+array_num, + ..., array_num-1+(2*group-1)*array_num}> */ + vect_indices_encoding_first (vectype, array_num, perm_mask_high_first, + perm_mask_low_first); + + /* Create interleaving stmt (in the following stages): + high = VEC_PERM_EXPR <vect1, vect2, {0, 1, ..., group-1, + nelt, nelt+1, ..., nelt+group-1, + group, group+1, ..., 2*group-1, + nelt+group, nelt+group+1, ..., nelt+2*group-1, + ...}> + low = VEC_PERM_EXPR <vect1, vect2, + {nelt/2, nelt/2+1, ..., nelt/2+group-1, + nelt*3/2, nelt*3/2+1, ..., nelt*3/2+group-1, + nelt/2+group, nelt/2+group+1, ..., nelt/2+2*group-1, + nelt*3/2+group, nelt*3/2+group+1, ..., nelt*3/2+2*group-1, + ...}> */ + vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low); + + for (unsigned int perm_time = 0; perm_time < log_length; perm_time++) + { + for (unsigned int index = 0; index < length / 2; index++) + { + tree vect1 = dr_chain[index]; + tree vect2 = dr_chain[index + length / 2]; + + tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); + perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2, + perm_time == 0 ? perm_mask_high_first + : perm_mask_high); + vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); + (*result_chain)[2 * index] = high; + + tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); + perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2, + perm_time == 0 ? perm_mask_low_first + : perm_mask_low); + vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); + (*result_chain)[2 * index+1] = low; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } +} + /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 476b32370..d30463b96 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2414,11 +2414,13 @@ vect_analyze_slp_instance (vec_info *vinfo, /* For basic block SLP, try to break the group up into multiples of the vector size. */ + bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); unsigned HOST_WIDE_INT const_nunits; if (is_a <bb_vec_info> (vinfo) && STMT_VINFO_GROUPED_ACCESS (stmt_info) && DR_GROUP_FIRST_ELEMENT (stmt_info) - && nunits.is_constant (&const_nunits)) + && nunits.is_constant (&const_nunits) + && !bb_vinfo->transposed) { /* We consider breaking the group only on VF boundaries from the existing start. */ @@ -2455,6 +2457,898 @@ vect_analyze_slp_instance (vec_info *vinfo, return false; } +static inline bool +is_const_assign (stmt_vec_info store_elem) +{ + if (store_elem == NULL) + { + gcc_unreachable (); + } + gimple *stmt = store_elem->stmt; + gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); + return rhs_class == GIMPLE_SINGLE_RHS + && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt)); +} + +/* Push inits to INNERMOST_INITS and check const assign. */ + +static bool +record_innermost (vec<tree> &innermost_inits, + vec<tree> &innermost_offsets, + stmt_vec_info stmt_vinfo) +{ + if (!stmt_vinfo) + { + return false; + } + stmt_vec_info next_info = stmt_vinfo; + while (next_info) + { + /* No need to vectorize constant assign in a transposed version. */ + if (is_const_assign (next_info)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "no need to vectorize, store is const assign: %G", + next_info->stmt); + } + return false; + } + innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info)); + innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info)); + next_info = DR_GROUP_NEXT_ELEMENT (next_info); + } + return true; +} + +/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match + the first grouped_store. And check const assign meanwhile. */ + +static bool +compare_innermost (const vec<tree> &innermost_inits, + const vec<tree> &innermost_offsets, + stmt_vec_info stmt_vinfo) +{ + if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size) + { + return false; + } + stmt_vec_info next_info = stmt_vinfo; + unsigned int i = 0; + while (next_info) + { + if (is_const_assign (next_info)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "no need to vectorize, store is const " + "assign: %G", next_info->stmt); + } + return false; + } + if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info) + || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info)) + { + return false; + } + next_info = DR_GROUP_NEXT_ELEMENT (next_info); + i++; + } + return true; +} + +/* Check if grouped stores are of same type. + input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt)) + output: 0 if same, 1 or -1 else. */ + +static int +tree_type_cmp (const tree t1, const tree t2) +{ + gcc_checking_assert (t1 != NULL && t2 != NULL); + if (t1 != t2) + { + if (TREE_CODE (t1) != TREE_CODE (t2)) + { + return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1; + } + if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2)) + { + return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1; + } + if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2)) + { + return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1; + } + } + return 0; +} + +/* Check it if 2 grouped stores are of same type that + we can analyze them in a transpose group. */ +static int +check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2) +{ + if (grp1 == grp2) + { + return 0; + } + if (grp1->size != grp2->size) + { + return grp1->size > grp2->size ? 1 : -1; + } + tree lhs1 = gimple_assign_lhs (grp1->stmt); + tree lhs2 = gimple_assign_lhs (grp2->stmt); + if (TREE_CODE (lhs1) != TREE_CODE (lhs2)) + { + return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1; + } + tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt)); + tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt)); + int cmp = tree_type_cmp (grp_type1, grp_type2); + return cmp; +} + +/* Sort grouped stores according to group_size and store_type. + output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */ + +static int +grouped_store_cmp (const void *grp1_, const void *grp2_) +{ + stmt_vec_info grp1 = *(stmt_vec_info *)const_cast<void *>(grp1_); + stmt_vec_info grp2 = *(stmt_vec_info *)const_cast<void *>(grp2_); + return check_same_store_type (grp1, grp2); +} + +/* Transposing is based on permutation in registers. Permutation requires + vector length being power of 2 and satisfying the vector mode. */ + +static inline bool +check_filling_reg (stmt_vec_info current_element) +{ + if (current_element->size == 0) + { + return false; + } + /* If the gimple STMT was already vectorized in vect pass, it's unable to + conduct transpose analysis, skip it. */ + bool lhs_vectorized + = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt))) + == VECTOR_TYPE; + bool rhs_vectorized + = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt))) + == VECTOR_TYPE; + if (lhs_vectorized || rhs_vectorized) + { + return false; + } + unsigned int store_precision + = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt))); + auto_vector_modes vector_modes; + targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); + unsigned min_mode_size = -1u; + for (unsigned i = 0; i < vector_modes.length (); i++) + { + unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0]; + min_mode_size = mode_bit_size < min_mode_size + ? mode_bit_size : min_mode_size; + } + return store_precision != 0 + && pow2p_hwi (current_element->size) + && (current_element->size * store_precision % min_mode_size == 0); +} + +/* Check if previous groups are suitable to transpose, if not, set their + group number to -1, reduce grp_num and clear current_groups. + Otherwise, just clear current_groups. */ + +static void +check_and_clear_groups (vec<stmt_vec_info> current_groups, + unsigned int &grp_num) +{ + stmt_vec_info first_element; + if (current_groups.length () == 1 + || (current_groups.length () != 0 + && !pow2p_hwi (current_groups.length ()))) + { + while (current_groups.length () != 0) + { + first_element = current_groups.pop (); + first_element->group_number = -1; + } + grp_num--; + } + else + { + while (current_groups.length ()) + { + current_groups.pop (); + } + } +} + + +/* Make sure that transpose slp vectorization is conducted only if grouped + stores are one dimension array ref. */ + +static bool +is_store_one_dim_array (gimple *stmt) +{ + tree op = gimple_get_lhs (stmt); + if (TREE_CODE (op) != ARRAY_REF) + return false; + return TREE_OPERAND_LENGTH (op) > 0 + && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0; +} + +/* Set grouped_stores with similar MEM_REF to the same group and mark their + grp_num. Groups with same grp_num consist the minimum unit to analyze + transpose. Return num of such units. */ + +static unsigned +vect_prepare_transpose (bb_vec_info bb_vinfo) +{ + stmt_vec_info current_element = NULL; + stmt_vec_info first_element = NULL; + unsigned int i = 0; + unsigned int grp_num = 0; + /* Use arrays to record MEM_REF data in different GROUPED_STORES. */ + auto_vec<tree> innermost_inits; + auto_vec<tree> innermost_offsets; + + /* A set of stmt_vec_info with same store type. Analyze them if their size + is suitable to transpose. */ + auto_vec<stmt_vec_info> current_groups; + + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element) + { + /* Compare current grouped_store to the first one if first_element exists, + push current_element to current_groups if they are similar on innermost + behavior of MEM_REF. */ + if (first_element != NULL + && !check_same_store_type (first_element, current_element) + && compare_innermost (innermost_inits, innermost_offsets, + current_element)) + { + current_groups.safe_push (current_element); + current_element->group_number = grp_num; + /* If current_element is the last element in grouped_stores, continue + will exit the loop and leave the last group unanalyzed. */ + if (i == bb_vinfo->grouped_stores.length () - 1) + { + check_and_clear_groups (current_groups, grp_num); + } + continue; + } + check_and_clear_groups (current_groups, grp_num); + innermost_inits.release (); + innermost_offsets.release (); + /* Beginning of a new group to analyze whether they are able to consist + a unit to conduct transpose analysis. */ + first_element = NULL; + if (is_store_one_dim_array (current_element->stmt) + && check_filling_reg (current_element) + && record_innermost (innermost_inits, innermost_offsets, + current_element)) + { + first_element = current_element; + current_groups.safe_push (current_element); + current_element->group_number = ++grp_num; + if (i == bb_vinfo->grouped_stores.length () - 1) + { + check_and_clear_groups (current_groups, grp_num); + } + continue; + } + current_element->group_number = -1; + } + return grp_num; +} + +/* Return a flag to transpose grouped stores before building slp tree. + Add bool may_transpose in class vec_info. */ + +static bool +vect_may_transpose (bb_vec_info bb_vinfo) +{ + if (targetm.vectorize.vec_perm_const == NULL) + { + return false; + } + if (bb_vinfo->grouped_stores.length () < 2) + { + return false; + } + DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp"); + /* Sort grouped_stores according to size and type for function + vect_prepare_transpose (). */ + bb_vinfo->grouped_stores.qsort (grouped_store_cmp); + + int groups = vect_prepare_transpose (bb_vinfo); + BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "%d groups to analyze transposed slp.\n", groups); + return groups != 0; +} + +/* Get the base address of STMT_INFO. */ + +static tree +get_op_base_address (stmt_vec_info stmt_info) +{ + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree op = DR_BASE_ADDRESS (dr); + while (TREE_OPERAND_LENGTH (op) > 0) + { + op = TREE_OPERAND (op, 0); + } + return op; +} + +/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B. + Sorting them in ascending order. */ + +static int +dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_) +{ + stmt_vec_info stmtinfo_a + = *(stmt_vec_info *) const_cast<void *> (stmtinfo_a_); + stmt_vec_info stmtinfo_b + = *(stmt_vec_info *) const_cast<void *> (stmtinfo_b_); + + /* Stabilize sort. */ + if (stmtinfo_a == stmtinfo_b) + { + return 0; + } + return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1; +} + +/* Find the first elements of the grouped loads which are required to merge. */ + +static void +vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec<bool> &visited, + vec<stmt_vec_info> &res) +{ + unsigned int i = 0; + stmt_vec_info merge_first_element = NULL; + stmt_vec_info first_element = NULL; + tree opa = NULL; + unsigned int grp_size_a = 0; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element) + { + if (visited[i]) + { + continue; + } + if (!STMT_VINFO_GROUPED_ACCESS (first_element) + || !pow2p_hwi (DR_GROUP_SIZE (first_element))) + { + /* Non-conforming grouped load should be grouped separately. */ + if (merge_first_element == NULL) + { + visited[i] = true; + res.safe_push (first_element); + return; + } + } + if (merge_first_element == NULL) + { + merge_first_element = first_element; + opa = get_op_base_address (first_element); + grp_size_a = DR_GROUP_SIZE (first_element); + res.safe_push (first_element); + visited[i] = true; + continue; + } + + /* If the two first elements are of the same base address and group size, + these two grouped loads need to be merged. */ + tree opb = get_op_base_address (first_element); + unsigned int grp_size_b = DR_GROUP_SIZE (first_element); + if (opa == opb && grp_size_a == grp_size_b) + { + res.safe_push (first_element); + visited[i] = true; + } + } +} + +/* Merge the grouped loads that are found from + vect_slp_grouped_load_find (). */ + +static stmt_vec_info +vect_slp_grouped_load_merge (vec<stmt_vec_info> res) +{ + stmt_vec_info stmt_info = res[0]; + if (res.length () == 1) + { + return stmt_info; + } + unsigned int i = 0; + unsigned int size = DR_GROUP_SIZE (res[0]); + unsigned int new_group_size = size * res.length (); + stmt_vec_info first_element = NULL; + stmt_vec_info merge_first_element = NULL; + stmt_vec_info last_element = NULL; + FOR_EACH_VEC_ELT (res, i, first_element) + { + if (merge_first_element == NULL) + { + merge_first_element = first_element; + last_element = merge_first_element; + size = DR_GROUP_SIZE (merge_first_element); + } + + if (last_element != first_element + && !DR_GROUP_NEXT_ELEMENT (last_element)) + { + DR_GROUP_NEXT_ELEMENT (last_element) = first_element; + /* Store the gap from the previous member of the group. If there is + no gap in the access, DR_GROUP_GAP is always 1. */ + DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element); + DR_GROUP_GAP (first_element) = 1; + } + for (stmt_info = first_element; stmt_info; + stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) + { + DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element; + DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); + DR_GROUP_SIZE (stmt_info) = new_group_size; + last_element = stmt_info; + } + } + DR_GROUP_SIZE (merge_first_element) = new_group_size; + DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true; + DR_GROUP_NEXT_ELEMENT (last_element) = NULL; + return merge_first_element; +} + +/* Merge the grouped loads which have the same base address and group size. + For example, for grouped loads (opa_1, opa_2, opb_1, opb_2): + opa_1: a0->a1->a2->a3 + opa_2: a8->a9->a10->a11 + opb_1: b0->b1 + opb_2: b16->b17 + we can probably get two merged grouped loads: + opa: a0->a1->a2->a3->a8->a9->a10->a11 + opb: b0->b1->b16->b17. */ + +static bool +vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo) +{ + if (bb_vinfo->grouped_loads.length () <= 0) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "The number of grouped loads is 0.\n"); + } + return false; + } + bb_vinfo->grouped_loads.qsort (dr_group_cmp); + auto_vec<bool> visited (bb_vinfo->grouped_loads.length ()); + auto_vec<stmt_vec_info> grouped_loads_merge; + for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++) + { + visited.safe_push (false); + } + while (1) + { + /* Find grouped loads which are required to merge. */ + auto_vec<stmt_vec_info> res; + vect_slp_grouped_load_find (bb_vinfo, visited, res); + if (res.is_empty ()) + { + break; + } + /* Merge the required grouped loads into one group. */ + grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res)); + } + if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ()) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "No grouped loads need to be merged.\n"); + } + return false; + } + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Merging grouped loads successfully.\n"); + } + BB_VINFO_GROUPED_LOADS (bb_vinfo).release (); + for (unsigned int i = 0; i < grouped_loads_merge.length (); i++) + { + BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]); + } + return true; +} + +/* Find the first elements of the grouped stores + which are required to transpose and merge. */ + +static void +vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec<bool> &visited, + vec<stmt_vec_info> &res) +{ + stmt_vec_info first_element = NULL; + stmt_vec_info merge_first_element = NULL; + unsigned int k = 0; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) + { + if (visited[k]) + { + continue; + } + /* Non-conforming grouped store should be grouped separately. */ + if (!STMT_VINFO_GROUPED_ACCESS (first_element) + || first_element->group_number == -1) + { + if (merge_first_element == NULL) + { + visited[k] = true; + res.safe_push (first_element); + return; + } + } + if (first_element->group_number != -1 + && merge_first_element == NULL) + { + merge_first_element = first_element; + } + if (merge_first_element->group_number == first_element->group_number) + { + visited[k] = true; + res.safe_push (first_element); + } + } +} + +/* Transpose and merge the grouped stores that are found from + vect_slp_grouped_store_find (). */ + +static stmt_vec_info +vect_slp_grouped_store_transform (vec<stmt_vec_info> res) +{ + stmt_vec_info stmt_info = res[0]; + if (res.length () == 1) + { + return stmt_info; + } + stmt_vec_info rearrange_first_element = stmt_info; + stmt_vec_info last_element = rearrange_first_element; + + unsigned int size = DR_GROUP_SIZE (rearrange_first_element); + unsigned int new_group_size = size * res.length (); + for (unsigned int i = 1; i < res.length (); i++) + { + /* Store the gap from the previous member of the group. If there is no + gap in the access, DR_GROUP_GAP is always 1. */ + DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]); + DR_GROUP_GAP (res[i]) = 1; + } + while (!res.is_empty ()) + { + stmt_info = res[0]; + res.ordered_remove (0); + if (DR_GROUP_NEXT_ELEMENT (stmt_info)) + { + res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info)); + } + DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element; + DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info; + DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); + DR_GROUP_SIZE (stmt_info) = new_group_size; + last_element = stmt_info; + } + + DR_GROUP_SIZE (rearrange_first_element) = new_group_size; + DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true; + DR_GROUP_NEXT_ELEMENT (last_element) = NULL; + return rearrange_first_element; +} + +/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for + transposing back grouped stores. */ + +static void +get_scalar_stores (bb_vec_info bb_vinfo) +{ + unsigned int k = 0; + stmt_vec_info first_element = NULL; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) + { + /* Filter the grouped store which is unnecessary for transposing. */ + if (!STMT_VINFO_GROUPED_ACCESS (first_element) + || first_element->group_number == -1) + { + continue; + } + vec<stmt_vec_info> tmp_scalar_store; + tmp_scalar_store.create (DR_GROUP_SIZE (first_element)); + for (stmt_vec_info stmt_info = first_element; stmt_info; + stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) + { + tmp_scalar_store.safe_push (stmt_info); + } + BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store); + } +} + +/* Transpose and merge the grouped stores which have the same group number. + For example, for grouped stores (opa_0, opa_1, opa_2, opa_3): + opa_0: a00->a01->a02->a03 + opa_1: a10->a11->a12->a13 + opa_2: a20->a21->a22->a23 + opa_2: a30->a31->a32->a33 + we can probably get the merged grouped store: + opa: a00->a10->a20->a30 + ->a01->a11->a21->a31 + ->a02->a12->a22->a32 + ->a03->a13->a23->a33. */ + +static bool +vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo) +{ + if (bb_vinfo->grouped_stores.length () <= 0) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "The number of grouped stores is 0.\n"); + } + return false; + } + + bb_vinfo->grouped_stores.qsort (dr_group_cmp); + auto_vec<stmt_vec_info> grouped_stores_merge; + auto_vec<bool> visited (bb_vinfo->grouped_stores.length ()); + unsigned int i = 0; + for (i = 0; i < bb_vinfo->grouped_stores.length (); i++) + { + visited.safe_push (false); + } + + /* Get scalar stores for the following transposition recovery. */ + get_scalar_stores (bb_vinfo); + + while (1) + { + /* Find grouped stores which are required to transpose and merge. */ + auto_vec<stmt_vec_info> res; + vect_slp_grouped_store_find (bb_vinfo, visited, res); + if (res.is_empty ()) + { + break; + } + /* Transpose and merge the required grouped stores into one group. */ + grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res)); + } + + BB_VINFO_GROUPED_STORES (bb_vinfo).release (); + for (i = 0; i < grouped_stores_merge.length (); i++) + { + BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]); + } + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Transposing grouped stores successfully.\n"); + } + return true; +} + +/* A helpful function of vect_transform_back_slp_grouped_stores (). */ + +static auto_vec<stmt_vec_info> +vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo, + stmt_vec_info first_stmt_info) +{ + auto_vec<stmt_vec_info> grouped_stores_split; + for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++) + { + vec<stmt_vec_info> scalar_tmp = bb_vinfo->scalar_stores[i]; + if (scalar_tmp.length () > 1 + && scalar_tmp[0]->group_number != first_stmt_info->group_number) + { + continue; + } + stmt_vec_info cur_stmt_info = NULL; + stmt_vec_info cur_first_stmt_info = NULL; + stmt_vec_info last_stmt_info = NULL; + unsigned int k = 0; + FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info) + { + if (k == 0) + { + cur_first_stmt_info = cur_stmt_info; + last_stmt_info = cur_stmt_info; + } + DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info; + DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info; + last_stmt_info = cur_stmt_info; + } + DR_GROUP_SIZE (cur_first_stmt_info) = k; + DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL; + if (first_stmt_info != cur_first_stmt_info) + { + DR_GROUP_GAP (cur_first_stmt_info) + = DR_GROUP_GAP_TRANS (cur_first_stmt_info); + DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false; + DR_GROUP_NUMBER (cur_first_stmt_info) = -1; + } + grouped_stores_split.safe_push (cur_first_stmt_info); + } + return grouped_stores_split; +} + +/* Transform the grouped store back. */ + +void +vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo, + stmt_vec_info first_stmt_info) +{ + if (first_stmt_info->group_number == -1) + { + return; + } + /* Transform back. */ + auto_vec<stmt_vec_info> grouped_stores_split + = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info); + + /* Add the remaining grouped stores to grouped_stores_split. */ + stmt_vec_info first_element = NULL; + unsigned int i = 0; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) + { + if (first_element->group_number != first_stmt_info->group_number) + { + grouped_stores_split.safe_push (first_element); + } + } + DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false; + DR_GROUP_NUMBER (first_stmt_info) = -1; + BB_VINFO_GROUPED_STORES (bb_vinfo).release (); + for (i = 0; i < grouped_stores_split.length (); i++) + { + BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]); + } +} + +/* Function check_for_slp_vectype + + Restriction for grouped stores by checking their vectype. + If the vectype of the grouped store is changed, it need transform back. + If all grouped stores need to be transformed back, return FALSE. */ + +static bool +check_for_slp_vectype (bb_vec_info bb_vinfo) +{ + stmt_vec_info first_element = NULL; + unsigned int i = 0; + int count = 0; + auto_vec<stmt_vec_info> grouped_stores_check; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) + { + grouped_stores_check.safe_push (first_element); + } + FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element) + { + if (STMT_VINFO_GROUPED_ACCESS (first_element) + && first_element->group_number != -1) + { + unsigned int group_size_b + = DR_GROUP_SIZE_TRANS (first_element); + tree vectype = STMT_VINFO_VECTYPE (first_element); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (nunits.to_constant () > group_size_b) + { + count++; + /* If the vectype is changed, this grouped store need + to be transformed back. */ + vect_transform_back_slp_grouped_stores (bb_vinfo, first_element); + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "No supported: only supported for" + " group_size geq than nunits.\n"); + } + } + } + } + if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo)) + { + return false; + } + return true; +} + +/* Function check_for_dr_alignment + + Check the alignment of the slp instance loads. + Return FALSE if a load cannot be vectorized. */ + +static bool +check_for_dr_alignment (slp_instance instance) +{ + slp_tree node = NULL; + unsigned int i = 0; + FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) + { + stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; + dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); + enum dr_alignment_support supportable_dr_alignment + = vect_supportable_dr_alignment (first_dr_info, false); + if (supportable_dr_alignment == dr_explicit_realign_optimized + || supportable_dr_alignment == dr_explicit_realign) + { + return false; + } + } + return true; +} + +/* Initialize slp_transpose flag before transposing. */ + +static void +init_stmt_info_slp_transpose (bb_vec_info bb_vinfo) +{ + stmt_vec_info first_element = NULL; + unsigned int k = 0; + FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) + { + if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { + DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } + } + FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element) + { + if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { + DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } + } +} + +/* Analyze and transpose the stmts before building the SLP tree. */ + +static bool +vect_analyze_transpose (bb_vec_info bb_vinfo) +{ + DUMP_VECT_SCOPE ("vect_analyze_transpose"); + + if (!vect_may_transpose (bb_vinfo)) + { + return false; + } + + /* For basic block SLP, try to merge the grouped stores and loads + into one group. */ + init_stmt_info_slp_transpose (bb_vinfo); + if (vect_transform_slp_grouped_stores (bb_vinfo) + && vect_merge_slp_grouped_loads (bb_vinfo)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Analysis succeeded with SLP transposed.\n"); + } + return true; + } + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Analysis failed with SLP transposed.\n"); + } + return false; +} /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP trees of packed scalar stmts if SLP is possible. */ @@ -3124,7 +4018,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; - if (dump_enabled_p ()) + BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost; + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost; + BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost; + + if (!unlimited_cost_model (NULL) && dump_enabled_p ()) { dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", @@ -3239,6 +4137,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) vect_pattern_recog (bb_vinfo); + /* Transpose grouped stores and loads for better vectorizable version. */ + if (bb_vinfo->transposed) + { + if (!vect_analyze_transpose (bb_vinfo)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: unhandled slp transposed in " + "basic block.\n"); + } + return false; + } + } + bb_vinfo->before_slp = true; + /* Check the SLP opportunities in the basic block, analyze and build SLP trees. */ if (!vect_analyze_slp (bb_vinfo, n_stmts)) @@ -3254,6 +4168,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) return false; } + /* Check if the vectype is suitable for SLP transposed. */ + if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Failed to SLP transposed in the basic block.\n"); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: vectype is not suitable for " + "SLP transposed in basic block.\n"); + } + return false; + } + vect_record_base_alignments (bb_vinfo); /* Analyze and verify the alignment of data references and the @@ -3286,6 +4214,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) return false; + /* Check if the alignment is suitable for SLP transposed. */ + if (bb_vinfo->transposed) + { + for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++) + { + if (!check_for_dr_alignment (instance)) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Failed to SLP transposed in the basic " + "block.\n"); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: alignment is not suitable " + "for SLP transposed in basic block.\n"); + } + return false; + } + } + } + if (!vect_slp_analyze_operations (bb_vinfo)) { if (dump_enabled_p ()) @@ -3311,6 +4260,83 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) return true; } +static bool +may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori) +{ + /* If the flag is false or the slp analysis is broken before + vect_analyze_slp, we don't try to analyze the transposed SLP version. */ + if (!flag_tree_slp_transpose_vectorize + || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori)) + { + return false; + } + + /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo + of the transposed version. */ + if (!res_ori) + { + return true; + } + + /* Caculate the cost of the original bb_vinfo. */ + if (unlimited_cost_model (NULL)) + { + vect_bb_vectorization_profitable_p (bb_vinfo_ori); + } + /* If the vec cost and scalar cost are not much difference (here we set the + threshold to 4), we try to new a bb_vinfo of the transposed version. */ + if (BB_VINFO_SCALAR_COST (bb_vinfo_ori) + < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) + + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori))) + { + return true; + } + return false; +} + +static bool +may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans, + bb_vec_info bb_vinfo_ori, bool res_ori) +{ + /* The original bb_vinfo is chosen if the transposed bb_vinfo + can't be vectorized. */ + if (!res_trans) + { + return false; + } + /* Caculate the cost of the transposed bb_vinfo. */ + if (unlimited_cost_model (NULL)) + { + vect_bb_vectorization_profitable_p (bb_vinfo_trans); + } + int diff_bb_cost = -1; + int diff_bb_cost_trans = -1; + if (res_ori) + { + diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori) + - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) + - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori); + } + if (res_trans) + { + diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans) + - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) + - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans); + } + /* The original bb_vinfo is chosen when one of the following conditions + is satisfied as follows: + 1) The cost of original version is better transposed version. + 2) The vec cost is similar to scalar cost in the transposed version. */ + if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans) + || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans) + <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) + + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans)))) + { + return false; + } + return true; +} + /* Subroutine of vect_slp_bb. Try to vectorize the statements between REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true on success. The region has N_STMTS statements and has the datarefs @@ -3323,6 +4349,7 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, unsigned int n_stmts) { bb_vec_info bb_vinfo; + bb_vec_info bb_vinfo_trans = NULL; auto_vector_modes vector_modes; /* Autodetect first vector size we try. */ @@ -3337,6 +4364,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, { bool vectorized = false; bool fatal = false; + bool res_bb_vinfo_ori = false; + bool res_bb_vinfo_trans = false; + + /* New a bb_vinfo of the original version. */ bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared); bool first_time_p = shared.datarefs.is_empty (); @@ -3346,8 +4377,57 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, else bb_vinfo->shared->check_datarefs (); bb_vinfo->vector_mode = next_vector_mode; + bb_vinfo->transposed = false; + bb_vinfo->before_slp = false; + + res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal); + /* Analyze and new a transposed bb_vinfo. */ + if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori)) + { + bool fatal_trans = false; + bb_vinfo_trans + = new _bb_vec_info (region_begin, region_end, &shared); + bool first_time_p = shared.datarefs.is_empty (); + BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs; + if (first_time_p) + { + bb_vinfo_trans->shared->save_datarefs (); + } + else + { + bb_vinfo_trans->shared->check_datarefs (); + } + bb_vinfo_trans->vector_mode = next_vector_mode; + bb_vinfo_trans->transposed = true; + bb_vinfo_trans->before_slp = false; + + res_bb_vinfo_trans + = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans); + if (may_choose_transpose_bbvinfo (bb_vinfo_trans, + res_bb_vinfo_trans, + bb_vinfo, res_bb_vinfo_ori)) + { + bb_vinfo = bb_vinfo_trans; + fatal = fatal_trans; + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Basic block part vectorized " + "using transposed version.\n"); + } + } + else + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Basic block part vectorized " + "using original version.\n"); + } + } + } - if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal) + if ((res_bb_vinfo_ori || res_bb_vinfo_trans) && dbg_cnt (vect_slp)) { if (dump_enabled_p ()) @@ -3400,6 +4480,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin, } delete bb_vinfo; + if (bb_vinfo_trans) + { + bb_vinfo_trans = NULL; + } if (mode_i < vector_modes.length () && VECTOR_MODE_P (autodetected_vector_mode) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 6418edb52..b872cfc8d 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -7329,6 +7329,153 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, return true; } +/* Function vect_permute_store_chains + + Call function vect_permute_store_chain (). + Given a chain of interleaved stores in DR_CHAIN, generate + interleave_high/low stmts to reorder the data correctly. + Return the final references for stores in RESULT_CHAIN. */ + +static void +vect_permute_store_chains (vec<tree> dr_chain, unsigned int num_each, + stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, + vec<tree> *result_chain, unsigned int group) +{ + unsigned int k = 0; + unsigned int t = 0; + + /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors + together. */ + for (k = 0; k < group; k++) + { + auto_vec<tree> dr_chain_transposed (num_each); + auto_vec<tree> result_chain_transposed (num_each); + for (t = k; t < dr_chain.length (); t = t + group) + { + dr_chain_transposed.quick_push (dr_chain[t]); + } + vect_permute_store_chain (dr_chain_transposed, num_each, stmt_info, + gsi, &result_chain_transposed); + for (t = 0; t < num_each; t++) + { + result_chain->quick_push (result_chain_transposed[t]); + } + } +} + +/* Function transpose_oprnd_store + + Calculate the transposed results from VEC_OPRNDS (VEC_STMT) + for vectorizable_store. */ + +static void +transpose_oprnd_store (vec<tree>vec_oprnds, vec<tree> *result_chain, + unsigned int vec_num, unsigned int const_nunits, + unsigned int array_num, stmt_vec_info first_stmt_info, + gimple_stmt_iterator *gsi) +{ + unsigned int group_for_transform = 0; + unsigned int num_each = 0; + + /* Transpose back for vec_oprnds. */ + /* vec = {vec1, vec2, ...} */ + if (array_num < const_nunits + && const_nunits % array_num == 0) + { + vect_transpose_store_chain (vec_oprnds, + vec_num, array_num, + first_stmt_info, + gsi, result_chain); + } + /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */ + else if (array_num >= const_nunits + && array_num % const_nunits == 0) + { + group_for_transform = array_num / const_nunits; + num_each = vec_oprnds.length () / group_for_transform; + vect_permute_store_chains (vec_oprnds, + num_each, first_stmt_info, + gsi, result_chain, + group_for_transform); + } + else + { + gcc_unreachable (); + } +} + +static dr_vec_info * +get_dr_info (stmt_vec_info stmt_info) +{ + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); + if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) + { + SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); + } + return dr_info; +} + +static unsigned +dr_align_vect_store (dr_vec_info *cur_first_dr_info, + unsigned HOST_WIDE_INT &align) +{ + unsigned misalign = 0; + align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); + if (aligned_access_p (cur_first_dr_info)) + { + return misalign; + } + else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) + { + align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); + } + else + { + misalign = DR_MISALIGNMENT (cur_first_dr_info); + } + return misalign; +} + +static stmt_vec_info +add_new_stmt_vect_store (tree vectype, tree dataref_ptr, tree dataref_offset, + tree ref_type, dr_vec_info *cur_first_dr_info, + tree vec_oprnd, gimple_stmt_iterator *gsi, + stmt_vec_info stmt_info) +{ + /* Data align. */ + unsigned HOST_WIDE_INT align; + unsigned misalign = dr_align_vect_store (cur_first_dr_info, align); + + if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) + { + set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); + } + + /* Get data_ref. */ + tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); + tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset); + if (aligned_access_p (cur_first_dr_info)) + { + ; + } + else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) + { + TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), + align * BITS_PER_UNIT); + } + else + { + tree elem_type = TREE_TYPE (vectype); + TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), + TYPE_ALIGN (elem_type)); + } + /* Add new stmt. */ + vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); + gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd); + stmt_vec_info new_stmt_info + = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + return new_stmt_info; +} /* Function vectorizable_store. @@ -8208,6 +8355,16 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) vect_get_gather_scatter_ops (loop, stmt_info, &gs_info, &dataref_ptr, &vec_offset); + /* If the stmt_info need to be transposed recovery, dataref_ptr + will be caculated later. */ + else if (memory_access_type == VMAT_CONTIGUOUS + && is_a <bb_vec_info> (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SLP_TRANSPOSE ( + DR_GROUP_FIRST_ELEMENT (stmt_info))) + { + dataref_ptr = NULL_TREE; + } else dataref_ptr = vect_create_data_ref_ptr (first_stmt_info, aggr_type, @@ -8299,6 +8456,75 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } else { + /* group_size: the size of group after transposing and merging. + group_size_b: the size of group before transposing and merging, + and only group_size_b >= const_nunits is supported. + array_num: the number of arrays. + const_nunits: TYPE_VECTOR_SUBPARTS (vectype). + ncontinues: group_size_b / const_nunits, it means the number of + times an array is stored in memory. */ + if (slp && is_a <bb_vec_info> (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "vectorizable_store for slp transpose.\n"); + } + /* Transpose back for grouped stores. */ + vect_transform_back_slp_grouped_stores (bb_vinfo, + first_stmt_info); + + result_chain.create (vec_oprnds.length ()); + unsigned int const_nunits = nunits.to_constant (); + unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); + unsigned int array_num = group_size / group_size_b; + transpose_oprnd_store (vec_oprnds, &result_chain, vec_num, + const_nunits, array_num, + first_stmt_info, gsi); + + /* For every store group, not for every vec, because transposing + and merging have changed the data reference access. */ + gcc_assert (group_size_b >= const_nunits); + unsigned int ncontinues = group_size_b / const_nunits; + + unsigned int k = 0; + for (i = 0; i < array_num; i++) + { + stmt_vec_info first_stmt_b; + BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b); + bool simd_lane_access_p + = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0; + tree ref_type = get_group_alias_ptr_type (first_stmt_b); + dataref_ptr = vect_create_data_ref_ptr ( + first_stmt_b, aggr_type, + simd_lane_access_p ? loop : NULL, + offset, &dummy, gsi, &ptr_incr, + simd_lane_access_p, NULL_TREE, bump); + dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b); + for (unsigned int t = 0; t < ncontinues; t++) + { + vec_oprnd = result_chain[k]; + k++; + if (t > 0) + { + /* Bump the vector pointer. */ + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, + gsi, first_stmt_b, + bump); + } + new_stmt_info = add_new_stmt_vect_store ( + vectype, dataref_ptr, dataref_offset, + ref_type, cur_first_dr_info, vec_oprnd, + gsi, first_stmt_b); + } + } + oprnds.release (); + result_chain.release (); + vec_oprnds.release (); + return true; + } new_stmt_info = NULL; if (grouped_store) { @@ -8557,6 +8783,447 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop) return true; } +static tree +calculate_new_type (tree vectype, unsigned int const_nunits, + unsigned int group_size_b, unsigned int &nloads, + unsigned int &ncontinues, tree &lvectype) +{ + tree ltype = TREE_TYPE (vectype); + /* nloads is the number of ARRAYs in a vector. + vectemp = {a[], b[], ...} */ + if (group_size_b < const_nunits) + { + tree ptype; + tree vtype + = vector_vector_composition_type (vectype, + const_nunits / group_size_b, + &ptype); + if (vtype != NULL_TREE) + { + nloads = const_nunits / group_size_b; + lvectype = vtype; + ltype = ptype; + ncontinues = 1; + } + } + /* ncontinues is the number of vectors from an ARRAY. + vectemp1 = {a[0], a[1], ...} + ... + vectempm = {a[k], a[k+1], ...} */ + else + { + nloads = 1; + ltype = vectype; + ncontinues = group_size_b / const_nunits; + } + ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); + return ltype; +} + +static void +generate_old_load_permutations (slp_tree slp_node, unsigned int group_size, + vec<unsigned> &old_load_permutation) +{ + /* Generate the old load permutations from the slp_node. */ + unsigned i = 0; + unsigned k = 0; + + /* If SLP_NODE has load_permutation, we copy it to old_load_permutation. + Otherwise, we generate a permutation sequentially. */ + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) + { + FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k) + { + old_load_permutation.safe_push (k); + } + } + else + { + for (unsigned i = 0; i < group_size; i++) + { + old_load_permutation.safe_push (i); + } + } +} + +static void +generate_new_load_permutation_mapping (unsigned slp_node_length, + vec<unsigned> &group_idx, + const vec<unsigned> &load_permutation, + unsigned int group_size_b, + unsigned &new_group_size, + vec<unsigned> &group_from) +{ + /* group_num_vec: only stores the group_loads IDs which are caculated from + load_permutation. */ + auto_vec<unsigned> group_num_vec; + + /* Caculate which group_loads are the stmts in SLP_NODE from. */ + unsigned i = 0; + unsigned k = 0; + FOR_EACH_VEC_ELT (load_permutation, i, k) + { + unsigned int t0 = k / group_size_b; + if (!group_num_vec.contains (t0)) + { + group_num_vec.safe_push (t0); + } + group_from.safe_push (t0); + } + group_num_vec.qsort (cmp_for_group_num); + /* n_groups: the number of group_loads. */ + unsigned int n_groups = group_num_vec.length (); + new_group_size = n_groups * group_size_b; + for (i = 0; i < n_groups; i++) + { + group_idx.safe_push (group_num_vec[i] * group_size_b); + } + /* A new mapping from group_ind_vec to group_from. + For example: + Origin: group_from = {1,1,3,3,5,5,7,7}; + After mapping: group_from = {0,0,1,1,2,2,2,2}; */ + auto_vec<unsigned> group_ind_vec (n_groups); + for (k = 0; k < n_groups; k++) + { + group_ind_vec.safe_push (k); + } + for (i = 0; i < slp_node_length; i++) + { + for (k = 0; k < n_groups; k++) + { + if (group_from[i] == group_num_vec[k]) + { + group_from[i] = group_ind_vec[k]; + break; + } + } + } +} + +static void +generate_new_load_permutation (vec<unsigned> &new_load_permutation, + const vec<unsigned> &old_load_permutation, + slp_tree slp_node, bool &this_load_permuted, + const vec<unsigned> &group_from, + unsigned int group_size_b) +{ + unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); + /* Generate the new load permutation from the new mapping. */ + new_load_permutation.create (slp_node_length); + unsigned i = 0; + unsigned k = 0; + FOR_EACH_VEC_ELT (old_load_permutation, i, k) + { + /* t1 is the new permutation of k in the old permutation. + t1 = base_address + offset: + base_address = group_from[i] * group_size_b; + offset = k % group_size_b. */ + unsigned int t1 + = group_from[i] * group_size_b + k % group_size_b; + new_load_permutation.safe_push (t1); + if (t1 != k) + { + this_load_permuted = true; + } + } +} + +static bool +is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits, + unsigned int group_size, stmt_vec_info first_stmt_info) +{ + /* Calculate the unrolling factor based on the smallest type. */ + poly_uint64 unrolling_factor + = exact_div (common_multiple (nunits, group_size), group_size); + /* The load requires permutation when unrolling exposes + a gap either because the group is larger than the SLP + group-size or because there is a gap between the groups. */ + if (!slp_perm && !this_load_permuted + && (known_eq (unrolling_factor, 1U) + || (group_size == DR_GROUP_SIZE (first_stmt_info) + && DR_GROUP_GAP (first_stmt_info) == 0))) + { + return false; + } + else + { + return true; + } +} + +static void +generate_load_permutation (slp_tree slp_node, unsigned &new_group_size, + unsigned int group_size, unsigned int group_size_b, + bool &this_load_permuted, vec<unsigned> &group_idx, + vec<unsigned> &new_load_permutation) +{ + /* Generate the old load permutations from SLP_NODE. */ + vec<unsigned> old_load_permutation; + old_load_permutation.create (group_size); + generate_old_load_permutations (slp_node, group_size, old_load_permutation); + + /* Caculate which group_loads are the stmts in SLP_NODE from. */ + unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); + /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */ + vec<unsigned> group_from; + group_from.create (slp_node_length); + generate_new_load_permutation_mapping (slp_node_length, group_idx, + old_load_permutation, + group_size_b, new_group_size, + group_from); + + /* Generate the new load permutation from the new mapping and caculate + this_load_permuted flag. If this_load_permuted is true, we need execute + slp permutation by using new load permutation. */ + generate_new_load_permutation (new_load_permutation, old_load_permutation, + slp_node, this_load_permuted, group_from, + group_size_b); + old_load_permutation.release (); + group_from.release (); +} + +static unsigned int +dr_align_vect_load (dr_vec_info *cur_first_dr_info, + unsigned HOST_WIDE_INT &align, + enum dr_alignment_support alignment_support_scheme) +{ + unsigned int misalign = 0; + + align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); + if (alignment_support_scheme == dr_aligned) + { + gcc_assert (aligned_access_p (cur_first_dr_info)); + } + else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) + { + align = dr_alignment (vect_dr_behavior (cur_first_dr_info)); + } + else + { + misalign = DR_MISALIGNMENT (cur_first_dr_info); + } + return misalign; +} + +static stmt_vec_info +add_new_stmt_vect_load (tree vectype, tree dataref_ptr, tree dataref_offset, + tree ref_type, tree ltype, gassign *(&new_stmt), + dr_vec_info *cur_first_dr_info, + gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) +{ + /* Data align. */ + enum dr_alignment_support alignment_support_scheme + = vect_supportable_dr_alignment (cur_first_dr_info, false); + unsigned HOST_WIDE_INT align; + unsigned int misalign = dr_align_vect_load (cur_first_dr_info, align, + alignment_support_scheme); + if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) + { + set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); + } + + /* Get data_ref. */ + tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); + tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); + if (alignment_support_scheme == dr_aligned) + { + ; + } + else if (DR_MISALIGNMENT (cur_first_dr_info) == -1) + { + TREE_TYPE (data_ref) + = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT); + } + else + { + tree elem_type = TREE_TYPE (vectype); + TREE_TYPE (data_ref) + = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); + } + + /* Add new stmt. */ + vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); + new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref); + stmt_vec_info new_stmt_info + = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + return new_stmt_info; +} + +static void +push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info, + vec<tree> &dr_chain, slp_tree slp_node) +{ + if (slp_perm) + { + dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt)); + } + else + { + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); + } +} + +static stmt_vec_info +get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info, + unsigned int group_el, + unsigned int group_size) +{ + stmt_vec_info last_stmt_info = first_stmt_info; + unsigned int count = 0; + gcc_assert (group_el < group_size); + while (count < group_el) + { + last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info); + count++; + } + return last_stmt_info; +} + +static stmt_vec_info +add_new_stmt_for_nloads_greater_than_one (tree lvectype, tree vectype, + vec<constructor_elt, va_gc> *v, + stmt_vec_info stmt_info, + gimple_stmt_iterator *gsi) +{ + tree vec_inv = build_constructor (lvectype, v); + tree new_temp = vect_init_vector (stmt_info, vec_inv, lvectype, gsi); + vec_info *vinfo = stmt_info->vinfo; + stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp); + if (lvectype != vectype) + { + gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype), + VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype, new_temp)); + new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + } + return new_stmt_info; +} + +/* Function new_vect_stmt_for_nloads. + + New a VEC_STMT when nloads Arrays are merged into a vector. + + ncopies is the number of vectors that need to be loaded from memmory. + nloads is the number of ARRAYs in a vector. + vectemp = {a[], b[], ...} */ + +static void +new_vect_stmt_for_nloads (unsigned int ncopies, unsigned int nloads, + vec<unsigned> group_idx, stmt_vec_info stmt_info, + offset_info *offset_info, vectype_info *vectype_info, + vect_memory_access_type memory_access_type, + bool slp_perm, vec<tree>& dr_chain, slp_tree slp_node, + gimple_stmt_iterator *gsi) +{ + vec<constructor_elt, va_gc> *v = NULL; + stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); + stmt_vec_info first_stmt_info_b = NULL; + stmt_vec_info new_stmt_info = NULL; + tree dataref_ptr = NULL_TREE; + tree dummy; + gimple *ptr_incr = NULL; + unsigned int n = 0; + for (unsigned int i = 0; i < ncopies; i++) + { + vec_alloc (v, nloads); + for (unsigned int t = 0; t < nloads; t++) + { + first_stmt_info_b = get_first_stmt_info_before_transpose ( + first_stmt_info, group_idx[n++], group_size); + dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); + tree bump = vect_get_data_ptr_increment (cur_first_dr_info, + vectype_info->ltype, + memory_access_type); + bool simd_lane_access_p + = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; + + /* Create dataref_ptr which is point to init_address. */ + dataref_ptr = vect_create_data_ref_ptr ( + first_stmt_info_b, vectype_info->ltype, NULL, + offset_info->offset, &dummy, gsi, &ptr_incr, + simd_lane_access_p, offset_info->byte_offset, bump); + + gassign *new_stmt = NULL; + new_stmt_info = add_new_stmt_vect_load ( + vectype_info->vectype, dataref_ptr, + offset_info->dataref_offset, vectype_info->ref_type, + vectype_info->ltype, new_stmt, cur_first_dr_info, + gsi, first_stmt_info_b); + + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt)); + } + new_stmt_info = add_new_stmt_for_nloads_greater_than_one ( + vectype_info->lvectype, vectype_info->vectype, + v, first_stmt_info_b, gsi); + push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, + dr_chain, slp_node); + } +} + +/* Function new_vect_stmt_for_ncontinues. + + New a VEC_STMTs when an Array is divided into several vectors. + + n_groups is the number of ARRAYs. + ncontinues is the number of vectors from an ARRAY. + vectemp1 = {a[0], a[1], ...} + ... + vectempm = {a[k], a[k+1], ...} */ + +static void +new_vect_stmt_for_ncontinues (unsigned int ncontinues, vec<unsigned> group_idx, + stmt_vec_info stmt_info, offset_info* offset_info, + vectype_info* vectype_info, + vect_memory_access_type memory_access_type, + bool slp_perm, vec<tree>& dr_chain, + slp_tree slp_node, + gimple_stmt_iterator *gsi) +{ + stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); + stmt_vec_info new_stmt_info = NULL; + tree dataref_ptr = NULL_TREE; + tree dummy; + gimple *ptr_incr = NULL; + unsigned int n_groups = group_idx.length (); + for (unsigned int i = 0; i < n_groups; i++) + { + stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose ( + first_stmt_info, group_idx[i], group_size); + dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); + tree bump = vect_get_data_ptr_increment (cur_first_dr_info, + vectype_info->ltype, memory_access_type); + bool simd_lane_access_p + = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; + for (unsigned int k = 0; k < ncontinues; k++) + { + /* Create dataref_ptr which is point to init_address. */ + if (k == 0) + { + dataref_ptr = vect_create_data_ref_ptr ( + first_stmt_info_b, vectype_info->ltype, NULL, + offset_info->offset, &dummy, gsi, &ptr_incr, + simd_lane_access_p, offset_info->byte_offset, bump); + } + else + { + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, + gsi, first_stmt_info_b, bump); + } + gassign *new_stmt = NULL; + new_stmt_info = add_new_stmt_vect_load ( + vectype_info->vectype, dataref_ptr, + offset_info->dataref_offset, vectype_info->ref_type, + vectype_info->ltype, new_stmt, cur_first_dr_info, + gsi, first_stmt_info_b); + push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, + dr_chain, slp_node); + } + } +} + /* vectorizable_load. Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) @@ -9364,6 +10031,9 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, tree vec_mask = NULL_TREE; prev_stmt_info = NULL; poly_uint64 group_elt = 0; + unsigned new_group_size = 0; + vec<unsigned> new_load_permutation; + for (j = 0; j < ncopies; j++) { stmt_vec_info new_stmt_info = NULL; @@ -9385,6 +10055,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); dataref_offset = build_int_cst (ref_type, 0); } + /* If the stmt_info need to be transposed recovery, dataref_ptr + will be caculated later. */ + else if (slp && is_a <bb_vec_info> (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SLP_TRANSPOSE ( + DR_GROUP_FIRST_ELEMENT (stmt_info))) + { + dataref_ptr = NULL_TREE; + } else if (diff_first_stmt_info) { dataref_ptr @@ -9501,6 +10180,63 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (stmt_info, gsi, vec_array); } + else if (slp && is_a <bb_vec_info> (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "vectorizable_load for slp transpose.\n"); + } + /* group_size: the size of group after merging. + group_size_b: the size of group before merging. + const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of + elements in a vector. + nloads: const_nunits / group_size_b or 1, it means the number + of ARRAYs in a vector. + ncontinues: group_size_b / const_nunits or 1, it means the number + of vectors from an ARRAY. */ + unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); + unsigned int const_nunits = nunits.to_constant (); + unsigned int nloads = const_nunits; + unsigned int ncontinues = group_size_b; + tree lvectype = vectype; + tree ltype = calculate_new_type (vectype, const_nunits, + group_size_b, nloads, + ncontinues, lvectype); + bool this_load_permuted = false; + auto_vec<unsigned> group_idx; + generate_load_permutation (slp_node, new_group_size, group_size, + group_size_b, this_load_permuted, + group_idx, new_load_permutation); + slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits, + group_size, first_stmt_info); + + /* ncopies: the number of vectors that need to be loaded from + memmory. */ + unsigned int ncopies = new_group_size / const_nunits; + offset_info offset_info = {offset, byte_offset, dataref_offset}; + vectype_info vectype_info = {vectype, ltype, lvectype, ref_type}; + if (slp_perm) + { + dr_chain.create (ncopies); + } + if (nloads > 1 && ncontinues == 1) + { + new_vect_stmt_for_nloads (ncopies, nloads, group_idx, stmt_info, + &offset_info, &vectype_info, + memory_access_type, slp_perm, dr_chain, + slp_node, gsi); + } + else + { + new_vect_stmt_for_ncontinues (ncontinues, group_idx, stmt_info, + &offset_info, &vectype_info, + memory_access_type, slp_perm, + dr_chain, slp_node, gsi); + } + } else { for (i = 0; i < vec_num; i++) @@ -9840,7 +10576,32 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (slp && !slp_perm) continue; - if (slp_perm) + /* Using the new load permutation to generate vector permute statements + from a list of loads in DR_CHAIN. */ + if (slp && slp_perm && is_a <bb_vec_info> (vinfo) + && STMT_VINFO_GROUPED_ACCESS (stmt_info) + && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) + { + unsigned n_perms; + stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0]; + unsigned int old_size = DR_GROUP_SIZE (stmt_info); + DR_GROUP_SIZE (stmt_info_) = new_group_size; + vec<unsigned> old_load_permutation + = SLP_TREE_LOAD_PERMUTATION (slp_node); + SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation; + bool perm_load_success = vect_transform_slp_perm_load ( + slp_node, dr_chain, gsi, vf, + slp_node_instance, false, &n_perms); + DR_GROUP_SIZE (stmt_info_) = old_size; + SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation; + new_load_permutation.release (); + if (!perm_load_success) + { + dr_chain.release (); + return false; + } + } + else if (slp_perm) { unsigned n_perms; if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index f7becb34a..1c4a6c421 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -297,6 +297,21 @@ public: vec<ddr_p> ddrs; }; +/* Information about offset in vectorizable_load. */ +struct offset_info { + tree offset; + tree byte_offset; + tree dataref_offset; +}; + +/* Information about vectype in vectorizable_load. */ +struct vectype_info { + tree vectype; + tree ltype; + tree lvectype; + tree ref_type; +}; + /* Vectorizer state common between loop and basic-block vectorization. */ class vec_info { public: @@ -335,6 +350,14 @@ public: stmt in the chain. */ auto_vec<stmt_vec_info> grouped_stores; + /* All interleaving chains of loads, represented by the first + stmt in the chain. */ + auto_vec<stmt_vec_info> grouped_loads; + + /* All interleaving chains of stores (before transposed), represented by all + stmt in the chain. */ + auto_vec<vec<stmt_vec_info> > scalar_stores; + /* Cost data used by the target cost model. */ void *target_cost_data; @@ -702,6 +725,8 @@ public: #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores +#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads +#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions @@ -764,6 +789,25 @@ public: basic_block bb; gimple_stmt_iterator region_begin; gimple_stmt_iterator region_end; + + /* True, if bb_vinfo can goto vect_analyze_slp. */ + bool before_slp; + + /* True, if bb_vinfo is a transposed version. */ + bool transposed; + + /* The number of transposed groups. */ + int transposed_group; + + /* The cost of the scalar iterations. */ + int scalar_cost; + + /* The cost of the vector prologue and epilogue, including peeled + iterations and set-up code. */ + int vec_outside_cost; + + /* The cost of the vector loop body. */ + int vec_inside_cost; } *bb_vec_info; #define BB_VINFO_BB(B) (B)->bb @@ -772,6 +816,14 @@ public: #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs #define BB_VINFO_DDRS(B) (B)->shared->ddrs #define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data +#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads +#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores +#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost +#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost +#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost +#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed +#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp +#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group static inline bb_vec_info vec_info_for_bb (basic_block bb) @@ -1012,6 +1064,17 @@ public: stmt_vec_info next_element; /* The size of the group. */ unsigned int size; + + /* The size of the group before transposed. */ + unsigned int size_before_transpose; + + /* If true, the stmt_info is slp transposed. */ + bool slp_transpose; + + /* Mark the group store number for rebuild interleaving chain + during transpose phase. Value -1 represents unable to transpose. */ + int group_number; + /* For stores, number of stores from this group seen. We vectorize the last one. */ unsigned int store_count; @@ -1019,6 +1082,9 @@ public: is 1. */ unsigned int gap; + /* The gap before transposed. */ + unsigned int gap_before_transpose; + /* The minimum negative dependence distance this stmt participates in or zero if none. */ unsigned int min_neg_dist; @@ -1217,6 +1283,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) #define STMT_VINFO_REDUC_VECTYPE_IN(S) (S)->reduc_vectype_in #define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p +#define DR_GROUP_SLP_TRANSPOSE(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose) +#define DR_GROUP_SIZE_TRANS(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose) +#define DR_GROUP_NUMBER(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number) #define DR_GROUP_FIRST_ELEMENT(S) \ (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) #define DR_GROUP_NEXT_ELEMENT(S) \ @@ -1227,6 +1299,8 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) #define DR_GROUP_GAP(S) \ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) +#define DR_GROUP_GAP_TRANS(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose) #define REDUC_GROUP_FIRST_ELEMENT(S) \ (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element) @@ -1624,6 +1698,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info) return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr)))); } +/* Compare two unsigned int A and B. + Sorting them in ascending order. */ + +static inline int +cmp_for_group_num (const void *a_, const void *b_) +{ + unsigned int a = *(unsigned int *)const_cast<void *>(a_); + unsigned int b = *(unsigned int *)const_cast<void *>(b_); + return a < b ? -1 : 1; +} + /* Return true if LOOP_VINFO requires a runtime check for whether the vector loop is profitable. */ @@ -1787,6 +1872,9 @@ extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern void vect_permute_store_chain (vec<tree> ,unsigned int, stmt_vec_info, gimple_stmt_iterator *, vec<tree> *); +extern void vect_transpose_store_chain (vec<tree>, unsigned int, unsigned int, + stmt_vec_info, gimple_stmt_iterator *, + vec<tree> *); extern tree vect_setup_realignment (stmt_vec_info, gimple_stmt_iterator *, tree *, enum dr_alignment_support, tree, class loop **); @@ -1849,6 +1937,7 @@ extern void vect_free_slp_instance (slp_instance, bool); extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> , gimple_stmt_iterator *, poly_uint64, slp_instance, bool, unsigned *); +extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info); extern bool vect_slp_analyze_operations (vec_info *); extern void vect_schedule_slp (vec_info *); extern opt_result vect_analyze_slp (vec_info *, unsigned); -- 2.27.0.windows.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2