Projects
Mega:24.03
gcc
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 4
View file
_service:tar_scm:gcc.spec
Changed
@@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 19 +%global gcc_release 22 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -166,6 +166,33 @@ Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch +Patch29: 0029-Struct-Reorg-Add-Safe-Structure-Pointer-Compression.patch +Patch30: 0030-Struct-Reorg-Add-unsafe-structure-pointer-compressio.patch +Patch31: 0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch +Patch32: 0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch +Patch33: 0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch +Patch34: 0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch +Patch35: 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch +Patch36: 0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch +Patch37: 0037-Perform-early-if-conversion-of-simple-arithmetic.patch +Patch38: 0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch +Patch39: 0039-Match-double-sized-mul-pattern.patch +Patch40: 0040-Port-icp-patch-to-GCC-12.patch +Patch41: 0041-Port-fixes-in-icp-to-GCC-12.patch +Patch42: 0042-Add-split-complex-instructions-pass.patch +Patch43: 0043-Extending-and-refactoring-of-pass_split_complex_inst.patch +Patch44: 0044-Port-maxmin-patch-to-GCC-12.patch +Patch45: 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch +Patch46: 0046-Add-new-pattern-to-pass-the-maxmin-tests.patch +Patch47: 0047-AES-Implement-AES-pattern-matching.patch +Patch48: 0048-crypto-accel-add-optimization-level-requirement-to-t.patch +Patch49: 0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch +Patch50: 0050-Port-IPA-prefetch-to-GCC-12.patch +Patch51: 0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch +Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch +Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch +Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch +Patch55: 0055-Struct-Reorg-Port-bugfixes-to-GCC-12.3.1.patch # Part 3000 ~ 4999 %ifarch loongarch64 @@ -789,6 +816,33 @@ %patch26 -p1 %patch27 -p1 %patch28 -p1 +%patch29 -p1 +%patch30 -p1 +%patch31 -p1 +%patch32 -p1 +%patch33 -p1 +%patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 +%patch48 -p1 +%patch49 -p1 +%patch50 -p1 +%patch51 -p1 +%patch52 -p1 +%patch53 -p1 +%patch54 -p1 +%patch55 -p1 %ifarch loongarch64 %patch3001 -p1 @@ -3174,6 +3228,18 @@ %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Fri Apr 12 2024 Zhengchen Hui <zhengchenhui1@huawei.com> - 12.3.1-22 +- Type: Sync +- DESC: Sync patch from openeuler/gcc + +* Thu Apr 11 2024 Zhengchen Hui <zhengchenhui1@huawei.com> - 12.3.1-21 +- Type: Sync +- DESC: Sync patch from openeuler/gcc + +* Thu Apr 11 2024 Zhenyu Zhao <zhaozhenyu17@huawei.com> - 12.3.1-20 +- Type: Sync +- DESC: Sync patch from openeuler/gcc + * Mon Apr 1 2024 Peng Fan <fanpeng@loongson.cn> 12.3.1-19 - Type: SPEC - DESC: fix libcc1 file path for LoongArch.
View file
_service:tar_scm:0029-Struct-Reorg-Add-Safe-Structure-Pointer-Compression.patch
Added
@@ -0,0 +1,1191 @@ +From 7930d75c9fd3f36cc2dce934569f00c71248bb31 Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Sat, 25 Nov 2023 10:28:48 +0800 +Subject: PATCH Struct Reorg Add Safe Structure Pointer Compression + +Safe structure pointer compression allows safely transfer pointers +stored in structure into the index of structure array with smaller +type to reduce the size of structure. +Add flag -fipa-struct-reorg=4 to enable safe structure pointer +compression. +Add param compressed-pointer-size=8,16,32 to control the compressed +pointer size. +--- + gcc/common.opt | 5 +- + gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 908 ++++++++++++++++++++++- + gcc/ipa-struct-reorg/ipa-struct-reorg.h | 4 + + gcc/params.opt | 4 + + 4 files changed, 882 insertions(+), 39 deletions(-) + +diff --git a/gcc/common.opt b/gcc/common.opt +index b01df919e..f6e20c1e8 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1993,8 +1993,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 3) +--fipa-struct-reorg=0,1,2,3 adding none, struct-reorg, reorder-fields, dfe optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) ++-fipa-struct-reorg=0,1,2,3,4 adding none, struct-reorg, reorder-fields, ++dfe, safe-pointer-compression optimizations. + + fipa-vrp + Common Var(flag_ipa_vrp) Optimization +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +index dcc6df496..5d451c4c8 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +@@ -89,6 +89,7 @@ along with GCC; see the file COPYING3. If not see + #include "gimple-iterator.h" + #include "gimple-walk.h" + #include "cfg.h" ++#include "cfghooks.h" /* For split_block. */ + #include "ssa.h" + #include "tree-dfa.h" + #include "fold-const.h" +@@ -147,7 +148,27 @@ using namespace struct_relayout; + #define VOID_POINTER_P(type) \ + (POINTER_TYPE_P (type) && VOID_TYPE_P (TREE_TYPE (type))) + +-/* Return true iff TYPE is stdarg va_list type. */ ++static void ++set_var_attributes (tree var) ++{ ++ if (!var) ++ return; ++ gcc_assert (TREE_CODE (var) == VAR_DECL); ++ ++ DECL_ARTIFICIAL (var) = 1; ++ DECL_EXTERNAL (var) = 0; ++ TREE_STATIC (var) = 1; ++ TREE_PUBLIC (var) = 0; ++ TREE_USED (var) = 1; ++ DECL_CONTEXT (var) = NULL; ++ TREE_THIS_VOLATILE (var) = 0; ++ TREE_ADDRESSABLE (var) = 0; ++ TREE_READONLY (var) = 0; ++ if (is_global_var (var)) ++ set_decl_tls_model (var, TLS_MODEL_NONE); ++} ++ ++/* Return true if TYPE is stdarg va_list type. */ + + static inline bool + is_va_list_type (tree type) +@@ -271,9 +292,15 @@ enum struct_layout_opt_level + STRUCT_SPLIT = 1 << 0, + COMPLETE_STRUCT_RELAYOUT = 1 << 1, + STRUCT_REORDER_FIELDS = 1 << 2, +- DEAD_FIELD_ELIMINATION = 1 << 3 ++ DEAD_FIELD_ELIMINATION = 1 << 3, ++ POINTER_COMPRESSION_SAFE = 1 << 4 + }; + ++/* Defines the target pointer size of compressed pointer, which should be 8, ++ 16, 32. */ ++ ++static int compressed_size = 32; ++ + static bool is_result_of_mult (tree arg, tree *num, tree struct_size); + static bool isptrptr (tree type); + void get_base (tree &base, tree expr); +@@ -394,7 +421,10 @@ srtype::srtype (tree type) + : type (type), + chain_type (false), + escapes (does_not_escape), ++ pc_gptr (NULL_TREE), + visited (false), ++ pc_candidate (false), ++ has_legal_alloc_num (false), + has_alloc_array (0) + { + for (int i = 0; i < max_split; i++) +@@ -476,6 +506,31 @@ srtype::mark_escape (escape_type e, gimple *stmt) + } + } + ++/* Create a global header for compressed struct. */ ++ ++void ++srtype::create_global_ptr_for_pc () ++{ ++ if (!pc_candidate || pc_gptr != NULL_TREE) ++ return; ++ ++ const char *type_name = get_type_name (type); ++ gcc_assert (type_name != NULL); ++ ++ char *gptr_name = concat (type_name, "_pc", NULL); ++ tree new_name = get_identifier (gptr_name); ++ tree new_type = build_pointer_type (newtype0); ++ tree new_var = build_decl (UNKNOWN_LOCATION, VAR_DECL, new_name, new_type); ++ set_var_attributes (new_var); ++ pc_gptr = new_var; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nType: %s has create global header for pointer" ++ " compression: %s\n", type_name, gptr_name); ++ ++ free (gptr_name); ++} ++ + /* Add FIELD to the list of fields that use this type. */ + + void +@@ -798,15 +853,31 @@ srfield::create_new_reorder_fields (tree newtypemax_split, + fields.safe_push (field); + } + +- DECL_NAME (field) = DECL_NAME (fielddecl); + if (type == NULL) +- /* Common members do not need to reconstruct. ++ { ++ DECL_NAME (field) = DECL_NAME (fielddecl); ++ /* Common members do not need to reconstruct. + Otherwise, int* -> int** or void* -> void**. */ +- TREE_TYPE (field) = nt; ++ TREE_TYPE (field) = nt; ++ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); ++ } ++ else if (type->pc_candidate) ++ { ++ const char *old_name = IDENTIFIER_POINTER (DECL_NAME (fielddecl)); ++ char *new_name = concat (old_name, "_pc", NULL); ++ DECL_NAME (field) = get_identifier (new_name); ++ free (new_name); ++ TREE_TYPE (field) = make_unsigned_type (compressed_size); ++ SET_DECL_ALIGN (field, compressed_size); ++ } + else +- TREE_TYPE (field) = reconstruct_complex_type (TREE_TYPE (fielddecl), nt); ++ { ++ TREE_TYPE (field) = reconstruct_complex_type (TREE_TYPE (fielddecl), nt); ++ DECL_NAME (field) = DECL_NAME (fielddecl); ++ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); ++ } ++ + DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (fielddecl); +- SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (fielddecl); + TREE_ADDRESSABLE (field) = TREE_ADDRESSABLE (fielddecl); + DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (fielddecl); +@@ -925,6 +996,10 @@ srtype::create_new_type (void) + && has_dead_field ()) + fprintf (dump_file, "Dead field elimination.\n"); + } ++ ++ if (pc_candidate && pc_gptr == NULL_TREE) ++ create_global_ptr_for_pc (); ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1338,6 +1413,30 @@ public: + + unsigned execute_struct_relayout (void); + bool remove_dead_field_stmt (tree lhs); ++ ++ // Pointer compression methods: ++ void check_and_prune_struct_for_pointer_compression (void); ++ void try_rewrite_with_pointer_compression (gassign *, gimple_stmt_iterator *, ++ tree, tree, tree &, tree &); ++ bool safe_void_cmp_p (tree, srtype *); ++ bool pc_candidate_st_type_p (tree); ++ bool pc_candidate_tree_p (tree); ++ bool pc_type_conversion_candidate_p (tree); ++ bool pc_direct_rewrite_chance_p (tree, tree &); ++ bool compress_candidate_with_check (gimple_stmt_iterator *, tree, tree &);
View file
_service:tar_scm:0030-Struct-Reorg-Add-unsafe-structure-pointer-compressio.patch
Added
@@ -0,0 +1,1232 @@ +From 82d6166cd29fb1c3474f29b28cb7e5478d3a551a Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Mon, 25 Dec 2023 11:17:04 +0800 +Subject: PATCH Struct Reorg Add unsafe structure pointer compression + +Unsafe structure pointer compression allows for some dangerous +conversions for better performance. +Add flag -fipa-struct-reorg=5 to enable unsafe structure pointer +compression. +--- + gcc/common.opt | 6 +- + gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 365 ++++++++++++++---- + gcc/symbol-summary.h | 22 +- + .../gcc.dg/struct/csr_skip_void_struct_name.c | 53 +++ + gcc/testsuite/gcc.dg/struct/pc_cast_int.c | 91 +++++ + .../gcc.dg/struct/pc_compress_and_decomress.c | 90 +++++ + gcc/testsuite/gcc.dg/struct/pc_ptr2void.c | 87 +++++ + .../gcc.dg/struct/pc_simple_rewrite_pc.c | 112 ++++++ + .../gcc.dg/struct/pc_skip_void_struct_name.c | 53 +++ + gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 8 + + 10 files changed, 804 insertions(+), 83 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/csr_skip_void_struct_name.c + create mode 100644 gcc/testsuite/gcc.dg/struct/pc_cast_int.c + create mode 100644 gcc/testsuite/gcc.dg/struct/pc_compress_and_decomress.c + create mode 100644 gcc/testsuite/gcc.dg/struct/pc_ptr2void.c + create mode 100644 gcc/testsuite/gcc.dg/struct/pc_simple_rewrite_pc.c + create mode 100644 gcc/testsuite/gcc.dg/struct/pc_skip_void_struct_name.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 56b547506..c7c6bc256 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1993,9 +1993,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) +--fipa-struct-reorg=0,1,2,3,4 adding none, struct-reorg, reorder-fields, +-dfe, safe-pointer-compression optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 5) ++-fipa-struct-reorg=0,1,2,3,4,5 adding none, struct-reorg, reorder-fields, ++dfe, safe-pointer-compression, unsafe-pointer-compression optimizations. + + fipa-vrp + Common Var(flag_ipa_vrp) Optimization +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +index 5d451c4c8..fa33f2d35 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +@@ -293,7 +293,8 @@ enum struct_layout_opt_level + COMPLETE_STRUCT_RELAYOUT = 1 << 1, + STRUCT_REORDER_FIELDS = 1 << 2, + DEAD_FIELD_ELIMINATION = 1 << 3, +- POINTER_COMPRESSION_SAFE = 1 << 4 ++ POINTER_COMPRESSION_SAFE = 1 << 4, ++ POINTER_COMPRESSION_UNSAFE = 1 << 5 + }; + + /* Defines the target pointer size of compressed pointer, which should be 8, +@@ -1267,10 +1268,10 @@ csrtype::init_type_info (void) + + /* Close enough to pad to improve performance. + 33~63 should pad to 64 but 33~48 (first half) are too far away, and +- 65~127 should pad to 128 but 65~96 (first half) are too far away. */ ++ 70~127 should pad to 128 but 65~70 (first half) are too far away. */ + if (old_size > 48 && old_size < 64) + new_size = 64; +- if (old_size > 96 && old_size < 128) ++ if (old_size > 70 && old_size < 128) + new_size = 128; + + /* For performance reasons, only allow structure size +@@ -1423,8 +1424,12 @@ public: + bool pc_candidate_tree_p (tree); + bool pc_type_conversion_candidate_p (tree); + bool pc_direct_rewrite_chance_p (tree, tree &); ++ bool pc_simplify_chance_for_compress_p (gassign *, tree); ++ bool compress_candidate_without_check (gimple_stmt_iterator *, tree, tree &); + bool compress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); + bool compress_candidate (gassign *, gimple_stmt_iterator *, tree, tree &); ++ bool decompress_candidate_without_check (gimple_stmt_iterator *, ++ tree, tree, tree &, tree &); + bool decompress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); + bool decompress_candidate (gimple_stmt_iterator *, tree, tree, tree &, + tree &); +@@ -1924,7 +1929,6 @@ bool + ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, + HOST_WIDE_INT ×) + { +- bool ret = false; + gcc_assert (TREE_CODE (cst) == INTEGER_CST); + + gimple *stmt = gsi_stmt (*gsi); +@@ -1948,27 +1952,95 @@ ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, + { + if (gsi_one_before_end_p (*gsi)) + return false; +- gsi_next (gsi); +- gimple *stmt2 = gsi_stmt (*gsi); +- +- if (gimple_code (stmt2) == GIMPLE_ASSIGN +- && gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) ++ // Check uses. ++ imm_use_iterator imm_iter_lhs; ++ use_operand_p use_p_lhs; ++ FOR_EACH_IMM_USE_FAST (use_p_lhs, imm_iter_lhs, gimple_assign_lhs (stmt)) + { +- tree lhs = gimple_assign_lhs (stmt2); +- tree rhs1 = gimple_assign_rhs1 (stmt2); +- if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) +- || types_compatible_p (inner_type (TREE_TYPE (lhs)), ctype.type)) ++ gimple *stmt2 = USE_STMT (use_p_lhs); ++ if (gimple_code (stmt2) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) + { +- tree num = NULL; +- if (is_result_of_mult (cst, &num, TYPE_SIZE_UNIT (ctype.type))) ++ tree lhs = gimple_assign_lhs (stmt2); ++ tree rhs1 = gimple_assign_rhs1 (stmt2); ++ if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) ++ || types_compatible_p (inner_type (TREE_TYPE (lhs)), ++ ctype.type)) + { +- times = TREE_INT_CST_LOW (num); +- ret = true; ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ } ++ } ++ // For pointer compression, handle plus stmt. ++ else if (gimple_assign_rhs_code (stmt2) == PLUS_EXPR) ++ { ++ // Check uses. ++ imm_use_iterator imm_iter_cast; ++ use_operand_p use_p_cast; ++ FOR_EACH_IMM_USE_FAST (use_p_cast, imm_iter_cast, ++ gimple_assign_lhs (stmt2)) ++ { ++ gimple *stmt_cast = USE_STMT (use_p_cast); ++ if (gimple_code (stmt_cast) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_cast_p (stmt_cast)) ++ { ++ tree lhs_type = inner_type (TREE_TYPE ( ++ gimple_assign_lhs (stmt_cast))); ++ if (types_compatible_p (lhs_type, ctype.type)) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ } ++ } + } + } + } +- gsi_prev (gsi); +- return ret; ++ } ++ // For pointer compression, handle div stmt. ++ if (gimple_assign_rhs_code (stmt) == TRUNC_DIV_EXPR) ++ { ++ imm_use_iterator imm_iter; ++ use_operand_p use_p; ++ tree lhs = gimple_assign_lhs (stmt); ++ if (lhs == NULL_TREE) ++ return false; ++ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) ++ { ++ gimple *use_stmt = USE_STMT (use_p); ++ if (is_gimple_debug (use_stmt)) ++ continue; ++ if (gimple_code (use_stmt) != GIMPLE_ASSIGN) ++ continue; ++ if (gimple_assign_cast_p (use_stmt)) ++ { ++ tree lhs_type = inner_type (TREE_TYPE ( ++ gimple_assign_lhs (use_stmt))); ++ if (TYPE_UNSIGNED (lhs_type) ++ && TREE_CODE (lhs_type) == INTEGER_TYPE ++ && TYPE_PRECISION (lhs_type) == compressed_size) ++ { ++ tree num = NULL; ++ if (is_result_of_mult (cst, &num, ++ TYPE_SIZE_UNIT (ctype.type))) ++ { ++ times = TREE_INT_CST_LOW (num); ++ return true; ++ } ++ }
View file
_service:tar_scm:0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch
Added
@@ -0,0 +1,550 @@ +From 72531376df5ed93c2d945469368ba5514eca8407 Mon Sep 17 00:00:00 2001 +From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> +Date: Tue, 5 Dec 2023 15:33:08 +0800 +Subject: PATCH AutoBOLT Support saving feedback count info to ELF segment + 1/3 + +--- + gcc/common.opt | 8 + + gcc/final.cc | 405 ++++++++++++++++++++++++++++++++++++++++++++++++- + gcc/opts.cc | 61 ++++++++ + 3 files changed, 473 insertions(+), 1 deletion(-) + +diff --git a/gcc/common.opt b/gcc/common.opt +index b01df919e..e69947fc2 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2546,6 +2546,14 @@ freorder-functions + Common Var(flag_reorder_functions) Optimization + Reorder functions to improve code placement. + ++fauto-bolt ++Common Var(flag_auto_bolt) ++Generate profile from AutoFDO or PGO and do BOLT optimization after linkage. ++ ++fauto-bolt= ++Common Joined RejectNegative ++Specify the feedback data directory required by BOLT-plugin. The default is the current directory. ++ + frerun-cse-after-loop + Common Var(flag_rerun_cse_after_loop) Optimization + Add a common subexpression elimination pass after loop optimizations. +diff --git a/gcc/final.cc b/gcc/final.cc +index a9868861b..d4c4fa08f 100644 +--- a/gcc/final.cc ++++ b/gcc/final.cc +@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3. If not see + #include "rtl-iter.h" + #include "print-rtl.h" + #include "function-abi.h" ++#include "insn-codes.h" + #include "common/common-target.h" + + #ifdef XCOFF_DEBUGGING_INFO +@@ -4266,7 +4267,403 @@ leaf_renumber_regs_insn (rtx in_rtx) + } + } + #endif +- ++ ++#define ASM_FDO_SECTION_PREFIX ".text.fdo." ++ ++#define ASM_FDO_CALLER_FLAG ".fdo.caller " ++#define ASM_FDO_CALLER_SIZE_FLAG ".fdo.caller.size " ++#define ASM_FDO_CALLER_BIND_FLAG ".fdo.caller.bind" ++ ++#define ASM_FDO_CALLEE_FLAG ".fdo.callee" ++ ++/* Return the relative offset address of the start instruction of BB, ++ return -1 if it is empty instruction. */ ++ ++static int ++get_bb_start_addr (basic_block bb) ++{ ++ rtx_insn *insn; ++ FOR_BB_INSNS (bb, insn) ++ { ++ if (!INSN_P (insn)) ++ { ++ continue; ++ } ++ /* The jump target of call is not in this function, so ++ it should be excluded. */ ++ if (CALL_P (insn)) ++ { ++ return -1; ++ } ++ ++ int insn_code = recog_memoized (insn); ++ ++ /* The instruction NOP in llvm-bolt belongs to the previous ++ BB, so it needs to be skipped. */ ++ if (insn_code != CODE_FOR_nop) ++ { ++ return INSN_ADDRESSES (INSN_UID (insn)); ++ } ++ } ++ return -1; ++} ++ ++/* Return the relative offet address of the end instruction of BB, ++ return -1 if it is empty or call instruction. */ ++ ++static int ++get_bb_end_addr (basic_block bb) ++{ ++ rtx_insn *insn; ++ int num_succs = EDGE_COUNT (bb->succs); ++ FOR_BB_INSNS_REVERSE (bb, insn) ++ { ++ if (!INSN_P (insn)) ++ { ++ continue; ++ } ++ /* The jump target of call is not in this function, so ++ it should be excluded. */ ++ if (CALL_P (insn)) ++ { ++ return -1; ++ } ++ if ((num_succs == 1) ++ || ((num_succs == 2) && any_condjump_p (insn))) ++ { ++ return INSN_ADDRESSES (INSN_UID (insn)); ++ } ++ else ++ { ++ return -1; ++ } ++ } ++ return -1; ++} ++ ++/* Return the end address of cfun. */ ++ ++static int ++get_function_end_addr () ++{ ++ rtx_insn *insn = get_last_insn (); ++ for (; insn != get_insns (); insn = PREV_INSN (insn)) ++ { ++ if (!INSN_P (insn)) ++ { ++ continue; ++ } ++ return INSN_ADDRESSES (INSN_UID (insn)); ++ } ++ ++ return -1; ++} ++ ++/* Return the function profile status string. */ ++ ++static const char * ++get_function_profile_status () ++{ ++ const char *profile_status = { ++ "PROFILE_ABSENT", ++ "PROFILE_GUESSED", ++ "PROFILE_READ", ++ "PROFILE_LAST" /* Last value, used by profile streaming. */ ++ }; ++ ++ return profile_statusprofile_status_for_fn (cfun); ++} ++ ++/* Return the count from the feedback data, such as PGO or ADDO. */ ++ ++inline static gcov_type ++get_fdo_count (profile_count count) ++{ ++ return count.quality () >= GUESSED ++ ? count.to_gcov_type () : 0; ++} ++ ++/* Return the profile quality string. */ ++ ++static const char * ++get_fdo_count_quality (profile_count count) ++{ ++ const char *profile_quality = { ++ "UNINITIALIZED_PROFILE", ++ "GUESSED_LOCAL", ++ "GUESSED_GLOBAL0", ++ "GUESSED_GLOBAL0_ADJUSTED", ++ "GUESSED", ++ "AFDO", ++ "ADJUSTED", ++ "PRECISE" ++ }; ++ ++ return profile_qualitycount.quality (); ++} ++ ++static const char * ++alias_local_functions (const char *fnname) ++{ ++ if (TREE_PUBLIC (cfun->decl)) ++ { ++ return fnname; ++ } ++ return concat (fnname, "/", lbasename (dump_base_name), NULL); ++} ++ ++/* Return function bind type string. */ ++ ++static const char * ++simple_get_function_bind () ++{ ++ const char *function_bind = {
View file
_service:tar_scm:0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch
Added
@@ -0,0 +1,34094 @@ +From 82f9f48406955a6150def998b69b4eace4bd51eb Mon Sep 17 00:00:00 2001 +From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> +Date: Thu, 7 Dec 2023 11:43:08 +0800 +Subject: PATCH AutoBOLT Add bolt linker plugin 2/3 + +--- + bolt-plugin/Makefile | 675 ++ + bolt-plugin/Makefile.am | 43 + + bolt-plugin/Makefile.in | 675 ++ + bolt-plugin/aclocal.m4 | 10250 +++++++++++++++++ + bolt-plugin/bolt-plugin.cc | 1153 ++ + bolt-plugin/config.h.in | 179 + + bolt-plugin/configure | 20909 +++++++++++++++++++++++++++++++++++ + bolt-plugin/configure.ac | 60 + + gcc/common.opt | 16 + + gcc/opts.cc | 27 +- + 10 files changed, 33985 insertions(+), 2 deletions(-) + create mode 100644 bolt-plugin/Makefile + create mode 100644 bolt-plugin/Makefile.am + create mode 100644 bolt-plugin/Makefile.in + create mode 100644 bolt-plugin/aclocal.m4 + create mode 100644 bolt-plugin/bolt-plugin.cc + create mode 100644 bolt-plugin/config.h.in + create mode 100755 bolt-plugin/configure + create mode 100644 bolt-plugin/configure.ac + +diff --git a/bolt-plugin/Makefile b/bolt-plugin/Makefile +new file mode 100644 +index 000000000..82a4bc2c6 +--- /dev/null ++++ b/bolt-plugin/Makefile +@@ -0,0 +1,675 @@ ++# Makefile.in generated by automake 1.16.5 from Makefile.am. ++# Makefile. Generated from Makefile.in by configure. ++ ++# Copyright (C) 1994-2021 Free Software Foundation, Inc. ++ ++# This Makefile.in is free software; the Free Software Foundation ++# gives unlimited permission to copy and/or distribute it, ++# with or without modifications, as long as this notice is preserved. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without ++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A ++# PARTICULAR PURPOSE. ++ ++ ++ ++ ++am__is_gnu_make = { \ ++ if test -z '$(MAKELEVEL)'; then \ ++ false; \ ++ elif test -n '$(MAKE_HOST)'; then \ ++ true; \ ++ elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ ++ true; \ ++ else \ ++ false; \ ++ fi; \ ++} ++am__make_running_with_option = \ ++ case $${target_option-} in \ ++ ?) ;; \ ++ *) echo "am__make_running_with_option: internal error: invalid" \ ++ "target option '$${target_option-}' specified" >&2; \ ++ exit 1;; \ ++ esac; \ ++ has_opt=no; \ ++ sane_makeflags=$$MAKEFLAGS; \ ++ if $(am__is_gnu_make); then \ ++ sane_makeflags=$$MFLAGS; \ ++ else \ ++ case $$MAKEFLAGS in \ ++ *\\\ \ *) \ ++ bs=\\; \ ++ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ ++ | sed "s/$$bs$$bs$$bs $$bs *//g"`;; \ ++ esac; \ ++ fi; \ ++ skip_next=no; \ ++ strip_trailopt () \ ++ { \ ++ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ ++ }; \ ++ for flg in $$sane_makeflags; do \ ++ test $$skip_next = yes && { skip_next=no; continue; }; \ ++ case $$flg in \ ++ *=*|--*) continue;; \ ++ -*I) strip_trailopt 'I'; skip_next=yes;; \ ++ -*I?*) strip_trailopt 'I';; \ ++ -*O) strip_trailopt 'O'; skip_next=yes;; \ ++ -*O?*) strip_trailopt 'O';; \ ++ -*l) strip_trailopt 'l'; skip_next=yes;; \ ++ -*l?*) strip_trailopt 'l';; \ ++ -dEDm) skip_next=yes;; \ ++ -JT) skip_next=yes;; \ ++ esac; \ ++ case $$flg in \ ++ *$$target_option*) has_opt=yes; break;; \ ++ esac; \ ++ done; \ ++ test $$has_opt = yes ++am__make_dryrun = (target_option=n; $(am__make_running_with_option)) ++am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) ++pkgdatadir = $(datadir)/bolt-plugin ++pkgincludedir = $(includedir)/bolt-plugin ++pkglibdir = $(libdir)/bolt-plugin ++pkglibexecdir = $(libexecdir)/bolt-plugin ++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd ++install_sh_DATA = $(install_sh) -c -m 644 ++install_sh_PROGRAM = $(install_sh) -c ++install_sh_SCRIPT = $(install_sh) -c ++INSTALL_HEADER = $(INSTALL_DATA) ++transform = $(program_transform_name) ++NORMAL_INSTALL = : ++PRE_INSTALL = : ++POST_INSTALL = : ++NORMAL_UNINSTALL = : ++PRE_UNINSTALL = : ++POST_UNINSTALL = : ++build_triplet = aarch64-unknown-linux-gnu ++host_triplet = aarch64-unknown-linux-gnu ++target_triplet = aarch64-unknown-linux-gnu ++subdir = . ++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ++am__aclocal_m4_deps = $(top_srcdir)/configure.ac ++am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ ++ $(ACLOCAL_M4) ++DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ ++ $(am__configure_deps) ++am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ ++ configure.lineno config.status.lineno ++mkinstalldirs = $(SHELL) $(top_srcdir)/../mkinstalldirs ++CONFIG_HEADER = config.h ++CONFIG_CLEAN_FILES = ++CONFIG_CLEAN_VPATH_FILES = ++am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; ++am__vpath_adj = case $$p in \ ++ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ ++ *) f=$$p;; \ ++ esac; ++am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; ++am__install_max = 40 ++am__nobase_strip_setup = \ ++ srcdirstrip=`echo "$(srcdir)" | sed 's/.^$$\\*|/\\\\&/g'` ++am__nobase_strip = \ ++ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" ++am__nobase_list = $(am__nobase_strip_setup); \ ++ for p in $$list; do echo "$$p $$p"; done | \ ++ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/^/*$$,\1,' | \ ++ $(AWK) 'BEGIN { files"." = "" } { files$$2 = files$$2 " " $$1; \ ++ if (++n$$2 == $(am__install_max)) \ ++ { print $$2, files$$2; n$$2 = 0; files$$2 = "" } } \ ++ END { for (dir in files) print dir, filesdir }' ++am__base_list = \ ++ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ ++ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' ++am__uninstall_files_from_dir = { \ ++ test -z "$$files" \ ++ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ ++ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ ++ $(am__cd) "$$dir" && rm -f $$files; }; \ ++ } ++am__installdirs = "$(DESTDIR)$(libexecsubdir)" ++LTLIBRARIES = $(libexecsub_LTLIBRARIES) ++am_libbolt_plugin_la_OBJECTS = bolt-plugin.lo ++libbolt_plugin_la_OBJECTS = $(am_libbolt_plugin_la_OBJECTS) ++AM_V_P = $(am__v_P_$(V)) ++am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) ++am__v_P_0 = false ++am__v_P_1 = : ++AM_V_GEN = $(am__v_GEN_$(V)) ++am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) ++am__v_GEN_0 = @echo " GEN " $@; ++am__v_GEN_1 = ++AM_V_at = $(am__v_at_$(V)) ++am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) ++am__v_at_0 = @ ++am__v_at_1 = ++DEFAULT_INCLUDES = -I. ++depcomp = ++am__maybe_remake_depfiles = ++CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ ++ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) ++AM_V_lt = $(am__v_lt_$(V)) ++am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) ++am__v_lt_0 = --silent ++am__v_lt_1 = ++LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ ++ $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ ++ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ ++ $(AM_CXXFLAGS) $(CXXFLAGS) ++AM_V_CXX = $(am__v_CXX_$(V)) ++am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) ++am__v_CXX_0 = @echo " CXX " $@; ++am__v_CXX_1 = ++CXXLD = $(CXX) ++CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ ++ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
View file
_service:tar_scm:0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch
Added
@@ -0,0 +1,345 @@ +From 94242286383a80e6ab83d824a4d7ea23ea311f75 Mon Sep 17 00:00:00 2001 +From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> +Date: Mon, 22 Jan 2024 15:38:24 +0800 +Subject: PATCH AutoBOLT Enable BOLT linker plugin on aarch64 3/3 + +--- + Makefile.def | 10 ++++++++++ + configure | 27 ++++++++++++++++++++++++++- + configure.ac | 22 +++++++++++++++++++++- + gcc/config.host | 1 + + gcc/config.in | 13 +++++++++++++ + gcc/configure | 10 ++++++++-- + gcc/configure.ac | 4 ++++ + gcc/gcc.cc | 23 +++++++++++++++++++++++ + 8 files changed, 106 insertions(+), 4 deletions(-) + +diff --git a/Makefile.def b/Makefile.def +index 72d585496..0ba868890 100644 +--- a/Makefile.def ++++ b/Makefile.def +@@ -145,6 +145,9 @@ host_modules= { module= gnattools; }; + host_modules= { module= lto-plugin; bootstrap=true; + extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@'; + extra_make_flags='@extra_linker_plugin_flags@'; }; ++host_modules= { module= bolt-plugin; bootstrap=true; ++ extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@'; ++ extra_make_flags='@extra_linker_plugin_flags@'; }; + host_modules= { module= libcc1; extra_configure_flags=--enable-shared; }; + host_modules= { module= gotools; }; + host_modules= { module= libctf; bootstrap=true; }; +@@ -349,6 +352,7 @@ dependencies = { module=configure-gcc; on=all-mpfr; }; + dependencies = { module=configure-gcc; on=all-mpc; }; + dependencies = { module=configure-gcc; on=all-isl; }; + dependencies = { module=configure-gcc; on=all-lto-plugin; }; ++dependencies = { module=configure-gcc; on=all-bolt-plugin; }; + dependencies = { module=configure-gcc; on=all-binutils; }; + dependencies = { module=configure-gcc; on=all-gas; }; + dependencies = { module=configure-gcc; on=all-ld; }; +@@ -374,6 +378,7 @@ dependencies = { module=all-gcc; on=all-libdecnumber; hard=true; }; + dependencies = { module=all-gcc; on=all-libiberty; }; + dependencies = { module=all-gcc; on=all-fixincludes; }; + dependencies = { module=all-gcc; on=all-lto-plugin; }; ++dependencies = { module=all-gcc; on=all-bolt-plugin; }; + dependencies = { module=all-gcc; on=all-libiconv; }; + dependencies = { module=info-gcc; on=all-build-libiberty; }; + dependencies = { module=dvi-gcc; on=all-build-libiberty; }; +@@ -381,8 +386,10 @@ dependencies = { module=pdf-gcc; on=all-build-libiberty; }; + dependencies = { module=html-gcc; on=all-build-libiberty; }; + dependencies = { module=install-gcc ; on=install-fixincludes; }; + dependencies = { module=install-gcc ; on=install-lto-plugin; }; ++dependencies = { module=install-gcc ; on=install-bolt-plugin; }; + dependencies = { module=install-strip-gcc ; on=install-strip-fixincludes; }; + dependencies = { module=install-strip-gcc ; on=install-strip-lto-plugin; }; ++dependencies = { module=install-strip-gcc ; on=install-strip-bolt-plugin; }; + + dependencies = { module=configure-libcpp; on=configure-libiberty; hard=true; }; + dependencies = { module=configure-libcpp; on=configure-intl; }; +@@ -401,6 +408,9 @@ dependencies = { module=all-gnattools; on=all-target-libstdc++-v3; }; + dependencies = { module=all-lto-plugin; on=all-libiberty; }; + dependencies = { module=all-lto-plugin; on=all-libiberty-linker-plugin; }; + ++dependencies = { module=all-bolt-plugin; on=all-libiberty; }; ++dependencies = { module=all-bolt-plugin; on=all-libiberty-linker-plugin; }; ++ + dependencies = { module=configure-libcc1; on=configure-gcc; }; + dependencies = { module=all-libcc1; on=all-gcc; }; + +diff --git a/configure b/configure +index 5dcaab14a..aff62c464 100755 +--- a/configure ++++ b/configure +@@ -826,6 +826,7 @@ with_isl + with_isl_include + with_isl_lib + enable_isl_version_check ++enable_bolt + enable_lto + enable_linker_plugin_configure_flags + enable_linker_plugin_flags +@@ -1550,6 +1551,7 @@ Optional Features: + enable the PGO build + --disable-isl-version-check + disable check for isl version ++ --enable-bolt enable bolt optimization support + --enable-lto enable link time optimization support + --enable-linker-plugin-configure-flags=FLAGS + additional flags for configuring linker plugins +@@ -8564,6 +8566,15 @@ fi + + + ++# Check for BOLT support. ++# Check whether --enable-bolt was given. ++if test "${enable_bolt+set}" = set; then : ++ enableval=$enable_bolt; enable_bolt=$enableval ++else ++ enable_bolt=no; default_enable_bolt=no ++fi ++ ++ + # Check for LTO support. + # Check whether --enable-lto was given. + if test "${enable_lto+set}" = set; then : +@@ -8593,6 +8604,16 @@ if test $target_elf = yes; then : + # ELF platforms build the lto-plugin always. + build_lto_plugin=yes + ++ # ELF platforms can build the bolt-plugin. ++ # NOT BUILD BOLT BY DEFAULT. ++ case $target in ++ aarch64*-*-linux*) ++ if test $enable_bolt = yes; then : ++ build_bolt_plugin=yes ++ fi ++ ;; ++ esac ++ + else + if test x"$default_enable_lto" = x"yes" ; then + case $target in +@@ -8780,6 +8801,10 @@ if test -d ${srcdir}/gcc; then + fi + fi + ++ if test "${build_bolt_plugin}" = "yes" ; then ++ configdirs="$configdirs bolt-plugin" ++ fi ++ + # If we're building an offloading compiler, add the LTO front end. + if test x"$enable_as_accelerator_for" != x ; then + case ,${enable_languages}, in +@@ -9202,7 +9227,7 @@ fi + extra_host_libiberty_configure_flags= + extra_host_zlib_configure_flags= + case " $configdirs " in +- *" lto-plugin "* | *" libcc1 "*) ++ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*) + # When these are to be built as shared libraries, the same applies to + # libiberty. + extra_host_libiberty_configure_flags=--enable-shared +diff --git a/configure.ac b/configure.ac +index 85977482a..f310d75ca 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1863,6 +1863,12 @@ fi + AC_SUBST(isllibs) + AC_SUBST(islinc) + ++# Check for BOLT support. ++AC_ARG_ENABLE(bolt, ++AS_HELP_STRING(--enable-bolt, enable bolt optimization support), ++enable_bolt=$enableval, ++enable_bolt=no; default_enable_bolt=no) ++ + # Check for LTO support. + AC_ARG_ENABLE(lto, + AS_HELP_STRING(--enable-lto, enable link time optimization support), +@@ -1871,6 +1877,16 @@ enable_lto=yes; default_enable_lto=yes) + + ACX_ELF_TARGET_IFELSE(# ELF platforms build the lto-plugin always. + build_lto_plugin=yes ++ ++ # ELF platforms can build the bolt-plugin. ++ # NOT BUILD BOLT BY DEFAULT. ++ case $target in ++ aarch64*-*-linux*) ++ if test $enable_bolt = yes; then : ++ build_bolt_plugin=yes ++ fi ++ ;; ++ esac + ,if test x"$default_enable_lto" = x"yes" ; then + case $target in + *-apple-darwin912* | *-cygwin* | *-mingw* | *djgpp*) ;; +@@ -2049,6 +2065,10 @@ if test -d ${srcdir}/gcc; then + fi + fi + ++ if test "${build_bolt_plugin}" = "yes" ; then ++ configdirs="$configdirs bolt-plugin" ++ fi ++ + # If we're building an offloading compiler, add the LTO front end. + if test x"$enable_as_accelerator_for" != x ; then + case ,${enable_languages}, in +@@ -2457,7 +2477,7 @@ fi + extra_host_libiberty_configure_flags= + extra_host_zlib_configure_flags= + case " $configdirs " in +- *" lto-plugin "* | *" libcc1 "*) ++ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*) + # When these are to be built as shared libraries, the same applies to + # libiberty. + extra_host_libiberty_configure_flags=--enable-shared +diff --git a/gcc/config.host b/gcc/config.host +index 4ca300f11..bf7dcb4cc 100644 +--- a/gcc/config.host ++++ b/gcc/config.host +@@ -75,6 +75,7 @@ out_host_hook_obj=host-default.o
View file
_service:tar_scm:0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch
Added
@@ -0,0 +1,312 @@ +From b020447c840c6e22440a9b9063298a06333fd2f1 Mon Sep 17 00:00:00 2001 +From: zhenyu--zhao <zhaozhenyu17@huawei.com> +Date: Sat, 23 Mar 2024 22:56:09 +0800 +Subject: PATCH AutofdoEnable discrimibator and MCF algorithm on Autofdo + +--- + gcc/auto-profile.cc | 171 +++++++++++++++++++++++++++++++++++++++++++- + gcc/cfghooks.cc | 7 ++ + gcc/opts.cc | 5 +- + gcc/tree-inline.cc | 14 ++++ + 4 files changed, 193 insertions(+), 4 deletions(-) + +diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc +index 2b34b80b8..f45f0ec66 100644 +--- a/gcc/auto-profile.cc ++++ b/gcc/auto-profile.cc +@@ -466,6 +466,17 @@ string_table::get_index (const char *name) const + if (name == NULL) + return -1; + string_index_map::const_iterator iter = map_.find (name); ++ /* Function name may be duplicate. Try to distinguish by the ++ #file_name#function_name defined by the autofdo tool chain. */ ++ if (iter == map_.end ()) ++ { ++ char* file_name = get_original_name (lbasename (dump_base_name)); ++ char* file_func_name ++ = concat ("#", file_name, "#", name, NULL); ++ iter = map_.find (file_func_name); ++ free (file_name); ++ free (file_func_name); ++ } + if (iter == map_.end ()) + return -1; + +@@ -654,7 +665,7 @@ function_instance::read_function_instance (function_instance_stack *stack, + + for (unsigned i = 0; i < num_pos_counts; i++) + { +- unsigned offset = gcov_read_unsigned () & 0xffff0000; ++ unsigned offset = gcov_read_unsigned (); + unsigned num_targets = gcov_read_unsigned (); + gcov_type count = gcov_read_counter (); + s->pos_countsoffset.count = count; +@@ -733,6 +744,10 @@ autofdo_source_profile::get_count_info (gimple *stmt, count_info *info) const + function_instance *s = get_function_instance_by_inline_stack (stack); + if (s == NULL) + return false; ++ if (s->get_count_info (stack0.second + stmt->bb->discriminator, info)) ++ { ++ return true; ++ } + return s->get_count_info (stack0.second, info); + } + +@@ -1395,6 +1410,66 @@ afdo_propagate (bb_set *annotated_bb) + } + } + ++/* Process the following scene when the branch probability ++ inversion when do function afdo_propagate (). E.g. ++ BB_NUM (sample count) ++ BB1 (1000) ++ / \ ++ BB2 (10) BB3 (0) ++ \ / ++ BB4 ++ In afdo_propagate ().count of BB3 is calculated by ++ COUNT (BB3) = 990 (990 = COUNT (BB1) - COUNT (BB2) = 1000 - 10) ++ In fact, BB3 may be colder than BB2 by sample count. ++ This function allocate source BB count to wach succ BB by sample ++ rate, E.g. ++ BB2_COUNT = BB1_COUNT * (BB2_COUNT / (BB2_COUNT + BB3_COUNT)) */ ++ ++static void ++afdo_preprocess_bb_count () ++{ ++ basic_block bb; ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ if (bb->count.ipa_p () && EDGE_COUNT (bb->succs) > 1 ++ && bb->count > profile_count::zero ().afdo ()) ++ { ++ basic_block bb1 = EDGE_SUCC (bb, 0)->dest; ++ basic_block bb2 = EDGE_SUCC (bb, 1)->dest; ++ if (single_succ_edge (bb1) && single_succ_edge (bb2) ++ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest) ++ { ++ gcov_type max_count = 0; ++ gcov_type total_count = 0; ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (!e->dest->count.ipa_p ()) ++ { ++ continue; ++ } ++ max_count = MAX (max_count, e->dest->count.to_gcov_type ()); ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ /* Only bb_count > max_count * 2, branch probability will ++ inversion. */ ++ if (max_count > 0 && bb->count.to_gcov_type () > max_count * 2) ++ { ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ gcov_type target_count = bb->count.to_gcov_type () ++ * e->dest->count.to_gcov_type ()/ total_count; ++ e->dest->count ++ = profile_count::from_gcov_type ++ (target_count).afdo (); ++ } ++ } ++ } ++ } ++ } ++} ++ + /* Propagate counts on control flow graph and calculate branch + probabilities. */ + +@@ -1420,6 +1495,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb) + } + + afdo_find_equiv_class (annotated_bb); ++ afdo_preprocess_bb_count (); + afdo_propagate (annotated_bb); + + FOR_EACH_BB_FN (bb, cfun) +@@ -1523,6 +1599,83 @@ afdo_vpt_for_early_inline (stmt_set *promoted_stmts) + return false; + } + ++/* Preparation before executing MCF algorithm. */ ++ ++static void ++afdo_init_mcf () ++{ ++ basic_block bb; ++ edge e; ++ edge_iterator ei; ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n init calling mcf_smooth_cfg (). \n"); ++ } ++ ++ /* Step1: when use mcf, BB id must be continous, ++ so we need compact_blocks (). */ ++ compact_blocks (); ++ ++ /* Step2: allocate memory for MCF input data. */ ++ bb_gcov_counts.safe_grow_cleared (cfun->cfg->x_last_basic_block); ++ edge_gcov_counts = new hash_map<edge, gcov_type>; ++ ++ /* Step3: init MCF input data from cfg. */ ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ /* Init BB count for MCF. */ ++ bb_gcov_count (bb) = bb->count.to_gcov_type (); ++ ++ gcov_type total_count = 0; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ ++ /* If there is no sample in each successor blocks, source ++ BB samples are allocated to each edge by branch static prob. */ ++ ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (total_count == 0) ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->probability.to_reg_br_prob_base () / REG_BR_PROB_BASE; ++ } ++ else ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->dest->count.to_gcov_type () / total_count; ++ } ++ } ++ } ++} ++ ++ ++/* Free the resources used by MCF and reset BB count from MCF result. ++ branch probability has been updated in mcf_smooth_cfg (). */ ++ ++static void ++afdo_process_after_mcf () ++{ ++ basic_block bb; ++ /* Reset BB count from MCF result. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ if (bb_gcov_count (bb)) ++ {
View file
_service:tar_scm:0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
Added
@@ -0,0 +1,194 @@ +From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com> +Date: Thu, 22 Feb 2024 11:27:43 +0300 +Subject: PATCH 01/18 Add insn defs and correct costs for cmlt generation + +--- + gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++ + gcc/config/aarch64/aarch64.cc | 15 +++++++++ + gcc/config/aarch64/aarch64.opt | 4 +++ + gcc/config/aarch64/iterators.md | 3 +- + gcc/config/aarch64/predicates.md | 25 +++++++++++++++ + gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++ + 6 files changed, 114 insertions(+), 1 deletion(-) + create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index ee7f0b89c..82f73805f 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -6454,6 +6454,54 @@ + (set_attr "type" "neon_compare<q>, neon_compare_zero<q>") + ) + ++;; Use cmlt to replace vector arithmetic operations like this (SImode example): ++;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001) ++;; TODO: maybe extend to scalar operations or other cm** instructions. ++ ++(define_insn "*aarch64_cmlt_as_arith<mode>" ++ (set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") ++ (minus:<V_INT_EQUIV> ++ (ashift:<V_INT_EQUIV> ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")) ++ (match_operand:VDQHSD 4 "half_size_operand")) ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_dup 1) ++ (match_dup 2)) ++ (match_dup 3)))) ++ "TARGET_SIMD && flag_cmlt_arith" ++ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0" ++ (set_attr "type" "neon_compare_zero") ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*arch64_cmlt_tmp<mode>" ++ (set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))) ++ "TARGET_SIMD && flag_cmlt_arith" ++ "#" ++ "&& reload_completed" ++ (set (match_operand:<V_INT_EQUIV> 0 "register_operand") ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))) ++ (set (match_dup 0) ++ (and:<V_INT_EQUIV> ++ (match_dup 0) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))) ++ "" ++ (set_attr "type" "neon_compare_zero") ++) ++ + (define_insn_and_split "aarch64_cm<optab>di" + (set (match_operand:DI 0 "register_operand" "=w,w,r") + (neg:DI +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index a3da4ca30..04072ca25 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -14064,6 +14064,21 @@ cost_minus: + return true; + } + ++ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern ++ matches the condition. The costs of cmlt and sub instructions ++ are comparable, so we are not increasing the cost here. */ ++ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT ++ && GET_CODE (op1) == AND) ++ { ++ rtx op0_subop0 = XEXP (op0, 0); ++ if (rtx_equal_p (op0_subop0, op1)) ++ { ++ rtx lshrt_op = XEXP (op0_subop0, 0); ++ if (GET_CODE (lshrt_op) == LSHIFTRT) ++ return true; ++ } ++ } ++ + /* Look for SUB (extended register). */ + if (is_a <scalar_int_mode> (mode) + && aarch64_rtx_arith_op_extract_p (op1)) +diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt +index a64b927e9..101664c7c 100644 +--- a/gcc/config/aarch64/aarch64.opt ++++ b/gcc/config/aarch64/aarch64.opt +@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0. + This option is for use with fstack-protector-strong and not for use in + user-land code. + ++mcmlt-arith ++Target Var(flag_cmlt_arith) Optimization Init(0) ++Use SIMD cmlt instruction to perform some arithmetic/logic calculations. ++ + TargetVariable + long aarch64_stack_protector_guard_offset = 0 + +diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md +index 26a840d7f..967e6b0b1 100644 +--- a/gcc/config/aarch64/iterators.md ++++ b/gcc/config/aarch64/iterators.md +@@ -1485,7 +1485,8 @@ + (V2DI "2s")) + + ;; Register suffix narrowed modes for VQN. +-(define_mode_attr V2ntype (V8HI "16b") (V4SI "8h") ++(define_mode_attr V2ntype (V4HI "8b") (V2SI "4h") ++ (V8HI "16b") (V4SI "8h") + (V2DI "4s")) + + ;; Widened modes of vector modes. +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index c308015ac..07c14aacb 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -49,6 +49,31 @@ + return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3); + }) + ++(define_predicate "half_size_minus_one_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size - 1); ++}) ++ ++(define_predicate "half_size_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size); ++}) ++ ++(define_predicate "cmlt_arith_mask_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ unsigned long long mask = ((unsigned long long) 1 << size) | 1; ++ return CONST_INT_P (op) && (UINTVAL (op) == mask); ++}) ++ + (define_predicate "subreg_lowpart_operator" + (ior (match_code "truncate") + (and (match_code "subreg") +diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c +new file mode 100755 +index 000000000..b4c9a37ff +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-cmlt.c +@@ -0,0 +1,20 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -mcmlt-arith" } */ ++ ++/* The test checks usage of cmlt insns for arithmetic/logic calculations ++ * in foo (). It's inspired by sources of x264 codec. */ ++ ++typedef unsigned short int uint16_t; ++typedef unsigned int uint32_t; ++ ++void foo( uint32_t *a, uint32_t *b) ++{ ++ for (unsigned i = 0; i < 4; i++) ++ { ++ uint32_t s = ((ai>>((8 * sizeof(uint16_t))-1)) ++ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1); ++ bi = (ai+s)^s; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */ +-- +2.33.0 +
View file
_service:tar_scm:0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
Added
@@ -0,0 +1,560 @@ +From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001 +From: vchernon <chernonog.vyacheslav@huawei.com> +Date: Wed, 28 Feb 2024 23:05:12 +0800 +Subject: PATCH 02/18 rtl-ifcvt introduce rtl ifcvt enchancements new + option: -fifcvt-allow-complicated-cmps: allows ifcvt to deal + with complicated cmps like + + cmp reg1 (reg2 + reg3) + + can increase compilation time + new param: + -param=ifcvt-allow-register-renaming=0,1,2 + 1 : allows ifcvt to rename registers in then and else bb + 2 : allows to rename registers in condition and else/then bb + can increase compilation time and register pressure +--- + gcc/common.opt | 4 + + gcc/ifcvt.cc | 291 +++++++++++++++--- + gcc/params.opt | 4 + + .../gcc.c-torture/execute/ifcvt-renaming-1.c | 35 +++ + gcc/testsuite/gcc.dg/ifcvt-6.c | 27 ++ + 5 files changed, 311 insertions(+), 50 deletions(-) + create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c + create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index c7c6bc256..aa00fb7b0 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3691,4 +3691,8 @@ fipa-ra + Common Var(flag_ipa_ra) Optimization + Use caller save register across calls if possible. + ++fifcvt-allow-complicated-cmps ++Common Var(flag_ifcvt_allow_complicated_cmps) Optimization ++Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time). ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc +index 2c1eba312..584db7b55 100644 +--- a/gcc/ifcvt.cc ++++ b/gcc/ifcvt.cc +@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep, + } + + /* Don't even try if the comparison operands or the mode of X are weird. */ +- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x))) ++ if (!flag_ifcvt_allow_complicated_cmps ++ && (cond_complex ++ || !SCALAR_INT_MODE_P (GET_MODE (x)))) + return NULL_RTX; + + return emit_store_flag (x, code, XEXP (cond, 0), +@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) + /* Currently support only simple single sets in test_bb. */ + if (!sset + || !noce_operand_ok (SET_DEST (sset)) +- || contains_ccmode_rtx_p (SET_DEST (sset)) ++ || (!flag_ifcvt_allow_complicated_cmps ++ && contains_ccmode_rtx_p (SET_DEST (sset))) + || !noce_operand_ok (SET_SRC (sset))) + return false; + +@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) + in this function. */ + + static bool +-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) ++bbs_ok_for_cmove_arith (basic_block bb_a, ++ basic_block bb_b, ++ rtx to_rename, ++ bitmap conflict_regs) + { + rtx_insn *a_insn; + bitmap bba_sets = BITMAP_ALLOC (®_obstack); +- ++ bitmap intersections = BITMAP_ALLOC (®_obstack); + df_ref def; + df_ref use; ++ rtx_insn *last_a = last_active_insn (bb_a, FALSE); + + FOR_BB_INSNS (bb_a, a_insn) + { +@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) + rtx sset_a = single_set (a_insn); + + if (!sset_a) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; + /* Record all registers that BB_A sets. */ + FOR_EACH_INSN_DEF (def, a_insn) +- if (!(to_rename && DF_REF_REG (def) == to_rename)) ++ if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a)) + bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); + } + ++ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets); + rtx_insn *b_insn; +- + FOR_BB_INSNS (bb_b, b_insn) + { + if (!active_insn_p (b_insn)) +@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) + rtx sset_b = single_set (b_insn); + + if (!sset_b) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; + + /* Make sure this is a REG and not some instance + of ZERO_EXTRACT or SUBREG or other dangerous stuff. +@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) + if (MEM_P (SET_DEST (sset_b))) + gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename)); + else if (!REG_P (SET_DEST (sset_b))) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; + +- /* If the insn uses a reg set in BB_A return false. */ ++ /* If the insn uses a reg set in BB_A return false ++ or try to collect register list for renaming. */ + FOR_EACH_INSN_USE (use, b_insn) + { +- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use))) ++ if (bitmap_bit_p (intersections, DF_REF_REGNO (use))) + { +- BITMAP_FREE (bba_sets); +- return false; ++ if (param_ifcvt_allow_register_renaming < 1) ++ goto end_cmove_arith_check_and_fail; ++ ++ /* Those regs should be renamed. We can't rename CC reg, but ++ possibly we can provide combined comparison in the future. */ ++ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC) ++ goto end_cmove_arith_check_and_fail; ++ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use)); + } + } +- + } + + BITMAP_FREE (bba_sets); ++ BITMAP_FREE (intersections); + return true; ++ ++end_cmove_arith_check_and_fail: ++ BITMAP_FREE (bba_sets); ++ BITMAP_FREE (intersections); ++ return false; + } + + /* Emit copies of all the active instructions in BB except the last. +@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple) + return true; + } + ++/* This function tries to rename regs that intersect with considered bb ++ inside condition expression. Condition expression will be moved down ++ if the optimization will be applied, so it is essential to be sure that ++ all intersected registers will be renamed otherwise transformation ++ can't be applied. Function returns true if renaming was successful ++ and optimization can proceed futher. */ ++ ++static bool ++noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) ++{ ++ bool success = true; ++ if (bitmap_empty_p (cond_rename_regs)) ++ return true; ++ if (param_ifcvt_allow_register_renaming < 2) ++ return false; ++ df_ref use; ++ rtx_insn *cmp_insn = if_info->cond_earliest; ++ /* Jump instruction as a condion currently unsupported. */ ++ if (JUMP_P (cmp_insn)) ++ return false; ++ rtx_insn *before_cmp = PREV_INSN (cmp_insn); ++ start_sequence (); ++ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn)); ++ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn); ++ FOR_EACH_INSN_USE (use, cmp_insn) ++ { ++ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use))) ++ { ++ rtx use_reg = DF_REF_REG (use); ++ rtx tmp = gen_reg_rtx (GET_MODE (use_reg)); ++ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp)) ++ { ++ end_sequence (); ++ return false;
View file
_service:tar_scm:0037-Perform-early-if-conversion-of-simple-arithmetic.patch
Added
@@ -0,0 +1,109 @@ +From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Thu, 12 Jan 2023 14:52:49 +0300 +Subject: PATCH 03/18 Perform early if-conversion of simple arithmetic + +--- + gcc/common.opt | 4 ++++ + gcc/match.pd | 25 +++++++++++++++++++ + gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++ + 3 files changed, 66 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index aa00fb7b0..dac477c04 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1821,6 +1821,10 @@ fif-conversion2 + Common Var(flag_if_conversion2) Optimization + Perform conversion of conditional jumps to conditional execution. + ++fif-conversion-gimple ++Common Var(flag_if_conversion_gimple) Optimization ++Perform conversion of conditional jumps to branchless equivalents during gimple transformations. ++ + fstack-reuse= + Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization + -fstack-reuse=all|named_vars|none Set stack reuse level for local variables. +diff --git a/gcc/match.pd b/gcc/match.pd +index 6f24d5079..3cbaf2a5b 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + ) + ) ++ ++(if (flag_if_conversion_gimple) ++ (for simple_op (plus minus bit_and bit_ior bit_xor) ++ (simplify ++ (cond @0 (simple_op @1 INTEGER_CST@2) @1) ++ (switch ++ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */ ++ (if (integer_onep (@2)) ++ (simple_op @1 (convert (convert:boolean_type_node @0)))) ++ /* a = cond ? a + powerof2cst : a -> ++ a = a + ((int) cond) << log2 (powerof2cst) */ ++ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2)) ++ (with ++ { ++ tree shift = build_int_cst (integer_type_node, tree_log2 (@2)); ++ } ++ (simple_op @1 (lshift (convert (convert:boolean_type_node @0)) ++ { shift; }) ++ ) ++ ) ++ ) ++ ) ++ ) ++ ) ++) + #endif + + #if GIMPLE +diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +new file mode 100644 +index 000000000..0f7c87e5c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +@@ -0,0 +1,37 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */ ++ ++int test_int (int optimizable_int) { ++ if (optimizable_int > 5) ++ ++optimizable_int; ++ return optimizable_int; ++} ++ ++int test_int_pow2 (int optimizable_int_pow2) { ++ if (optimizable_int_pow2 <= 4) ++ optimizable_int_pow2 += 1024; ++ return optimizable_int_pow2; ++} ++ ++int test_int_non_pow2 (int not_optimizable_int_non_pow2) { ++ if (not_optimizable_int_non_pow2 == 1) ++ not_optimizable_int_non_pow2 += 513; ++ return not_optimizable_int_non_pow2; ++} ++ ++float test_float (float not_optimizable_float) { ++ if (not_optimizable_float > 5) ++ not_optimizable_float += 1; ++ return not_optimizable_float; ++} ++ ++/* Expecting if-else block in test_float and test_int_non_pow2 only. */ ++/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */ ++/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */ ++/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */ ++ ++/* Expecting shifted result only for optimizable_int_pow2. */ ++/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */ ++/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */ +-- +2.33.0 +
View file
_service:tar_scm:0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Added
@@ -0,0 +1,252 @@ +From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Tue, 24 Jan 2023 16:43:40 +0300 +Subject: PATCH 04/18 Add option to allow matching uaddsub overflow for widen + ops too. + +--- + gcc/common.opt | 5 ++ + gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++ + gcc/tree-ssa-math-opts.cc | 43 ++++++++-- + 3 files changed, 184 insertions(+), 7 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index dac477c04..39c90604e 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3106,6 +3106,11 @@ freciprocal-math + Common Var(flag_reciprocal_math) SetByCombined Optimization + Same as -fassociative-math for expressions which include division. + ++fuaddsub-overflow-match-all ++Common Var(flag_uaddsub_overflow_match_all) ++Match unsigned add/sub overflow even if the target does not support ++the corresponding instruction. ++ + ; Nonzero means that unsafe floating-point math optimizations are allowed + ; for the sake of speed. IEEE compliance is not guaranteed, and operations + ; are allowed to assume that their arguments and results are "normal" +diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c +new file mode 100644 +index 000000000..96c26d308 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/uaddsub.c +@@ -0,0 +1,143 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */ ++#include <stdint.h> ++ ++typedef unsigned __int128 uint128_t; ++typedef struct uint256_t ++{ ++ uint128_t lo; ++ uint128_t hi; ++} uint256_t; ++ ++uint16_t add16 (uint8_t a, uint8_t b) ++{ ++ uint8_t tmp = a + b; ++ uint8_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint16_t res = overflow; ++ res <<= 8; ++ res += tmp; ++ return res; ++} ++ ++uint32_t add32 (uint16_t a, uint16_t b) ++{ ++ uint16_t tmp = a + b; ++ uint16_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint32_t res = overflow; ++ res <<= 16; ++ res += tmp; ++ return res; ++} ++ ++uint64_t add64 (uint32_t a, uint32_t b) ++{ ++ uint32_t tmp = a + b; ++ uint32_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint64_t res = overflow; ++ res <<= 32; ++ res += tmp; ++ return res; ++} ++ ++uint128_t add128 (uint64_t a, uint64_t b) ++{ ++ uint64_t tmp = a + b; ++ uint64_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint128_t res = overflow; ++ res <<= 64; ++ res += tmp; ++ return res; ++} ++ ++uint256_t add256 (uint128_t a, uint128_t b) ++{ ++ uint128_t tmp = a + b; ++ uint128_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint256_t res; ++ res.hi = overflow; ++ res.lo = tmp; ++ return res; ++} ++ ++uint16_t sub16 (uint8_t a, uint8_t b) ++{ ++ uint8_t tmp = a - b; ++ uint8_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint16_t res = overflow; ++ res <<= 8; ++ res += tmp; ++ return res; ++} ++ ++uint32_t sub32 (uint16_t a, uint16_t b) ++{ ++ uint16_t tmp = a - b; ++ uint16_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint32_t res = overflow; ++ res <<= 16; ++ res += tmp; ++ return res; ++} ++ ++uint64_t sub64 (uint32_t a, uint32_t b) ++{ ++ uint32_t tmp = a - b; ++ uint32_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint64_t res = overflow; ++ res <<= 32; ++ res += tmp; ++ return res; ++} ++ ++uint128_t sub128 (uint64_t a, uint64_t b) ++{ ++ uint64_t tmp = a - b; ++ uint64_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint128_t res = overflow; ++ res <<= 64; ++ res += tmp; ++ return res; ++} ++ ++uint256_t sub256 (uint128_t a, uint128_t b) ++{ ++ uint128_t tmp = a - b; ++ uint128_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint256_t res; ++ res.hi = overflow; ++ res.lo = tmp; ++ return res; ++} ++ ++/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\0-9\+\\(D\\), b_\0-9\+\\(D\\)\\)" 5 "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\0-9\+\\(D\\), b_\0-9\+\\(D\\)\\)" 5 "optimized" } } */ +diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc +index 232e903b0..55d6ee8ae 100644 +--- a/gcc/tree-ssa-math-opts.cc ++++ b/gcc/tree-ssa-math-opts.cc +@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2, + } + } + ++/* Check if the corresponding operation has wider equivalent on the target. */ ++ ++static bool ++wider_optab_check_p (optab op, machine_mode mode, int unsignedp) ++{ ++ machine_mode wider_mode; ++ FOR_EACH_WIDER_MODE (wider_mode, mode) ++ { ++ machine_mode next_mode; ++ if (optab_handler (op, wider_mode) != CODE_FOR_nothing ++ || (op == smul_optab ++ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode) ++ && (find_widening_optab_handler ((unsignedp
View file
_service:tar_scm:0039-Match-double-sized-mul-pattern.patch
Added
@@ -0,0 +1,488 @@ +From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Wed, 25 Jan 2023 15:04:07 +0300 +Subject: PATCH 05/18 Match double sized mul pattern + +--- + gcc/match.pd | 136 +++++++++++++++++++++ + gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++ + gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++ + gcc/tree-ssa-math-opts.cc | 80 ++++++++++++ + 4 files changed, 419 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c + create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c + +diff --git a/gcc/match.pd b/gcc/match.pd +index 3cbaf2a5b..61866cb90 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -7895,3 +7895,139 @@ and, + == TYPE_UNSIGNED (TREE_TYPE (@3)))) + && single_use (@4) + && single_use (@5)))) ++ ++/* Match multiplication with double sized result. ++ ++ Consider the following calculations: ++ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo) ++ * (2^(bit_size/2) * arg1_hi + arg1_lo) ++ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi ++ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi) ++ + arg0_lo * arg1_lo ++ ++ The products of high and low parts fits in bit_size values, thus they are ++ placed in high and low parts of result respectively. ++ ++ The sum of the mixed products may overflow, so we need a detection for that. ++ Also it has a bit_size/2 offset, thus it intersects with both high and low ++ parts of result. Overflow detection constant is bit_size/2 due to this. ++ ++ With this info: ++ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi ++ + 2^(bit_size/2) * middle ++ + 2^bit_size * possible_middle_overflow ++ + arg0_lo * arg1_lo ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow) ++ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo) ++ + arg0_lo * arg1_lo ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi ++ + possible_middle_overflow) ++ + 2^(bit_size/2) * middle_lo ++ + arg0_lo * arg1_lo ++ ++ The last sum can produce overflow for the high result part. With this: ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow ++ + possible_res_lo_overflow + middle_hi) ++ + res_lo ++ = res_hi + res_lo ++ ++ This formula is quite big to fit into one match pattern with all of the ++ combinations of terms inside it. There are many helpers for better code ++ readability. ++ ++ The simplification basis is res_hi: assuming that res_lo only is not ++ real practical case for such calculations. ++ ++ Overflow handling is done via matching complex calculations: ++ the realpart and imagpart are quite handy here. */ ++/* Match low and high parts of the argument. */ ++(match (double_size_mul_arg_lo @0 @1) ++ (bit_and @0 INTEGER_CST@1) ++ (if (wi::to_wide (@1) ++ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type))))) ++(match (double_size_mul_arg_hi @0 @1) ++ (rshift @0 INTEGER_CST@1) ++ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2))) ++ ++/* Match various argument parts products. */ ++(match (double_size_mul_lolo @0 @1) ++ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3)) ++ (if (single_use (@4)))) ++(match (double_size_mul_hihi @0 @1) ++ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3)) ++ (if (single_use (@4)))) ++(match (double_size_mul_lohi @0 @1) ++ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3)) ++ (if (single_use (@4)))) ++ ++/* Match complex middle sum. */ ++(match (double_size_mul_middle_complex @0 @1) ++ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0)) ++ (if (num_imm_uses (@2) == 2))) ++ ++/* Match real middle results. */ ++(match (double_size_mul_middle @0 @1) ++ (realpart@2 (double_size_mul_middle_complex @0 @1)) ++ (if (num_imm_uses (@2) == 2))) ++(match (double_size_mul_middleres_lo @0 @1) ++ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3)))) ++(match (double_size_mul_middleres_hi @0 @1) ++ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3)))) ++ ++/* Match low result part. */ ++/* Number of uses may be < 2 in case when we are interested in ++ high part only. */ ++(match (double_size_mul_res_lo_complex @0 @1) ++ (IFN_ADD_OVERFLOW:c@2 ++ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1)) ++ (if (num_imm_uses (@2) <= 2))) ++(match (double_size_mul_res_lo @0 @1) ++ (realpart (double_size_mul_res_lo_complex @0 @1))) ++ ++/* Match overflow terms. */ ++(match (double_size_mul_overflow_check_lo @0 @1 @5) ++ (convert@4 (ne@3 ++ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop)) ++ (if (single_use (@2) && single_use (@3) && single_use (@4)))) ++(match (double_size_mul_overflow_check_hi @0 @1) ++ (lshift@6 (convert@5 (ne@4 ++ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop)) ++ INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3) && single_use (@4) && single_use (@5) ++ && single_use (@6)))) ++ ++/* Match all possible permutations for high result part calculations. */ ++(for op1 (double_size_mul_hihi ++ double_size_mul_overflow_check_hi ++ double_size_mul_middleres_hi) ++ op2 (double_size_mul_overflow_check_hi ++ double_size_mul_middleres_hi ++ double_size_mul_hihi) ++ op3 (double_size_mul_middleres_hi ++ double_size_mul_hihi ++ double_size_mul_overflow_check_hi) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 ++ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1)) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3) ++ (plus:c@4 (op1:c @0 @1) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (op1:c @0 @1) ++ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (op1:c @0 @1) ++ (plus:c@4 (op2:c @0 @1) ++ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5))))) +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +new file mode 100644 +index 000000000..4d475cc8a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +@@ -0,0 +1,141 @@ ++/* { dg-do compile } */ ++/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for ++ proper overflow detection in some cases. */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++#include <stdint.h> ++ ++typedef unsigned __int128 uint128_t; ++ ++uint16_t mul16 (uint8_t a, uint8_t b) ++{ ++ uint8_t a_lo = a & 0xF; ++ uint8_t b_lo = b & 0xF; ++ uint8_t a_hi = a >> 4; ++ uint8_t b_hi = b >> 4; ++ uint8_t lolo = a_lo * b_lo; ++ uint8_t lohi = a_lo * b_hi; ++ uint8_t hilo = a_hi * b_lo; ++ uint8_t hihi = a_hi * b_hi; ++ uint8_t middle = hilo + lohi; ++ uint8_t middle_hi = middle >> 4; ++ uint8_t middle_lo = middle << 4; ++ uint8_t res_lo = lolo + middle_lo; ++ uint8_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x10 : 0); ++ uint16_t res = ((uint16_t) res_hi) << 8; ++ res += res_lo; ++ return res; ++} ++ ++uint32_t mul32 (uint16_t a, uint16_t b) ++{ ++ uint16_t a_lo = a & 0xFF; ++ uint16_t b_lo = b & 0xFF; ++ uint16_t a_hi = a >> 8;
View file
_service:tar_scm:0040-Port-icp-patch-to-GCC-12.patch
Added
@@ -0,0 +1,2387 @@ +From b73462757734c62f64e7a4379340679ec6f19669 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Tue, 27 Feb 2024 07:28:12 +0800 +Subject: PATCH 06/18 Port icp patch to GCC 12 + +--- + gcc/common.opt | 8 + + gcc/dbgcnt.def | 1 + + gcc/ipa-devirt.cc | 1855 +++++++++++++++++++++++++++++++++++ + gcc/passes.def | 1 + + gcc/testsuite/gcc.dg/icp1.c | 40 + + gcc/testsuite/gcc.dg/icp2.c | 38 + + gcc/testsuite/gcc.dg/icp3.c | 52 + + gcc/testsuite/gcc.dg/icp4.c | 55 ++ + gcc/testsuite/gcc.dg/icp5.c | 66 ++ + gcc/testsuite/gcc.dg/icp6.c | 66 ++ + gcc/testsuite/gcc.dg/icp7.c | 48 + + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 13 files changed, 2232 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/icp1.c + create mode 100644 gcc/testsuite/gcc.dg/icp2.c + create mode 100644 gcc/testsuite/gcc.dg/icp3.c + create mode 100644 gcc/testsuite/gcc.dg/icp4.c + create mode 100644 gcc/testsuite/gcc.dg/icp5.c + create mode 100644 gcc/testsuite/gcc.dg/icp6.c + create mode 100644 gcc/testsuite/gcc.dg/icp7.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 39c90604e..16aadccf6 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1316,6 +1316,14 @@ fdevirtualize + Common Var(flag_devirtualize) Optimization + Try to convert virtual calls to direct ones. + ++ficp ++Common Var(flag_icp) Optimization Init(0) ++Try to promote indirect calls to direct ones. ++ ++ficp-speculatively ++Common Var(flag_icp_speculatively) Optimization ++Promote indirect calls speculatively. ++ + fdiagnostics-show-location= + Common Joined RejectNegative Enum(diagnostic_prefixing_rule) + -fdiagnostics-show-location=once|every-line How often to emit source location at the beginning of line-wrapped diagnostics. +diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def +index 3aa18cd0c..a00bbc31b 100644 +--- a/gcc/dbgcnt.def ++++ b/gcc/dbgcnt.def +@@ -170,6 +170,7 @@ DEBUG_COUNTER (graphite_scop) + DEBUG_COUNTER (hoist) + DEBUG_COUNTER (hoist_insn) + DEBUG_COUNTER (ia64_sched2) ++DEBUG_COUNTER (icp) + DEBUG_COUNTER (if_after_combine) + DEBUG_COUNTER (if_after_reload) + DEBUG_COUNTER (if_conversion) +diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc +index 74fe65608..383839189 100644 +--- a/gcc/ipa-devirt.cc ++++ b/gcc/ipa-devirt.cc +@@ -103,9 +103,14 @@ along with GCC; see the file COPYING3. If not see + indirect polymorphic edge all possible polymorphic call targets of the call. + + pass_ipa_devirt performs simple speculative devirtualization. ++ pass_ipa_icp performs simple indirect call promotion. + */ + + #include "config.h" ++#define INCLUDE_ALGORITHM ++#define INCLUDE_SET ++#define INCLUDE_MAP ++#define INCLUDE_LIST + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -127,6 +132,7 @@ along with GCC; see the file COPYING3. If not see + #include "ipa-fnsummary.h" + #include "demangle.h" + #include "dbgcnt.h" ++#include "gimple-iterator.h" + #include "gimple-pretty-print.h" + #include "intl.h" + #include "stringpool.h" +@@ -4401,5 +4407,1854 @@ make_pass_ipa_odr (gcc::context *ctxt) + return new pass_ipa_odr (ctxt); + } + ++/* Function signature map used to look up function decl which corresponds to ++ the given function type. */ ++typedef std::set<unsigned> type_set; ++typedef std::set<tree> decl_set; ++typedef std::map<unsigned, type_set*> type_alias_map; ++typedef std::map<unsigned, decl_set*> type_decl_map; ++typedef std::map<unsigned, tree> uid_to_type_map; ++typedef std::map<tree, tree> type_map; ++ ++static bool has_address_taken_functions_with_varargs = false; ++static type_set *unsafe_types = NULL; ++static type_alias_map *fta_map = NULL; ++static type_alias_map *ta_map = NULL; ++static type_map *ctype_map = NULL; ++static type_alias_map *cbase_to_ptype = NULL; ++static type_decl_map *fs_map = NULL; ++static uid_to_type_map *type_uid_map = NULL; ++ ++static void ++print_type_set(unsigned ftype_uid, type_alias_map *map) ++{ ++ if (!map->count (ftype_uid)) ++ return; ++ type_set* s = (*map)ftype_uid; ++ for (type_set::const_iterator it = s->begin (); it != s->end (); it++) ++ fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); ++} ++ ++static void ++dump_type_with_uid (const char *msg, tree type, dump_flags_t flags = TDF_NONE) ++{ ++ fprintf (dump_file, msg); ++ print_generic_expr (dump_file, type, flags); ++ fprintf (dump_file, " (%d)\n", TYPE_UID (type)); ++} ++ ++/* Walk aggregate type and collect types of scalar elements. */ ++ ++static void ++collect_scalar_types (tree tp, std::list<tree> &types) ++{ ++ /* TODO: take into account different field offsets. ++ Also support array casts. */ ++ if (tp && dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Walk var's type: ", tp, TDF_UID); ++ if (RECORD_OR_UNION_TYPE_P (tp)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Record's fields {\n"); ++ for (tree field = TYPE_FIELDS (tp); field; ++ field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) != FIELD_DECL) ++ continue; ++ collect_scalar_types (TREE_TYPE (field), types); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "}\n"); ++ return; ++ } ++ if (TREE_CODE (tp) == ARRAY_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Array's innermost type:\n"); ++ /* Take the innermost component type. */ ++ tree elt; ++ for (elt = TREE_TYPE (tp); TREE_CODE (elt) == ARRAY_TYPE; ++ elt = TREE_TYPE (elt)) ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ print_generic_expr (dump_file, elt); ++ collect_scalar_types (elt, types); ++ return; ++ } ++ types.push_back (tp); ++} ++ ++static void maybe_register_aliases (tree type1, tree type2); ++ ++/* Walk type lists and maybe register type aliases. */ ++ ++static void ++compare_type_lists (std::list<tree> tlist1, std::list<tree> tlist2) ++{ ++ for (std::list<tree>::iterator ti1 = tlist1.begin (), ti2 = tlist2.begin (); ++ ti1 != tlist1.end (); ++ti1, ++ti2) ++ { ++ /* TODO: correct the analysis results if lists have different length. */ ++ if (ti2 == tlist2.end ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Type lists with different length!\n"); ++ break; ++ } ++ maybe_register_aliases (*ti1, *ti2); ++ } ++} ++ ++/* For two given types collect scalar element types and ++ compare the result lists to find type aliases. */ ++ ++static void ++collect_scalar_types_and_find_aliases (tree t1, tree t2) ++{ ++ std::list<tree> tlist1; ++ std::list<tree> tlist2; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "First type list: "); ++ collect_scalar_types (t1, tlist1); ++ if (dump_file && (dump_flags & TDF_DETAILS))
View file
_service:tar_scm:0041-Port-fixes-in-icp-to-GCC-12.patch
Added
@@ -0,0 +1,100 @@ +From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Tue, 27 Feb 2024 07:43:57 +0800 +Subject: PATCH 07/18 Port fixes in icp to GCC 12 + +--- + gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++------- + 1 file changed, 30 insertions(+), 7 deletions(-) + +diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc +index 383839189..318535d06 100644 +--- a/gcc/ipa-devirt.cc ++++ b/gcc/ipa-devirt.cc +@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map) + if (!map->count (ftype_uid)) + return; + type_set* s = (*map)ftype_uid; ++ if (!s) ++ { ++ fprintf (dump_file, "%d (no set)", ftype_uid); ++ return; ++ } + for (type_set::const_iterator it = s->begin (); it != s->end (); it++) + fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); + } +@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2) + if (register_ailas_type (type1, type2, ta_map)) + analyze_pointees (type1, type2); + } ++ unsigned type1_uid = TYPE_UID (type1); ++ unsigned type2_uid = TYPE_UID (type2); ++ if (type_uid_map->count (type1_uid) == 0) ++ (*type_uid_map)type1_uid = type1; ++ if (type_uid_map->count (type2_uid) == 0) ++ (*type_uid_map)type2_uid = type2; ++ + /* If function and non-function type pointers alias, + the function type is unsafe. */ + if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2)) +- unsafe_types->insert (TYPE_UID (type1)); ++ unsafe_types->insert (type1_uid); + if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1)) +- unsafe_types->insert (TYPE_UID (type2)); ++ unsafe_types->insert (type2_uid); + + /* Try to figure out with pointers to incomplete types. */ + if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)) +@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1) + static void + analyze_global_var (varpool_node *var) + { +- var->get_constructor(); + tree decl = var->decl; +- if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl) +- || integer_zerop (DECL_INITIAL (decl))) ++ if (decl || !DECL_INITIAL (decl)) ++ return; ++ var->get_constructor (); ++ if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl)) ++ || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt) + { + rhs = TREE_OPERAND (rhs, 0); + if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST +- || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL) ++ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL ++ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL ++ || TREE_CODE (rhs) == RESULT_DECL) + rhs_type = build_pointer_type (TREE_TYPE (rhs)); + else if (TREE_CODE (rhs) == COMPONENT_REF) + { +@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt) + gcc_assert (POINTER_TYPE_P (rhs_type)); + } + else +- gcc_unreachable(); ++ { ++ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", ++ get_tree_code_name (TREE_CODE (rhs))); ++ print_gimple_stmt (dump_file, stmt, 0); ++ gcc_unreachable (); ++ } + } + else + rhs_type = TREE_TYPE (rhs); +@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases () + decl_set *d_set = it1->second; + tree type = (*type_uid_map)it1->first; + type_set *set = (*fta_map)it1->first; ++ if (!set) ++ continue; + for (type_set::const_iterator it2 = set->begin (); + it2 != set->end (); it2++) + { +-- +2.33.0 +
View file
_service:tar_scm:0042-Add-split-complex-instructions-pass.patch
Added
@@ -0,0 +1,1245 @@ +From 9a8e5716543972dec36bae1f9d380d27bfbcdae1 Mon Sep 17 00:00:00 2001 +From: Agrachev Andrey WX1228450 <agrachev.andrey@huawei-partners.com> +Date: Mon, 21 Aug 2023 12:35:19 +0300 +Subject: PATCH 09/18 Add split-complex-instructions pass + + - Add option -fsplit-ldp-stp + - Add functionality to detect and split depended from store LDP instructions. + - Add -param=param-ldp-dependency-search-range= to configure ldp dependency search range + - Add RTL tests + +Co-authored-by: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com> +Co-authored-by: Zinin Ivan WX1305386 <zinin.ivan@huawei-partners.com> +Co-authored-by: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com> +--- + gcc/common.opt | 5 + + gcc/config/aarch64/aarch64.cc | 42 ++ + gcc/doc/tm.texi | 8 + + gcc/doc/tm.texi.in | 4 + + gcc/params.opt | 3 + + gcc/passes.def | 1 + + gcc/sched-rgn.cc | 704 +++++++++++++++++- + gcc/target.def | 10 + + .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 74 ++ + .../rtl/aarch64/test-ldp-split-rearrange.c | 40 + + .../gcc.dg/rtl/aarch64/test-ldp-split.c | 174 +++++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 13 files changed, 1066 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index a42bee250..c0e3f5687 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1797,6 +1797,11 @@ floop-nest-optimize + Common Var(flag_loop_nest_optimize) Optimization + Enable the loop nest optimizer. + ++fsplit-ldp-stp ++Common Var(flag_split_ldp_stp) Optimization ++Split load/store pair instructions into separate load/store operations ++for better performance. ++ + fstrict-volatile-bitfields + Common Var(flag_strict_volatile_bitfields) Init(-1) Optimization + Force bitfield accesses to match their type width. +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 04072ca25..48e2eded0 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -27507,6 +27507,48 @@ aarch64_run_selftests (void) + + #endif /* #if CHECKING_P */ + ++/* TODO: refuse to use ranges intead of full list of an instruction codes. */ ++ ++bool ++is_aarch64_ldp_insn (int icode) ++{ ++ if ((icode >= CODE_FOR_load_pair_sw_sisi ++ && icode <= CODE_FOR_load_pair_dw_tftf) ++ || (icode >= CODE_FOR_loadwb_pairsi_si ++ && icode <= CODE_FOR_loadwb_pairtf_di) ++ || (icode >= CODE_FOR_load_pairv8qiv8qi ++ && icode <= CODE_FOR_load_pairdfdf) ++ || (icode >= CODE_FOR_load_pairv16qiv16qi ++ && icode <= CODE_FOR_load_pairv8bfv2df) ++ || (icode >= CODE_FOR_load_pair_lanesv8qi ++ && icode <= CODE_FOR_load_pair_lanesdf)) ++ return true; ++ return false; ++} ++ ++bool ++is_aarch64_stp_insn (int icode) ++{ ++ if ((icode >= CODE_FOR_store_pair_sw_sisi ++ && icode <= CODE_FOR_store_pair_dw_tftf) ++ || (icode >= CODE_FOR_storewb_pairsi_si ++ && icode <= CODE_FOR_storewb_pairtf_di) ++ || (icode >= CODE_FOR_vec_store_pairv8qiv8qi ++ && icode <= CODE_FOR_vec_store_pairdfdf) ++ || (icode >= CODE_FOR_vec_store_pairv16qiv16qi ++ && icode <= CODE_FOR_vec_store_pairv8bfv2df) ++ || (icode >= CODE_FOR_store_pair_lanesv8qi ++ && icode <= CODE_FOR_store_pair_lanesdf)) ++ return true; ++ return false; ++} ++ ++#undef TARGET_IS_LDP_INSN ++#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn ++ ++#undef TARGET_IS_STP_INSN ++#define TARGET_IS_STP_INSN is_aarch64_stp_insn ++ + #undef TARGET_STACK_PROTECT_GUARD + #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index c5006afc0..0c6415a9c 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -12113,6 +12113,14 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) ++Return true if icode is corresponding to any of the LDP instruction types. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) ++Return true if icode is corresponding to any of the STP instruction types. ++@end deftypefn ++ + @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) + This target hook returns @code{true} past the point in which new jump + instructions could be created. On machines that require a register for +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index f869ddd5e..6ff60e562 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -7977,6 +7977,10 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@hook TARGET_IS_LDP_INSN ++ ++@hook TARGET_IS_STP_INSN ++ + @hook TARGET_CANNOT_MODIFY_JUMPS_P + + @hook TARGET_HAVE_CONDITIONAL_EXECUTION +diff --git a/gcc/params.opt b/gcc/params.opt +index 7fcc2398d..6176d4790 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1217,4 +1217,7 @@ Enum(vrp_mode) String(ranger) Value(VRP_MODE_RANGER) + Common Joined UInteger Var(param_pointer_compression_size) Init(32) IntegerRange(8, 32) Param Optimization + Target size of compressed pointer, which should be 8, 16 or 32. + ++-param=param-ldp-dependency-search-range= ++Common Joined UInteger Var(param_ldp_dependency_search_range) Init(16) IntegerRange(1, 32) Param Optimization ++Range for depended ldp search in split-ldp-stp path. + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/passes.def b/gcc/passes.def +index 941bbadf0..a30e05688 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -514,6 +514,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_reorder_blocks); + NEXT_PASS (pass_leaf_regs); + NEXT_PASS (pass_split_before_sched2); ++ NEXT_PASS (pass_split_complex_instructions); + NEXT_PASS (pass_sched2); + NEXT_PASS (pass_stack_regs); + PUSH_INSERT_PASSES_WITHIN (pass_stack_regs) +diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc +index a0dfdb788..b4df8bdc5 100644 +--- a/gcc/sched-rgn.cc ++++ b/gcc/sched-rgn.cc +@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3. If not see + are actually scheduled. */ +  + #include "config.h" ++#define INCLUDE_SET ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -65,6 +67,7 @@ along with GCC; see the file COPYING3. If not see + #include "dbgcnt.h" + #include "pretty-print.h" + #include "print-rtl.h" ++#include "cfgrtl.h" + + /* Disable warnings about quoting issues in the pp_xxx calls below + that (intentionally) don't follow GCC diagnostic conventions. */ +@@ -3951,6 +3954,705 @@ make_pass_sched_fusion (gcc::context *ctxt) + return new pass_sched_fusion (ctxt); + } + ++namespace { ++ ++/* Def-use analisys special functions implementation. */ ++ ++static struct df_link * ++get_defs (rtx_insn *insn, rtx reg) ++{ ++ df_ref use; ++ struct df_link *ref_chain, *ref_link; ++ ++ FOR_EACH_INSN_USE (use, insn) ++ { ++ if (GET_CODE (DF_REF_REG (use)) == SUBREG) ++ return NULL; ++ if (REGNO (DF_REF_REG (use)) == REGNO (reg))
View file
_service:tar_scm:0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
Added
@@ -0,0 +1,1426 @@ +From a49db831320ac70ca8f46b94ee60d7c6951f65c3 Mon Sep 17 00:00:00 2001 +From: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com> +Date: Wed, 20 Dec 2023 21:36:07 +0300 +Subject: PATCH 10/18 Extending and refactoring of + pass_split_complex_instructions + +- Add flag parameter in is_ldp_insn and is_stp_insn to know + if instruction has writeback operation +- Add support of PRE_*, POST_* operands as a memory address + expression +- Split only LDPs that intersect with a dependent store + instruction +- Make the selection of dependent store instructions stricter + so it will be enough to check by BFS that dependent store + instruction appears in search range. +- Add helper methods to retrieve fields of rtx +- Remove redundant iterations in find_dependent_stores_candidates +- Refactor generation of instructions +- Add more test cases +--- + gcc/config/aarch64/aarch64.cc | 62 +- + gcc/doc/tm.texi | 12 +- + gcc/sched-rgn.cc | 771 +++++++++--------- + gcc/target.def | 14 +- + .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 35 +- + .../rtl/aarch64/test-ldp-split-rearrange.c | 2 +- + .../gcc.dg/rtl/aarch64/test-ldp-split.c | 181 +++- + 7 files changed, 603 insertions(+), 474 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 48e2eded0..fa566dd80 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -27507,39 +27507,59 @@ aarch64_run_selftests (void) + + #endif /* #if CHECKING_P */ + +-/* TODO: refuse to use ranges intead of full list of an instruction codes. */ ++/* TODO: refuse to use ranges instead of full list of an instruction codes. */ + + bool +-is_aarch64_ldp_insn (int icode) ++is_aarch64_ldp_insn (int icode, bool *has_wb) + { + if ((icode >= CODE_FOR_load_pair_sw_sisi +- && icode <= CODE_FOR_load_pair_dw_tftf) ++ && icode <= CODE_FOR_load_pair_sw_sfsf) ++ || (icode >= CODE_FOR_load_pair_dw_didi ++ && icode <= CODE_FOR_load_pair_dw_dfdf) ++ || (icode == CODE_FOR_load_pair_dw_tftf) + || (icode >= CODE_FOR_loadwb_pairsi_si +- && icode <= CODE_FOR_loadwb_pairtf_di) +- || (icode >= CODE_FOR_load_pairv8qiv8qi +- && icode <= CODE_FOR_load_pairdfdf) +- || (icode >= CODE_FOR_load_pairv16qiv16qi +- && icode <= CODE_FOR_load_pairv8bfv2df) +- || (icode >= CODE_FOR_load_pair_lanesv8qi +- && icode <= CODE_FOR_load_pair_lanesdf)) +- return true; ++ && icode <= CODE_FOR_loadwb_pairdi_di) ++ || (icode >= CODE_FOR_loadwb_pairsf_si ++ && icode <= CODE_FOR_loadwb_pairdf_di) ++ || (icode >= CODE_FOR_loadwb_pairti_si ++ && icode <= CODE_FOR_loadwb_pairtf_di)) ++ { ++ if (has_wb) ++ *has_wb = ((icode >= CODE_FOR_loadwb_pairsi_si ++ && icode <= CODE_FOR_loadwb_pairdi_di) ++ || (icode >= CODE_FOR_loadwb_pairsf_si ++ && icode <= CODE_FOR_loadwb_pairdf_di) ++ || (icode >= CODE_FOR_loadwb_pairti_si ++ && icode <= CODE_FOR_loadwb_pairtf_di)); ++ return true; ++ } + return false; + } + + bool +-is_aarch64_stp_insn (int icode) ++is_aarch64_stp_insn (int icode, bool *has_wb) + { + if ((icode >= CODE_FOR_store_pair_sw_sisi +- && icode <= CODE_FOR_store_pair_dw_tftf) ++ && icode <= CODE_FOR_store_pair_sw_sfsf) ++ || (icode >= CODE_FOR_store_pair_dw_didi ++ && icode <= CODE_FOR_store_pair_dw_dfdf) ++ || (icode == CODE_FOR_store_pair_dw_tftf) + || (icode >= CODE_FOR_storewb_pairsi_si +- && icode <= CODE_FOR_storewb_pairtf_di) +- || (icode >= CODE_FOR_vec_store_pairv8qiv8qi +- && icode <= CODE_FOR_vec_store_pairdfdf) +- || (icode >= CODE_FOR_vec_store_pairv16qiv16qi +- && icode <= CODE_FOR_vec_store_pairv8bfv2df) +- || (icode >= CODE_FOR_store_pair_lanesv8qi +- && icode <= CODE_FOR_store_pair_lanesdf)) +- return true; ++ && icode <= CODE_FOR_storewb_pairdi_di) ++ || (icode >= CODE_FOR_storewb_pairsf_si ++ && icode <= CODE_FOR_storewb_pairdf_di) ++ || (icode >= CODE_FOR_storewb_pairti_si ++ && icode <= CODE_FOR_storewb_pairtf_di)) ++ { ++ if (has_wb) ++ *has_wb = ((icode >= CODE_FOR_storewb_pairsi_si ++ && icode <= CODE_FOR_storewb_pairdi_di) ++ || (icode >= CODE_FOR_storewb_pairsf_si ++ && icode <= CODE_FOR_storewb_pairdf_di) ++ || (icode >= CODE_FOR_storewb_pairti_si ++ && icode <= CODE_FOR_storewb_pairtf_di)); ++ return true; ++ } + return false; + } + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 0c6415a9c..3b6e90bf2 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -12113,12 +12113,16 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + +-@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) +-Return true if icode is corresponding to any of the LDP instruction types. ++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}, bool *@var{has_wb}) ++Return true if @var{icode} is corresponding to any of the LDP instruction ++types. If @var{has_wb} is not NULL then its value is set to true if LDP ++contains post-index or pre-index operation. + @end deftypefn + +-@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) +-Return true if icode is corresponding to any of the STP instruction types. ++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}, bool *@var{has_wb}) ++Return true if @var{icode} is corresponding to any of the STP instruction ++types. If @var{has_wb} is not NULL then its value is set to true if STP ++contains post-index or pre-index operation. + @end deftypefn + + @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) +diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc +index b4df8bdc5..5f61de1c8 100644 +--- a/gcc/sched-rgn.cc ++++ b/gcc/sched-rgn.cc +@@ -3956,7 +3956,7 @@ make_pass_sched_fusion (gcc::context *ctxt) + + namespace { + +-/* Def-use analisys special functions implementation. */ ++/* Def-use analysis special functions implementation. */ + + static struct df_link * + get_defs (rtx_insn *insn, rtx reg) +@@ -4032,42 +4032,66 @@ const pass_data pass_data_split_complex_instructions = { + (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish. */ + }; + ++/* Pass split_complex_instructions finds LOAD PAIR instructions (LDP) that can ++ be split into two LDR instructions. It splits only those LDP for which one ++ half of the requested memory is contained in the preceding STORE (STR/STP) ++ instruction whose base register has the same definition. This allows ++ to use hardware store-to-load forwarding mechanism and to get one half of ++ requested memory from the store queue of CPU. ++ ++ TODO: Add split of STP. ++ TODO: Add split of vector STP and LDP. */ + class pass_split_complex_instructions : public rtl_opt_pass + { + private: +- enum complex_instructions_t ++ enum mem_access_insn_t + { + UNDEFINED, + LDP, ++ /* LDP with post-index (see loadwb_pair in config/aarch64.md). */ ++ LDP_WB, ++ /* LDP that contains one destination register in RTL IR ++ (see movti_aarch64 in config/aarch64.md). */ + LDP_TI, + STP, ++ /* STP with pre-index (see storewb_pair in config/aarch64.md). */ ++ STP_WB, ++ /* STP that contains one source register in RTL IR ++ (see movti_aarch64 in config/aarch64.md). */ ++ STP_TI, + STR + }; + +- void split_complex_insn (rtx_insn *insn); +- void split_ldp_ti (rtx_insn *insn); +- void split_ldp_with_offset (rtx_insn *ldp_insn); +- void split_simple_ldp (rtx_insn *ldp_insn); +- void split_ldp_stp (rtx_insn *insn); +- complex_instructions_t get_insn_type (rtx_insn *insn); +- +- basic_block bb; +- rtx_insn *insn; + std::set<rtx_insn *> dependent_stores_candidates; + std::set<rtx_insn *> ldp_to_split_list; +
View file
_service:tar_scm:0044-Port-maxmin-patch-to-GCC-12.patch
Added
@@ -0,0 +1,378 @@ +From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Thu, 22 Feb 2024 17:07:24 +0800 +Subject: PATCH 12/18 Port maxmin patch to GCC 12 + +--- + gcc/config/aarch64/aarch64-simd.md | 256 ++++++++++++++++++++++++++ + gcc/config/aarch64/predicates.md | 19 ++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 46 +++++ + 3 files changed, 321 insertions(+) + create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 82f73805f..de92802f5 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -1138,6 +1138,82 @@ + (set_attr "type" "neon_compare<q>,neon_shift_imm<q>") + ) + ++;; Simplify the extension with following truncation for shift+neg operation. ++ ++(define_insn_and_split "*aarch64_sshr_neg_v8hi" ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_dup 1) ++ (match_operand:V8HI 4 "vect_par_cnst_hi_half")))) ++ (match_dup 2))))) ++ "TARGET_SIMD" ++ "#" ++ "&& true" ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (ashiftrt:V8HI ++ (neg:V8HI ++ (match_operand:V8HI 1 "register_operand" "w")) ++ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one"))) ++ { ++ /* Reduce the shift amount to smaller mode. */ ++ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands2, 0)) ++ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands2)) / 2); ++ operands2 = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++ } ++ (set_attr "type" "multiple") ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi" ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_operand:V4SI 1 "register_operand" "w")) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_operand:V4SI 3 "register_operand" "w")) ++ (match_dup 2))))) ++ "TARGET_SIMD" ++ "#" ++ "&& true" ++ (set (match_operand:V4SI 1 "register_operand" "=w") ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_dup 1)) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (set (match_operand:V4SI 3 "register_operand" "=w") ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_dup 3)) ++ (match_dup 2))) ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (match_dup 1)) ++ (truncate:V4HI ++ (match_dup 3)))) ++ "" ++ (set_attr "type" "multiple") ++) ++ + (define_insn "*aarch64_simd_sra<mode>" + (set (match_operand:VDQ_I 0 "register_operand" "=w") + (plus:VDQ_I +@@ -1714,6 +1790,26 @@ + } + ) + ++(define_insn "vec_pack_trunc_shifted_<mode>" ++ (set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w") ++ (vec_concat:<VNARROWQ2> ++ (truncate:<VNARROWQ> ++ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w") ++ (match_operand:VQN 2 "half_size_operand" "w"))) ++ (truncate:<VNARROWQ> ++ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w") ++ (match_operand:VQN 4 "half_size_operand" "w"))))) ++ "TARGET_SIMD" ++ { ++ if (BYTES_BIG_ENDIAN) ++ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>"; ++ else ++ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>"; ++ } ++ (set_attr "type" "neon_permute<q>") ++ (set_attr "length" "4") ++) ++ + (define_insn "aarch64_shrn<mode>_insn_le" + (set (match_operand:<VNARROWQ2> 0 "register_operand" "=w") + (vec_concat:<VNARROWQ2> +@@ -6652,6 +6748,166 @@ + (set_attr "type" "neon_tst<q>") + ) + ++;; Simplify the extension with following truncation for cmtst-like operation. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_v8hi" ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (plus:V4HI ++ (truncate:V4HI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))) ++ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one")) ++ (plus:V4HI ++ (truncate:V4HI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_dup 1) ++ (match_dup 2)) ++ (match_operand:V8HI 6 "vect_par_cnst_hi_half"))) ++ (match_dup 4))) ++ (match_dup 5)))) ++ "TARGET_SIMD && !reload_completed" ++ "#" ++ "&& true" ++ (set (match_operand:V8HI 6 "register_operand" "=w") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (plus:V8HI ++ (eq:V8HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand" "w") ++ (match_dup 6)) ++ (match_operand:V8HI 4 "aarch64_simd_imm_zero")) ++ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one"))) ++ { ++ if (can_create_pseudo_p ()) ++ { ++ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands4, 0)); ++ operands4 = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands5, 0)); ++ operands5 = aarch64_simd_gen_const_vector_dup (V8HImode, val2); ++ ++ operands6 = gen_reg_rtx (V8HImode); ++ } ++ else ++ FAIL; ++ } ++ (set_attr "type" "neon_tst_q") ++) ++ ++;; Three helper definitions that allow combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi" ++ (set (match_operand:V4SI 0 "register_operand" "=w") ++ (neg:V4SI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))
View file
_service:tar_scm:0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
Added
@@ -0,0 +1,239 @@ +From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Thu, 22 Feb 2024 17:13:27 +0800 +Subject: PATCH 13/18 Port moving minmask pattern to gimple to GCC 12 + +--- + gcc/common.opt | 4 + + gcc/match.pd | 104 ++++++++++++++++++++++++ + gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++ + gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++-- + 5 files changed, 151 insertions(+), 5 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6c6fabb31..3a5004271 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1846,6 +1846,10 @@ fif-conversion-gimple + Common Var(flag_if_conversion_gimple) Optimization + Perform conversion of conditional jumps to branchless equivalents during gimple transformations. + ++fconvert-minmax ++Common Var(flag_convert_minmax) Optimization ++Convert saturating clipping to min max. ++ + fstack-reuse= + Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization + -fstack-reuse=all|named_vars|none Set stack reuse level for local variables. +diff --git a/gcc/match.pd b/gcc/match.pd +index 61866cb90..3a19e93b3 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -8031,3 +8031,107 @@ and, + (plus:c@4 (op2:c @0 @1) + (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) + (if (single_use (@4) && single_use (@5))))) ++ ++/* MinMax pattern matching helpers. More info on the transformation below. */ ++ ++/* Match (a & 0b11..100..0) pattern. */ ++(match (minmax_cmp_arg @0 @1) ++ (bit_and @0 INTEGER_CST@1) ++ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) ++ ++/* Match (inversed_sign_bit >> sign_bit_pos) pattern. ++ This statement is blocking for the transformation of unsigned integers. ++ Do type check here to avoid unnecessary duplications. */ ++(match (minmax_sat_arg @0) ++ (rshift (negate @0) INTEGER_CST@1) ++ (if (!TYPE_UNSIGNED (TREE_TYPE (@0)) ++ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1)))) ++ ++/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)). ++ The matched pattern can be described as saturated clipping. ++ ++ The pattern supports truncation via both casts and bit_and. ++ Also there are patterns for possible inverted conditions. */ ++(if (flag_convert_minmax) ++/* Truncation via casts. Unfortunately convert? cannot be applied here ++ because convert and cond take different number of arguments. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0)) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0)))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ /* Truncation via bit_and with mask. Same concerns on convert? here. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0)) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; })))))) +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +new file mode 100644 +index 000000000..859ff7df8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include <inttypes.h> ++ ++__attribute__((noinline)) ++void test (int32_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 4; i++) ++ ai = ((((-xi) >> 31) ^ xi) ++ & (-((int32_t)((xi & (~((1 << 8)-1))) == 0)))) ^ ((-xi) >> 31); ++} ++ ++/* { dg-final { scan-assembler-not {smax\t} } } */ ++/* { dg-final { scan-assembler-not {smin\t} } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +new file mode 100644 +index 000000000..63d4d85b3 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include <inttypes.h> ++ ++__attribute__((noinline)) ++void test (int8_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 8; i++) ++ ai = ((xi & ~((1 << 9)-1)) ? (-xi)>>31 & ((1 << 9)-1) : xi); ++} ++ ++/* { dg-final { scan-assembler-times {smax\t} 4 } } */ ++/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +index 06bce7029..a984fa560 100755 +--- a/gcc/testsuite/gcc.dg/combine-maxmin.c ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target aarch64-*-* } } */ +-/* { dg-options "-O3 -fdump-rtl-combine-all" } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ + + /* The test checks usage of smax/smin insns for clip evaluation and + * uzp1/uzp2 insns for vector element narrowing. It's inspired by +@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + { + const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; + for( int y = 0; y < height; y++ ) { ++ /* This loop is not being vectorized now. */
View file
_service:tar_scm:0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
Added
@@ -0,0 +1,65 @@ +From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Thu, 22 Feb 2024 17:20:17 +0800 +Subject: PATCH 14/18 Add new pattern to pass the maxmin tests + +--- + gcc/match.pd | 24 ++++++++++++++++++++++++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 2 +- + 2 files changed, 25 insertions(+), 1 deletion(-) + +diff --git a/gcc/match.pd b/gcc/match.pd +index 3a19e93b3..aee58e47b 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -8038,6 +8038,10 @@ and, + (match (minmax_cmp_arg @0 @1) + (bit_and @0 INTEGER_CST@1) + (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) ++/* Match ((unsigned) a > 0b0..01..1) pattern. */ ++(match (minmax_cmp_arg1 @0 @1) ++ (gt @0 INTEGER_CST@1) ++ (if (wi::popcount (wi::to_widest (@1) + 1) == 1))) + + /* Match (inversed_sign_bit >> sign_bit_pos) pattern. + This statement is blocking for the transformation of unsigned integers. +@@ -8095,6 +8099,26 @@ and, + (convert (min (max @0 { integer_zero_node; }) + { mask; }))))) + ++ (simplify ++ (convert ++ (cond ++ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0))) ++ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); } ++ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0)) ++ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); } ++ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; }) ++ { mask; }))))) ++ + /* Truncation via bit_and with mask. Same concerns on convert? here. */ + (simplify + (convert +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +index a984fa560..5c0c9cc49 100755 +--- a/gcc/testsuite/gcc.dg/combine-maxmin.c ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + /* { dg-final { scan-assembler-times {usmin\t} 6 } } */ + /* All of the vectorized patterns are expected to be matched. */ + /* { dg-final { scan-assembler-not {cmtst\t} } } */ +-/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */ ++/* { dg-final { scan-assembler-times {uzp1\t} 2 } } */ +-- +2.33.0 +
View file
_service:tar_scm:0047-AES-Implement-AES-pattern-matching.patch
Added
@@ -0,0 +1,3968 @@ +From 53d321d2fe08f69a29527be157d4bcaaefea04ab Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Wed, 6 Dec 2023 10:46:28 +0300 +Subject: PATCH 15/18 AES Implement AES pattern matching + +--- + gcc/Makefile.in | 1 + + gcc/common.opt | 4 + + gcc/config/aarch64/aarch64.cc | 24 + + gcc/crypto-accel.cc | 2415 +++++++++++++++++ + gcc/doc/tm.texi | 29 + + gcc/doc/tm.texi.in | 12 + + gcc/passes.def | 1 + + gcc/rtl-matcher.h | 367 +++ + gcc/target.def | 41 + + .../gcc.target/aarch64/aes-decrypt.c | 478 ++++ + .../gcc.target/aarch64/aes-encrypt.c | 443 +++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 13 files changed, 3817 insertions(+) + create mode 100644 gcc/crypto-accel.cc + create mode 100644 gcc/rtl-matcher.h + create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-decrypt.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-encrypt.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 45705c1f3..876000bda 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1332,6 +1332,7 @@ OBJS = \ + cgraphunit.o \ + cgraphclones.o \ + combine.o \ ++ crypto-accel.o \ + combine-stack-adj.o \ + compare-elim.o \ + context.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index 3a5004271..1eb62ada5 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1129,6 +1129,10 @@ Common Var(flag_array_widen_compare) Optimization + Extends types for pointers to arrays to improve array comparsion performance. + In some extreme situations this may result in unsafe behavior. + ++fcrypto-accel-aes ++Common Var(flag_crypto_accel_aes) Init(0) Optimization ++Perform crypto acceleration AES pattern matching. ++ + fauto-inc-dec + Common Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index fa566dd80..9171d9d56 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -27569,6 +27569,30 @@ is_aarch64_stp_insn (int icode, bool *has_wb) + #undef TARGET_IS_STP_INSN + #define TARGET_IS_STP_INSN is_aarch64_stp_insn + ++machine_mode ++aarch64_get_v16qi_mode () ++{ ++ return V16QImode; ++} ++ ++#undef TARGET_GET_V16QI_MODE ++#define TARGET_GET_V16QI_MODE aarch64_get_v16qi_mode ++ ++#undef TARGET_GEN_REV32V16QI ++#define TARGET_GEN_REV32V16QI gen_aarch64_rev32v16qi ++ ++#undef TARGET_GEN_AESEV16QI ++#define TARGET_GEN_AESEV16QI gen_aarch64_crypto_aesev16qi ++ ++#undef TARGET_GEN_AESDV16QI ++#define TARGET_GEN_AESDV16QI gen_aarch64_crypto_aesdv16qi ++ ++#undef TARGET_GEN_AESMCV16QI ++#define TARGET_GEN_AESMCV16QI gen_aarch64_crypto_aesmcv16qi ++ ++#undef TARGET_GEN_AESIMCV16QI ++#define TARGET_GEN_AESIMCV16QI gen_aarch64_crypto_aesimcv16qi ++ + #undef TARGET_STACK_PROTECT_GUARD + #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard + +diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc +new file mode 100644 +index 000000000..f4e810a6b +--- /dev/null ++++ b/gcc/crypto-accel.cc +@@ -0,0 +1,2415 @@ ++/* Crypto-pattern optimizer. ++ Copyright (C) 2003-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++<http://www.gnu.org/licenses/>. */ ++ ++#include "config.h" ++#define INCLUDE_VECTOR ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_ALGORITHM ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "df.h" ++#include "memmodel.h" ++#include "optabs.h" ++#include "regs.h" ++#include "emit-rtl.h" ++#include "recog.h" ++#include "cfgrtl.h" ++#include "cfgcleanup.h" ++#include "expr.h" ++#include "tree-pass.h" ++#include "rtl-matcher.h" ++ ++/* Basic AES table descryption. */ ++struct aes_table ++{ ++ /* Number of elements per table. */ ++ static const unsigned int table_nelts = 256; ++ /* Number of tables. */ ++ static const unsigned int basic_tables_num = 4; ++ /* Number of rounds. */ ++ static const unsigned int rounds_num = 4; ++ /* Common ID for wrong table. */ ++ static const unsigned int BAD_TABLE = -1; ++ ++ typedef const unsigned int table_typetable_nelts; ++ typedef table_type *table_mapbasic_tables_num; ++ ++ template<typename T> ++ static bool is_basic_table (tree ctor, const T ethalontable_nelts) ++ { ++ if (TREE_CODE (ctor) != CONSTRUCTOR ++ ||CONSTRUCTOR_NELTS (ctor) != table_nelts) ++ return false; ++ ++ unsigned ix; ++ tree val; ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val) ++ if (TREE_INT_CST_LOW (val) != ethalonix) ++ return false; ++ return true; ++ } ++ ++ static unsigned check_table (tree ctor, ++ table_map tables) ++ { ++ for (unsigned i = 0; i < 4; ++i) ++ if (is_basic_table (ctor, *tablesi)) ++ return i; ++ return BAD_TABLE; ++ } ++}; ++ ++/* AES encryption info. */ ++struct aes_encrypt_table : aes_table ++{ ++ typedef enum ++ { ++ TE0, ++ TE1, ++ TE2, ++ TE3, ++ BAD_TABLE = aes_table::BAD_TABLE ++ } table_entry; ++ ++ static table_type Te0; ++ static table_type Te1; ++ static table_type Te2; ++ static table_type Te3; ++ ++ static table_map tables; ++ static table_entry roundsrounds_num; ++ static table_entry final_roundsrounds_num; ++ ++ static table_entry get_table_id (tree ctor) ++ { ++ return static_cast<table_entry> (check_table (ctor, tables));
View file
_service:tar_scm:0048-crypto-accel-add-optimization-level-requirement-to-t.patch
Added
@@ -0,0 +1,27 @@ +From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001 +From: vchernon <chernonog.vyacheslav@huawei.com> +Date: Wed, 27 Dec 2023 23:31:26 +0800 +Subject: PATCH 16/18 crypto-accel add optimization level requirement to + the gate + +fix issue (src-openEuler/gcc: I8RRDW) +--- + gcc/crypto-accel.cc | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc +index f4e810a6b..e7766a585 100644 +--- a/gcc/crypto-accel.cc ++++ b/gcc/crypto-accel.cc +@@ -2391,7 +2391,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- if (flag_crypto_accel_aes <= 0) ++ if (flag_crypto_accel_aes <= 0 || optimize < 1) + return false; + return targetm.get_v16qi_mode + && targetm.gen_rev32v16qi +-- +2.33.0 +
View file
_service:tar_scm:0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
Added
@@ -0,0 +1,239 @@ +From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001 +From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com> +Date: Tue, 12 Mar 2024 23:30:56 +0800 +Subject: PATCH 17/18 Add more flexible check for pointer aliasing during + vectorization It takes minimum between number of iteration and segment length + it helps to speed up loops with small number of iterations when only tail can + be vectorized + +--- + gcc/params.opt | 5 ++ + .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++ + gcc/tree-data-ref.cc | 67 +++++++++++++------ + gcc/tree-data-ref.h | 11 ++- + gcc/tree-vect-data-refs.cc | 14 +++- + 5 files changed, 95 insertions(+), 25 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c + +diff --git a/gcc/params.opt b/gcc/params.opt +index 6176d4790..7e5c119cf 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop. + Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check. + ++-param=vect-alias-flexible-segment-len= ++Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization ++Use a minimum length of different segments. Currenlty the minimum between ++iteration number and vectorization length is chosen by this param. ++ + -param=vect-max-version-for-alignment-checks= + Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c +new file mode 100644 +index 000000000..894f075f3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c +@@ -0,0 +1,23 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */ ++ ++#define TYPE int ++#define SIZE 257 ++ ++void __attribute__ ((weak)) ++f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) ++{ ++ for (int i = 0; i < SIZE; ++i) ++ xi * n += yi * n; ++} ++ ++/* { dg-final { scan-assembler {\tld1w\tz0-9+} } } */ ++/* { dg-final { scan-assembler {\tst1w\tz0-9+} } } */ ++/* { dg-final { scan-assembler {\tldr\tw0-9+} } } */ ++/* { dg-final { scan-assembler {\tstr\tw0-9+} } } */ ++/* Should use a WAR check that multiplies by (VF-2)*4 rather than ++ an overlap check that multiplies by (257-1)*4. */ ++/* { dg-final { scan-assembler {\tcntb\t(x0-9+)\n.*\tsub\tx0-9+, \1, #8\n.*\tmul\tx0-9+,^\n*\1} } } */ ++/* One range check and a check for n being zero. */ ++/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */ ++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ +diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc +index 397792c35..e6ae9e847 100644 +--- a/gcc/tree-data-ref.cc ++++ b/gcc/tree-data-ref.cc +@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, + same arguments. Try to optimize cases in which the second access + is a write and in which some overlap is valid. */ + +-static bool +-create_waw_or_war_checks (tree *cond_expr, ++static void ++create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a, + const dr_with_seg_len_pair_t &alias_pair) + { + const dr_with_seg_len& dr_a = alias_pair.first; + const dr_with_seg_len& dr_b = alias_pair.second; + +- /* Check for cases in which: +- +- (a) DR_B is always a write; +- (b) the accesses are well-ordered in both the original and new code +- (see the comment above the DR_ALIAS_* flags for details); and +- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ +- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) +- return false; +- +- /* Check for equal (but possibly variable) steps. */ + tree step = DR_STEP (dr_a.dr); +- if (!operand_equal_p (step, DR_STEP (dr_b.dr))) +- return false; +- +- /* Make sure that we can operate on sizetype without loss of precision. */ + tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); +- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) +- return false; + + /* All addresses involved are known to have a common alignment ALIGN. + We can therefore subtract ALIGN from an exclusive endpoint to get +@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr, + fold_convert (ssizetype, indicator), + ssize_int (0)); + +- /* Get lengths in sizetype. */ +- tree seg_len_a +- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); + step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); + + /* Each access has the following pattern: +@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr, + *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); ++} ++ ++/* This is a wrapper function for create_waw_or_war_checks2. */ ++static bool ++create_waw_or_war_checks (tree *cond_expr, ++ const dr_with_seg_len_pair_t &alias_pair) ++{ ++ const dr_with_seg_len& dr_a = alias_pair.first; ++ const dr_with_seg_len& dr_b = alias_pair.second; ++ ++ /* Check for cases in which: ++ ++ (a) DR_B is always a write; ++ (b) the accesses are well-ordered in both the original and new code ++ (see the comment above the DR_ALIAS_* flags for details); and ++ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ ++ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) ++ return false; ++ ++ /* Check for equal (but possibly variable) steps. */ ++ tree step = DR_STEP (dr_a.dr); ++ if (!operand_equal_p (step, DR_STEP (dr_b.dr))) ++ return false; ++ ++ /* Make sure that we can operate on sizetype without loss of precision. */ ++ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); ++ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) ++ return false; ++ ++ /* Get lengths in sizetype. */ ++ tree seg_len_a ++ = fold_convert (sizetype, ++ rewrite_to_non_trapping_overflow (dr_a.seg_len)); ++ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair); ++ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2) ++ { ++ tree seg_len2_a ++ = fold_convert (sizetype, ++ rewrite_to_non_trapping_overflow (dr_a.seg_len2)); ++ tree cond_expr2; ++ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair); ++ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, ++ *cond_expr, cond_expr2); ++ } + return true; + } + +diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h +index f643a95b2..9bc5f16ee 100644 +--- a/gcc/tree-data-ref.h ++++ b/gcc/tree-data-ref.h +@@ -213,12 +213,19 @@ class dr_with_seg_len + public: + dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size, + unsigned int a) +- : dr (d), seg_len (len), access_size (size), align (a) {} +- ++ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a) ++ {} ++ dr_with_seg_len (data_reference_p d, tree len, tree len2, ++ unsigned HOST_WIDE_INT size, unsigned int a) ++ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a) ++ {} + data_reference_p dr; + /* The offset of the last access that needs to be checked minus + the offset of the first. */ + tree seg_len; ++ /* The second version of segment length. Currently this is used to ++ soften checks for a small number of iterations. */ ++ tree seg_len2; + /* A value that, when added to abs (SEG_LEN), gives the total number of + bytes in the segment. */ + poly_uint64 access_size; +diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc +index 4e615b80b..04e68f621 100644 +--- a/gcc/tree-vect-data-refs.cc ++++ b/gcc/tree-vect-data-refs.cc +@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + { + poly_uint64 lower_bound; + tree segment_length_a, segment_length_b; ++ tree segment_length2_a, segment_length2_b; + unsigned HOST_WIDE_INT access_size_a, access_size_b; + unsigned int align_a, align_b; +
View file
_service:tar_scm:0050-Port-IPA-prefetch-to-GCC-12.patch
Added
@@ -0,0 +1,2071 @@ +From 7ee50ce44c652e21ca8ad33dc4e175f02b51b072 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Fri, 8 Mar 2024 06:50:39 +0800 +Subject: PATCH 18/18 Port IPA prefetch to GCC 12 + +--- + gcc/Makefile.in | 1 + + gcc/cgraph.cc | 1 + + gcc/cgraph.h | 2 + + gcc/common.opt | 8 + + gcc/ipa-devirt.cc | 54 +- + gcc/ipa-prefetch.cc | 1819 +++++++++++++++++++++++++++++++++++++++++++ + gcc/ipa-sra.cc | 8 + + gcc/params.opt | 8 + + gcc/passes.def | 1 + + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 11 files changed, 1902 insertions(+), 2 deletions(-) + create mode 100644 gcc/ipa-prefetch.cc + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 876000bda..10544e4a9 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1468,6 +1468,7 @@ OBJS = \ + ipa-modref.o \ + ipa-modref-tree.o \ + ipa-predicate.o \ ++ ipa-prefetch.o \ + ipa-profile.o \ + ipa-prop.o \ + ipa-param-manipulation.o \ +diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc +index 3734c85db..7d738b891 100644 +--- a/gcc/cgraph.cc ++++ b/gcc/cgraph.cc +@@ -998,6 +998,7 @@ cgraph_node::create_indirect_edge (gcall *call_stmt, int ecf_flags, + edge->indirect_info = cgraph_allocate_init_indirect_info (); + edge->indirect_info->ecf_flags = ecf_flags; + edge->indirect_info->vptr_changed = true; ++ edge->indirect_info->targets = NULL; + + /* Record polymorphic call info. */ + if (!cloning_p +diff --git a/gcc/cgraph.h b/gcc/cgraph.h +index d96690326..b84ff2f98 100644 +--- a/gcc/cgraph.h ++++ b/gcc/cgraph.h +@@ -1659,6 +1659,8 @@ public: + int param_index; + /* ECF flags determined from the caller. */ + int ecf_flags; ++ /* Vector of potential call targets determined by analysis. */ ++ vec<cgraph_node *, va_gc_atomic> *targets; + + /* Number of speculative call targets, it's less than GCOV_TOPN_VALUES. */ + unsigned num_speculative_call_targets : 16; +diff --git a/gcc/common.opt b/gcc/common.opt +index 1eb62ada5..e65a06af9 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1328,6 +1328,10 @@ fdevirtualize + Common Var(flag_devirtualize) Optimization + Try to convert virtual calls to direct ones. + ++fipa-ic ++Common Var(flag_ipa_ic) Optimization Init(0) ++Perform interprocedural analysis of indirect calls. ++ + ficp + Common Var(flag_icp) Optimization Init(0) + Try to promote indirect calls to direct ones. +@@ -2367,6 +2371,10 @@ fprefetch-loop-arrays + Common Var(flag_prefetch_loop_arrays) Init(-1) Optimization + Generate prefetch instructions, if available, for arrays in loops. + ++fipa-prefetch ++Common Var(flag_ipa_prefetch) Init(0) Optimization ++Generate prefetch instructions, if available, using IPA info. ++ + fprofile + Common Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc +index 318535d06..dd3562d56 100644 +--- a/gcc/ipa-devirt.cc ++++ b/gcc/ipa-devirt.cc +@@ -5758,6 +5758,54 @@ merge_fs_map_for_ftype_aliases () + } + } + ++/* Save results of indirect call analysis for the next passes. */ ++ ++static void ++save_analysis_results () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\n\nSave results of indirect call analysis.\n"); ++ ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ cgraph_edge *e, *next; ++ for (e = n->indirect_calls; e; e = next) ++ { ++ next = e->next_callee; ++ if (e->indirect_info->polymorphic) ++ continue; ++ gcall *stmt = e->call_stmt; ++ gcc_assert (stmt != NULL); ++ tree call_fn = gimple_call_fn (stmt); ++ tree call_fn_ty = TREE_TYPE (call_fn); ++ if (!POINTER_TYPE_P (call_fn_ty)) ++ continue; ++ ++ tree ctype = TYPE_CANONICAL (TREE_TYPE (call_fn_ty)); ++ unsigned ctype_uid = ctype ? TYPE_UID (ctype) : 0; ++ if (!ctype_uid || unsafe_types->count (ctype_uid) ++ || !fs_map->count (ctype_uid)) ++ continue; ++ /* TODO: cleanup noninterposable aliases. */ ++ decl_set *decls = (*fs_map)ctype_uid; ++ if (dump_file) ++ { ++ fprintf (dump_file, "For call "); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ vec_alloc (e->indirect_info->targets, decls->size ()); ++ for (decl_set::const_iterator it = decls->begin (); ++ it != decls->end (); it++) ++ { ++ struct cgraph_node *target = cgraph_node::get (*it); ++ /* TODO: maybe discard some targets. */ ++ e->indirect_info->targets->quick_push (target); ++ } ++ } ++ } ++} ++ + /* Dump function types with set of functions corresponding to it. */ + + static void +@@ -5822,6 +5870,8 @@ collect_function_signatures () + } + } + merge_fs_map_for_ftype_aliases (); ++ if (flag_ipa_ic) ++ save_analysis_results (); + if (dump_file) + dump_function_signature_sets (); + } +@@ -6217,7 +6267,7 @@ ipa_icp (void) + optimize indirect calls. */ + collect_function_type_aliases (); + collect_function_signatures (); +- bool optimized = optimize_indirect_calls (); ++ bool optimized = flag_icp ? optimize_indirect_calls () : false; + + remove_type_alias_map (ta_map); + remove_type_alias_map (fta_map); +@@ -6264,7 +6314,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- return (optimize && flag_icp && !seen_error () ++ return (optimize && (flag_icp || flag_ipa_ic) && !seen_error () + && (in_lto_p || flag_whole_program)); + } + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +new file mode 100644 +index 000000000..aeea51105 +--- /dev/null ++++ b/gcc/ipa-prefetch.cc +@@ -0,0 +1,1819 @@ ++/* IPA prefetch optimizations. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ Contributed by Ilia Diachkov. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++<http://www.gnu.org/licenses/>. */ ++ ++/* IPA prefetch is an interprocedural pass that detects cases of indirect ++ memory access potentially in loops and inserts prefetch instructions ++ to optimize cache usage during these indirect memory accesses. */ ++
View file
_service:tar_scm:0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
Added
@@ -0,0 +1,2216 @@ +From 4c262af8e178ac7c81b32be5b159b4d09a5841c9 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Fri, 8 Mar 2024 07:07:50 +0800 +Subject: PATCH 1/2 Port fixes for IPA prefetch to GCC 12 + +--- + gcc/ipa-devirt.cc | 9 +- + gcc/ipa-prefetch.cc | 174 +- + gcc/ipa-sra.cc | 7 + + gcc/params.opt | 4 +- + gcc/testsuite/gcc.dg/completion-1.c | 1 + + gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c | 1843 ++++++++++++++++++++ + 6 files changed, 1974 insertions(+), 64 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c + +diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc +index dd3562d56..dd000b401 100644 +--- a/gcc/ipa-devirt.cc ++++ b/gcc/ipa-devirt.cc +@@ -5029,9 +5029,12 @@ analyze_assign_stmt (gimple *stmt) + } + else + { +- fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", +- get_tree_code_name (TREE_CODE (rhs))); +- print_gimple_stmt (dump_file, stmt, 0); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", ++ get_tree_code_name (TREE_CODE (rhs))); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } + gcc_unreachable (); + } + } +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index aeea51105..9537e4835 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -167,6 +167,7 @@ analyse_cgraph () + } + + /* TODO: maybe remove loop info here. */ ++ n->get_body (); + push_cfun (DECL_STRUCT_FUNCTION (n->decl)); + calculate_dominance_info (CDI_DOMINATORS); + loop_optimizer_init (LOOPS_NORMAL); +@@ -942,6 +943,9 @@ compare_memrefs (memref_t* mr, memref_t* mr2) + (*mr_candidate_map)mr = mr2; + return; + } ++ /* Probably we shouldn't leave nulls in the map. */ ++ if ((*mr_candidate_map)mr == NULL) ++ return; + /* TODO: support analysis with incrementation of different fields. */ + if ((*mr_candidate_map)mr->offset != mr2->offset) + { +@@ -1090,6 +1094,15 @@ analyse_loops () + memref_t *mr = it->first, *mr2 = it->second; + if (mr2 == NULL || !(*fmrs_map)fn->count (mr)) + continue; ++ /* For now optimize only MRs that mem is MEM_REF. ++ TODO: support other MR types. */ ++ if (TREE_CODE (mr->mem) != MEM_REF) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Skip MR %d: unsupported tree code = %s\n", ++ mr->mr_id, get_tree_code_name (TREE_CODE (mr->mem))); ++ continue; ++ } + if (!optimize_mrs_map->count (fn)) + (*optimize_mrs_map)fn = new memref_set; + (*optimize_mrs_map)fn->insert (mr); +@@ -1102,7 +1115,7 @@ analyse_loops () + it != (*optimize_mrs_map)fn->end (); it++) + { + memref_t *mr = *it, *mr2 = (*mr_candidate_map)mr; +- fprintf (dump_file, "MRs %d,%d with incremental offset ", ++ fprintf (dump_file, "MRs %d, %d with incremental offset ", + mr->mr_id, mr2->mr_id); + print_generic_expr (dump_file, mr2->offset); + fprintf (dump_file, "\n"); +@@ -1435,6 +1448,52 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data) + return NULL_TREE; + } + ++/* Copy stmt and remap its operands. */ ++ ++static gimple * ++gimple_copy_and_remap (gimple *stmt) ++{ ++ gimple *copy = gimple_copy (stmt); ++ gcc_checking_assert (!is_gimple_debug (copy)); ++ ++ /* Remap all the operands in COPY. */ ++ struct walk_stmt_info wi; ++ memset (&wi, 0, sizeof (wi)); ++ wi.info = copy; ++ walk_gimple_op (copy, remap_gimple_op_r, &wi); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Stmt copy after remap:\n"); ++ print_gimple_stmt (dump_file, copy, 0); ++ } ++ return copy; ++} ++ ++/* Copy and remap stmts listed in MR in reverse order to last_idx, skipping ++ processed ones. Insert new stmts to the sequence. */ ++ ++static gimple * ++gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, ++ int last_idx, stmt_set &processed) ++{ ++ gimple *last_stmt = NULL; ++ for (int i = mr->stmts.length () - 1; i >= last_idx ; i--) ++ { ++ if (processed.count (mr->stmtsi)) ++ continue; ++ processed.insert (mr->stmtsi); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", ++ i, mr->mr_id); ++ print_gimple_stmt (dump_file, mr->stmtsi, 0); ++ } ++ last_stmt = gimple_copy_and_remap (mr->stmtsi); ++ gimple_seq_add_stmt (&stmts, last_stmt); ++ } ++ return last_stmt; ++} ++ + static void + create_cgraph_edge (cgraph_node *n, gimple *stmt) + { +@@ -1490,6 +1549,13 @@ optimize_function (cgraph_node *n, function *fn) + "Skip the case.\n"); + return 0; + } ++ if (!tree_fits_shwi_p (inc_mr->step)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Cannot represent incremental MR's step as " ++ "integer. Skip the case.\n"); ++ return 0; ++ } + if (dump_file && !used_mrs.empty ()) + print_mrs_ids (used_mrs, "Common list of used mrs:\n"); + +@@ -1539,16 +1605,44 @@ optimize_function (cgraph_node *n, function *fn) + return 0; + } + else if (dump_file) +- fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index); ++ { ++ fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index); ++ gimple_dump_bb (dump_file, dom_bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } + +- split_block (dom_bb, (gimple *) NULL); ++ /* Try to find comp_mr's stmt in the dominator bb. */ ++ gimple *last_used = NULL; ++ for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); ++ gsi_prev (&si)) ++ if (comp_mr->stmts0 == gsi_stmt (si)) ++ { ++ last_used = gsi_stmt (si); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Last used stmt in dominator bb:\n"); ++ print_gimple_stmt (dump_file, last_used, 0); ++ } ++ break; ++ } ++ ++ split_block (dom_bb, last_used); + gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); + + /* Create new inc var. Insert new_var = old_var + step * factor. */ + decl_map = new tree_map; + gcc_assert (comp_mr->stmts0 && gimple_assign_single_p (comp_mr->stmts0)); + tree inc_var = gimple_assign_lhs (comp_mr->stmts0); ++ /* If old_var definition dominates the current use, just use it, otherwise ++ evaluate it just before new inc var evaluation. */ + gimple_seq stmts = NULL; ++ stmt_set processed_stmts; ++ if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts0))) ++ { ++ gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, ++ processed_stmts); ++ inc_var = gimple_assign_lhs (tmp); ++ } + tree var_type = TREE_TYPE (inc_var); + enum tree_code inc_code; + if (TREE_CODE (var_type) == POINTER_TYPE) +@@ -1556,52 +1650,28 @@ optimize_function (cgraph_node *n, function *fn) + else + inc_code = PLUS_EXPR;
View file
_service:tar_scm:0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
Added
@@ -0,0 +1,94 @@ +From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Fri, 29 Mar 2024 17:15:41 +0800 +Subject: PATCH 2/2 Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7) + +--- + gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++-- + 1 file changed, 26 insertions(+), 2 deletions(-) + +diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc +index 9537e4835..1ceb5137f 100644 +--- a/gcc/ipa-prefetch.cc ++++ b/gcc/ipa-prefetch.cc +@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map; + typedef std::map<memref_t *, tree> memref_tree_map; + + typedef std::set<gimple *> stmt_set; ++typedef std::set<tree> tree_set; + typedef std::map<tree, tree> tree_map; + + tree_memref_map *tm_map; +@@ -1124,8 +1125,21 @@ analyse_loops () + } + } + ++/* Compare memrefs by IDs; helper for qsort. */ ++ ++static int ++memref_id_cmp (const void *p1, const void *p2) ++{ ++ const memref_t *mr1 = *(const memref_t **) p1; ++ const memref_t *mr2 = *(const memref_t **) p2; ++ ++ if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id) ++ return 1; ++ return -1; ++} ++ + /* Reduce the set filtering out memrefs with the same memory references, +- return the result vector of memrefs. */ ++ sort and return the result vector of memrefs. */ + + static void + reduce_memref_set (memref_set *set, vec<memref_t *> &vec) +@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec) + vec.safe_push (mr1); + } + } ++ vec.qsort (memref_id_cmp); + if (dump_file) + { + fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ()); +@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn) + } + + /* Create other new vars. Insert new stmts. */ ++ vec<memref_t *> used_mr_vec = vNULL; + for (memref_set::const_iterator it = used_mrs.begin (); + it != used_mrs.end (); it++) ++ used_mr_vec.safe_push (*it); ++ used_mr_vec.qsort (memref_id_cmp); ++ ++ for (unsigned int j = 0; j < used_mr_vec.length (); j++) + { +- memref_t *mr = *it; ++ memref_t *mr = used_mr_vecj; + if (mr == comp_mr) + continue; + gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, +@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn) + local = integer_three_node; + break; + } ++ tree_set prefetched_addrs; + for (unsigned int j = 0; j < vmrs.length (); j++) + { + memref_t *mr = vmrsj; +@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn) + tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); + if (decl_map->count (addr)) + addr = (*decl_map)addr; ++ if (prefetched_addrs.count (addr)) ++ continue; + last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), + 3, addr, write_p, local); + pcalls.safe_push (last_stmt); + gimple_seq_add_stmt (&stmts, last_stmt); ++ prefetched_addrs.insert (addr); + if (dump_file) + { + fprintf (dump_file, "Insert %d prefetch stmt:\n", j); +-- +2.33.0 +
View file
_service:tar_scm:0053-struct-reorg-Add-Semi-Relayout.patch
Added
@@ -0,0 +1,1366 @@ +From c2a0dcc565e0f6274f26644bd389337db8f2940c Mon Sep 17 00:00:00 2001 +From: tiancheng-bao <baotiancheng1@huawei.com> +Date: Sat, 30 Mar 2024 11:04:23 +0800 +Subject: PATCH struct-reorg Add Semi Relayout + +--- + gcc/common.opt | 6 +- + gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 975 +++++++++++++++++- + gcc/ipa-struct-reorg/ipa-struct-reorg.h | 8 + + gcc/params.opt | 5 + + .../gcc.dg/struct/semi_relayout_rewrite.c | 86 ++ + gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 4 + + 6 files changed, 1040 insertions(+), 44 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/semi_relayout_rewrite.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 38f1e457d..9484df5ad 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2010,9 +2010,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization + Perform structure layout optimizations. + + fipa-struct-reorg= +-Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 5) +--fipa-struct-reorg=0,1,2,3,4,5 adding none, struct-reorg, reorder-fields, +-dfe, safe-pointer-compression, unsafe-pointer-compression optimizations. ++Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 6) ++-fipa-struct-reorg=0,1,2,3,4,5,6 adding none, struct-reorg, reorder-fields, ++dfe, safe-pointer-compression, unsafe-pointer-compression, semi-relayout optimizations. + + fipa-vrp + Common Var(flag_ipa_vrp) Optimization +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +index 3922873f3..6a202b4bd 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +@@ -294,7 +294,8 @@ enum struct_layout_opt_level + STRUCT_REORDER_FIELDS = 1 << 2, + DEAD_FIELD_ELIMINATION = 1 << 3, + POINTER_COMPRESSION_SAFE = 1 << 4, +- POINTER_COMPRESSION_UNSAFE = 1 << 5 ++ POINTER_COMPRESSION_UNSAFE = 1 << 5, ++ SEMI_RELAYOUT = 1 << 6 + }; + + /* Defines the target pointer size of compressed pointer, which should be 8, +@@ -308,6 +309,7 @@ void get_base (tree &base, tree expr); + + static unsigned int current_layout_opt_level; + hash_map<tree, tree> replace_type_map; ++hash_map<tree, tree> semi_relayout_map; + + /* Return true if one of these types is created by struct-reorg. */ + +@@ -426,7 +428,9 @@ srtype::srtype (tree type) + visited (false), + pc_candidate (false), + has_legal_alloc_num (false), +- has_alloc_array (0) ++ has_alloc_array (0), ++ semi_relayout (false), ++ bucket_parts (0) + { + for (int i = 0; i < max_split; i++) + newtypei = NULL_TREE; +@@ -891,6 +895,66 @@ srfield::create_new_reorder_fields (tree newtypemax_split, + newfield0 = field; + } + ++/* Given a struct s whose fields has already reordered by size, we try to ++ combine fields less than 8 bytes together to 8 bytes. Example: ++ struct s { ++ uint64_t a, ++ uint32_t b, ++ uint32_t c, ++ uint32_t d, ++ uint16_t e, ++ uint8_t f ++ } ++ ++ We allocate memory for arrays of struct S, before semi-relayout, their ++ layout in memory is shown as below: ++ a,b,c,d,e,f,padding;a,b,c,d,e,f,padding;... ++ ++ During semi-relayout, we put a number of structs into a same region called ++ bucket. The number is determined by param realyout-bucket-capacity-level. ++ Using 1024 here as example. After semi-relayout, the layout in a bucket is ++ shown as below: ++ part1 a;a;a... ++ part2 b,c;b,c;b,c;... ++ part3 d,e,f,pad;d,e,f,pad;d,e,f,pad;... ++ ++ In the last bucket, if the amount of rest structs is less than the capacity ++ of a bucket, the rest of allcated memory will be wasted as padding. */ ++ ++unsigned ++srtype::calculate_bucket_size () ++{ ++ unsigned parts = 0; ++ unsigned bit_sum = 0; ++ unsigned relayout_offset = 0; ++ /* Currently, limit each 8 bytes with less than 2 fields. */ ++ unsigned curr_part_num = 0; ++ unsigned field_num = 0; ++ for (tree f = TYPE_FIELDS (newtype0); f; f = DECL_CHAIN (f)) ++ { ++ unsigned size = TYPE_PRECISION (TREE_TYPE (f)); ++ bit_sum += size; ++ field_num++; ++ if (++curr_part_num > 2 || bit_sum > 64) ++ { ++ bit_sum = size; ++ parts++; ++ relayout_offset = relayout_part_size * parts; ++ curr_part_num = 1; ++ } ++ else ++ { ++ relayout_offset = relayout_part_size * parts + (bit_sum - size) / 8; ++ } ++ new_field_offsets.put (f, relayout_offset); ++ } ++ /* Donnot relayout a struct with only one field after DFE. */ ++ if (field_num == 1) ++ return 0; ++ bucket_parts = ++parts; ++ return parts * relayout_part_size; ++} ++ + /* Create the new TYPE corresponding to THIS type. */ + + bool +@@ -1001,6 +1065,15 @@ srtype::create_new_type (void) + if (pc_candidate && pc_gptr == NULL_TREE) + create_global_ptr_for_pc (); + ++ if (semi_relayout) ++ { ++ bucket_size = calculate_bucket_size (); ++ if (bucket_size == 0) ++ return false; ++ if (semi_relayout_map.get (this->newtype0) == NULL) ++ semi_relayout_map.put (this->newtype0, this->type); ++ } ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Created %d types:\n", maxclusters); +@@ -1393,7 +1466,7 @@ public: + bool should_create = false, bool can_escape = false); + bool wholeaccess (tree expr, tree base, tree accesstype, srtype *t); + +- void check_alloc_num (gimple *stmt, srtype *type); ++ void check_alloc_num (gimple *stmt, srtype *type, bool ptrptr); + void check_definition_assign (srdecl *decl, vec<srdecl *> &worklist); + void check_definition_call (srdecl *decl, vec<srdecl *> &worklist); + void check_definition (srdecl *decl, vec<srdecl *> &); +@@ -1442,6 +1515,33 @@ public: + tree &); + basic_block create_bb_for_compress_nullptr (basic_block, tree &); + basic_block create_bb_for_decompress_nullptr (basic_block, tree, tree &); ++ ++ // Semi-relayout methods: ++ bool is_semi_relayout_candidate (tree); ++ srtype *get_semi_relayout_candidate_type (tree); ++ void check_and_prune_struct_for_semi_relayout (void); ++ tree rewrite_pointer_diff (gimple_stmt_iterator *, tree, tree, srtype *); ++ tree rewrite_pointer_plus_integer (gimple *, gimple_stmt_iterator *, tree, ++ tree, srtype *); ++ tree build_div_expr (gimple_stmt_iterator *, tree, tree); ++ tree get_true_pointer_base (gimple_stmt_iterator *, tree, srtype *); ++ tree get_real_allocated_ptr (tree, gimple_stmt_iterator *); ++ tree set_ptr_for_use (tree, gimple_stmt_iterator *); ++ void record_allocated_size (tree, gimple_stmt_iterator *, tree); ++ tree read_allocated_size (tree, gimple_stmt_iterator *); ++ gimple *create_aligned_alloc (gimple_stmt_iterator *, srtype *, tree, ++ tree &); ++ void create_memset_zero (tree, gimple_stmt_iterator *, tree); ++ void create_memcpy (tree, tree, tree, gimple_stmt_iterator *); ++ void create_free (tree, gimple_stmt_iterator *); ++ void copy_to_lhs (tree, tree, gimple_stmt_iterator *); ++ srtype *get_relayout_candidate_type (tree); ++ long unsigned int get_true_field_offset (srfield *, srtype *); ++ tree rewrite_address (tree, srfield *, srtype *, gimple_stmt_iterator *); ++ bool check_sr_copy (gimple *); ++ void relayout_field_copy (gimple_stmt_iterator *, gimple *, tree, tree, ++ tree&, tree &); ++ bool do_semi_relayout (gimple_stmt_iterator *, gimple *, tree &, tree &); + }; + + struct ipa_struct_relayout +@@ -4355,7 +4455,7 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, + } + + void +-ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) ++ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type, bool ptrptr) + { + if (current_layout_opt_level >= COMPLETE_STRUCT_RELAYOUT
View file
_service:tar_scm:0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
Added
@@ -0,0 +1,28 @@ +From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com> +Date: Tue, 2 Apr 2024 11:08:30 +0800 +Subject: PATCH Struct Reorg Bugfix for structure pointer compression + +--- + gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +index fa33f2d35..3922873f3 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void) + if (!type->has_legal_alloc_num) + { + if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE) ++ { + if (dump_file) + fprintf (dump_file, " has unknown alloc size, but" + " in unsafe mode, so"); ++ } + else + { + if (dump_file) +-- +2.33.0 +
View file
_service:tar_scm:0055-Struct-Reorg-Port-bugfixes-to-GCC-12.3.1.patch
Added
@@ -0,0 +1,420 @@ +From 55c547748af36ffc3f2d5ed154a91fb3fcb8431c Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu <wumingchuan1992@foxmail.com> +Date: Thu, 11 Apr 2024 15:49:59 +0800 +Subject: PATCH Struct Reorg Port bugfixes to GCC 12.3.1 + +Migrated from commits in GCC10.3.1: +https://gitee.com/openeuler/gcc/commit/41af6d361a6d85ef4fce8a8438113d765596afdd +https://gitee.com/openeuler/gcc/commit/25d74b98caeaae881e374924886ee664aa1af5bc +https://gitee.com/openeuler/gcc/commit/b5a3bfe92f96cd0d2224d80ac4eaa80dab1bd6bf +https://gitee.com/openeuler/gcc/commit/708ffe6f132ee39441b66b6ab6b98847d35916b7 +https://gitee.com/openeuler/gcc/commit/e875e4e7f3716aa268ffbbf55ee199ec82b6aeba +--- + gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 97 ++++++++++--------- + gcc/testsuite/gcc.dg/struct/dfe_escape.c | 50 ++++++++++ + gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c | 69 +++++++++++++ + gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 2 + + gcc/testsuite/gcc.dg/struct/struct_reorg-10.c | 29 ++++++ + gcc/testsuite/gcc.dg/struct/struct_reorg-11.c | 16 +++ + gcc/testsuite/gcc.dg/struct/struct_reorg-12.c | 26 +++++ + 7 files changed, 243 insertions(+), 46 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_escape.c + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-10.c + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-11.c + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-12.c + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +index 6a202b4bd..f03d1d875 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc +@@ -466,10 +466,19 @@ srtype::has_dead_field (void) + unsigned i; + FOR_EACH_VEC_ELT (fields, i, this_field) + { +- if (!(this_field->field_access & READ_FIELD)) +- { +- may_dfe = true; +- break; ++ /* Function pointer members are not processed, because DFE ++ does not currently support accurate analysis of function ++ pointers, and we have not identified specific use cases. */ ++ if (!(this_field->field_access & READ_FIELD) ++ && !FUNCTION_POINTER_TYPE_P (this_field->fieldtype)) ++ { ++ /* Fields with escape risks should not be processed. */ ++ if (this_field->type == NULL ++ || (this_field->type->escapes == does_not_escape)) ++ { ++ may_dfe = true; ++ break; ++ } + } + } + return may_dfe; +@@ -1032,8 +1041,13 @@ srtype::create_new_type (void) + { + srfield *f = fieldsi; + if (current_layout_opt_level & DEAD_FIELD_ELIMINATION +- && !(f->field_access & READ_FIELD)) +- continue; ++ && !(f->field_access & READ_FIELD) ++ && !FUNCTION_POINTER_TYPE_P (f->fieldtype)) ++ { ++ /* Fields with escape risks should not be processed. */ ++ if (f->type == NULL || (f->type->escapes == does_not_escape)) ++ continue; ++ } + f->create_new_fields (newtype, newfields, newlast); + } + +@@ -3815,9 +3829,17 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, + if (VOID_POINTER_P (TREE_TYPE (side)) + && TREE_CODE (side) == SSA_NAME) + { +- /* The type is other, the declaration is side. */ +- current_function->record_decl (type, side, -1, +- isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL); ++ tree inner = SSA_NAME_VAR (side); ++ if (inner) ++ { ++ srdecl *in = find_decl (inner); ++ if (in && !in->type->has_escaped ()) ++ { ++ /* The type is other, the declaration is side. */ ++ current_function->record_decl (type, side, -1, ++ isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL); ++ } ++ } + } + else + /* *_1 = &MEM(void *)&x + 8B. */ +@@ -3910,6 +3932,12 @@ ipa_struct_reorg::maybe_record_assign (cgraph_node *node, gassign *stmt) + maybe_mark_or_record_other_side (rhs, lhs, stmt); + if (TREE_CODE (lhs) == SSA_NAME) + maybe_mark_or_record_other_side (lhs, rhs, stmt); ++ ++ /* Handle missing ARRAY_REF cases. */ ++ if (TREE_CODE (lhs) == ARRAY_REF) ++ mark_type_as_escape (TREE_TYPE (lhs), escape_array, stmt); ++ if (TREE_CODE (rhs) == ARRAY_REF) ++ mark_type_as_escape (TREE_TYPE (rhs), escape_array, stmt); + } + } + +@@ -5272,8 +5300,11 @@ ipa_struct_reorg::record_accesses (void) + record_function (cnode); + else + { +- tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl)); +- mark_type_as_escape (return_type, escape_return, NULL); ++ if (cnode->externally_visible) ++ { ++ tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl)); ++ mark_type_as_escape (return_type, escape_return, NULL); ++ } + } + + } +@@ -5889,6 +5920,7 @@ ipa_struct_reorg::rewrite_expr (tree expr, + bool escape_from_base = false; + + tree newbasemax_split; ++ memset (newbase, 0, sizeof (treemax_split)); + memset (newexpr, 0, sizeof (treemax_split)); + + if (TREE_CODE (expr) == CONSTRUCTOR) +@@ -6912,7 +6944,7 @@ create_bb_for_group_diff_ne_0 (basic_block new_bb, tree &phi, tree ptr, + } + + tree +-ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt, ++ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt ATTRIBUTE_UNUSED, + gimple_stmt_iterator *gsi, + tree ptr, tree offset, + srtype *type) +@@ -7889,41 +7921,14 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, + should be removed. */ + + bool +-ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) ++ipa_struct_reorg::rewrite_debug (gimple *, gimple_stmt_iterator *) + { +- if (current_layout_opt_level >= STRUCT_REORDER_FIELDS) +- /* Delete debug gimple now. */ +- return true; +- bool remove = false; +- if (gimple_debug_bind_p (stmt)) +- { +- tree var = gimple_debug_bind_get_var (stmt); +- tree newvarmax_split; +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- if (gimple_debug_bind_has_value_p (stmt)) +- { +- var = gimple_debug_bind_get_value (stmt); +- if (TREE_CODE (var) == POINTER_PLUS_EXPR) +- var = TREE_OPERAND (var, 0); +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- } +- } +- else if (gimple_debug_source_bind_p (stmt)) +- { +- tree var = gimple_debug_source_bind_get_var (stmt); +- tree newvarmax_split; +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- var = gimple_debug_source_bind_get_value (stmt); +- if (TREE_CODE (var) == POINTER_PLUS_EXPR) +- var = TREE_OPERAND (var, 0); +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- } +- +- return remove; ++ /* In debug statements, there might be some statements that have ++ been optimized out in gimple but left in debug gimple. Sometimes ++ these statements need to be analyzed to escape, but in rewrite ++ stage it shouldn't happen. It needs to care a lot to handle these ++ cases but seems useless. So now we just delete debug gimple. */ ++ return true; + } + + /* Rewrite PHI nodes, return true if the PHI was replaced. */ +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_escape.c b/gcc/testsuite/gcc.dg/struct/dfe_escape.c +new file mode 100644 +index 000000000..09efe8027 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_escape.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile } */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2