Projects
Mega-LLVM:24.03
gcc
Sign Up
Log In
Username
Password
We truncated the diff of some files because they were too big. If you want to see the full diff for every file,
click here
.
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Difference Between Revision 1 and
Mega:24.03
/
gcc
View file
_service:tar_scm:gcc.spec
Changed
@@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 25 +%global gcc_release 19 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -23,7 +23,7 @@ %else %global build_libquadmath 0 %endif -%ifarch %{ix86} x86_64 ppc ppc64 ppc64le ppc64p7 s390 s390x %{arm} aarch64 loongarch64 riscv64 +%ifarch %{ix86} x86_64 ppc ppc64 ppc64le ppc64p7 s390 s390x %{arm} aarch64 loongarch64 %global build_libasan 1 %else %global build_libasan 0 @@ -38,7 +38,7 @@ %else %global build_liblsan 0 %endif -%ifarch %{ix86} x86_64 ppc ppc64 ppc64le ppc64p7 s390 s390x %{arm} aarch64 loongarch64 riscv64 +%ifarch %{ix86} x86_64 ppc ppc64 ppc64le ppc64p7 s390 s390x %{arm} aarch64 loongarch64 %global build_libubsan 1 %else %global build_libubsan 0 @@ -166,65 +166,6 @@ Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch -Patch29: 0029-Struct-Reorg-Add-Safe-Structure-Pointer-Compression.patch -Patch30: 0030-Struct-Reorg-Add-unsafe-structure-pointer-compressio.patch -Patch31: 0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch -Patch32: 0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch -Patch33: 0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch -Patch34: 0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch -Patch35: 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch -Patch36: 0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch -Patch37: 0037-Perform-early-if-conversion-of-simple-arithmetic.patch -Patch38: 0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch -Patch39: 0039-Match-double-sized-mul-pattern.patch -Patch40: 0040-Port-icp-patch-to-GCC-12.patch -Patch41: 0041-Port-fixes-in-icp-to-GCC-12.patch -Patch42: 0042-Add-split-complex-instructions-pass.patch -Patch43: 0043-Extending-and-refactoring-of-pass_split_complex_inst.patch -Patch44: 0044-Port-maxmin-patch-to-GCC-12.patch -Patch45: 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch -Patch46: 0046-Add-new-pattern-to-pass-the-maxmin-tests.patch -Patch47: 0047-AES-Implement-AES-pattern-matching.patch -Patch48: 0048-crypto-accel-add-optimization-level-requirement-to-t.patch -Patch49: 0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch -Patch50: 0050-Port-IPA-prefetch-to-GCC-12.patch -Patch51: 0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch -Patch52: 0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch -Patch53: 0053-struct-reorg-Add-Semi-Relayout.patch -Patch54: 0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch -Patch55: 0055-Struct-Reorg-Port-bugfixes-to-GCC-12.3.1.patch -Patch56: 0056-Fix-bug-that-verifying-gimple-failed-when-reorg-leve.patch -Patch57: 0057-AutoFdo-Fix-memory-leaks-in-autofdo.patch -Patch58: 0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch -Patch59: 0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch -Patch60: 0060-Make-option-mvzeroupper-independent-of-optimization-.patch -Patch61: 0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch -Patch62: 0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch -Patch63: 0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch -Patch64: 0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch -Patch65: 0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch -Patch66: 0066-Software-mitigation-Disable-gather-generation-in-vec.patch -Patch67: 0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch -Patch68: 0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch -Patch69: 0069-Disparage-slightly-for-the-alternative-which-move-DF.patch -Patch70: 0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch -Patch71: 0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch -Patch72: 0072-Disable-FMADD-in-chains-for-Zen4-and-generic.patch -Patch73: 0073-Initial-Raptorlake-Support.patch -Patch74: 0074-Initial-Meteorlake-Support.patch -Patch75: 0075-Support-Intel-AMX-FP16-ISA.patch -Patch76: 0076-Support-Intel-prefetchit0-t1.patch -Patch77: 0077-Initial-Granite-Rapids-Support.patch -Patch78: 0078-Support-Intel-AMX-COMPLEX.patch -Patch79: 0079-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch -Patch80: 0080-Initial-Granite-Rapids-D-Support.patch -Patch81: 0081-Correct-Granite-Rapids-D-documentation.patch -Patch82: 0082-i386-Remove-Meteorlake-s-family_model.patch -Patch83: 0083-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch -Patch84: 0084-x86-Update-model-values-for-Raptorlake.patch -Patch85: 0085-Fix-target_clone-arch-graniterapids-d.patch -Patch86: 0086-Modfify-cost-calculation-for-dealing-with-equivalenc.patch -Patch87: 0087-Add-cost-calculation-for-reg-equivalence-invariants.patch # Part 3000 ~ 4999 %ifarch loongarch64 @@ -353,10 +294,6 @@ Patch3124: libsanitizer-add-LoongArch-support.patch Patch3125: LoongArch-fix-error-building.patch Patch3126: libjccjit-do-not-link-objects-contained-same-element.patch -Patch3127: LoongArch-Use-finer-grained-DBAR-hints.patch -Patch3128: LoongArch-Add-LA664-support.patch -Patch3129: LoongArch-Fix-internal-error-running-gcc-march-nativ.patch -Patch3130: LoongArch-Fix-lsx-vshuf.c-and-lasx-xvshuf_b.c-tests-.patch %endif # On ARM EABI systems, we do want -gnueabi to be part of the @@ -852,65 +789,6 @@ %patch26 -p1 %patch27 -p1 %patch28 -p1 -%patch29 -p1 -%patch30 -p1 -%patch31 -p1 -%patch32 -p1 -%patch33 -p1 -%patch34 -p1 -%patch35 -p1 -%patch36 -p1 -%patch37 -p1 -%patch38 -p1 -%patch39 -p1 -%patch40 -p1 -%patch41 -p1 -%patch42 -p1 -%patch43 -p1 -%patch44 -p1 -%patch45 -p1 -%patch46 -p1 -%patch47 -p1 -%patch48 -p1 -%patch49 -p1 -%patch50 -p1 -%patch51 -p1 -%patch52 -p1 -%patch53 -p1 -%patch54 -p1 -%patch55 -p1 -%patch56 -p1 -%patch57 -p1 -%patch58 -p1 -%patch59 -p1 -%patch60 -p1 -%patch61 -p1 -%patch62 -p1 -%patch63 -p1 -%patch64 -p1 -%patch65 -p1 -%patch66 -p1 -%patch67 -p1 -%patch68 -p1 -%patch69 -p1 -%patch70 -p1 -%patch71 -p1 -%patch72 -p1 -%patch73 -p1 -%patch74 -p1 -%patch75 -p1 -%patch76 -p1 -%patch77 -p1 -%patch78 -p1 -%patch79 -p1 -%patch80 -p1 -%patch81 -p1 -%patch82 -p1 -%patch83 -p1 -%patch84 -p1 -%patch85 -p1 -%patch86 -p1 -%patch87 -p1 %ifarch loongarch64 %patch3001 -p1 @@ -1038,10 +916,6 @@ %patch3124 -p1 %patch3125 -p1 %patch3126 -p1 -%patch3127 -p1 -%patch3128 -p1 -%patch3129 -p1 -%patch3130 -p1 %endif echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2488,8 +2362,6 @@ %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/plugin/include/config/loongarch/loongarch-def.h %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/plugin/include/config/loongarch/loongarch-tune.h %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/plugin/include/config/loongarch/loongarch-driver.h -%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/lsxintrin.h -%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/lasxintrin.h %endif %ifarch sparc sparcv9 sparc64 %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/visintrin.h @@ -3302,29 +3174,6 @@ %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog -* Wed Apr 24 2024 Wang Ding <wangding16@huawei.com> - 12.3.1-25 -- Type: Sync -- DESC: Sync patch from openeuler/gcc - -* Tue Apr 23 2024 laokz <zhangkai@iscas.ac.cn> - 12.3.1-24 -- Type: SPEC
View file
_service:tar_scm:0029-Struct-Reorg-Add-Safe-Structure-Pointer-Compression.patch
Deleted
@@ -1,1191 +0,0 @@ -From 7930d75c9fd3f36cc2dce934569f00c71248bb31 Mon Sep 17 00:00:00 2001 -From: liyancheng <412998149@qq.com> -Date: Sat, 25 Nov 2023 10:28:48 +0800 -Subject: PATCH Struct Reorg Add Safe Structure Pointer Compression - -Safe structure pointer compression allows safely transfer pointers -stored in structure into the index of structure array with smaller -type to reduce the size of structure. -Add flag -fipa-struct-reorg=4 to enable safe structure pointer -compression. -Add param compressed-pointer-size=8,16,32 to control the compressed -pointer size. ---- - gcc/common.opt | 5 +- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 908 ++++++++++++++++++++++- - gcc/ipa-struct-reorg/ipa-struct-reorg.h | 4 + - gcc/params.opt | 4 + - 4 files changed, 882 insertions(+), 39 deletions(-) - -diff --git a/gcc/common.opt b/gcc/common.opt -index b01df919e..f6e20c1e8 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1993,8 +1993,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization - Perform structure layout optimizations. - - fipa-struct-reorg= --Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 3) ---fipa-struct-reorg=0,1,2,3 adding none, struct-reorg, reorder-fields, dfe optimizations. -+Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) -+-fipa-struct-reorg=0,1,2,3,4 adding none, struct-reorg, reorder-fields, -+dfe, safe-pointer-compression optimizations. - - fipa-vrp - Common Var(flag_ipa_vrp) Optimization -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index dcc6df496..5d451c4c8 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -89,6 +89,7 @@ along with GCC; see the file COPYING3. If not see - #include "gimple-iterator.h" - #include "gimple-walk.h" - #include "cfg.h" -+#include "cfghooks.h" /* For split_block. */ - #include "ssa.h" - #include "tree-dfa.h" - #include "fold-const.h" -@@ -147,7 +148,27 @@ using namespace struct_relayout; - #define VOID_POINTER_P(type) \ - (POINTER_TYPE_P (type) && VOID_TYPE_P (TREE_TYPE (type))) - --/* Return true iff TYPE is stdarg va_list type. */ -+static void -+set_var_attributes (tree var) -+{ -+ if (!var) -+ return; -+ gcc_assert (TREE_CODE (var) == VAR_DECL); -+ -+ DECL_ARTIFICIAL (var) = 1; -+ DECL_EXTERNAL (var) = 0; -+ TREE_STATIC (var) = 1; -+ TREE_PUBLIC (var) = 0; -+ TREE_USED (var) = 1; -+ DECL_CONTEXT (var) = NULL; -+ TREE_THIS_VOLATILE (var) = 0; -+ TREE_ADDRESSABLE (var) = 0; -+ TREE_READONLY (var) = 0; -+ if (is_global_var (var)) -+ set_decl_tls_model (var, TLS_MODEL_NONE); -+} -+ -+/* Return true if TYPE is stdarg va_list type. */ - - static inline bool - is_va_list_type (tree type) -@@ -271,9 +292,15 @@ enum struct_layout_opt_level - STRUCT_SPLIT = 1 << 0, - COMPLETE_STRUCT_RELAYOUT = 1 << 1, - STRUCT_REORDER_FIELDS = 1 << 2, -- DEAD_FIELD_ELIMINATION = 1 << 3 -+ DEAD_FIELD_ELIMINATION = 1 << 3, -+ POINTER_COMPRESSION_SAFE = 1 << 4 - }; - -+/* Defines the target pointer size of compressed pointer, which should be 8, -+ 16, 32. */ -+ -+static int compressed_size = 32; -+ - static bool is_result_of_mult (tree arg, tree *num, tree struct_size); - static bool isptrptr (tree type); - void get_base (tree &base, tree expr); -@@ -394,7 +421,10 @@ srtype::srtype (tree type) - : type (type), - chain_type (false), - escapes (does_not_escape), -+ pc_gptr (NULL_TREE), - visited (false), -+ pc_candidate (false), -+ has_legal_alloc_num (false), - has_alloc_array (0) - { - for (int i = 0; i < max_split; i++) -@@ -476,6 +506,31 @@ srtype::mark_escape (escape_type e, gimple *stmt) - } - } - -+/* Create a global header for compressed struct. */ -+ -+void -+srtype::create_global_ptr_for_pc () -+{ -+ if (!pc_candidate || pc_gptr != NULL_TREE) -+ return; -+ -+ const char *type_name = get_type_name (type); -+ gcc_assert (type_name != NULL); -+ -+ char *gptr_name = concat (type_name, "_pc", NULL); -+ tree new_name = get_identifier (gptr_name); -+ tree new_type = build_pointer_type (newtype0); -+ tree new_var = build_decl (UNKNOWN_LOCATION, VAR_DECL, new_name, new_type); -+ set_var_attributes (new_var); -+ pc_gptr = new_var; -+ -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "\nType: %s has create global header for pointer" -+ " compression: %s\n", type_name, gptr_name); -+ -+ free (gptr_name); -+} -+ - /* Add FIELD to the list of fields that use this type. */ - - void -@@ -798,15 +853,31 @@ srfield::create_new_reorder_fields (tree newtypemax_split, - fields.safe_push (field); - } - -- DECL_NAME (field) = DECL_NAME (fielddecl); - if (type == NULL) -- /* Common members do not need to reconstruct. -+ { -+ DECL_NAME (field) = DECL_NAME (fielddecl); -+ /* Common members do not need to reconstruct. - Otherwise, int* -> int** or void* -> void**. */ -- TREE_TYPE (field) = nt; -+ TREE_TYPE (field) = nt; -+ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); -+ } -+ else if (type->pc_candidate) -+ { -+ const char *old_name = IDENTIFIER_POINTER (DECL_NAME (fielddecl)); -+ char *new_name = concat (old_name, "_pc", NULL); -+ DECL_NAME (field) = get_identifier (new_name); -+ free (new_name); -+ TREE_TYPE (field) = make_unsigned_type (compressed_size); -+ SET_DECL_ALIGN (field, compressed_size); -+ } - else -- TREE_TYPE (field) = reconstruct_complex_type (TREE_TYPE (fielddecl), nt); -+ { -+ TREE_TYPE (field) = reconstruct_complex_type (TREE_TYPE (fielddecl), nt); -+ DECL_NAME (field) = DECL_NAME (fielddecl); -+ SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); -+ } -+ - DECL_SOURCE_LOCATION (field) = DECL_SOURCE_LOCATION (fielddecl); -- SET_DECL_ALIGN (field, DECL_ALIGN (fielddecl)); - DECL_USER_ALIGN (field) = DECL_USER_ALIGN (fielddecl); - TREE_ADDRESSABLE (field) = TREE_ADDRESSABLE (fielddecl); - DECL_NONADDRESSABLE_P (field) = !TREE_ADDRESSABLE (fielddecl); -@@ -925,6 +996,10 @@ srtype::create_new_type (void) - && has_dead_field ()) - fprintf (dump_file, "Dead field elimination.\n"); - } -+ -+ if (pc_candidate && pc_gptr == NULL_TREE) -+ create_global_ptr_for_pc (); -+ - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, "Created %d types:\n", maxclusters); -@@ -1338,6 +1413,30 @@ public: - - unsigned execute_struct_relayout (void); - bool remove_dead_field_stmt (tree lhs); -+ -+ // Pointer compression methods: -+ void check_and_prune_struct_for_pointer_compression (void); -+ void try_rewrite_with_pointer_compression (gassign *, gimple_stmt_iterator *, -+ tree, tree, tree &, tree &); -+ bool safe_void_cmp_p (tree, srtype *); -+ bool pc_candidate_st_type_p (tree); -+ bool pc_candidate_tree_p (tree); -+ bool pc_type_conversion_candidate_p (tree); -+ bool pc_direct_rewrite_chance_p (tree, tree &); -+ bool compress_candidate_with_check (gimple_stmt_iterator *, tree, tree &);
View file
_service:tar_scm:0030-Struct-Reorg-Add-unsafe-structure-pointer-compressio.patch
Deleted
@@ -1,1232 +0,0 @@ -From 82d6166cd29fb1c3474f29b28cb7e5478d3a551a Mon Sep 17 00:00:00 2001 -From: liyancheng <412998149@qq.com> -Date: Mon, 25 Dec 2023 11:17:04 +0800 -Subject: PATCH Struct Reorg Add unsafe structure pointer compression - -Unsafe structure pointer compression allows for some dangerous -conversions for better performance. -Add flag -fipa-struct-reorg=5 to enable unsafe structure pointer -compression. ---- - gcc/common.opt | 6 +- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 365 ++++++++++++++---- - gcc/symbol-summary.h | 22 +- - .../gcc.dg/struct/csr_skip_void_struct_name.c | 53 +++ - gcc/testsuite/gcc.dg/struct/pc_cast_int.c | 91 +++++ - .../gcc.dg/struct/pc_compress_and_decomress.c | 90 +++++ - gcc/testsuite/gcc.dg/struct/pc_ptr2void.c | 87 +++++ - .../gcc.dg/struct/pc_simple_rewrite_pc.c | 112 ++++++ - .../gcc.dg/struct/pc_skip_void_struct_name.c | 53 +++ - gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 8 + - 10 files changed, 804 insertions(+), 83 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/struct/csr_skip_void_struct_name.c - create mode 100644 gcc/testsuite/gcc.dg/struct/pc_cast_int.c - create mode 100644 gcc/testsuite/gcc.dg/struct/pc_compress_and_decomress.c - create mode 100644 gcc/testsuite/gcc.dg/struct/pc_ptr2void.c - create mode 100644 gcc/testsuite/gcc.dg/struct/pc_simple_rewrite_pc.c - create mode 100644 gcc/testsuite/gcc.dg/struct/pc_skip_void_struct_name.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index 56b547506..c7c6bc256 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1993,9 +1993,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization - Perform structure layout optimizations. - - fipa-struct-reorg= --Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 4) ---fipa-struct-reorg=0,1,2,3,4 adding none, struct-reorg, reorder-fields, --dfe, safe-pointer-compression optimizations. -+Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 5) -+-fipa-struct-reorg=0,1,2,3,4,5 adding none, struct-reorg, reorder-fields, -+dfe, safe-pointer-compression, unsafe-pointer-compression optimizations. - - fipa-vrp - Common Var(flag_ipa_vrp) Optimization -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index 5d451c4c8..fa33f2d35 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -293,7 +293,8 @@ enum struct_layout_opt_level - COMPLETE_STRUCT_RELAYOUT = 1 << 1, - STRUCT_REORDER_FIELDS = 1 << 2, - DEAD_FIELD_ELIMINATION = 1 << 3, -- POINTER_COMPRESSION_SAFE = 1 << 4 -+ POINTER_COMPRESSION_SAFE = 1 << 4, -+ POINTER_COMPRESSION_UNSAFE = 1 << 5 - }; - - /* Defines the target pointer size of compressed pointer, which should be 8, -@@ -1267,10 +1268,10 @@ csrtype::init_type_info (void) - - /* Close enough to pad to improve performance. - 33~63 should pad to 64 but 33~48 (first half) are too far away, and -- 65~127 should pad to 128 but 65~96 (first half) are too far away. */ -+ 70~127 should pad to 128 but 65~70 (first half) are too far away. */ - if (old_size > 48 && old_size < 64) - new_size = 64; -- if (old_size > 96 && old_size < 128) -+ if (old_size > 70 && old_size < 128) - new_size = 128; - - /* For performance reasons, only allow structure size -@@ -1423,8 +1424,12 @@ public: - bool pc_candidate_tree_p (tree); - bool pc_type_conversion_candidate_p (tree); - bool pc_direct_rewrite_chance_p (tree, tree &); -+ bool pc_simplify_chance_for_compress_p (gassign *, tree); -+ bool compress_candidate_without_check (gimple_stmt_iterator *, tree, tree &); - bool compress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); - bool compress_candidate (gassign *, gimple_stmt_iterator *, tree, tree &); -+ bool decompress_candidate_without_check (gimple_stmt_iterator *, -+ tree, tree, tree &, tree &); - bool decompress_candidate_with_check (gimple_stmt_iterator *, tree, tree &); - bool decompress_candidate (gimple_stmt_iterator *, tree, tree, tree &, - tree &); -@@ -1924,7 +1929,6 @@ bool - ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, - HOST_WIDE_INT ×) - { -- bool ret = false; - gcc_assert (TREE_CODE (cst) == INTEGER_CST); - - gimple *stmt = gsi_stmt (*gsi); -@@ -1948,27 +1952,95 @@ ipa_struct_relayout::maybe_rewrite_cst (tree cst, gimple_stmt_iterator *gsi, - { - if (gsi_one_before_end_p (*gsi)) - return false; -- gsi_next (gsi); -- gimple *stmt2 = gsi_stmt (*gsi); -- -- if (gimple_code (stmt2) == GIMPLE_ASSIGN -- && gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) -+ // Check uses. -+ imm_use_iterator imm_iter_lhs; -+ use_operand_p use_p_lhs; -+ FOR_EACH_IMM_USE_FAST (use_p_lhs, imm_iter_lhs, gimple_assign_lhs (stmt)) - { -- tree lhs = gimple_assign_lhs (stmt2); -- tree rhs1 = gimple_assign_rhs1 (stmt2); -- if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) -- || types_compatible_p (inner_type (TREE_TYPE (lhs)), ctype.type)) -+ gimple *stmt2 = USE_STMT (use_p_lhs); -+ if (gimple_code (stmt2) != GIMPLE_ASSIGN) -+ continue; -+ if (gimple_assign_rhs_code (stmt2) == POINTER_PLUS_EXPR) - { -- tree num = NULL; -- if (is_result_of_mult (cst, &num, TYPE_SIZE_UNIT (ctype.type))) -+ tree lhs = gimple_assign_lhs (stmt2); -+ tree rhs1 = gimple_assign_rhs1 (stmt2); -+ if (types_compatible_p (inner_type (TREE_TYPE (rhs1)), ctype.type) -+ || types_compatible_p (inner_type (TREE_TYPE (lhs)), -+ ctype.type)) - { -- times = TREE_INT_CST_LOW (num); -- ret = true; -+ tree num = NULL; -+ if (is_result_of_mult (cst, &num, -+ TYPE_SIZE_UNIT (ctype.type))) -+ { -+ times = TREE_INT_CST_LOW (num); -+ return true; -+ } -+ } -+ } -+ // For pointer compression, handle plus stmt. -+ else if (gimple_assign_rhs_code (stmt2) == PLUS_EXPR) -+ { -+ // Check uses. -+ imm_use_iterator imm_iter_cast; -+ use_operand_p use_p_cast; -+ FOR_EACH_IMM_USE_FAST (use_p_cast, imm_iter_cast, -+ gimple_assign_lhs (stmt2)) -+ { -+ gimple *stmt_cast = USE_STMT (use_p_cast); -+ if (gimple_code (stmt_cast) != GIMPLE_ASSIGN) -+ continue; -+ if (gimple_assign_cast_p (stmt_cast)) -+ { -+ tree lhs_type = inner_type (TREE_TYPE ( -+ gimple_assign_lhs (stmt_cast))); -+ if (types_compatible_p (lhs_type, ctype.type)) -+ { -+ tree num = NULL; -+ if (is_result_of_mult (cst, &num, -+ TYPE_SIZE_UNIT (ctype.type))) -+ { -+ times = TREE_INT_CST_LOW (num); -+ return true; -+ } -+ } -+ } - } - } - } -- gsi_prev (gsi); -- return ret; -+ } -+ // For pointer compression, handle div stmt. -+ if (gimple_assign_rhs_code (stmt) == TRUNC_DIV_EXPR) -+ { -+ imm_use_iterator imm_iter; -+ use_operand_p use_p; -+ tree lhs = gimple_assign_lhs (stmt); -+ if (lhs == NULL_TREE) -+ return false; -+ FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) -+ { -+ gimple *use_stmt = USE_STMT (use_p); -+ if (is_gimple_debug (use_stmt)) -+ continue; -+ if (gimple_code (use_stmt) != GIMPLE_ASSIGN) -+ continue; -+ if (gimple_assign_cast_p (use_stmt)) -+ { -+ tree lhs_type = inner_type (TREE_TYPE ( -+ gimple_assign_lhs (use_stmt))); -+ if (TYPE_UNSIGNED (lhs_type) -+ && TREE_CODE (lhs_type) == INTEGER_TYPE -+ && TYPE_PRECISION (lhs_type) == compressed_size) -+ { -+ tree num = NULL; -+ if (is_result_of_mult (cst, &num, -+ TYPE_SIZE_UNIT (ctype.type))) -+ { -+ times = TREE_INT_CST_LOW (num); -+ return true; -+ } -+ }
View file
_service:tar_scm:0031-AutoBOLT-Support-saving-feedback-count-info-to-ELF-s.patch
Deleted
@@ -1,550 +0,0 @@ -From 72531376df5ed93c2d945469368ba5514eca8407 Mon Sep 17 00:00:00 2001 -From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> -Date: Tue, 5 Dec 2023 15:33:08 +0800 -Subject: PATCH AutoBOLT Support saving feedback count info to ELF segment - 1/3 - ---- - gcc/common.opt | 8 + - gcc/final.cc | 405 ++++++++++++++++++++++++++++++++++++++++++++++++- - gcc/opts.cc | 61 ++++++++ - 3 files changed, 473 insertions(+), 1 deletion(-) - -diff --git a/gcc/common.opt b/gcc/common.opt -index b01df919e..e69947fc2 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -2546,6 +2546,14 @@ freorder-functions - Common Var(flag_reorder_functions) Optimization - Reorder functions to improve code placement. - -+fauto-bolt -+Common Var(flag_auto_bolt) -+Generate profile from AutoFDO or PGO and do BOLT optimization after linkage. -+ -+fauto-bolt= -+Common Joined RejectNegative -+Specify the feedback data directory required by BOLT-plugin. The default is the current directory. -+ - frerun-cse-after-loop - Common Var(flag_rerun_cse_after_loop) Optimization - Add a common subexpression elimination pass after loop optimizations. -diff --git a/gcc/final.cc b/gcc/final.cc -index a9868861b..d4c4fa08f 100644 ---- a/gcc/final.cc -+++ b/gcc/final.cc -@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3. If not see - #include "rtl-iter.h" - #include "print-rtl.h" - #include "function-abi.h" -+#include "insn-codes.h" - #include "common/common-target.h" - - #ifdef XCOFF_DEBUGGING_INFO -@@ -4266,7 +4267,403 @@ leaf_renumber_regs_insn (rtx in_rtx) - } - } - #endif -- -+ -+#define ASM_FDO_SECTION_PREFIX ".text.fdo." -+ -+#define ASM_FDO_CALLER_FLAG ".fdo.caller " -+#define ASM_FDO_CALLER_SIZE_FLAG ".fdo.caller.size " -+#define ASM_FDO_CALLER_BIND_FLAG ".fdo.caller.bind" -+ -+#define ASM_FDO_CALLEE_FLAG ".fdo.callee" -+ -+/* Return the relative offset address of the start instruction of BB, -+ return -1 if it is empty instruction. */ -+ -+static int -+get_bb_start_addr (basic_block bb) -+{ -+ rtx_insn *insn; -+ FOR_BB_INSNS (bb, insn) -+ { -+ if (!INSN_P (insn)) -+ { -+ continue; -+ } -+ /* The jump target of call is not in this function, so -+ it should be excluded. */ -+ if (CALL_P (insn)) -+ { -+ return -1; -+ } -+ -+ int insn_code = recog_memoized (insn); -+ -+ /* The instruction NOP in llvm-bolt belongs to the previous -+ BB, so it needs to be skipped. */ -+ if (insn_code != CODE_FOR_nop) -+ { -+ return INSN_ADDRESSES (INSN_UID (insn)); -+ } -+ } -+ return -1; -+} -+ -+/* Return the relative offet address of the end instruction of BB, -+ return -1 if it is empty or call instruction. */ -+ -+static int -+get_bb_end_addr (basic_block bb) -+{ -+ rtx_insn *insn; -+ int num_succs = EDGE_COUNT (bb->succs); -+ FOR_BB_INSNS_REVERSE (bb, insn) -+ { -+ if (!INSN_P (insn)) -+ { -+ continue; -+ } -+ /* The jump target of call is not in this function, so -+ it should be excluded. */ -+ if (CALL_P (insn)) -+ { -+ return -1; -+ } -+ if ((num_succs == 1) -+ || ((num_succs == 2) && any_condjump_p (insn))) -+ { -+ return INSN_ADDRESSES (INSN_UID (insn)); -+ } -+ else -+ { -+ return -1; -+ } -+ } -+ return -1; -+} -+ -+/* Return the end address of cfun. */ -+ -+static int -+get_function_end_addr () -+{ -+ rtx_insn *insn = get_last_insn (); -+ for (; insn != get_insns (); insn = PREV_INSN (insn)) -+ { -+ if (!INSN_P (insn)) -+ { -+ continue; -+ } -+ return INSN_ADDRESSES (INSN_UID (insn)); -+ } -+ -+ return -1; -+} -+ -+/* Return the function profile status string. */ -+ -+static const char * -+get_function_profile_status () -+{ -+ const char *profile_status = { -+ "PROFILE_ABSENT", -+ "PROFILE_GUESSED", -+ "PROFILE_READ", -+ "PROFILE_LAST" /* Last value, used by profile streaming. */ -+ }; -+ -+ return profile_statusprofile_status_for_fn (cfun); -+} -+ -+/* Return the count from the feedback data, such as PGO or ADDO. */ -+ -+inline static gcov_type -+get_fdo_count (profile_count count) -+{ -+ return count.quality () >= GUESSED -+ ? count.to_gcov_type () : 0; -+} -+ -+/* Return the profile quality string. */ -+ -+static const char * -+get_fdo_count_quality (profile_count count) -+{ -+ const char *profile_quality = { -+ "UNINITIALIZED_PROFILE", -+ "GUESSED_LOCAL", -+ "GUESSED_GLOBAL0", -+ "GUESSED_GLOBAL0_ADJUSTED", -+ "GUESSED", -+ "AFDO", -+ "ADJUSTED", -+ "PRECISE" -+ }; -+ -+ return profile_qualitycount.quality (); -+} -+ -+static const char * -+alias_local_functions (const char *fnname) -+{ -+ if (TREE_PUBLIC (cfun->decl)) -+ { -+ return fnname; -+ } -+ return concat (fnname, "/", lbasename (dump_base_name), NULL); -+} -+ -+/* Return function bind type string. */ -+ -+static const char * -+simple_get_function_bind () -+{ -+ const char *function_bind = {
View file
_service:tar_scm:0032-AutoBOLT-Add-bolt-linker-plugin-2-3.patch
Deleted
@@ -1,34094 +0,0 @@ -From 82f9f48406955a6150def998b69b4eace4bd51eb Mon Sep 17 00:00:00 2001 -From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> -Date: Thu, 7 Dec 2023 11:43:08 +0800 -Subject: PATCH AutoBOLT Add bolt linker plugin 2/3 - ---- - bolt-plugin/Makefile | 675 ++ - bolt-plugin/Makefile.am | 43 + - bolt-plugin/Makefile.in | 675 ++ - bolt-plugin/aclocal.m4 | 10250 +++++++++++++++++ - bolt-plugin/bolt-plugin.cc | 1153 ++ - bolt-plugin/config.h.in | 179 + - bolt-plugin/configure | 20909 +++++++++++++++++++++++++++++++++++ - bolt-plugin/configure.ac | 60 + - gcc/common.opt | 16 + - gcc/opts.cc | 27 +- - 10 files changed, 33985 insertions(+), 2 deletions(-) - create mode 100644 bolt-plugin/Makefile - create mode 100644 bolt-plugin/Makefile.am - create mode 100644 bolt-plugin/Makefile.in - create mode 100644 bolt-plugin/aclocal.m4 - create mode 100644 bolt-plugin/bolt-plugin.cc - create mode 100644 bolt-plugin/config.h.in - create mode 100755 bolt-plugin/configure - create mode 100644 bolt-plugin/configure.ac - -diff --git a/bolt-plugin/Makefile b/bolt-plugin/Makefile -new file mode 100644 -index 000000000..82a4bc2c6 ---- /dev/null -+++ b/bolt-plugin/Makefile -@@ -0,0 +1,675 @@ -+# Makefile.in generated by automake 1.16.5 from Makefile.am. -+# Makefile. Generated from Makefile.in by configure. -+ -+# Copyright (C) 1994-2021 Free Software Foundation, Inc. -+ -+# This Makefile.in is free software; the Free Software Foundation -+# gives unlimited permission to copy and/or distribute it, -+# with or without modifications, as long as this notice is preserved. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -+# PARTICULAR PURPOSE. -+ -+ -+ -+ -+am__is_gnu_make = { \ -+ if test -z '$(MAKELEVEL)'; then \ -+ false; \ -+ elif test -n '$(MAKE_HOST)'; then \ -+ true; \ -+ elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \ -+ true; \ -+ else \ -+ false; \ -+ fi; \ -+} -+am__make_running_with_option = \ -+ case $${target_option-} in \ -+ ?) ;; \ -+ *) echo "am__make_running_with_option: internal error: invalid" \ -+ "target option '$${target_option-}' specified" >&2; \ -+ exit 1;; \ -+ esac; \ -+ has_opt=no; \ -+ sane_makeflags=$$MAKEFLAGS; \ -+ if $(am__is_gnu_make); then \ -+ sane_makeflags=$$MFLAGS; \ -+ else \ -+ case $$MAKEFLAGS in \ -+ *\\\ \ *) \ -+ bs=\\; \ -+ sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ -+ | sed "s/$$bs$$bs$$bs $$bs *//g"`;; \ -+ esac; \ -+ fi; \ -+ skip_next=no; \ -+ strip_trailopt () \ -+ { \ -+ flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ -+ }; \ -+ for flg in $$sane_makeflags; do \ -+ test $$skip_next = yes && { skip_next=no; continue; }; \ -+ case $$flg in \ -+ *=*|--*) continue;; \ -+ -*I) strip_trailopt 'I'; skip_next=yes;; \ -+ -*I?*) strip_trailopt 'I';; \ -+ -*O) strip_trailopt 'O'; skip_next=yes;; \ -+ -*O?*) strip_trailopt 'O';; \ -+ -*l) strip_trailopt 'l'; skip_next=yes;; \ -+ -*l?*) strip_trailopt 'l';; \ -+ -dEDm) skip_next=yes;; \ -+ -JT) skip_next=yes;; \ -+ esac; \ -+ case $$flg in \ -+ *$$target_option*) has_opt=yes; break;; \ -+ esac; \ -+ done; \ -+ test $$has_opt = yes -+am__make_dryrun = (target_option=n; $(am__make_running_with_option)) -+am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) -+pkgdatadir = $(datadir)/bolt-plugin -+pkgincludedir = $(includedir)/bolt-plugin -+pkglibdir = $(libdir)/bolt-plugin -+pkglibexecdir = $(libexecdir)/bolt-plugin -+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -+install_sh_DATA = $(install_sh) -c -m 644 -+install_sh_PROGRAM = $(install_sh) -c -+install_sh_SCRIPT = $(install_sh) -c -+INSTALL_HEADER = $(INSTALL_DATA) -+transform = $(program_transform_name) -+NORMAL_INSTALL = : -+PRE_INSTALL = : -+POST_INSTALL = : -+NORMAL_UNINSTALL = : -+PRE_UNINSTALL = : -+POST_UNINSTALL = : -+build_triplet = aarch64-unknown-linux-gnu -+host_triplet = aarch64-unknown-linux-gnu -+target_triplet = aarch64-unknown-linux-gnu -+subdir = . -+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -+am__aclocal_m4_deps = $(top_srcdir)/configure.ac -+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ -+ $(ACLOCAL_M4) -+DIST_COMMON = $(srcdir)/Makefile.am $(top_srcdir)/configure \ -+ $(am__configure_deps) -+am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \ -+ configure.lineno config.status.lineno -+mkinstalldirs = $(SHELL) $(top_srcdir)/../mkinstalldirs -+CONFIG_HEADER = config.h -+CONFIG_CLEAN_FILES = -+CONFIG_CLEAN_VPATH_FILES = -+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; -+am__vpath_adj = case $$p in \ -+ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ -+ *) f=$$p;; \ -+ esac; -+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; -+am__install_max = 40 -+am__nobase_strip_setup = \ -+ srcdirstrip=`echo "$(srcdir)" | sed 's/.^$$\\*|/\\\\&/g'` -+am__nobase_strip = \ -+ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" -+am__nobase_list = $(am__nobase_strip_setup); \ -+ for p in $$list; do echo "$$p $$p"; done | \ -+ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/^/*$$,\1,' | \ -+ $(AWK) 'BEGIN { files"." = "" } { files$$2 = files$$2 " " $$1; \ -+ if (++n$$2 == $(am__install_max)) \ -+ { print $$2, files$$2; n$$2 = 0; files$$2 = "" } } \ -+ END { for (dir in files) print dir, filesdir }' -+am__base_list = \ -+ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ -+ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' -+am__uninstall_files_from_dir = { \ -+ test -z "$$files" \ -+ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ -+ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ -+ $(am__cd) "$$dir" && rm -f $$files; }; \ -+ } -+am__installdirs = "$(DESTDIR)$(libexecsubdir)" -+LTLIBRARIES = $(libexecsub_LTLIBRARIES) -+am_libbolt_plugin_la_OBJECTS = bolt-plugin.lo -+libbolt_plugin_la_OBJECTS = $(am_libbolt_plugin_la_OBJECTS) -+AM_V_P = $(am__v_P_$(V)) -+am__v_P_ = $(am__v_P_$(AM_DEFAULT_VERBOSITY)) -+am__v_P_0 = false -+am__v_P_1 = : -+AM_V_GEN = $(am__v_GEN_$(V)) -+am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) -+am__v_GEN_0 = @echo " GEN " $@; -+am__v_GEN_1 = -+AM_V_at = $(am__v_at_$(V)) -+am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) -+am__v_at_0 = @ -+am__v_at_1 = -+DEFAULT_INCLUDES = -I. -+depcomp = -+am__maybe_remake_depfiles = -+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ -+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -+AM_V_lt = $(am__v_lt_$(V)) -+am__v_lt_ = $(am__v_lt_$(AM_DEFAULT_VERBOSITY)) -+am__v_lt_0 = --silent -+am__v_lt_1 = -+LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ -+ $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ -+ $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ -+ $(AM_CXXFLAGS) $(CXXFLAGS) -+AM_V_CXX = $(am__v_CXX_$(V)) -+am__v_CXX_ = $(am__v_CXX_$(AM_DEFAULT_VERBOSITY)) -+am__v_CXX_0 = @echo " CXX " $@; -+am__v_CXX_1 = -+CXXLD = $(CXX) -+CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ -+ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
View file
_service:tar_scm:0033-AutoBOLT-Enable-BOLT-linker-plugin-on-aarch64-3-3.patch
Deleted
@@ -1,345 +0,0 @@ -From 94242286383a80e6ab83d824a4d7ea23ea311f75 Mon Sep 17 00:00:00 2001 -From: zhenyu--zhao_admin <zhaozhenyu17@huawei.com> -Date: Mon, 22 Jan 2024 15:38:24 +0800 -Subject: PATCH AutoBOLT Enable BOLT linker plugin on aarch64 3/3 - ---- - Makefile.def | 10 ++++++++++ - configure | 27 ++++++++++++++++++++++++++- - configure.ac | 22 +++++++++++++++++++++- - gcc/config.host | 1 + - gcc/config.in | 13 +++++++++++++ - gcc/configure | 10 ++++++++-- - gcc/configure.ac | 4 ++++ - gcc/gcc.cc | 23 +++++++++++++++++++++++ - 8 files changed, 106 insertions(+), 4 deletions(-) - -diff --git a/Makefile.def b/Makefile.def -index 72d585496..0ba868890 100644 ---- a/Makefile.def -+++ b/Makefile.def -@@ -145,6 +145,9 @@ host_modules= { module= gnattools; }; - host_modules= { module= lto-plugin; bootstrap=true; - extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@'; - extra_make_flags='@extra_linker_plugin_flags@'; }; -+host_modules= { module= bolt-plugin; bootstrap=true; -+ extra_configure_flags='--enable-shared @extra_linker_plugin_flags@ @extra_linker_plugin_configure_flags@'; -+ extra_make_flags='@extra_linker_plugin_flags@'; }; - host_modules= { module= libcc1; extra_configure_flags=--enable-shared; }; - host_modules= { module= gotools; }; - host_modules= { module= libctf; bootstrap=true; }; -@@ -349,6 +352,7 @@ dependencies = { module=configure-gcc; on=all-mpfr; }; - dependencies = { module=configure-gcc; on=all-mpc; }; - dependencies = { module=configure-gcc; on=all-isl; }; - dependencies = { module=configure-gcc; on=all-lto-plugin; }; -+dependencies = { module=configure-gcc; on=all-bolt-plugin; }; - dependencies = { module=configure-gcc; on=all-binutils; }; - dependencies = { module=configure-gcc; on=all-gas; }; - dependencies = { module=configure-gcc; on=all-ld; }; -@@ -374,6 +378,7 @@ dependencies = { module=all-gcc; on=all-libdecnumber; hard=true; }; - dependencies = { module=all-gcc; on=all-libiberty; }; - dependencies = { module=all-gcc; on=all-fixincludes; }; - dependencies = { module=all-gcc; on=all-lto-plugin; }; -+dependencies = { module=all-gcc; on=all-bolt-plugin; }; - dependencies = { module=all-gcc; on=all-libiconv; }; - dependencies = { module=info-gcc; on=all-build-libiberty; }; - dependencies = { module=dvi-gcc; on=all-build-libiberty; }; -@@ -381,8 +386,10 @@ dependencies = { module=pdf-gcc; on=all-build-libiberty; }; - dependencies = { module=html-gcc; on=all-build-libiberty; }; - dependencies = { module=install-gcc ; on=install-fixincludes; }; - dependencies = { module=install-gcc ; on=install-lto-plugin; }; -+dependencies = { module=install-gcc ; on=install-bolt-plugin; }; - dependencies = { module=install-strip-gcc ; on=install-strip-fixincludes; }; - dependencies = { module=install-strip-gcc ; on=install-strip-lto-plugin; }; -+dependencies = { module=install-strip-gcc ; on=install-strip-bolt-plugin; }; - - dependencies = { module=configure-libcpp; on=configure-libiberty; hard=true; }; - dependencies = { module=configure-libcpp; on=configure-intl; }; -@@ -401,6 +408,9 @@ dependencies = { module=all-gnattools; on=all-target-libstdc++-v3; }; - dependencies = { module=all-lto-plugin; on=all-libiberty; }; - dependencies = { module=all-lto-plugin; on=all-libiberty-linker-plugin; }; - -+dependencies = { module=all-bolt-plugin; on=all-libiberty; }; -+dependencies = { module=all-bolt-plugin; on=all-libiberty-linker-plugin; }; -+ - dependencies = { module=configure-libcc1; on=configure-gcc; }; - dependencies = { module=all-libcc1; on=all-gcc; }; - -diff --git a/configure b/configure -index 5dcaab14a..aff62c464 100755 ---- a/configure -+++ b/configure -@@ -826,6 +826,7 @@ with_isl - with_isl_include - with_isl_lib - enable_isl_version_check -+enable_bolt - enable_lto - enable_linker_plugin_configure_flags - enable_linker_plugin_flags -@@ -1550,6 +1551,7 @@ Optional Features: - enable the PGO build - --disable-isl-version-check - disable check for isl version -+ --enable-bolt enable bolt optimization support - --enable-lto enable link time optimization support - --enable-linker-plugin-configure-flags=FLAGS - additional flags for configuring linker plugins -@@ -8564,6 +8566,15 @@ fi - - - -+# Check for BOLT support. -+# Check whether --enable-bolt was given. -+if test "${enable_bolt+set}" = set; then : -+ enableval=$enable_bolt; enable_bolt=$enableval -+else -+ enable_bolt=no; default_enable_bolt=no -+fi -+ -+ - # Check for LTO support. - # Check whether --enable-lto was given. - if test "${enable_lto+set}" = set; then : -@@ -8593,6 +8604,16 @@ if test $target_elf = yes; then : - # ELF platforms build the lto-plugin always. - build_lto_plugin=yes - -+ # ELF platforms can build the bolt-plugin. -+ # NOT BUILD BOLT BY DEFAULT. -+ case $target in -+ aarch64*-*-linux*) -+ if test $enable_bolt = yes; then : -+ build_bolt_plugin=yes -+ fi -+ ;; -+ esac -+ - else - if test x"$default_enable_lto" = x"yes" ; then - case $target in -@@ -8780,6 +8801,10 @@ if test -d ${srcdir}/gcc; then - fi - fi - -+ if test "${build_bolt_plugin}" = "yes" ; then -+ configdirs="$configdirs bolt-plugin" -+ fi -+ - # If we're building an offloading compiler, add the LTO front end. - if test x"$enable_as_accelerator_for" != x ; then - case ,${enable_languages}, in -@@ -9202,7 +9227,7 @@ fi - extra_host_libiberty_configure_flags= - extra_host_zlib_configure_flags= - case " $configdirs " in -- *" lto-plugin "* | *" libcc1 "*) -+ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*) - # When these are to be built as shared libraries, the same applies to - # libiberty. - extra_host_libiberty_configure_flags=--enable-shared -diff --git a/configure.ac b/configure.ac -index 85977482a..f310d75ca 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -1863,6 +1863,12 @@ fi - AC_SUBST(isllibs) - AC_SUBST(islinc) - -+# Check for BOLT support. -+AC_ARG_ENABLE(bolt, -+AS_HELP_STRING(--enable-bolt, enable bolt optimization support), -+enable_bolt=$enableval, -+enable_bolt=no; default_enable_bolt=no) -+ - # Check for LTO support. - AC_ARG_ENABLE(lto, - AS_HELP_STRING(--enable-lto, enable link time optimization support), -@@ -1871,6 +1877,16 @@ enable_lto=yes; default_enable_lto=yes) - - ACX_ELF_TARGET_IFELSE(# ELF platforms build the lto-plugin always. - build_lto_plugin=yes -+ -+ # ELF platforms can build the bolt-plugin. -+ # NOT BUILD BOLT BY DEFAULT. -+ case $target in -+ aarch64*-*-linux*) -+ if test $enable_bolt = yes; then : -+ build_bolt_plugin=yes -+ fi -+ ;; -+ esac - ,if test x"$default_enable_lto" = x"yes" ; then - case $target in - *-apple-darwin912* | *-cygwin* | *-mingw* | *djgpp*) ;; -@@ -2049,6 +2065,10 @@ if test -d ${srcdir}/gcc; then - fi - fi - -+ if test "${build_bolt_plugin}" = "yes" ; then -+ configdirs="$configdirs bolt-plugin" -+ fi -+ - # If we're building an offloading compiler, add the LTO front end. - if test x"$enable_as_accelerator_for" != x ; then - case ,${enable_languages}, in -@@ -2457,7 +2477,7 @@ fi - extra_host_libiberty_configure_flags= - extra_host_zlib_configure_flags= - case " $configdirs " in -- *" lto-plugin "* | *" libcc1 "*) -+ *" lto-plugin "* | *" libcc1 "* | *" bolt-plugin "*) - # When these are to be built as shared libraries, the same applies to - # libiberty. - extra_host_libiberty_configure_flags=--enable-shared -diff --git a/gcc/config.host b/gcc/config.host -index 4ca300f11..bf7dcb4cc 100644 ---- a/gcc/config.host -+++ b/gcc/config.host -@@ -75,6 +75,7 @@ out_host_hook_obj=host-default.o
View file
_service:tar_scm:0034-Autofdo-Enable-discrimibator-and-MCF-algorithm-on-Au.patch
Deleted
@@ -1,312 +0,0 @@ -From b020447c840c6e22440a9b9063298a06333fd2f1 Mon Sep 17 00:00:00 2001 -From: zhenyu--zhao <zhaozhenyu17@huawei.com> -Date: Sat, 23 Mar 2024 22:56:09 +0800 -Subject: PATCH AutofdoEnable discrimibator and MCF algorithm on Autofdo - ---- - gcc/auto-profile.cc | 171 +++++++++++++++++++++++++++++++++++++++++++- - gcc/cfghooks.cc | 7 ++ - gcc/opts.cc | 5 +- - gcc/tree-inline.cc | 14 ++++ - 4 files changed, 193 insertions(+), 4 deletions(-) - -diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc -index 2b34b80b8..f45f0ec66 100644 ---- a/gcc/auto-profile.cc -+++ b/gcc/auto-profile.cc -@@ -466,6 +466,17 @@ string_table::get_index (const char *name) const - if (name == NULL) - return -1; - string_index_map::const_iterator iter = map_.find (name); -+ /* Function name may be duplicate. Try to distinguish by the -+ #file_name#function_name defined by the autofdo tool chain. */ -+ if (iter == map_.end ()) -+ { -+ char* file_name = get_original_name (lbasename (dump_base_name)); -+ char* file_func_name -+ = concat ("#", file_name, "#", name, NULL); -+ iter = map_.find (file_func_name); -+ free (file_name); -+ free (file_func_name); -+ } - if (iter == map_.end ()) - return -1; - -@@ -654,7 +665,7 @@ function_instance::read_function_instance (function_instance_stack *stack, - - for (unsigned i = 0; i < num_pos_counts; i++) - { -- unsigned offset = gcov_read_unsigned () & 0xffff0000; -+ unsigned offset = gcov_read_unsigned (); - unsigned num_targets = gcov_read_unsigned (); - gcov_type count = gcov_read_counter (); - s->pos_countsoffset.count = count; -@@ -733,6 +744,10 @@ autofdo_source_profile::get_count_info (gimple *stmt, count_info *info) const - function_instance *s = get_function_instance_by_inline_stack (stack); - if (s == NULL) - return false; -+ if (s->get_count_info (stack0.second + stmt->bb->discriminator, info)) -+ { -+ return true; -+ } - return s->get_count_info (stack0.second, info); - } - -@@ -1395,6 +1410,66 @@ afdo_propagate (bb_set *annotated_bb) - } - } - -+/* Process the following scene when the branch probability -+ inversion when do function afdo_propagate (). E.g. -+ BB_NUM (sample count) -+ BB1 (1000) -+ / \ -+ BB2 (10) BB3 (0) -+ \ / -+ BB4 -+ In afdo_propagate ().count of BB3 is calculated by -+ COUNT (BB3) = 990 (990 = COUNT (BB1) - COUNT (BB2) = 1000 - 10) -+ In fact, BB3 may be colder than BB2 by sample count. -+ This function allocate source BB count to wach succ BB by sample -+ rate, E.g. -+ BB2_COUNT = BB1_COUNT * (BB2_COUNT / (BB2_COUNT + BB3_COUNT)) */ -+ -+static void -+afdo_preprocess_bb_count () -+{ -+ basic_block bb; -+ FOR_ALL_BB_FN (bb, cfun) -+ { -+ if (bb->count.ipa_p () && EDGE_COUNT (bb->succs) > 1 -+ && bb->count > profile_count::zero ().afdo ()) -+ { -+ basic_block bb1 = EDGE_SUCC (bb, 0)->dest; -+ basic_block bb2 = EDGE_SUCC (bb, 1)->dest; -+ if (single_succ_edge (bb1) && single_succ_edge (bb2) -+ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest) -+ { -+ gcov_type max_count = 0; -+ gcov_type total_count = 0; -+ edge e; -+ edge_iterator ei; -+ FOR_EACH_EDGE (e, ei, bb->succs) -+ { -+ if (!e->dest->count.ipa_p ()) -+ { -+ continue; -+ } -+ max_count = MAX (max_count, e->dest->count.to_gcov_type ()); -+ total_count += e->dest->count.to_gcov_type (); -+ } -+ /* Only bb_count > max_count * 2, branch probability will -+ inversion. */ -+ if (max_count > 0 && bb->count.to_gcov_type () > max_count * 2) -+ { -+ FOR_EACH_EDGE (e, ei, bb->succs) -+ { -+ gcov_type target_count = bb->count.to_gcov_type () -+ * e->dest->count.to_gcov_type ()/ total_count; -+ e->dest->count -+ = profile_count::from_gcov_type -+ (target_count).afdo (); -+ } -+ } -+ } -+ } -+ } -+} -+ - /* Propagate counts on control flow graph and calculate branch - probabilities. */ - -@@ -1420,6 +1495,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb) - } - - afdo_find_equiv_class (annotated_bb); -+ afdo_preprocess_bb_count (); - afdo_propagate (annotated_bb); - - FOR_EACH_BB_FN (bb, cfun) -@@ -1523,6 +1599,83 @@ afdo_vpt_for_early_inline (stmt_set *promoted_stmts) - return false; - } - -+/* Preparation before executing MCF algorithm. */ -+ -+static void -+afdo_init_mcf () -+{ -+ basic_block bb; -+ edge e; -+ edge_iterator ei; -+ -+ if (dump_file) -+ { -+ fprintf (dump_file, "\n init calling mcf_smooth_cfg (). \n"); -+ } -+ -+ /* Step1: when use mcf, BB id must be continous, -+ so we need compact_blocks (). */ -+ compact_blocks (); -+ -+ /* Step2: allocate memory for MCF input data. */ -+ bb_gcov_counts.safe_grow_cleared (cfun->cfg->x_last_basic_block); -+ edge_gcov_counts = new hash_map<edge, gcov_type>; -+ -+ /* Step3: init MCF input data from cfg. */ -+ FOR_ALL_BB_FN (bb, cfun) -+ { -+ /* Init BB count for MCF. */ -+ bb_gcov_count (bb) = bb->count.to_gcov_type (); -+ -+ gcov_type total_count = 0; -+ FOR_EACH_EDGE (e, ei, bb->succs) -+ { -+ total_count += e->dest->count.to_gcov_type (); -+ } -+ -+ /* If there is no sample in each successor blocks, source -+ BB samples are allocated to each edge by branch static prob. */ -+ -+ FOR_EACH_EDGE (e, ei, bb->succs) -+ { -+ if (total_count == 0) -+ { -+ edge_gcov_count (e) = e->src->count.to_gcov_type () -+ * e->probability.to_reg_br_prob_base () / REG_BR_PROB_BASE; -+ } -+ else -+ { -+ edge_gcov_count (e) = e->src->count.to_gcov_type () -+ * e->dest->count.to_gcov_type () / total_count; -+ } -+ } -+ } -+} -+ -+ -+/* Free the resources used by MCF and reset BB count from MCF result. -+ branch probability has been updated in mcf_smooth_cfg (). */ -+ -+static void -+afdo_process_after_mcf () -+{ -+ basic_block bb; -+ /* Reset BB count from MCF result. */ -+ FOR_EACH_BB_FN (bb, cfun) -+ { -+ if (bb_gcov_count (bb)) -+ {
View file
_service:tar_scm:0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
Deleted
@@ -1,194 +0,0 @@ -From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com> -Date: Thu, 22 Feb 2024 11:27:43 +0300 -Subject: PATCH 01/18 Add insn defs and correct costs for cmlt generation - ---- - gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++ - gcc/config/aarch64/aarch64.cc | 15 +++++++++ - gcc/config/aarch64/aarch64.opt | 4 +++ - gcc/config/aarch64/iterators.md | 3 +- - gcc/config/aarch64/predicates.md | 25 +++++++++++++++ - gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++ - 6 files changed, 114 insertions(+), 1 deletion(-) - create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c - -diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md -index ee7f0b89c..82f73805f 100644 ---- a/gcc/config/aarch64/aarch64-simd.md -+++ b/gcc/config/aarch64/aarch64-simd.md -@@ -6454,6 +6454,54 @@ - (set_attr "type" "neon_compare<q>, neon_compare_zero<q>") - ) - -+;; Use cmlt to replace vector arithmetic operations like this (SImode example): -+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001) -+;; TODO: maybe extend to scalar operations or other cm** instructions. -+ -+(define_insn "*aarch64_cmlt_as_arith<mode>" -+ (set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") -+ (minus:<V_INT_EQUIV> -+ (ashift:<V_INT_EQUIV> -+ (and:<V_INT_EQUIV> -+ (lshiftrt:<V_INT_EQUIV> -+ (match_operand:VDQHSD 1 "register_operand" "w") -+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) -+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")) -+ (match_operand:VDQHSD 4 "half_size_operand")) -+ (and:<V_INT_EQUIV> -+ (lshiftrt:<V_INT_EQUIV> -+ (match_dup 1) -+ (match_dup 2)) -+ (match_dup 3)))) -+ "TARGET_SIMD && flag_cmlt_arith" -+ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0" -+ (set_attr "type" "neon_compare_zero") -+) -+ -+;; The helper definition that allows combiner to use the previous pattern. -+ -+(define_insn_and_split "*arch64_cmlt_tmp<mode>" -+ (set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") -+ (and:<V_INT_EQUIV> -+ (lshiftrt:<V_INT_EQUIV> -+ (match_operand:VDQHSD 1 "register_operand" "w") -+ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) -+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))) -+ "TARGET_SIMD && flag_cmlt_arith" -+ "#" -+ "&& reload_completed" -+ (set (match_operand:<V_INT_EQUIV> 0 "register_operand") -+ (lshiftrt:<V_INT_EQUIV> -+ (match_operand:VDQHSD 1 "register_operand") -+ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))) -+ (set (match_dup 0) -+ (and:<V_INT_EQUIV> -+ (match_dup 0) -+ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))) -+ "" -+ (set_attr "type" "neon_compare_zero") -+) -+ - (define_insn_and_split "aarch64_cm<optab>di" - (set (match_operand:DI 0 "register_operand" "=w,w,r") - (neg:DI -diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc -index a3da4ca30..04072ca25 100644 ---- a/gcc/config/aarch64/aarch64.cc -+++ b/gcc/config/aarch64/aarch64.cc -@@ -14064,6 +14064,21 @@ cost_minus: - return true; - } - -+ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern -+ matches the condition. The costs of cmlt and sub instructions -+ are comparable, so we are not increasing the cost here. */ -+ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT -+ && GET_CODE (op1) == AND) -+ { -+ rtx op0_subop0 = XEXP (op0, 0); -+ if (rtx_equal_p (op0_subop0, op1)) -+ { -+ rtx lshrt_op = XEXP (op0_subop0, 0); -+ if (GET_CODE (lshrt_op) == LSHIFTRT) -+ return true; -+ } -+ } -+ - /* Look for SUB (extended register). */ - if (is_a <scalar_int_mode> (mode) - && aarch64_rtx_arith_op_extract_p (op1)) -diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt -index a64b927e9..101664c7c 100644 ---- a/gcc/config/aarch64/aarch64.opt -+++ b/gcc/config/aarch64/aarch64.opt -@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0. - This option is for use with fstack-protector-strong and not for use in - user-land code. - -+mcmlt-arith -+Target Var(flag_cmlt_arith) Optimization Init(0) -+Use SIMD cmlt instruction to perform some arithmetic/logic calculations. -+ - TargetVariable - long aarch64_stack_protector_guard_offset = 0 - -diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md -index 26a840d7f..967e6b0b1 100644 ---- a/gcc/config/aarch64/iterators.md -+++ b/gcc/config/aarch64/iterators.md -@@ -1485,7 +1485,8 @@ - (V2DI "2s")) - - ;; Register suffix narrowed modes for VQN. --(define_mode_attr V2ntype (V8HI "16b") (V4SI "8h") -+(define_mode_attr V2ntype (V4HI "8b") (V2SI "4h") -+ (V8HI "16b") (V4SI "8h") - (V2DI "4s")) - - ;; Widened modes of vector modes. -diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md -index c308015ac..07c14aacb 100644 ---- a/gcc/config/aarch64/predicates.md -+++ b/gcc/config/aarch64/predicates.md -@@ -49,6 +49,31 @@ - return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3); - }) - -+(define_predicate "half_size_minus_one_operand" -+ (match_code "const_vector") -+{ -+ op = unwrap_const_vec_duplicate (op); -+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; -+ return CONST_INT_P (op) && (UINTVAL (op) == size - 1); -+}) -+ -+(define_predicate "half_size_operand" -+ (match_code "const_vector") -+{ -+ op = unwrap_const_vec_duplicate (op); -+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; -+ return CONST_INT_P (op) && (UINTVAL (op) == size); -+}) -+ -+(define_predicate "cmlt_arith_mask_operand" -+ (match_code "const_vector") -+{ -+ op = unwrap_const_vec_duplicate (op); -+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; -+ unsigned long long mask = ((unsigned long long) 1 << size) | 1; -+ return CONST_INT_P (op) && (UINTVAL (op) == mask); -+}) -+ - (define_predicate "subreg_lowpart_operator" - (ior (match_code "truncate") - (and (match_code "subreg") -diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c -new file mode 100755 -index 000000000..b4c9a37ff ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c -@@ -0,0 +1,20 @@ -+/* { dg-do compile { target aarch64-*-* } } */ -+/* { dg-options "-O3 -mcmlt-arith" } */ -+ -+/* The test checks usage of cmlt insns for arithmetic/logic calculations -+ * in foo (). It's inspired by sources of x264 codec. */ -+ -+typedef unsigned short int uint16_t; -+typedef unsigned int uint32_t; -+ -+void foo( uint32_t *a, uint32_t *b) -+{ -+ for (unsigned i = 0; i < 4; i++) -+ { -+ uint32_t s = ((ai>>((8 * sizeof(uint16_t))-1)) -+ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1); -+ bi = (ai+s)^s; -+ } -+} -+ -+/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */ --- -2.33.0 -
View file
_service:tar_scm:0036-rtl-ifcvt-introduce-rtl-ifcvt-enchancements.patch
Deleted
@@ -1,560 +0,0 @@ -From 4cae948c1c00ad7a59f0f234f809fbd9a0208eb4 Mon Sep 17 00:00:00 2001 -From: vchernon <chernonog.vyacheslav@huawei.com> -Date: Wed, 28 Feb 2024 23:05:12 +0800 -Subject: PATCH 02/18 rtl-ifcvt introduce rtl ifcvt enchancements new - option: -fifcvt-allow-complicated-cmps: allows ifcvt to deal - with complicated cmps like - - cmp reg1 (reg2 + reg3) - - can increase compilation time - new param: - -param=ifcvt-allow-register-renaming=0,1,2 - 1 : allows ifcvt to rename registers in then and else bb - 2 : allows to rename registers in condition and else/then bb - can increase compilation time and register pressure ---- - gcc/common.opt | 4 + - gcc/ifcvt.cc | 291 +++++++++++++++--- - gcc/params.opt | 4 + - .../gcc.c-torture/execute/ifcvt-renaming-1.c | 35 +++ - gcc/testsuite/gcc.dg/ifcvt-6.c | 27 ++ - 5 files changed, 311 insertions(+), 50 deletions(-) - create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c - create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index c7c6bc256..aa00fb7b0 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -3691,4 +3691,8 @@ fipa-ra - Common Var(flag_ipa_ra) Optimization - Use caller save register across calls if possible. - -+fifcvt-allow-complicated-cmps -+Common Var(flag_ifcvt_allow_complicated_cmps) Optimization -+Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time). -+ - ; This comment is to ensure we retain the blank line above. -diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc -index 2c1eba312..584db7b55 100644 ---- a/gcc/ifcvt.cc -+++ b/gcc/ifcvt.cc -@@ -886,7 +886,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep, - } - - /* Don't even try if the comparison operands or the mode of X are weird. */ -- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x))) -+ if (!flag_ifcvt_allow_complicated_cmps -+ && (cond_complex -+ || !SCALAR_INT_MODE_P (GET_MODE (x)))) - return NULL_RTX; - - return emit_store_flag (x, code, XEXP (cond, 0), -@@ -1965,7 +1967,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) - /* Currently support only simple single sets in test_bb. */ - if (!sset - || !noce_operand_ok (SET_DEST (sset)) -- || contains_ccmode_rtx_p (SET_DEST (sset)) -+ || (!flag_ifcvt_allow_complicated_cmps -+ && contains_ccmode_rtx_p (SET_DEST (sset))) - || !noce_operand_ok (SET_SRC (sset))) - return false; - -@@ -1979,13 +1982,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) - in this function. */ - - static bool --bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) -+bbs_ok_for_cmove_arith (basic_block bb_a, -+ basic_block bb_b, -+ rtx to_rename, -+ bitmap conflict_regs) - { - rtx_insn *a_insn; - bitmap bba_sets = BITMAP_ALLOC (®_obstack); -- -+ bitmap intersections = BITMAP_ALLOC (®_obstack); - df_ref def; - df_ref use; -+ rtx_insn *last_a = last_active_insn (bb_a, FALSE); - - FOR_BB_INSNS (bb_a, a_insn) - { -@@ -1995,18 +2002,15 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) - rtx sset_a = single_set (a_insn); - - if (!sset_a) -- { -- BITMAP_FREE (bba_sets); -- return false; -- } -+ goto end_cmove_arith_check_and_fail; - /* Record all registers that BB_A sets. */ - FOR_EACH_INSN_DEF (def, a_insn) -- if (!(to_rename && DF_REF_REG (def) == to_rename)) -+ if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a)) - bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); - } - -+ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets); - rtx_insn *b_insn; -- - FOR_BB_INSNS (bb_b, b_insn) - { - if (!active_insn_p (b_insn)) -@@ -2015,10 +2019,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) - rtx sset_b = single_set (b_insn); - - if (!sset_b) -- { -- BITMAP_FREE (bba_sets); -- return false; -- } -+ goto end_cmove_arith_check_and_fail; - - /* Make sure this is a REG and not some instance - of ZERO_EXTRACT or SUBREG or other dangerous stuff. -@@ -2030,25 +2031,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) - if (MEM_P (SET_DEST (sset_b))) - gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename)); - else if (!REG_P (SET_DEST (sset_b))) -- { -- BITMAP_FREE (bba_sets); -- return false; -- } -+ goto end_cmove_arith_check_and_fail; - -- /* If the insn uses a reg set in BB_A return false. */ -+ /* If the insn uses a reg set in BB_A return false -+ or try to collect register list for renaming. */ - FOR_EACH_INSN_USE (use, b_insn) - { -- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use))) -+ if (bitmap_bit_p (intersections, DF_REF_REGNO (use))) - { -- BITMAP_FREE (bba_sets); -- return false; -+ if (param_ifcvt_allow_register_renaming < 1) -+ goto end_cmove_arith_check_and_fail; -+ -+ /* Those regs should be renamed. We can't rename CC reg, but -+ possibly we can provide combined comparison in the future. */ -+ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC) -+ goto end_cmove_arith_check_and_fail; -+ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use)); - } - } -- - } - - BITMAP_FREE (bba_sets); -+ BITMAP_FREE (intersections); - return true; -+ -+end_cmove_arith_check_and_fail: -+ BITMAP_FREE (bba_sets); -+ BITMAP_FREE (intersections); -+ return false; - } - - /* Emit copies of all the active instructions in BB except the last. -@@ -2103,6 +2113,142 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple) - return true; - } - -+/* This function tries to rename regs that intersect with considered bb -+ inside condition expression. Condition expression will be moved down -+ if the optimization will be applied, so it is essential to be sure that -+ all intersected registers will be renamed otherwise transformation -+ can't be applied. Function returns true if renaming was successful -+ and optimization can proceed futher. */ -+ -+static bool -+noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) -+{ -+ bool success = true; -+ if (bitmap_empty_p (cond_rename_regs)) -+ return true; -+ if (param_ifcvt_allow_register_renaming < 2) -+ return false; -+ df_ref use; -+ rtx_insn *cmp_insn = if_info->cond_earliest; -+ /* Jump instruction as a condion currently unsupported. */ -+ if (JUMP_P (cmp_insn)) -+ return false; -+ rtx_insn *before_cmp = PREV_INSN (cmp_insn); -+ start_sequence (); -+ rtx_insn *copy_of_cmp = as_a <rtx_insn *> (copy_rtx (cmp_insn)); -+ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn); -+ FOR_EACH_INSN_USE (use, cmp_insn) -+ { -+ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use))) -+ { -+ rtx use_reg = DF_REF_REG (use); -+ rtx tmp = gen_reg_rtx (GET_MODE (use_reg)); -+ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp)) -+ { -+ end_sequence (); -+ return false;
View file
_service:tar_scm:0037-Perform-early-if-conversion-of-simple-arithmetic.patch
Deleted
@@ -1,109 +0,0 @@ -From 310eade1450995b55d9f8120561022fbf164b2ec Mon Sep 17 00:00:00 2001 -From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> -Date: Thu, 12 Jan 2023 14:52:49 +0300 -Subject: PATCH 03/18 Perform early if-conversion of simple arithmetic - ---- - gcc/common.opt | 4 ++++ - gcc/match.pd | 25 +++++++++++++++++++ - gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++ - 3 files changed, 66 insertions(+) - create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index aa00fb7b0..dac477c04 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1821,6 +1821,10 @@ fif-conversion2 - Common Var(flag_if_conversion2) Optimization - Perform conversion of conditional jumps to conditional execution. - -+fif-conversion-gimple -+Common Var(flag_if_conversion_gimple) Optimization -+Perform conversion of conditional jumps to branchless equivalents during gimple transformations. -+ - fstack-reuse= - Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization - -fstack-reuse=all|named_vars|none Set stack reuse level for local variables. -diff --git a/gcc/match.pd b/gcc/match.pd -index 6f24d5079..3cbaf2a5b 100644 ---- a/gcc/match.pd -+++ b/gcc/match.pd -@@ -4278,6 +4278,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) - ) - ) - ) -+ -+(if (flag_if_conversion_gimple) -+ (for simple_op (plus minus bit_and bit_ior bit_xor) -+ (simplify -+ (cond @0 (simple_op @1 INTEGER_CST@2) @1) -+ (switch -+ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */ -+ (if (integer_onep (@2)) -+ (simple_op @1 (convert (convert:boolean_type_node @0)))) -+ /* a = cond ? a + powerof2cst : a -> -+ a = a + ((int) cond) << log2 (powerof2cst) */ -+ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2)) -+ (with -+ { -+ tree shift = build_int_cst (integer_type_node, tree_log2 (@2)); -+ } -+ (simple_op @1 (lshift (convert (convert:boolean_type_node @0)) -+ { shift; }) -+ ) -+ ) -+ ) -+ ) -+ ) -+ ) -+) - #endif - - #if GIMPLE -diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c -new file mode 100644 -index 000000000..0f7c87e5c ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c -@@ -0,0 +1,37 @@ -+/* { dg-do compile } */ -+/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */ -+ -+int test_int (int optimizable_int) { -+ if (optimizable_int > 5) -+ ++optimizable_int; -+ return optimizable_int; -+} -+ -+int test_int_pow2 (int optimizable_int_pow2) { -+ if (optimizable_int_pow2 <= 4) -+ optimizable_int_pow2 += 1024; -+ return optimizable_int_pow2; -+} -+ -+int test_int_non_pow2 (int not_optimizable_int_non_pow2) { -+ if (not_optimizable_int_non_pow2 == 1) -+ not_optimizable_int_non_pow2 += 513; -+ return not_optimizable_int_non_pow2; -+} -+ -+float test_float (float not_optimizable_float) { -+ if (not_optimizable_float > 5) -+ not_optimizable_float += 1; -+ return not_optimizable_float; -+} -+ -+/* Expecting if-else block in test_float and test_int_non_pow2 only. */ -+/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */ -+/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */ -+/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */ -+/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */ -+/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */ -+ -+/* Expecting shifted result only for optimizable_int_pow2. */ -+/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */ -+/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */ --- -2.33.0 -
View file
_service:tar_scm:0038-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch
Deleted
@@ -1,252 +0,0 @@ -From 6684509e81e4341675c73a7dc853180229a8abcb Mon Sep 17 00:00:00 2001 -From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> -Date: Tue, 24 Jan 2023 16:43:40 +0300 -Subject: PATCH 04/18 Add option to allow matching uaddsub overflow for widen - ops too. - ---- - gcc/common.opt | 5 ++ - gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++ - gcc/tree-ssa-math-opts.cc | 43 ++++++++-- - 3 files changed, 184 insertions(+), 7 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index dac477c04..39c90604e 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -3106,6 +3106,11 @@ freciprocal-math - Common Var(flag_reciprocal_math) SetByCombined Optimization - Same as -fassociative-math for expressions which include division. - -+fuaddsub-overflow-match-all -+Common Var(flag_uaddsub_overflow_match_all) -+Match unsigned add/sub overflow even if the target does not support -+the corresponding instruction. -+ - ; Nonzero means that unsafe floating-point math optimizations are allowed - ; for the sake of speed. IEEE compliance is not guaranteed, and operations - ; are allowed to assume that their arguments and results are "normal" -diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c -new file mode 100644 -index 000000000..96c26d308 ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/uaddsub.c -@@ -0,0 +1,143 @@ -+/* { dg-do compile } */ -+/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */ -+#include <stdint.h> -+ -+typedef unsigned __int128 uint128_t; -+typedef struct uint256_t -+{ -+ uint128_t lo; -+ uint128_t hi; -+} uint256_t; -+ -+uint16_t add16 (uint8_t a, uint8_t b) -+{ -+ uint8_t tmp = a + b; -+ uint8_t overflow = 0; -+ if (tmp < a) -+ overflow = 1; -+ -+ uint16_t res = overflow; -+ res <<= 8; -+ res += tmp; -+ return res; -+} -+ -+uint32_t add32 (uint16_t a, uint16_t b) -+{ -+ uint16_t tmp = a + b; -+ uint16_t overflow = 0; -+ if (tmp < a) -+ overflow = 1; -+ -+ uint32_t res = overflow; -+ res <<= 16; -+ res += tmp; -+ return res; -+} -+ -+uint64_t add64 (uint32_t a, uint32_t b) -+{ -+ uint32_t tmp = a + b; -+ uint32_t overflow = 0; -+ if (tmp < a) -+ overflow = 1; -+ -+ uint64_t res = overflow; -+ res <<= 32; -+ res += tmp; -+ return res; -+} -+ -+uint128_t add128 (uint64_t a, uint64_t b) -+{ -+ uint64_t tmp = a + b; -+ uint64_t overflow = 0; -+ if (tmp < a) -+ overflow = 1; -+ -+ uint128_t res = overflow; -+ res <<= 64; -+ res += tmp; -+ return res; -+} -+ -+uint256_t add256 (uint128_t a, uint128_t b) -+{ -+ uint128_t tmp = a + b; -+ uint128_t overflow = 0; -+ if (tmp < a) -+ overflow = 1; -+ -+ uint256_t res; -+ res.hi = overflow; -+ res.lo = tmp; -+ return res; -+} -+ -+uint16_t sub16 (uint8_t a, uint8_t b) -+{ -+ uint8_t tmp = a - b; -+ uint8_t overflow = 0; -+ if (tmp > a) -+ overflow = -1; -+ -+ uint16_t res = overflow; -+ res <<= 8; -+ res += tmp; -+ return res; -+} -+ -+uint32_t sub32 (uint16_t a, uint16_t b) -+{ -+ uint16_t tmp = a - b; -+ uint16_t overflow = 0; -+ if (tmp > a) -+ overflow = -1; -+ -+ uint32_t res = overflow; -+ res <<= 16; -+ res += tmp; -+ return res; -+} -+ -+uint64_t sub64 (uint32_t a, uint32_t b) -+{ -+ uint32_t tmp = a - b; -+ uint32_t overflow = 0; -+ if (tmp > a) -+ overflow = -1; -+ -+ uint64_t res = overflow; -+ res <<= 32; -+ res += tmp; -+ return res; -+} -+ -+uint128_t sub128 (uint64_t a, uint64_t b) -+{ -+ uint64_t tmp = a - b; -+ uint64_t overflow = 0; -+ if (tmp > a) -+ overflow = -1; -+ -+ uint128_t res = overflow; -+ res <<= 64; -+ res += tmp; -+ return res; -+} -+ -+uint256_t sub256 (uint128_t a, uint128_t b) -+{ -+ uint128_t tmp = a - b; -+ uint128_t overflow = 0; -+ if (tmp > a) -+ overflow = -1; -+ -+ uint256_t res; -+ res.hi = overflow; -+ res.lo = tmp; -+ return res; -+} -+ -+/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\0-9\+\\(D\\), b_\0-9\+\\(D\\)\\)" 5 "optimized" } } */ -+/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\0-9\+\\(D\\), b_\0-9\+\\(D\\)\\)" 5 "optimized" } } */ -diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc -index 232e903b0..55d6ee8ae 100644 ---- a/gcc/tree-ssa-math-opts.cc -+++ b/gcc/tree-ssa-math-opts.cc -@@ -3468,6 +3468,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2, - } - } - -+/* Check if the corresponding operation has wider equivalent on the target. */ -+ -+static bool -+wider_optab_check_p (optab op, machine_mode mode, int unsignedp) -+{ -+ machine_mode wider_mode; -+ FOR_EACH_WIDER_MODE (wider_mode, mode) -+ { -+ machine_mode next_mode; -+ if (optab_handler (op, wider_mode) != CODE_FOR_nothing -+ || (op == smul_optab -+ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode) -+ && (find_widening_optab_handler ((unsignedp
View file
_service:tar_scm:0039-Match-double-sized-mul-pattern.patch
Deleted
@@ -1,488 +0,0 @@ -From e7b22f97f960b62e555dfd6f2e3ae43973fcbb3e Mon Sep 17 00:00:00 2001 -From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> -Date: Wed, 25 Jan 2023 15:04:07 +0300 -Subject: PATCH 05/18 Match double sized mul pattern - ---- - gcc/match.pd | 136 +++++++++++++++++++++ - gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++ - gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++ - gcc/tree-ssa-math-opts.cc | 80 ++++++++++++ - 4 files changed, 419 insertions(+) - create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c - create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c - -diff --git a/gcc/match.pd b/gcc/match.pd -index 3cbaf2a5b..61866cb90 100644 ---- a/gcc/match.pd -+++ b/gcc/match.pd -@@ -7895,3 +7895,139 @@ and, - == TYPE_UNSIGNED (TREE_TYPE (@3)))) - && single_use (@4) - && single_use (@5)))) -+ -+/* Match multiplication with double sized result. -+ -+ Consider the following calculations: -+ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo) -+ * (2^(bit_size/2) * arg1_hi + arg1_lo) -+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi -+ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi) -+ + arg0_lo * arg1_lo -+ -+ The products of high and low parts fits in bit_size values, thus they are -+ placed in high and low parts of result respectively. -+ -+ The sum of the mixed products may overflow, so we need a detection for that. -+ Also it has a bit_size/2 offset, thus it intersects with both high and low -+ parts of result. Overflow detection constant is bit_size/2 due to this. -+ -+ With this info: -+ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi -+ + 2^(bit_size/2) * middle -+ + 2^bit_size * possible_middle_overflow -+ + arg0_lo * arg1_lo -+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow) -+ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo) -+ + arg0_lo * arg1_lo -+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi -+ + possible_middle_overflow) -+ + 2^(bit_size/2) * middle_lo -+ + arg0_lo * arg1_lo -+ -+ The last sum can produce overflow for the high result part. With this: -+ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow -+ + possible_res_lo_overflow + middle_hi) -+ + res_lo -+ = res_hi + res_lo -+ -+ This formula is quite big to fit into one match pattern with all of the -+ combinations of terms inside it. There are many helpers for better code -+ readability. -+ -+ The simplification basis is res_hi: assuming that res_lo only is not -+ real practical case for such calculations. -+ -+ Overflow handling is done via matching complex calculations: -+ the realpart and imagpart are quite handy here. */ -+/* Match low and high parts of the argument. */ -+(match (double_size_mul_arg_lo @0 @1) -+ (bit_and @0 INTEGER_CST@1) -+ (if (wi::to_wide (@1) -+ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type))))) -+(match (double_size_mul_arg_hi @0 @1) -+ (rshift @0 INTEGER_CST@1) -+ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2))) -+ -+/* Match various argument parts products. */ -+(match (double_size_mul_lolo @0 @1) -+ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3)) -+ (if (single_use (@4)))) -+(match (double_size_mul_hihi @0 @1) -+ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3)) -+ (if (single_use (@4)))) -+(match (double_size_mul_lohi @0 @1) -+ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3)) -+ (if (single_use (@4)))) -+ -+/* Match complex middle sum. */ -+(match (double_size_mul_middle_complex @0 @1) -+ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0)) -+ (if (num_imm_uses (@2) == 2))) -+ -+/* Match real middle results. */ -+(match (double_size_mul_middle @0 @1) -+ (realpart@2 (double_size_mul_middle_complex @0 @1)) -+ (if (num_imm_uses (@2) == 2))) -+(match (double_size_mul_middleres_lo @0 @1) -+ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) -+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 -+ && single_use (@3)))) -+(match (double_size_mul_middleres_hi @0 @1) -+ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) -+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 -+ && single_use (@3)))) -+ -+/* Match low result part. */ -+/* Number of uses may be < 2 in case when we are interested in -+ high part only. */ -+(match (double_size_mul_res_lo_complex @0 @1) -+ (IFN_ADD_OVERFLOW:c@2 -+ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1)) -+ (if (num_imm_uses (@2) <= 2))) -+(match (double_size_mul_res_lo @0 @1) -+ (realpart (double_size_mul_res_lo_complex @0 @1))) -+ -+/* Match overflow terms. */ -+(match (double_size_mul_overflow_check_lo @0 @1 @5) -+ (convert@4 (ne@3 -+ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop)) -+ (if (single_use (@2) && single_use (@3) && single_use (@4)))) -+(match (double_size_mul_overflow_check_hi @0 @1) -+ (lshift@6 (convert@5 (ne@4 -+ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop)) -+ INTEGER_CST@2) -+ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 -+ && single_use (@3) && single_use (@4) && single_use (@5) -+ && single_use (@6)))) -+ -+/* Match all possible permutations for high result part calculations. */ -+(for op1 (double_size_mul_hihi -+ double_size_mul_overflow_check_hi -+ double_size_mul_middleres_hi) -+ op2 (double_size_mul_overflow_check_hi -+ double_size_mul_middleres_hi -+ double_size_mul_hihi) -+ op3 (double_size_mul_middleres_hi -+ double_size_mul_hihi -+ double_size_mul_overflow_check_hi) -+ (match (double_size_mul_candidate @0 @1 @2 @3) -+ (plus:c@2 -+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1)) -+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))) -+ (if (single_use (@4) && single_use (@5)))) -+ (match (double_size_mul_candidate @0 @1 @2 @3) -+ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3) -+ (plus:c@4 (op1:c @0 @1) -+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) -+ (if (single_use (@4) && single_use (@5)))) -+ (match (double_size_mul_candidate @0 @1 @2 @3) -+ (plus:c@2 (op1:c @0 @1) -+ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) -+ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) -+ (if (single_use (@4) && single_use (@5)))) -+ (match (double_size_mul_candidate @0 @1 @2 @3) -+ (plus:c@2 (op1:c @0 @1) -+ (plus:c@4 (op2:c @0 @1) -+ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) -+ (if (single_use (@4) && single_use (@5))))) -diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c -new file mode 100644 -index 000000000..4d475cc8a ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c -@@ -0,0 +1,141 @@ -+/* { dg-do compile } */ -+/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for -+ proper overflow detection in some cases. */ -+/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ -+#include <stdint.h> -+ -+typedef unsigned __int128 uint128_t; -+ -+uint16_t mul16 (uint8_t a, uint8_t b) -+{ -+ uint8_t a_lo = a & 0xF; -+ uint8_t b_lo = b & 0xF; -+ uint8_t a_hi = a >> 4; -+ uint8_t b_hi = b >> 4; -+ uint8_t lolo = a_lo * b_lo; -+ uint8_t lohi = a_lo * b_hi; -+ uint8_t hilo = a_hi * b_lo; -+ uint8_t hihi = a_hi * b_hi; -+ uint8_t middle = hilo + lohi; -+ uint8_t middle_hi = middle >> 4; -+ uint8_t middle_lo = middle << 4; -+ uint8_t res_lo = lolo + middle_lo; -+ uint8_t res_hi = hihi + middle_hi; -+ res_hi += (res_lo < middle_lo ? 1 : 0); -+ res_hi += (middle < hilo ? 0x10 : 0); -+ uint16_t res = ((uint16_t) res_hi) << 8; -+ res += res_lo; -+ return res; -+} -+ -+uint32_t mul32 (uint16_t a, uint16_t b) -+{ -+ uint16_t a_lo = a & 0xFF; -+ uint16_t b_lo = b & 0xFF; -+ uint16_t a_hi = a >> 8;
View file
_service:tar_scm:0040-Port-icp-patch-to-GCC-12.patch
Deleted
@@ -1,2387 +0,0 @@ -From b73462757734c62f64e7a4379340679ec6f19669 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Tue, 27 Feb 2024 07:28:12 +0800 -Subject: PATCH 06/18 Port icp patch to GCC 12 - ---- - gcc/common.opt | 8 + - gcc/dbgcnt.def | 1 + - gcc/ipa-devirt.cc | 1855 +++++++++++++++++++++++++++++++++++ - gcc/passes.def | 1 + - gcc/testsuite/gcc.dg/icp1.c | 40 + - gcc/testsuite/gcc.dg/icp2.c | 38 + - gcc/testsuite/gcc.dg/icp3.c | 52 + - gcc/testsuite/gcc.dg/icp4.c | 55 ++ - gcc/testsuite/gcc.dg/icp5.c | 66 ++ - gcc/testsuite/gcc.dg/icp6.c | 66 ++ - gcc/testsuite/gcc.dg/icp7.c | 48 + - gcc/timevar.def | 1 + - gcc/tree-pass.h | 1 + - 13 files changed, 2232 insertions(+) - create mode 100644 gcc/testsuite/gcc.dg/icp1.c - create mode 100644 gcc/testsuite/gcc.dg/icp2.c - create mode 100644 gcc/testsuite/gcc.dg/icp3.c - create mode 100644 gcc/testsuite/gcc.dg/icp4.c - create mode 100644 gcc/testsuite/gcc.dg/icp5.c - create mode 100644 gcc/testsuite/gcc.dg/icp6.c - create mode 100644 gcc/testsuite/gcc.dg/icp7.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index 39c90604e..16aadccf6 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1316,6 +1316,14 @@ fdevirtualize - Common Var(flag_devirtualize) Optimization - Try to convert virtual calls to direct ones. - -+ficp -+Common Var(flag_icp) Optimization Init(0) -+Try to promote indirect calls to direct ones. -+ -+ficp-speculatively -+Common Var(flag_icp_speculatively) Optimization -+Promote indirect calls speculatively. -+ - fdiagnostics-show-location= - Common Joined RejectNegative Enum(diagnostic_prefixing_rule) - -fdiagnostics-show-location=once|every-line How often to emit source location at the beginning of line-wrapped diagnostics. -diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def -index 3aa18cd0c..a00bbc31b 100644 ---- a/gcc/dbgcnt.def -+++ b/gcc/dbgcnt.def -@@ -170,6 +170,7 @@ DEBUG_COUNTER (graphite_scop) - DEBUG_COUNTER (hoist) - DEBUG_COUNTER (hoist_insn) - DEBUG_COUNTER (ia64_sched2) -+DEBUG_COUNTER (icp) - DEBUG_COUNTER (if_after_combine) - DEBUG_COUNTER (if_after_reload) - DEBUG_COUNTER (if_conversion) -diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc -index 74fe65608..383839189 100644 ---- a/gcc/ipa-devirt.cc -+++ b/gcc/ipa-devirt.cc -@@ -103,9 +103,14 @@ along with GCC; see the file COPYING3. If not see - indirect polymorphic edge all possible polymorphic call targets of the call. - - pass_ipa_devirt performs simple speculative devirtualization. -+ pass_ipa_icp performs simple indirect call promotion. - */ - - #include "config.h" -+#define INCLUDE_ALGORITHM -+#define INCLUDE_SET -+#define INCLUDE_MAP -+#define INCLUDE_LIST - #include "system.h" - #include "coretypes.h" - #include "backend.h" -@@ -127,6 +132,7 @@ along with GCC; see the file COPYING3. If not see - #include "ipa-fnsummary.h" - #include "demangle.h" - #include "dbgcnt.h" -+#include "gimple-iterator.h" - #include "gimple-pretty-print.h" - #include "intl.h" - #include "stringpool.h" -@@ -4401,5 +4407,1854 @@ make_pass_ipa_odr (gcc::context *ctxt) - return new pass_ipa_odr (ctxt); - } - -+/* Function signature map used to look up function decl which corresponds to -+ the given function type. */ -+typedef std::set<unsigned> type_set; -+typedef std::set<tree> decl_set; -+typedef std::map<unsigned, type_set*> type_alias_map; -+typedef std::map<unsigned, decl_set*> type_decl_map; -+typedef std::map<unsigned, tree> uid_to_type_map; -+typedef std::map<tree, tree> type_map; -+ -+static bool has_address_taken_functions_with_varargs = false; -+static type_set *unsafe_types = NULL; -+static type_alias_map *fta_map = NULL; -+static type_alias_map *ta_map = NULL; -+static type_map *ctype_map = NULL; -+static type_alias_map *cbase_to_ptype = NULL; -+static type_decl_map *fs_map = NULL; -+static uid_to_type_map *type_uid_map = NULL; -+ -+static void -+print_type_set(unsigned ftype_uid, type_alias_map *map) -+{ -+ if (!map->count (ftype_uid)) -+ return; -+ type_set* s = (*map)ftype_uid; -+ for (type_set::const_iterator it = s->begin (); it != s->end (); it++) -+ fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); -+} -+ -+static void -+dump_type_with_uid (const char *msg, tree type, dump_flags_t flags = TDF_NONE) -+{ -+ fprintf (dump_file, msg); -+ print_generic_expr (dump_file, type, flags); -+ fprintf (dump_file, " (%d)\n", TYPE_UID (type)); -+} -+ -+/* Walk aggregate type and collect types of scalar elements. */ -+ -+static void -+collect_scalar_types (tree tp, std::list<tree> &types) -+{ -+ /* TODO: take into account different field offsets. -+ Also support array casts. */ -+ if (tp && dump_file && (dump_flags & TDF_DETAILS)) -+ dump_type_with_uid ("Walk var's type: ", tp, TDF_UID); -+ if (RECORD_OR_UNION_TYPE_P (tp)) -+ { -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "Record's fields {\n"); -+ for (tree field = TYPE_FIELDS (tp); field; -+ field = DECL_CHAIN (field)) -+ { -+ if (TREE_CODE (field) != FIELD_DECL) -+ continue; -+ collect_scalar_types (TREE_TYPE (field), types); -+ } -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "}\n"); -+ return; -+ } -+ if (TREE_CODE (tp) == ARRAY_TYPE) -+ { -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "Array's innermost type:\n"); -+ /* Take the innermost component type. */ -+ tree elt; -+ for (elt = TREE_TYPE (tp); TREE_CODE (elt) == ARRAY_TYPE; -+ elt = TREE_TYPE (elt)) -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ print_generic_expr (dump_file, elt); -+ collect_scalar_types (elt, types); -+ return; -+ } -+ types.push_back (tp); -+} -+ -+static void maybe_register_aliases (tree type1, tree type2); -+ -+/* Walk type lists and maybe register type aliases. */ -+ -+static void -+compare_type_lists (std::list<tree> tlist1, std::list<tree> tlist2) -+{ -+ for (std::list<tree>::iterator ti1 = tlist1.begin (), ti2 = tlist2.begin (); -+ ti1 != tlist1.end (); ++ti1, ++ti2) -+ { -+ /* TODO: correct the analysis results if lists have different length. */ -+ if (ti2 == tlist2.end ()) -+ { -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "Type lists with different length!\n"); -+ break; -+ } -+ maybe_register_aliases (*ti1, *ti2); -+ } -+} -+ -+/* For two given types collect scalar element types and -+ compare the result lists to find type aliases. */ -+ -+static void -+collect_scalar_types_and_find_aliases (tree t1, tree t2) -+{ -+ std::list<tree> tlist1; -+ std::list<tree> tlist2; -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ fprintf (dump_file, "First type list: "); -+ collect_scalar_types (t1, tlist1); -+ if (dump_file && (dump_flags & TDF_DETAILS))
View file
_service:tar_scm:0041-Port-fixes-in-icp-to-GCC-12.patch
Deleted
@@ -1,100 +0,0 @@ -From aaa117a9ff58fb208e8c8859e075ca425f995f63 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Tue, 27 Feb 2024 07:43:57 +0800 -Subject: PATCH 07/18 Port fixes in icp to GCC 12 - ---- - gcc/ipa-devirt.cc | 37 ++++++++++++++++++++++++++++++------- - 1 file changed, 30 insertions(+), 7 deletions(-) - -diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc -index 383839189..318535d06 100644 ---- a/gcc/ipa-devirt.cc -+++ b/gcc/ipa-devirt.cc -@@ -4431,6 +4431,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map) - if (!map->count (ftype_uid)) - return; - type_set* s = (*map)ftype_uid; -+ if (!s) -+ { -+ fprintf (dump_file, "%d (no set)", ftype_uid); -+ return; -+ } - for (type_set::const_iterator it = s->begin (); it != s->end (); it++) - fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); - } -@@ -4696,12 +4701,19 @@ maybe_register_aliases (tree type1, tree type2) - if (register_ailas_type (type1, type2, ta_map)) - analyze_pointees (type1, type2); - } -+ unsigned type1_uid = TYPE_UID (type1); -+ unsigned type2_uid = TYPE_UID (type2); -+ if (type_uid_map->count (type1_uid) == 0) -+ (*type_uid_map)type1_uid = type1; -+ if (type_uid_map->count (type2_uid) == 0) -+ (*type_uid_map)type2_uid = type2; -+ - /* If function and non-function type pointers alias, - the function type is unsafe. */ - if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2)) -- unsafe_types->insert (TYPE_UID (type1)); -+ unsafe_types->insert (type1_uid); - if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1)) -- unsafe_types->insert (TYPE_UID (type2)); -+ unsafe_types->insert (type2_uid); - - /* Try to figure out with pointers to incomplete types. */ - if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)) -@@ -4825,10 +4837,12 @@ compare_block_and_init_type (tree block, tree t1) - static void - analyze_global_var (varpool_node *var) - { -- var->get_constructor(); - tree decl = var->decl; -- if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl) -- || integer_zerop (DECL_INITIAL (decl))) -+ if (decl || !DECL_INITIAL (decl)) -+ return; -+ var->get_constructor (); -+ if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl)) -+ || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK) - return; - - if (dump_file && (dump_flags & TDF_DETAILS)) -@@ -4998,7 +5012,9 @@ analyze_assign_stmt (gimple *stmt) - { - rhs = TREE_OPERAND (rhs, 0); - if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST -- || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL) -+ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL -+ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL -+ || TREE_CODE (rhs) == RESULT_DECL) - rhs_type = build_pointer_type (TREE_TYPE (rhs)); - else if (TREE_CODE (rhs) == COMPONENT_REF) - { -@@ -5012,7 +5028,12 @@ analyze_assign_stmt (gimple *stmt) - gcc_assert (POINTER_TYPE_P (rhs_type)); - } - else -- gcc_unreachable(); -+ { -+ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", -+ get_tree_code_name (TREE_CODE (rhs))); -+ print_gimple_stmt (dump_file, stmt, 0); -+ gcc_unreachable (); -+ } - } - else - rhs_type = TREE_TYPE (rhs); -@@ -5710,6 +5731,8 @@ merge_fs_map_for_ftype_aliases () - decl_set *d_set = it1->second; - tree type = (*type_uid_map)it1->first; - type_set *set = (*fta_map)it1->first; -+ if (!set) -+ continue; - for (type_set::const_iterator it2 = set->begin (); - it2 != set->end (); it2++) - { --- -2.33.0 -
View file
_service:tar_scm:0042-Add-split-complex-instructions-pass.patch
Deleted
@@ -1,1245 +0,0 @@ -From 9a8e5716543972dec36bae1f9d380d27bfbcdae1 Mon Sep 17 00:00:00 2001 -From: Agrachev Andrey WX1228450 <agrachev.andrey@huawei-partners.com> -Date: Mon, 21 Aug 2023 12:35:19 +0300 -Subject: PATCH 09/18 Add split-complex-instructions pass - - - Add option -fsplit-ldp-stp - - Add functionality to detect and split depended from store LDP instructions. - - Add -param=param-ldp-dependency-search-range= to configure ldp dependency search range - - Add RTL tests - -Co-authored-by: Chernonog Vyacheslav 00812786 <chernonog.vyacheslav@huawei.com> -Co-authored-by: Zinin Ivan WX1305386 <zinin.ivan@huawei-partners.com> -Co-authored-by: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com> ---- - gcc/common.opt | 5 + - gcc/config/aarch64/aarch64.cc | 42 ++ - gcc/doc/tm.texi | 8 + - gcc/doc/tm.texi.in | 4 + - gcc/params.opt | 3 + - gcc/passes.def | 1 + - gcc/sched-rgn.cc | 704 +++++++++++++++++- - gcc/target.def | 10 + - .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 74 ++ - .../rtl/aarch64/test-ldp-split-rearrange.c | 40 + - .../gcc.dg/rtl/aarch64/test-ldp-split.c | 174 +++++ - gcc/timevar.def | 1 + - gcc/tree-pass.h | 1 + - 13 files changed, 1066 insertions(+), 1 deletion(-) - create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c - create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c - create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index a42bee250..c0e3f5687 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1797,6 +1797,11 @@ floop-nest-optimize - Common Var(flag_loop_nest_optimize) Optimization - Enable the loop nest optimizer. - -+fsplit-ldp-stp -+Common Var(flag_split_ldp_stp) Optimization -+Split load/store pair instructions into separate load/store operations -+for better performance. -+ - fstrict-volatile-bitfields - Common Var(flag_strict_volatile_bitfields) Init(-1) Optimization - Force bitfield accesses to match their type width. -diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc -index 04072ca25..48e2eded0 100644 ---- a/gcc/config/aarch64/aarch64.cc -+++ b/gcc/config/aarch64/aarch64.cc -@@ -27507,6 +27507,48 @@ aarch64_run_selftests (void) - - #endif /* #if CHECKING_P */ - -+/* TODO: refuse to use ranges intead of full list of an instruction codes. */ -+ -+bool -+is_aarch64_ldp_insn (int icode) -+{ -+ if ((icode >= CODE_FOR_load_pair_sw_sisi -+ && icode <= CODE_FOR_load_pair_dw_tftf) -+ || (icode >= CODE_FOR_loadwb_pairsi_si -+ && icode <= CODE_FOR_loadwb_pairtf_di) -+ || (icode >= CODE_FOR_load_pairv8qiv8qi -+ && icode <= CODE_FOR_load_pairdfdf) -+ || (icode >= CODE_FOR_load_pairv16qiv16qi -+ && icode <= CODE_FOR_load_pairv8bfv2df) -+ || (icode >= CODE_FOR_load_pair_lanesv8qi -+ && icode <= CODE_FOR_load_pair_lanesdf)) -+ return true; -+ return false; -+} -+ -+bool -+is_aarch64_stp_insn (int icode) -+{ -+ if ((icode >= CODE_FOR_store_pair_sw_sisi -+ && icode <= CODE_FOR_store_pair_dw_tftf) -+ || (icode >= CODE_FOR_storewb_pairsi_si -+ && icode <= CODE_FOR_storewb_pairtf_di) -+ || (icode >= CODE_FOR_vec_store_pairv8qiv8qi -+ && icode <= CODE_FOR_vec_store_pairdfdf) -+ || (icode >= CODE_FOR_vec_store_pairv16qiv16qi -+ && icode <= CODE_FOR_vec_store_pairv8bfv2df) -+ || (icode >= CODE_FOR_store_pair_lanesv8qi -+ && icode <= CODE_FOR_store_pair_lanesdf)) -+ return true; -+ return false; -+} -+ -+#undef TARGET_IS_LDP_INSN -+#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn -+ -+#undef TARGET_IS_STP_INSN -+#define TARGET_IS_STP_INSN is_aarch64_stp_insn -+ - #undef TARGET_STACK_PROTECT_GUARD - #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard - -diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi -index c5006afc0..0c6415a9c 100644 ---- a/gcc/doc/tm.texi -+++ b/gcc/doc/tm.texi -@@ -12113,6 +12113,14 @@ object files that are not referenced from @code{main} and uses export - lists. - @end defmac - -+@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) -+Return true if icode is corresponding to any of the LDP instruction types. -+@end deftypefn -+ -+@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) -+Return true if icode is corresponding to any of the STP instruction types. -+@end deftypefn -+ - @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) - This target hook returns @code{true} past the point in which new jump - instructions could be created. On machines that require a register for -diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in -index f869ddd5e..6ff60e562 100644 ---- a/gcc/doc/tm.texi.in -+++ b/gcc/doc/tm.texi.in -@@ -7977,6 +7977,10 @@ object files that are not referenced from @code{main} and uses export - lists. - @end defmac - -+@hook TARGET_IS_LDP_INSN -+ -+@hook TARGET_IS_STP_INSN -+ - @hook TARGET_CANNOT_MODIFY_JUMPS_P - - @hook TARGET_HAVE_CONDITIONAL_EXECUTION -diff --git a/gcc/params.opt b/gcc/params.opt -index 7fcc2398d..6176d4790 100644 ---- a/gcc/params.opt -+++ b/gcc/params.opt -@@ -1217,4 +1217,7 @@ Enum(vrp_mode) String(ranger) Value(VRP_MODE_RANGER) - Common Joined UInteger Var(param_pointer_compression_size) Init(32) IntegerRange(8, 32) Param Optimization - Target size of compressed pointer, which should be 8, 16 or 32. - -+-param=param-ldp-dependency-search-range= -+Common Joined UInteger Var(param_ldp_dependency_search_range) Init(16) IntegerRange(1, 32) Param Optimization -+Range for depended ldp search in split-ldp-stp path. - ; This comment is to ensure we retain the blank line above. -diff --git a/gcc/passes.def b/gcc/passes.def -index 941bbadf0..a30e05688 100644 ---- a/gcc/passes.def -+++ b/gcc/passes.def -@@ -514,6 +514,7 @@ along with GCC; see the file COPYING3. If not see - NEXT_PASS (pass_reorder_blocks); - NEXT_PASS (pass_leaf_regs); - NEXT_PASS (pass_split_before_sched2); -+ NEXT_PASS (pass_split_complex_instructions); - NEXT_PASS (pass_sched2); - NEXT_PASS (pass_stack_regs); - PUSH_INSERT_PASSES_WITHIN (pass_stack_regs) -diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc -index a0dfdb788..b4df8bdc5 100644 ---- a/gcc/sched-rgn.cc -+++ b/gcc/sched-rgn.cc -@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3. If not see - are actually scheduled. */ -  - #include "config.h" -+#define INCLUDE_SET -+#define INCLUDE_VECTOR - #include "system.h" - #include "coretypes.h" - #include "backend.h" -@@ -65,6 +67,7 @@ along with GCC; see the file COPYING3. If not see - #include "dbgcnt.h" - #include "pretty-print.h" - #include "print-rtl.h" -+#include "cfgrtl.h" - - /* Disable warnings about quoting issues in the pp_xxx calls below - that (intentionally) don't follow GCC diagnostic conventions. */ -@@ -3951,6 +3954,705 @@ make_pass_sched_fusion (gcc::context *ctxt) - return new pass_sched_fusion (ctxt); - } - -+namespace { -+ -+/* Def-use analisys special functions implementation. */ -+ -+static struct df_link * -+get_defs (rtx_insn *insn, rtx reg) -+{ -+ df_ref use; -+ struct df_link *ref_chain, *ref_link; -+ -+ FOR_EACH_INSN_USE (use, insn) -+ { -+ if (GET_CODE (DF_REF_REG (use)) == SUBREG) -+ return NULL; -+ if (REGNO (DF_REF_REG (use)) == REGNO (reg))
View file
_service:tar_scm:0043-Extending-and-refactoring-of-pass_split_complex_inst.patch
Deleted
@@ -1,1426 +0,0 @@ -From a49db831320ac70ca8f46b94ee60d7c6951f65c3 Mon Sep 17 00:00:00 2001 -From: Gadzhiev Emin WX1195297 <gadzhiev.emin@huawei-partners.com> -Date: Wed, 20 Dec 2023 21:36:07 +0300 -Subject: PATCH 10/18 Extending and refactoring of - pass_split_complex_instructions - -- Add flag parameter in is_ldp_insn and is_stp_insn to know - if instruction has writeback operation -- Add support of PRE_*, POST_* operands as a memory address - expression -- Split only LDPs that intersect with a dependent store - instruction -- Make the selection of dependent store instructions stricter - so it will be enough to check by BFS that dependent store - instruction appears in search range. -- Add helper methods to retrieve fields of rtx -- Remove redundant iterations in find_dependent_stores_candidates -- Refactor generation of instructions -- Add more test cases ---- - gcc/config/aarch64/aarch64.cc | 62 +- - gcc/doc/tm.texi | 12 +- - gcc/sched-rgn.cc | 771 +++++++++--------- - gcc/target.def | 14 +- - .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 35 +- - .../rtl/aarch64/test-ldp-split-rearrange.c | 2 +- - .../gcc.dg/rtl/aarch64/test-ldp-split.c | 181 +++- - 7 files changed, 603 insertions(+), 474 deletions(-) - -diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc -index 48e2eded0..fa566dd80 100644 ---- a/gcc/config/aarch64/aarch64.cc -+++ b/gcc/config/aarch64/aarch64.cc -@@ -27507,39 +27507,59 @@ aarch64_run_selftests (void) - - #endif /* #if CHECKING_P */ - --/* TODO: refuse to use ranges intead of full list of an instruction codes. */ -+/* TODO: refuse to use ranges instead of full list of an instruction codes. */ - - bool --is_aarch64_ldp_insn (int icode) -+is_aarch64_ldp_insn (int icode, bool *has_wb) - { - if ((icode >= CODE_FOR_load_pair_sw_sisi -- && icode <= CODE_FOR_load_pair_dw_tftf) -+ && icode <= CODE_FOR_load_pair_sw_sfsf) -+ || (icode >= CODE_FOR_load_pair_dw_didi -+ && icode <= CODE_FOR_load_pair_dw_dfdf) -+ || (icode == CODE_FOR_load_pair_dw_tftf) - || (icode >= CODE_FOR_loadwb_pairsi_si -- && icode <= CODE_FOR_loadwb_pairtf_di) -- || (icode >= CODE_FOR_load_pairv8qiv8qi -- && icode <= CODE_FOR_load_pairdfdf) -- || (icode >= CODE_FOR_load_pairv16qiv16qi -- && icode <= CODE_FOR_load_pairv8bfv2df) -- || (icode >= CODE_FOR_load_pair_lanesv8qi -- && icode <= CODE_FOR_load_pair_lanesdf)) -- return true; -+ && icode <= CODE_FOR_loadwb_pairdi_di) -+ || (icode >= CODE_FOR_loadwb_pairsf_si -+ && icode <= CODE_FOR_loadwb_pairdf_di) -+ || (icode >= CODE_FOR_loadwb_pairti_si -+ && icode <= CODE_FOR_loadwb_pairtf_di)) -+ { -+ if (has_wb) -+ *has_wb = ((icode >= CODE_FOR_loadwb_pairsi_si -+ && icode <= CODE_FOR_loadwb_pairdi_di) -+ || (icode >= CODE_FOR_loadwb_pairsf_si -+ && icode <= CODE_FOR_loadwb_pairdf_di) -+ || (icode >= CODE_FOR_loadwb_pairti_si -+ && icode <= CODE_FOR_loadwb_pairtf_di)); -+ return true; -+ } - return false; - } - - bool --is_aarch64_stp_insn (int icode) -+is_aarch64_stp_insn (int icode, bool *has_wb) - { - if ((icode >= CODE_FOR_store_pair_sw_sisi -- && icode <= CODE_FOR_store_pair_dw_tftf) -+ && icode <= CODE_FOR_store_pair_sw_sfsf) -+ || (icode >= CODE_FOR_store_pair_dw_didi -+ && icode <= CODE_FOR_store_pair_dw_dfdf) -+ || (icode == CODE_FOR_store_pair_dw_tftf) - || (icode >= CODE_FOR_storewb_pairsi_si -- && icode <= CODE_FOR_storewb_pairtf_di) -- || (icode >= CODE_FOR_vec_store_pairv8qiv8qi -- && icode <= CODE_FOR_vec_store_pairdfdf) -- || (icode >= CODE_FOR_vec_store_pairv16qiv16qi -- && icode <= CODE_FOR_vec_store_pairv8bfv2df) -- || (icode >= CODE_FOR_store_pair_lanesv8qi -- && icode <= CODE_FOR_store_pair_lanesdf)) -- return true; -+ && icode <= CODE_FOR_storewb_pairdi_di) -+ || (icode >= CODE_FOR_storewb_pairsf_si -+ && icode <= CODE_FOR_storewb_pairdf_di) -+ || (icode >= CODE_FOR_storewb_pairti_si -+ && icode <= CODE_FOR_storewb_pairtf_di)) -+ { -+ if (has_wb) -+ *has_wb = ((icode >= CODE_FOR_storewb_pairsi_si -+ && icode <= CODE_FOR_storewb_pairdi_di) -+ || (icode >= CODE_FOR_storewb_pairsf_si -+ && icode <= CODE_FOR_storewb_pairdf_di) -+ || (icode >= CODE_FOR_storewb_pairti_si -+ && icode <= CODE_FOR_storewb_pairtf_di)); -+ return true; -+ } - return false; - } - -diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi -index 0c6415a9c..3b6e90bf2 100644 ---- a/gcc/doc/tm.texi -+++ b/gcc/doc/tm.texi -@@ -12113,12 +12113,16 @@ object files that are not referenced from @code{main} and uses export - lists. - @end defmac - --@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) --Return true if icode is corresponding to any of the LDP instruction types. -+@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}, bool *@var{has_wb}) -+Return true if @var{icode} is corresponding to any of the LDP instruction -+types. If @var{has_wb} is not NULL then its value is set to true if LDP -+contains post-index or pre-index operation. - @end deftypefn - --@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) --Return true if icode is corresponding to any of the STP instruction types. -+@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}, bool *@var{has_wb}) -+Return true if @var{icode} is corresponding to any of the STP instruction -+types. If @var{has_wb} is not NULL then its value is set to true if STP -+contains post-index or pre-index operation. - @end deftypefn - - @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) -diff --git a/gcc/sched-rgn.cc b/gcc/sched-rgn.cc -index b4df8bdc5..5f61de1c8 100644 ---- a/gcc/sched-rgn.cc -+++ b/gcc/sched-rgn.cc -@@ -3956,7 +3956,7 @@ make_pass_sched_fusion (gcc::context *ctxt) - - namespace { - --/* Def-use analisys special functions implementation. */ -+/* Def-use analysis special functions implementation. */ - - static struct df_link * - get_defs (rtx_insn *insn, rtx reg) -@@ -4032,42 +4032,66 @@ const pass_data pass_data_split_complex_instructions = { - (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish. */ - }; - -+/* Pass split_complex_instructions finds LOAD PAIR instructions (LDP) that can -+ be split into two LDR instructions. It splits only those LDP for which one -+ half of the requested memory is contained in the preceding STORE (STR/STP) -+ instruction whose base register has the same definition. This allows -+ to use hardware store-to-load forwarding mechanism and to get one half of -+ requested memory from the store queue of CPU. -+ -+ TODO: Add split of STP. -+ TODO: Add split of vector STP and LDP. */ - class pass_split_complex_instructions : public rtl_opt_pass - { - private: -- enum complex_instructions_t -+ enum mem_access_insn_t - { - UNDEFINED, - LDP, -+ /* LDP with post-index (see loadwb_pair in config/aarch64.md). */ -+ LDP_WB, -+ /* LDP that contains one destination register in RTL IR -+ (see movti_aarch64 in config/aarch64.md). */ - LDP_TI, - STP, -+ /* STP with pre-index (see storewb_pair in config/aarch64.md). */ -+ STP_WB, -+ /* STP that contains one source register in RTL IR -+ (see movti_aarch64 in config/aarch64.md). */ -+ STP_TI, - STR - }; - -- void split_complex_insn (rtx_insn *insn); -- void split_ldp_ti (rtx_insn *insn); -- void split_ldp_with_offset (rtx_insn *ldp_insn); -- void split_simple_ldp (rtx_insn *ldp_insn); -- void split_ldp_stp (rtx_insn *insn); -- complex_instructions_t get_insn_type (rtx_insn *insn); -- -- basic_block bb; -- rtx_insn *insn; - std::set<rtx_insn *> dependent_stores_candidates; - std::set<rtx_insn *> ldp_to_split_list; -
View file
_service:tar_scm:0044-Port-maxmin-patch-to-GCC-12.patch
Deleted
@@ -1,378 +0,0 @@ -From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Thu, 22 Feb 2024 17:07:24 +0800 -Subject: PATCH 12/18 Port maxmin patch to GCC 12 - ---- - gcc/config/aarch64/aarch64-simd.md | 256 ++++++++++++++++++++++++++ - gcc/config/aarch64/predicates.md | 19 ++ - gcc/testsuite/gcc.dg/combine-maxmin.c | 46 +++++ - 3 files changed, 321 insertions(+) - create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c - -diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md -index 82f73805f..de92802f5 100644 ---- a/gcc/config/aarch64/aarch64-simd.md -+++ b/gcc/config/aarch64/aarch64-simd.md -@@ -1138,6 +1138,82 @@ - (set_attr "type" "neon_compare<q>,neon_shift_imm<q>") - ) - -+;; Simplify the extension with following truncation for shift+neg operation. -+ -+(define_insn_and_split "*aarch64_sshr_neg_v8hi" -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (vec_concat:V8HI -+ (truncate:V4HI -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (sign_extend:V4SI -+ (vec_select:V4HI -+ (match_operand:V8HI 1 "register_operand") -+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) -+ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) -+ (truncate:V4HI -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (sign_extend:V4SI -+ (vec_select:V4HI -+ (match_dup 1) -+ (match_operand:V8HI 4 "vect_par_cnst_hi_half")))) -+ (match_dup 2))))) -+ "TARGET_SIMD" -+ "#" -+ "&& true" -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (ashiftrt:V8HI -+ (neg:V8HI -+ (match_operand:V8HI 1 "register_operand" "w")) -+ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one"))) -+ { -+ /* Reduce the shift amount to smaller mode. */ -+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands2, 0)) -+ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands2)) / 2); -+ operands2 = aarch64_simd_gen_const_vector_dup (V8HImode, val); -+ } -+ (set_attr "type" "multiple") -+) -+ -+;; The helper definition that allows combiner to use the previous pattern. -+ -+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi" -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (vec_concat:V8HI -+ (truncate:V4HI -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (match_operand:V4SI 1 "register_operand" "w")) -+ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) -+ (truncate:V4HI -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (match_operand:V4SI 3 "register_operand" "w")) -+ (match_dup 2))))) -+ "TARGET_SIMD" -+ "#" -+ "&& true" -+ (set (match_operand:V4SI 1 "register_operand" "=w") -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (match_dup 1)) -+ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) -+ (set (match_operand:V4SI 3 "register_operand" "=w") -+ (ashiftrt:V4SI -+ (neg:V4SI -+ (match_dup 3)) -+ (match_dup 2))) -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (vec_concat:V8HI -+ (truncate:V4HI -+ (match_dup 1)) -+ (truncate:V4HI -+ (match_dup 3)))) -+ "" -+ (set_attr "type" "multiple") -+) -+ - (define_insn "*aarch64_simd_sra<mode>" - (set (match_operand:VDQ_I 0 "register_operand" "=w") - (plus:VDQ_I -@@ -1714,6 +1790,26 @@ - } - ) - -+(define_insn "vec_pack_trunc_shifted_<mode>" -+ (set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w") -+ (vec_concat:<VNARROWQ2> -+ (truncate:<VNARROWQ> -+ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w") -+ (match_operand:VQN 2 "half_size_operand" "w"))) -+ (truncate:<VNARROWQ> -+ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w") -+ (match_operand:VQN 4 "half_size_operand" "w"))))) -+ "TARGET_SIMD" -+ { -+ if (BYTES_BIG_ENDIAN) -+ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>"; -+ else -+ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>"; -+ } -+ (set_attr "type" "neon_permute<q>") -+ (set_attr "length" "4") -+) -+ - (define_insn "aarch64_shrn<mode>_insn_le" - (set (match_operand:<VNARROWQ2> 0 "register_operand" "=w") - (vec_concat:<VNARROWQ2> -@@ -6652,6 +6748,166 @@ - (set_attr "type" "neon_tst<q>") - ) - -+;; Simplify the extension with following truncation for cmtst-like operation. -+ -+(define_insn_and_split "*aarch64_cmtst_arith_v8hi" -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (vec_concat:V8HI -+ (plus:V4HI -+ (truncate:V4HI -+ (eq:V4SI -+ (sign_extend:V4SI -+ (vec_select:V4HI -+ (and:V8HI -+ (match_operand:V8HI 1 "register_operand") -+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) -+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) -+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))) -+ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one")) -+ (plus:V4HI -+ (truncate:V4HI -+ (eq:V4SI -+ (sign_extend:V4SI -+ (vec_select:V4HI -+ (and:V8HI -+ (match_dup 1) -+ (match_dup 2)) -+ (match_operand:V8HI 6 "vect_par_cnst_hi_half"))) -+ (match_dup 4))) -+ (match_dup 5)))) -+ "TARGET_SIMD && !reload_completed" -+ "#" -+ "&& true" -+ (set (match_operand:V8HI 6 "register_operand" "=w") -+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) -+ (set (match_operand:V8HI 0 "register_operand" "=w") -+ (plus:V8HI -+ (eq:V8HI -+ (and:V8HI -+ (match_operand:V8HI 1 "register_operand" "w") -+ (match_dup 6)) -+ (match_operand:V8HI 4 "aarch64_simd_imm_zero")) -+ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one"))) -+ { -+ if (can_create_pseudo_p ()) -+ { -+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands4, 0)); -+ operands4 = aarch64_simd_gen_const_vector_dup (V8HImode, val); -+ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands5, 0)); -+ operands5 = aarch64_simd_gen_const_vector_dup (V8HImode, val2); -+ -+ operands6 = gen_reg_rtx (V8HImode); -+ } -+ else -+ FAIL; -+ } -+ (set_attr "type" "neon_tst_q") -+) -+ -+;; Three helper definitions that allow combiner to use the previous pattern. -+ -+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi" -+ (set (match_operand:V4SI 0 "register_operand" "=w") -+ (neg:V4SI -+ (eq:V4SI -+ (sign_extend:V4SI -+ (vec_select:V4HI -+ (and:V8HI -+ (match_operand:V8HI 1 "register_operand") -+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) -+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) -+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))
View file
_service:tar_scm:0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
Deleted
@@ -1,239 +0,0 @@ -From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001 -From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> -Date: Thu, 22 Feb 2024 17:13:27 +0800 -Subject: PATCH 13/18 Port moving minmask pattern to gimple to GCC 12 - ---- - gcc/common.opt | 4 + - gcc/match.pd | 104 ++++++++++++++++++++++++ - gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++ - gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++ - gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++-- - 5 files changed, 151 insertions(+), 5 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c - create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index 6c6fabb31..3a5004271 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1846,6 +1846,10 @@ fif-conversion-gimple - Common Var(flag_if_conversion_gimple) Optimization - Perform conversion of conditional jumps to branchless equivalents during gimple transformations. - -+fconvert-minmax -+Common Var(flag_convert_minmax) Optimization -+Convert saturating clipping to min max. -+ - fstack-reuse= - Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization - -fstack-reuse=all|named_vars|none Set stack reuse level for local variables. -diff --git a/gcc/match.pd b/gcc/match.pd -index 61866cb90..3a19e93b3 100644 ---- a/gcc/match.pd -+++ b/gcc/match.pd -@@ -8031,3 +8031,107 @@ and, - (plus:c@4 (op2:c @0 @1) - (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) - (if (single_use (@4) && single_use (@5))))) -+ -+/* MinMax pattern matching helpers. More info on the transformation below. */ -+ -+/* Match (a & 0b11..100..0) pattern. */ -+(match (minmax_cmp_arg @0 @1) -+ (bit_and @0 INTEGER_CST@1) -+ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) -+ -+/* Match (inversed_sign_bit >> sign_bit_pos) pattern. -+ This statement is blocking for the transformation of unsigned integers. -+ Do type check here to avoid unnecessary duplications. */ -+(match (minmax_sat_arg @0) -+ (rshift (negate @0) INTEGER_CST@1) -+ (if (!TYPE_UNSIGNED (TREE_TYPE (@0)) -+ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1)))) -+ -+/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)). -+ The matched pattern can be described as saturated clipping. -+ -+ The pattern supports truncation via both casts and bit_and. -+ Also there are patterns for possible inverted conditions. */ -+(if (flag_convert_minmax) -+/* Truncation via casts. Unfortunately convert? cannot be applied here -+ because convert and cond take different number of arguments. */ -+ (simplify -+ (convert -+ (cond -+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? (minmax_sat_arg @0)) -+ (convert? @0))) -+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ (simplify -+ (cond -+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? (minmax_sat_arg @0)) -+ (convert? @0)) -+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ -+ (simplify -+ (convert -+ (cond -+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? @0) -+ (convert? (minmax_sat_arg @0)))) -+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ (simplify -+ (cond -+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? @0) -+ (convert? (minmax_sat_arg @0))) -+ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ -+ /* Truncation via bit_and with mask. Same concerns on convert? here. */ -+ (simplify -+ (convert -+ (cond -+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) -+ (convert? @0))) -+ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ (simplify -+ (cond -+ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) -+ (convert? @0)) -+ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ -+ (simplify -+ (convert -+ (cond -+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? @0) -+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))) -+ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; }))))) -+ (simplify -+ (cond -+ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) -+ (convert? @0) -+ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))) -+ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) -+ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } -+ (convert (min (max @0 { integer_zero_node; }) -+ { mask; })))))) -diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c -new file mode 100644 -index 000000000..859ff7df8 ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c -@@ -0,0 +1,15 @@ -+/* { dg-do compile { target aarch64-*-* } } */ -+/* { dg-options "-O3 -fconvert-minmax" } */ -+ -+#include <inttypes.h> -+ -+__attribute__((noinline)) -+void test (int32_t *restrict a, int32_t *restrict x) -+{ -+ for (int i = 0; i < 4; i++) -+ ai = ((((-xi) >> 31) ^ xi) -+ & (-((int32_t)((xi & (~((1 << 8)-1))) == 0)))) ^ ((-xi) >> 31); -+} -+ -+/* { dg-final { scan-assembler-not {smax\t} } } */ -+/* { dg-final { scan-assembler-not {smin\t} } } */ -diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c -new file mode 100644 -index 000000000..63d4d85b3 ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c -@@ -0,0 +1,14 @@ -+/* { dg-do compile { target aarch64-*-* } } */ -+/* { dg-options "-O3 -fconvert-minmax" } */ -+ -+#include <inttypes.h> -+ -+__attribute__((noinline)) -+void test (int8_t *restrict a, int32_t *restrict x) -+{ -+ for (int i = 0; i < 8; i++) -+ ai = ((xi & ~((1 << 9)-1)) ? (-xi)>>31 & ((1 << 9)-1) : xi); -+} -+ -+/* { dg-final { scan-assembler-times {smax\t} 4 } } */ -+/* { dg-final { scan-assembler-times {smin\t} 4 } } */ -diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c -index 06bce7029..a984fa560 100755 ---- a/gcc/testsuite/gcc.dg/combine-maxmin.c -+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c -@@ -1,5 +1,5 @@ - /* { dg-do compile { target aarch64-*-* } } */ --/* { dg-options "-O3 -fdump-rtl-combine-all" } */ -+/* { dg-options "-O3 -fconvert-minmax" } */ - - /* The test checks usage of smax/smin insns for clip evaluation and - * uzp1/uzp2 insns for vector element narrowing. It's inspired by -@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, - { - const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; - for( int y = 0; y < height; y++ ) { -+ /* This loop is not being vectorized now. */
View file
_service:tar_scm:0046-Add-new-pattern-to-pass-the-maxmin-tests.patch
Deleted
@@ -1,65 +0,0 @@ -From dbcb2630c426c8dd2117b5ce625da8422dd8cd65 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Thu, 22 Feb 2024 17:20:17 +0800 -Subject: PATCH 14/18 Add new pattern to pass the maxmin tests - ---- - gcc/match.pd | 24 ++++++++++++++++++++++++ - gcc/testsuite/gcc.dg/combine-maxmin.c | 2 +- - 2 files changed, 25 insertions(+), 1 deletion(-) - -diff --git a/gcc/match.pd b/gcc/match.pd -index 3a19e93b3..aee58e47b 100644 ---- a/gcc/match.pd -+++ b/gcc/match.pd -@@ -8038,6 +8038,10 @@ and, - (match (minmax_cmp_arg @0 @1) - (bit_and @0 INTEGER_CST@1) - (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) -+/* Match ((unsigned) a > 0b0..01..1) pattern. */ -+(match (minmax_cmp_arg1 @0 @1) -+ (gt @0 INTEGER_CST@1) -+ (if (wi::popcount (wi::to_widest (@1) + 1) == 1))) - - /* Match (inversed_sign_bit >> sign_bit_pos) pattern. - This statement is blocking for the transformation of unsigned integers. -@@ -8095,6 +8099,26 @@ and, - (convert (min (max @0 { integer_zero_node; }) - { mask; }))))) - -+ (simplify -+ (convert -+ (cond -+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1) -+ (convert? (minmax_sat_arg @0)) -+ (convert? @0))) -+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); } -+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; }) -+ { mask; }))))) -+ (simplify -+ (cond -+ (minmax_cmp_arg1 (convert? @0) INTEGER_CST@1) -+ (convert? (minmax_sat_arg @0)) -+ (convert? @0)) -+ (if (wi::geu_p (wi::to_widest (@1) + 1, TYPE_PRECISION (type))) -+ (with { tree mask = build_int_cst (integer_type_node, tree_to_shwi (@1)); } -+ (convert (min (max (convert:integer_type_node @0) { integer_zero_node; }) -+ { mask; }))))) -+ - /* Truncation via bit_and with mask. Same concerns on convert? here. */ - (simplify - (convert -diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c -index a984fa560..5c0c9cc49 100755 ---- a/gcc/testsuite/gcc.dg/combine-maxmin.c -+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c -@@ -52,4 +52,4 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, - /* { dg-final { scan-assembler-times {usmin\t} 6 } } */ - /* All of the vectorized patterns are expected to be matched. */ - /* { dg-final { scan-assembler-not {cmtst\t} } } */ --/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */ -+/* { dg-final { scan-assembler-times {uzp1\t} 2 } } */ --- -2.33.0 -
View file
_service:tar_scm:0047-AES-Implement-AES-pattern-matching.patch
Deleted
@@ -1,3968 +0,0 @@ -From 53d321d2fe08f69a29527be157d4bcaaefea04ab Mon Sep 17 00:00:00 2001 -From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> -Date: Wed, 6 Dec 2023 10:46:28 +0300 -Subject: PATCH 15/18 AES Implement AES pattern matching - ---- - gcc/Makefile.in | 1 + - gcc/common.opt | 4 + - gcc/config/aarch64/aarch64.cc | 24 + - gcc/crypto-accel.cc | 2415 +++++++++++++++++ - gcc/doc/tm.texi | 29 + - gcc/doc/tm.texi.in | 12 + - gcc/passes.def | 1 + - gcc/rtl-matcher.h | 367 +++ - gcc/target.def | 41 + - .../gcc.target/aarch64/aes-decrypt.c | 478 ++++ - .../gcc.target/aarch64/aes-encrypt.c | 443 +++ - gcc/timevar.def | 1 + - gcc/tree-pass.h | 1 + - 13 files changed, 3817 insertions(+) - create mode 100644 gcc/crypto-accel.cc - create mode 100644 gcc/rtl-matcher.h - create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-decrypt.c - create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-encrypt.c - -diff --git a/gcc/Makefile.in b/gcc/Makefile.in -index 45705c1f3..876000bda 100644 ---- a/gcc/Makefile.in -+++ b/gcc/Makefile.in -@@ -1332,6 +1332,7 @@ OBJS = \ - cgraphunit.o \ - cgraphclones.o \ - combine.o \ -+ crypto-accel.o \ - combine-stack-adj.o \ - compare-elim.o \ - context.o \ -diff --git a/gcc/common.opt b/gcc/common.opt -index 3a5004271..1eb62ada5 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1129,6 +1129,10 @@ Common Var(flag_array_widen_compare) Optimization - Extends types for pointers to arrays to improve array comparsion performance. - In some extreme situations this may result in unsafe behavior. - -+fcrypto-accel-aes -+Common Var(flag_crypto_accel_aes) Init(0) Optimization -+Perform crypto acceleration AES pattern matching. -+ - fauto-inc-dec - Common Var(flag_auto_inc_dec) Init(1) Optimization - Generate auto-inc/dec instructions. -diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc -index fa566dd80..9171d9d56 100644 ---- a/gcc/config/aarch64/aarch64.cc -+++ b/gcc/config/aarch64/aarch64.cc -@@ -27569,6 +27569,30 @@ is_aarch64_stp_insn (int icode, bool *has_wb) - #undef TARGET_IS_STP_INSN - #define TARGET_IS_STP_INSN is_aarch64_stp_insn - -+machine_mode -+aarch64_get_v16qi_mode () -+{ -+ return V16QImode; -+} -+ -+#undef TARGET_GET_V16QI_MODE -+#define TARGET_GET_V16QI_MODE aarch64_get_v16qi_mode -+ -+#undef TARGET_GEN_REV32V16QI -+#define TARGET_GEN_REV32V16QI gen_aarch64_rev32v16qi -+ -+#undef TARGET_GEN_AESEV16QI -+#define TARGET_GEN_AESEV16QI gen_aarch64_crypto_aesev16qi -+ -+#undef TARGET_GEN_AESDV16QI -+#define TARGET_GEN_AESDV16QI gen_aarch64_crypto_aesdv16qi -+ -+#undef TARGET_GEN_AESMCV16QI -+#define TARGET_GEN_AESMCV16QI gen_aarch64_crypto_aesmcv16qi -+ -+#undef TARGET_GEN_AESIMCV16QI -+#define TARGET_GEN_AESIMCV16QI gen_aarch64_crypto_aesimcv16qi -+ - #undef TARGET_STACK_PROTECT_GUARD - #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard - -diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc -new file mode 100644 -index 000000000..f4e810a6b ---- /dev/null -+++ b/gcc/crypto-accel.cc -@@ -0,0 +1,2415 @@ -+/* Crypto-pattern optimizer. -+ Copyright (C) 2003-2023 Free Software Foundation, Inc. -+ -+This file is part of GCC. -+ -+GCC is free software; you can redistribute it and/or modify it under -+the terms of the GNU General Public License as published by the Free -+Software Foundation; either version 3, or (at your option) any later -+version. -+ -+GCC is distributed in the hope that it will be useful, but WITHOUT ANY -+WARRANTY; without even the implied warranty of MERCHANTABILITY or -+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+for more details. -+ -+You should have received a copy of the GNU General Public License -+along with GCC; see the file COPYING3. If not see -+<http://www.gnu.org/licenses/>. */ -+ -+#include "config.h" -+#define INCLUDE_VECTOR -+#define INCLUDE_MAP -+#define INCLUDE_SET -+#define INCLUDE_ALGORITHM -+#include "system.h" -+#include "coretypes.h" -+#include "backend.h" -+#include "target.h" -+#include "rtl.h" -+#include "tree.h" -+#include "df.h" -+#include "memmodel.h" -+#include "optabs.h" -+#include "regs.h" -+#include "emit-rtl.h" -+#include "recog.h" -+#include "cfgrtl.h" -+#include "cfgcleanup.h" -+#include "expr.h" -+#include "tree-pass.h" -+#include "rtl-matcher.h" -+ -+/* Basic AES table descryption. */ -+struct aes_table -+{ -+ /* Number of elements per table. */ -+ static const unsigned int table_nelts = 256; -+ /* Number of tables. */ -+ static const unsigned int basic_tables_num = 4; -+ /* Number of rounds. */ -+ static const unsigned int rounds_num = 4; -+ /* Common ID for wrong table. */ -+ static const unsigned int BAD_TABLE = -1; -+ -+ typedef const unsigned int table_typetable_nelts; -+ typedef table_type *table_mapbasic_tables_num; -+ -+ template<typename T> -+ static bool is_basic_table (tree ctor, const T ethalontable_nelts) -+ { -+ if (TREE_CODE (ctor) != CONSTRUCTOR -+ ||CONSTRUCTOR_NELTS (ctor) != table_nelts) -+ return false; -+ -+ unsigned ix; -+ tree val; -+ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val) -+ if (TREE_INT_CST_LOW (val) != ethalonix) -+ return false; -+ return true; -+ } -+ -+ static unsigned check_table (tree ctor, -+ table_map tables) -+ { -+ for (unsigned i = 0; i < 4; ++i) -+ if (is_basic_table (ctor, *tablesi)) -+ return i; -+ return BAD_TABLE; -+ } -+}; -+ -+/* AES encryption info. */ -+struct aes_encrypt_table : aes_table -+{ -+ typedef enum -+ { -+ TE0, -+ TE1, -+ TE2, -+ TE3, -+ BAD_TABLE = aes_table::BAD_TABLE -+ } table_entry; -+ -+ static table_type Te0; -+ static table_type Te1; -+ static table_type Te2; -+ static table_type Te3; -+ -+ static table_map tables; -+ static table_entry roundsrounds_num; -+ static table_entry final_roundsrounds_num; -+ -+ static table_entry get_table_id (tree ctor) -+ { -+ return static_cast<table_entry> (check_table (ctor, tables));
View file
_service:tar_scm:0048-crypto-accel-add-optimization-level-requirement-to-t.patch
Deleted
@@ -1,27 +0,0 @@ -From 915d549b03c10ab403538888149facd417a02ebc Mon Sep 17 00:00:00 2001 -From: vchernon <chernonog.vyacheslav@huawei.com> -Date: Wed, 27 Dec 2023 23:31:26 +0800 -Subject: PATCH 16/18 crypto-accel add optimization level requirement to - the gate - -fix issue (src-openEuler/gcc: I8RRDW) ---- - gcc/crypto-accel.cc | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/gcc/crypto-accel.cc b/gcc/crypto-accel.cc -index f4e810a6b..e7766a585 100644 ---- a/gcc/crypto-accel.cc -+++ b/gcc/crypto-accel.cc -@@ -2391,7 +2391,7 @@ public: - /* opt_pass methods: */ - virtual bool gate (function *) - { -- if (flag_crypto_accel_aes <= 0) -+ if (flag_crypto_accel_aes <= 0 || optimize < 1) - return false; - return targetm.get_v16qi_mode - && targetm.gen_rev32v16qi --- -2.33.0 -
View file
_service:tar_scm:0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
Deleted
@@ -1,239 +0,0 @@ -From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001 -From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com> -Date: Tue, 12 Mar 2024 23:30:56 +0800 -Subject: PATCH 17/18 Add more flexible check for pointer aliasing during - vectorization It takes minimum between number of iteration and segment length - it helps to speed up loops with small number of iterations when only tail can - be vectorized - ---- - gcc/params.opt | 5 ++ - .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++ - gcc/tree-data-ref.cc | 67 +++++++++++++------ - gcc/tree-data-ref.h | 11 ++- - gcc/tree-vect-data-refs.cc | 14 +++- - 5 files changed, 95 insertions(+), 25 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c - -diff --git a/gcc/params.opt b/gcc/params.opt -index 6176d4790..7e5c119cf 100644 ---- a/gcc/params.opt -+++ b/gcc/params.opt -@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop. - Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization - Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check. - -+-param=vect-alias-flexible-segment-len= -+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization -+Use a minimum length of different segments. Currenlty the minimum between -+iteration number and vectorization length is chosen by this param. -+ - -param=vect-max-version-for-alignment-checks= - Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization - Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. -diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c -new file mode 100644 -index 000000000..894f075f3 ---- /dev/null -+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c -@@ -0,0 +1,23 @@ -+/* { dg-do compile } */ -+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */ -+ -+#define TYPE int -+#define SIZE 257 -+ -+void __attribute__ ((weak)) -+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) -+{ -+ for (int i = 0; i < SIZE; ++i) -+ xi * n += yi * n; -+} -+ -+/* { dg-final { scan-assembler {\tld1w\tz0-9+} } } */ -+/* { dg-final { scan-assembler {\tst1w\tz0-9+} } } */ -+/* { dg-final { scan-assembler {\tldr\tw0-9+} } } */ -+/* { dg-final { scan-assembler {\tstr\tw0-9+} } } */ -+/* Should use a WAR check that multiplies by (VF-2)*4 rather than -+ an overlap check that multiplies by (257-1)*4. */ -+/* { dg-final { scan-assembler {\tcntb\t(x0-9+)\n.*\tsub\tx0-9+, \1, #8\n.*\tmul\tx0-9+,^\n*\1} } } */ -+/* One range check and a check for n being zero. */ -+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */ -+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ -diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc -index 397792c35..e6ae9e847 100644 ---- a/gcc/tree-data-ref.cc -+++ b/gcc/tree-data-ref.cc -@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, - same arguments. Try to optimize cases in which the second access - is a write and in which some overlap is valid. */ - --static bool --create_waw_or_war_checks (tree *cond_expr, -+static void -+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a, - const dr_with_seg_len_pair_t &alias_pair) - { - const dr_with_seg_len& dr_a = alias_pair.first; - const dr_with_seg_len& dr_b = alias_pair.second; - -- /* Check for cases in which: -- -- (a) DR_B is always a write; -- (b) the accesses are well-ordered in both the original and new code -- (see the comment above the DR_ALIAS_* flags for details); and -- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ -- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) -- return false; -- -- /* Check for equal (but possibly variable) steps. */ - tree step = DR_STEP (dr_a.dr); -- if (!operand_equal_p (step, DR_STEP (dr_b.dr))) -- return false; -- -- /* Make sure that we can operate on sizetype without loss of precision. */ - tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); -- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) -- return false; - - /* All addresses involved are known to have a common alignment ALIGN. - We can therefore subtract ALIGN from an exclusive endpoint to get -@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr, - fold_convert (ssizetype, indicator), - ssize_int (0)); - -- /* Get lengths in sizetype. */ -- tree seg_len_a -- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); - step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); - - /* Each access has the following pattern: -@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr, - *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); - if (dump_enabled_p ()) - dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); -+} -+ -+/* This is a wrapper function for create_waw_or_war_checks2. */ -+static bool -+create_waw_or_war_checks (tree *cond_expr, -+ const dr_with_seg_len_pair_t &alias_pair) -+{ -+ const dr_with_seg_len& dr_a = alias_pair.first; -+ const dr_with_seg_len& dr_b = alias_pair.second; -+ -+ /* Check for cases in which: -+ -+ (a) DR_B is always a write; -+ (b) the accesses are well-ordered in both the original and new code -+ (see the comment above the DR_ALIAS_* flags for details); and -+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ -+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) -+ return false; -+ -+ /* Check for equal (but possibly variable) steps. */ -+ tree step = DR_STEP (dr_a.dr); -+ if (!operand_equal_p (step, DR_STEP (dr_b.dr))) -+ return false; -+ -+ /* Make sure that we can operate on sizetype without loss of precision. */ -+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); -+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) -+ return false; -+ -+ /* Get lengths in sizetype. */ -+ tree seg_len_a -+ = fold_convert (sizetype, -+ rewrite_to_non_trapping_overflow (dr_a.seg_len)); -+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair); -+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2) -+ { -+ tree seg_len2_a -+ = fold_convert (sizetype, -+ rewrite_to_non_trapping_overflow (dr_a.seg_len2)); -+ tree cond_expr2; -+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair); -+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, -+ *cond_expr, cond_expr2); -+ } - return true; - } - -diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h -index f643a95b2..9bc5f16ee 100644 ---- a/gcc/tree-data-ref.h -+++ b/gcc/tree-data-ref.h -@@ -213,12 +213,19 @@ class dr_with_seg_len - public: - dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size, - unsigned int a) -- : dr (d), seg_len (len), access_size (size), align (a) {} -- -+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a) -+ {} -+ dr_with_seg_len (data_reference_p d, tree len, tree len2, -+ unsigned HOST_WIDE_INT size, unsigned int a) -+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a) -+ {} - data_reference_p dr; - /* The offset of the last access that needs to be checked minus - the offset of the first. */ - tree seg_len; -+ /* The second version of segment length. Currently this is used to -+ soften checks for a small number of iterations. */ -+ tree seg_len2; - /* A value that, when added to abs (SEG_LEN), gives the total number of - bytes in the segment. */ - poly_uint64 access_size; -diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc -index 4e615b80b..04e68f621 100644 ---- a/gcc/tree-vect-data-refs.cc -+++ b/gcc/tree-vect-data-refs.cc -@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) - { - poly_uint64 lower_bound; - tree segment_length_a, segment_length_b; -+ tree segment_length2_a, segment_length2_b; - unsigned HOST_WIDE_INT access_size_a, access_size_b; - unsigned int align_a, align_b; -
View file
_service:tar_scm:0050-Port-IPA-prefetch-to-GCC-12.patch
Deleted
@@ -1,2071 +0,0 @@ -From 7ee50ce44c652e21ca8ad33dc4e175f02b51b072 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Fri, 8 Mar 2024 06:50:39 +0800 -Subject: PATCH 18/18 Port IPA prefetch to GCC 12 - ---- - gcc/Makefile.in | 1 + - gcc/cgraph.cc | 1 + - gcc/cgraph.h | 2 + - gcc/common.opt | 8 + - gcc/ipa-devirt.cc | 54 +- - gcc/ipa-prefetch.cc | 1819 +++++++++++++++++++++++++++++++++++++++++++ - gcc/ipa-sra.cc | 8 + - gcc/params.opt | 8 + - gcc/passes.def | 1 + - gcc/timevar.def | 1 + - gcc/tree-pass.h | 1 + - 11 files changed, 1902 insertions(+), 2 deletions(-) - create mode 100644 gcc/ipa-prefetch.cc - -diff --git a/gcc/Makefile.in b/gcc/Makefile.in -index 876000bda..10544e4a9 100644 ---- a/gcc/Makefile.in -+++ b/gcc/Makefile.in -@@ -1468,6 +1468,7 @@ OBJS = \ - ipa-modref.o \ - ipa-modref-tree.o \ - ipa-predicate.o \ -+ ipa-prefetch.o \ - ipa-profile.o \ - ipa-prop.o \ - ipa-param-manipulation.o \ -diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc -index 3734c85db..7d738b891 100644 ---- a/gcc/cgraph.cc -+++ b/gcc/cgraph.cc -@@ -998,6 +998,7 @@ cgraph_node::create_indirect_edge (gcall *call_stmt, int ecf_flags, - edge->indirect_info = cgraph_allocate_init_indirect_info (); - edge->indirect_info->ecf_flags = ecf_flags; - edge->indirect_info->vptr_changed = true; -+ edge->indirect_info->targets = NULL; - - /* Record polymorphic call info. */ - if (!cloning_p -diff --git a/gcc/cgraph.h b/gcc/cgraph.h -index d96690326..b84ff2f98 100644 ---- a/gcc/cgraph.h -+++ b/gcc/cgraph.h -@@ -1659,6 +1659,8 @@ public: - int param_index; - /* ECF flags determined from the caller. */ - int ecf_flags; -+ /* Vector of potential call targets determined by analysis. */ -+ vec<cgraph_node *, va_gc_atomic> *targets; - - /* Number of speculative call targets, it's less than GCOV_TOPN_VALUES. */ - unsigned num_speculative_call_targets : 16; -diff --git a/gcc/common.opt b/gcc/common.opt -index 1eb62ada5..e65a06af9 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -1328,6 +1328,10 @@ fdevirtualize - Common Var(flag_devirtualize) Optimization - Try to convert virtual calls to direct ones. - -+fipa-ic -+Common Var(flag_ipa_ic) Optimization Init(0) -+Perform interprocedural analysis of indirect calls. -+ - ficp - Common Var(flag_icp) Optimization Init(0) - Try to promote indirect calls to direct ones. -@@ -2367,6 +2371,10 @@ fprefetch-loop-arrays - Common Var(flag_prefetch_loop_arrays) Init(-1) Optimization - Generate prefetch instructions, if available, for arrays in loops. - -+fipa-prefetch -+Common Var(flag_ipa_prefetch) Init(0) Optimization -+Generate prefetch instructions, if available, using IPA info. -+ - fprofile - Common Var(profile_flag) - Enable basic program profiling code. -diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc -index 318535d06..dd3562d56 100644 ---- a/gcc/ipa-devirt.cc -+++ b/gcc/ipa-devirt.cc -@@ -5758,6 +5758,54 @@ merge_fs_map_for_ftype_aliases () - } - } - -+/* Save results of indirect call analysis for the next passes. */ -+ -+static void -+save_analysis_results () -+{ -+ if (dump_file) -+ fprintf (dump_file, "\n\nSave results of indirect call analysis.\n"); -+ -+ struct cgraph_node *n; -+ FOR_EACH_FUNCTION (n) -+ { -+ cgraph_edge *e, *next; -+ for (e = n->indirect_calls; e; e = next) -+ { -+ next = e->next_callee; -+ if (e->indirect_info->polymorphic) -+ continue; -+ gcall *stmt = e->call_stmt; -+ gcc_assert (stmt != NULL); -+ tree call_fn = gimple_call_fn (stmt); -+ tree call_fn_ty = TREE_TYPE (call_fn); -+ if (!POINTER_TYPE_P (call_fn_ty)) -+ continue; -+ -+ tree ctype = TYPE_CANONICAL (TREE_TYPE (call_fn_ty)); -+ unsigned ctype_uid = ctype ? TYPE_UID (ctype) : 0; -+ if (!ctype_uid || unsafe_types->count (ctype_uid) -+ || !fs_map->count (ctype_uid)) -+ continue; -+ /* TODO: cleanup noninterposable aliases. */ -+ decl_set *decls = (*fs_map)ctype_uid; -+ if (dump_file) -+ { -+ fprintf (dump_file, "For call "); -+ print_gimple_stmt (dump_file, stmt, 0); -+ } -+ vec_alloc (e->indirect_info->targets, decls->size ()); -+ for (decl_set::const_iterator it = decls->begin (); -+ it != decls->end (); it++) -+ { -+ struct cgraph_node *target = cgraph_node::get (*it); -+ /* TODO: maybe discard some targets. */ -+ e->indirect_info->targets->quick_push (target); -+ } -+ } -+ } -+} -+ - /* Dump function types with set of functions corresponding to it. */ - - static void -@@ -5822,6 +5870,8 @@ collect_function_signatures () - } - } - merge_fs_map_for_ftype_aliases (); -+ if (flag_ipa_ic) -+ save_analysis_results (); - if (dump_file) - dump_function_signature_sets (); - } -@@ -6217,7 +6267,7 @@ ipa_icp (void) - optimize indirect calls. */ - collect_function_type_aliases (); - collect_function_signatures (); -- bool optimized = optimize_indirect_calls (); -+ bool optimized = flag_icp ? optimize_indirect_calls () : false; - - remove_type_alias_map (ta_map); - remove_type_alias_map (fta_map); -@@ -6264,7 +6314,7 @@ public: - /* opt_pass methods: */ - virtual bool gate (function *) - { -- return (optimize && flag_icp && !seen_error () -+ return (optimize && (flag_icp || flag_ipa_ic) && !seen_error () - && (in_lto_p || flag_whole_program)); - } - -diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc -new file mode 100644 -index 000000000..aeea51105 ---- /dev/null -+++ b/gcc/ipa-prefetch.cc -@@ -0,0 +1,1819 @@ -+/* IPA prefetch optimizations. -+ Copyright (C) 2023 Free Software Foundation, Inc. -+ Contributed by Ilia Diachkov. -+ -+This file is part of GCC. -+ -+GCC is free software; you can redistribute it and/or modify it under -+the terms of the GNU General Public License as published by the Free -+Software Foundation; either version 3, or (at your option) any later -+version. -+ -+GCC is distributed in the hope that it will be useful, but WITHOUT ANY -+WARRANTY; without even the implied warranty of MERCHANTABILITY or -+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+for more details. -+ -+You should have received a copy of the GNU General Public License -+along with GCC; see the file COPYING3. If not see -+<http://www.gnu.org/licenses/>. */ -+ -+/* IPA prefetch is an interprocedural pass that detects cases of indirect -+ memory access potentially in loops and inserts prefetch instructions -+ to optimize cache usage during these indirect memory accesses. */ -+
View file
_service:tar_scm:0051-Port-fixes-for-IPA-prefetch-to-GCC-12.patch
Deleted
@@ -1,2216 +0,0 @@ -From 4c262af8e178ac7c81b32be5b159b4d09a5841c9 Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Fri, 8 Mar 2024 07:07:50 +0800 -Subject: PATCH 1/2 Port fixes for IPA prefetch to GCC 12 - ---- - gcc/ipa-devirt.cc | 9 +- - gcc/ipa-prefetch.cc | 174 +- - gcc/ipa-sra.cc | 7 + - gcc/params.opt | 4 +- - gcc/testsuite/gcc.dg/completion-1.c | 1 + - gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c | 1843 ++++++++++++++++++++ - 6 files changed, 1974 insertions(+), 64 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c - -diff --git a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc -index dd3562d56..dd000b401 100644 ---- a/gcc/ipa-devirt.cc -+++ b/gcc/ipa-devirt.cc -@@ -5029,9 +5029,12 @@ analyze_assign_stmt (gimple *stmt) - } - else - { -- fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", -- get_tree_code_name (TREE_CODE (rhs))); -- print_gimple_stmt (dump_file, stmt, 0); -+ if (dump_file && (dump_flags & TDF_DETAILS)) -+ { -+ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", -+ get_tree_code_name (TREE_CODE (rhs))); -+ print_gimple_stmt (dump_file, stmt, 0); -+ } - gcc_unreachable (); - } - } -diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc -index aeea51105..9537e4835 100644 ---- a/gcc/ipa-prefetch.cc -+++ b/gcc/ipa-prefetch.cc -@@ -167,6 +167,7 @@ analyse_cgraph () - } - - /* TODO: maybe remove loop info here. */ -+ n->get_body (); - push_cfun (DECL_STRUCT_FUNCTION (n->decl)); - calculate_dominance_info (CDI_DOMINATORS); - loop_optimizer_init (LOOPS_NORMAL); -@@ -942,6 +943,9 @@ compare_memrefs (memref_t* mr, memref_t* mr2) - (*mr_candidate_map)mr = mr2; - return; - } -+ /* Probably we shouldn't leave nulls in the map. */ -+ if ((*mr_candidate_map)mr == NULL) -+ return; - /* TODO: support analysis with incrementation of different fields. */ - if ((*mr_candidate_map)mr->offset != mr2->offset) - { -@@ -1090,6 +1094,15 @@ analyse_loops () - memref_t *mr = it->first, *mr2 = it->second; - if (mr2 == NULL || !(*fmrs_map)fn->count (mr)) - continue; -+ /* For now optimize only MRs that mem is MEM_REF. -+ TODO: support other MR types. */ -+ if (TREE_CODE (mr->mem) != MEM_REF) -+ { -+ if (dump_file) -+ fprintf (dump_file, "Skip MR %d: unsupported tree code = %s\n", -+ mr->mr_id, get_tree_code_name (TREE_CODE (mr->mem))); -+ continue; -+ } - if (!optimize_mrs_map->count (fn)) - (*optimize_mrs_map)fn = new memref_set; - (*optimize_mrs_map)fn->insert (mr); -@@ -1102,7 +1115,7 @@ analyse_loops () - it != (*optimize_mrs_map)fn->end (); it++) - { - memref_t *mr = *it, *mr2 = (*mr_candidate_map)mr; -- fprintf (dump_file, "MRs %d,%d with incremental offset ", -+ fprintf (dump_file, "MRs %d, %d with incremental offset ", - mr->mr_id, mr2->mr_id); - print_generic_expr (dump_file, mr2->offset); - fprintf (dump_file, "\n"); -@@ -1435,6 +1448,52 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data) - return NULL_TREE; - } - -+/* Copy stmt and remap its operands. */ -+ -+static gimple * -+gimple_copy_and_remap (gimple *stmt) -+{ -+ gimple *copy = gimple_copy (stmt); -+ gcc_checking_assert (!is_gimple_debug (copy)); -+ -+ /* Remap all the operands in COPY. */ -+ struct walk_stmt_info wi; -+ memset (&wi, 0, sizeof (wi)); -+ wi.info = copy; -+ walk_gimple_op (copy, remap_gimple_op_r, &wi); -+ if (dump_file) -+ { -+ fprintf (dump_file, "Stmt copy after remap:\n"); -+ print_gimple_stmt (dump_file, copy, 0); -+ } -+ return copy; -+} -+ -+/* Copy and remap stmts listed in MR in reverse order to last_idx, skipping -+ processed ones. Insert new stmts to the sequence. */ -+ -+static gimple * -+gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, -+ int last_idx, stmt_set &processed) -+{ -+ gimple *last_stmt = NULL; -+ for (int i = mr->stmts.length () - 1; i >= last_idx ; i--) -+ { -+ if (processed.count (mr->stmtsi)) -+ continue; -+ processed.insert (mr->stmtsi); -+ if (dump_file) -+ { -+ fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", -+ i, mr->mr_id); -+ print_gimple_stmt (dump_file, mr->stmtsi, 0); -+ } -+ last_stmt = gimple_copy_and_remap (mr->stmtsi); -+ gimple_seq_add_stmt (&stmts, last_stmt); -+ } -+ return last_stmt; -+} -+ - static void - create_cgraph_edge (cgraph_node *n, gimple *stmt) - { -@@ -1490,6 +1549,13 @@ optimize_function (cgraph_node *n, function *fn) - "Skip the case.\n"); - return 0; - } -+ if (!tree_fits_shwi_p (inc_mr->step)) -+ { -+ if (dump_file) -+ fprintf (dump_file, "Cannot represent incremental MR's step as " -+ "integer. Skip the case.\n"); -+ return 0; -+ } - if (dump_file && !used_mrs.empty ()) - print_mrs_ids (used_mrs, "Common list of used mrs:\n"); - -@@ -1539,16 +1605,44 @@ optimize_function (cgraph_node *n, function *fn) - return 0; - } - else if (dump_file) -- fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index); -+ { -+ fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index); -+ gimple_dump_bb (dump_file, dom_bb, 0, dump_flags); -+ fprintf (dump_file, "\n"); -+ } - -- split_block (dom_bb, (gimple *) NULL); -+ /* Try to find comp_mr's stmt in the dominator bb. */ -+ gimple *last_used = NULL; -+ for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); -+ gsi_prev (&si)) -+ if (comp_mr->stmts0 == gsi_stmt (si)) -+ { -+ last_used = gsi_stmt (si); -+ if (dump_file) -+ { -+ fprintf (dump_file, "Last used stmt in dominator bb:\n"); -+ print_gimple_stmt (dump_file, last_used, 0); -+ } -+ break; -+ } -+ -+ split_block (dom_bb, last_used); - gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); - - /* Create new inc var. Insert new_var = old_var + step * factor. */ - decl_map = new tree_map; - gcc_assert (comp_mr->stmts0 && gimple_assign_single_p (comp_mr->stmts0)); - tree inc_var = gimple_assign_lhs (comp_mr->stmts0); -+ /* If old_var definition dominates the current use, just use it, otherwise -+ evaluate it just before new inc var evaluation. */ - gimple_seq stmts = NULL; -+ stmt_set processed_stmts; -+ if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts0))) -+ { -+ gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, -+ processed_stmts); -+ inc_var = gimple_assign_lhs (tmp); -+ } - tree var_type = TREE_TYPE (inc_var); - enum tree_code inc_code; - if (TREE_CODE (var_type) == POINTER_TYPE) -@@ -1556,52 +1650,28 @@ optimize_function (cgraph_node *n, function *fn) - else - inc_code = PLUS_EXPR;
View file
_service:tar_scm:0052-Fix-fails-in-IPA-prefetch-src-openEuler-gcc-I96ID7.patch
Deleted
@@ -1,94 +0,0 @@ -From 0263daa1312d0cdcdf9c770bcf5d982a2d4fc16b Mon Sep 17 00:00:00 2001 -From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> -Date: Fri, 29 Mar 2024 17:15:41 +0800 -Subject: PATCH 2/2 Fix fails in IPA prefetch (src-openEuler/gcc: I96ID7) - ---- - gcc/ipa-prefetch.cc | 28 ++++++++++++++++++++++++++-- - 1 file changed, 26 insertions(+), 2 deletions(-) - -diff --git a/gcc/ipa-prefetch.cc b/gcc/ipa-prefetch.cc -index 9537e4835..1ceb5137f 100644 ---- a/gcc/ipa-prefetch.cc -+++ b/gcc/ipa-prefetch.cc -@@ -366,6 +366,7 @@ typedef std::map<memref_t *, memref_t *> memref_map; - typedef std::map<memref_t *, tree> memref_tree_map; - - typedef std::set<gimple *> stmt_set; -+typedef std::set<tree> tree_set; - typedef std::map<tree, tree> tree_map; - - tree_memref_map *tm_map; -@@ -1124,8 +1125,21 @@ analyse_loops () - } - } - -+/* Compare memrefs by IDs; helper for qsort. */ -+ -+static int -+memref_id_cmp (const void *p1, const void *p2) -+{ -+ const memref_t *mr1 = *(const memref_t **) p1; -+ const memref_t *mr2 = *(const memref_t **) p2; -+ -+ if ((unsigned) mr1->mr_id > (unsigned) mr2->mr_id) -+ return 1; -+ return -1; -+} -+ - /* Reduce the set filtering out memrefs with the same memory references, -- return the result vector of memrefs. */ -+ sort and return the result vector of memrefs. */ - - static void - reduce_memref_set (memref_set *set, vec<memref_t *> &vec) -@@ -1162,6 +1176,7 @@ reduce_memref_set (memref_set *set, vec<memref_t *> &vec) - vec.safe_push (mr1); - } - } -+ vec.qsort (memref_id_cmp); - if (dump_file) - { - fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ()); -@@ -1663,10 +1678,15 @@ optimize_function (cgraph_node *n, function *fn) - } - - /* Create other new vars. Insert new stmts. */ -+ vec<memref_t *> used_mr_vec = vNULL; - for (memref_set::const_iterator it = used_mrs.begin (); - it != used_mrs.end (); it++) -+ used_mr_vec.safe_push (*it); -+ used_mr_vec.qsort (memref_id_cmp); -+ -+ for (unsigned int j = 0; j < used_mr_vec.length (); j++) - { -- memref_t *mr = *it; -+ memref_t *mr = used_mr_vecj; - if (mr == comp_mr) - continue; - gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, -@@ -1702,6 +1722,7 @@ optimize_function (cgraph_node *n, function *fn) - local = integer_three_node; - break; - } -+ tree_set prefetched_addrs; - for (unsigned int j = 0; j < vmrs.length (); j++) - { - memref_t *mr = vmrsj; -@@ -1714,10 +1735,13 @@ optimize_function (cgraph_node *n, function *fn) - tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); - if (decl_map->count (addr)) - addr = (*decl_map)addr; -+ if (prefetched_addrs.count (addr)) -+ continue; - last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), - 3, addr, write_p, local); - pcalls.safe_push (last_stmt); - gimple_seq_add_stmt (&stmts, last_stmt); -+ prefetched_addrs.insert (addr); - if (dump_file) - { - fprintf (dump_file, "Insert %d prefetch stmt:\n", j); --- -2.33.0 -
View file
_service:tar_scm:0053-struct-reorg-Add-Semi-Relayout.patch
Deleted
@@ -1,1366 +0,0 @@ -From c2a0dcc565e0f6274f26644bd389337db8f2940c Mon Sep 17 00:00:00 2001 -From: tiancheng-bao <baotiancheng1@huawei.com> -Date: Sat, 30 Mar 2024 11:04:23 +0800 -Subject: PATCH struct-reorg Add Semi Relayout - ---- - gcc/common.opt | 6 +- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 975 +++++++++++++++++- - gcc/ipa-struct-reorg/ipa-struct-reorg.h | 8 + - gcc/params.opt | 5 + - .../gcc.dg/struct/semi_relayout_rewrite.c | 86 ++ - gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 4 + - 6 files changed, 1040 insertions(+), 44 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/struct/semi_relayout_rewrite.c - -diff --git a/gcc/common.opt b/gcc/common.opt -index 38f1e457d..9484df5ad 100644 ---- a/gcc/common.opt -+++ b/gcc/common.opt -@@ -2010,9 +2010,9 @@ Common Var(flag_ipa_struct_reorg) Init(0) Optimization - Perform structure layout optimizations. - - fipa-struct-reorg= --Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 5) ---fipa-struct-reorg=0,1,2,3,4,5 adding none, struct-reorg, reorder-fields, --dfe, safe-pointer-compression, unsafe-pointer-compression optimizations. -+Common RejectNegative Joined UInteger Var(struct_layout_optimize_level) Init(0) IntegerRange(0, 6) -+-fipa-struct-reorg=0,1,2,3,4,5,6 adding none, struct-reorg, reorder-fields, -+dfe, safe-pointer-compression, unsafe-pointer-compression, semi-relayout optimizations. - - fipa-vrp - Common Var(flag_ipa_vrp) Optimization -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index 3922873f3..6a202b4bd 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -294,7 +294,8 @@ enum struct_layout_opt_level - STRUCT_REORDER_FIELDS = 1 << 2, - DEAD_FIELD_ELIMINATION = 1 << 3, - POINTER_COMPRESSION_SAFE = 1 << 4, -- POINTER_COMPRESSION_UNSAFE = 1 << 5 -+ POINTER_COMPRESSION_UNSAFE = 1 << 5, -+ SEMI_RELAYOUT = 1 << 6 - }; - - /* Defines the target pointer size of compressed pointer, which should be 8, -@@ -308,6 +309,7 @@ void get_base (tree &base, tree expr); - - static unsigned int current_layout_opt_level; - hash_map<tree, tree> replace_type_map; -+hash_map<tree, tree> semi_relayout_map; - - /* Return true if one of these types is created by struct-reorg. */ - -@@ -426,7 +428,9 @@ srtype::srtype (tree type) - visited (false), - pc_candidate (false), - has_legal_alloc_num (false), -- has_alloc_array (0) -+ has_alloc_array (0), -+ semi_relayout (false), -+ bucket_parts (0) - { - for (int i = 0; i < max_split; i++) - newtypei = NULL_TREE; -@@ -891,6 +895,66 @@ srfield::create_new_reorder_fields (tree newtypemax_split, - newfield0 = field; - } - -+/* Given a struct s whose fields has already reordered by size, we try to -+ combine fields less than 8 bytes together to 8 bytes. Example: -+ struct s { -+ uint64_t a, -+ uint32_t b, -+ uint32_t c, -+ uint32_t d, -+ uint16_t e, -+ uint8_t f -+ } -+ -+ We allocate memory for arrays of struct S, before semi-relayout, their -+ layout in memory is shown as below: -+ a,b,c,d,e,f,padding;a,b,c,d,e,f,padding;... -+ -+ During semi-relayout, we put a number of structs into a same region called -+ bucket. The number is determined by param realyout-bucket-capacity-level. -+ Using 1024 here as example. After semi-relayout, the layout in a bucket is -+ shown as below: -+ part1 a;a;a... -+ part2 b,c;b,c;b,c;... -+ part3 d,e,f,pad;d,e,f,pad;d,e,f,pad;... -+ -+ In the last bucket, if the amount of rest structs is less than the capacity -+ of a bucket, the rest of allcated memory will be wasted as padding. */ -+ -+unsigned -+srtype::calculate_bucket_size () -+{ -+ unsigned parts = 0; -+ unsigned bit_sum = 0; -+ unsigned relayout_offset = 0; -+ /* Currently, limit each 8 bytes with less than 2 fields. */ -+ unsigned curr_part_num = 0; -+ unsigned field_num = 0; -+ for (tree f = TYPE_FIELDS (newtype0); f; f = DECL_CHAIN (f)) -+ { -+ unsigned size = TYPE_PRECISION (TREE_TYPE (f)); -+ bit_sum += size; -+ field_num++; -+ if (++curr_part_num > 2 || bit_sum > 64) -+ { -+ bit_sum = size; -+ parts++; -+ relayout_offset = relayout_part_size * parts; -+ curr_part_num = 1; -+ } -+ else -+ { -+ relayout_offset = relayout_part_size * parts + (bit_sum - size) / 8; -+ } -+ new_field_offsets.put (f, relayout_offset); -+ } -+ /* Donnot relayout a struct with only one field after DFE. */ -+ if (field_num == 1) -+ return 0; -+ bucket_parts = ++parts; -+ return parts * relayout_part_size; -+} -+ - /* Create the new TYPE corresponding to THIS type. */ - - bool -@@ -1001,6 +1065,15 @@ srtype::create_new_type (void) - if (pc_candidate && pc_gptr == NULL_TREE) - create_global_ptr_for_pc (); - -+ if (semi_relayout) -+ { -+ bucket_size = calculate_bucket_size (); -+ if (bucket_size == 0) -+ return false; -+ if (semi_relayout_map.get (this->newtype0) == NULL) -+ semi_relayout_map.put (this->newtype0, this->type); -+ } -+ - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, "Created %d types:\n", maxclusters); -@@ -1393,7 +1466,7 @@ public: - bool should_create = false, bool can_escape = false); - bool wholeaccess (tree expr, tree base, tree accesstype, srtype *t); - -- void check_alloc_num (gimple *stmt, srtype *type); -+ void check_alloc_num (gimple *stmt, srtype *type, bool ptrptr); - void check_definition_assign (srdecl *decl, vec<srdecl *> &worklist); - void check_definition_call (srdecl *decl, vec<srdecl *> &worklist); - void check_definition (srdecl *decl, vec<srdecl *> &); -@@ -1442,6 +1515,33 @@ public: - tree &); - basic_block create_bb_for_compress_nullptr (basic_block, tree &); - basic_block create_bb_for_decompress_nullptr (basic_block, tree, tree &); -+ -+ // Semi-relayout methods: -+ bool is_semi_relayout_candidate (tree); -+ srtype *get_semi_relayout_candidate_type (tree); -+ void check_and_prune_struct_for_semi_relayout (void); -+ tree rewrite_pointer_diff (gimple_stmt_iterator *, tree, tree, srtype *); -+ tree rewrite_pointer_plus_integer (gimple *, gimple_stmt_iterator *, tree, -+ tree, srtype *); -+ tree build_div_expr (gimple_stmt_iterator *, tree, tree); -+ tree get_true_pointer_base (gimple_stmt_iterator *, tree, srtype *); -+ tree get_real_allocated_ptr (tree, gimple_stmt_iterator *); -+ tree set_ptr_for_use (tree, gimple_stmt_iterator *); -+ void record_allocated_size (tree, gimple_stmt_iterator *, tree); -+ tree read_allocated_size (tree, gimple_stmt_iterator *); -+ gimple *create_aligned_alloc (gimple_stmt_iterator *, srtype *, tree, -+ tree &); -+ void create_memset_zero (tree, gimple_stmt_iterator *, tree); -+ void create_memcpy (tree, tree, tree, gimple_stmt_iterator *); -+ void create_free (tree, gimple_stmt_iterator *); -+ void copy_to_lhs (tree, tree, gimple_stmt_iterator *); -+ srtype *get_relayout_candidate_type (tree); -+ long unsigned int get_true_field_offset (srfield *, srtype *); -+ tree rewrite_address (tree, srfield *, srtype *, gimple_stmt_iterator *); -+ bool check_sr_copy (gimple *); -+ void relayout_field_copy (gimple_stmt_iterator *, gimple *, tree, tree, -+ tree&, tree &); -+ bool do_semi_relayout (gimple_stmt_iterator *, gimple *, tree &, tree &); - }; - - struct ipa_struct_relayout -@@ -4355,7 +4455,7 @@ ipa_struct_reorg::check_type_and_push (tree newdecl, srdecl *decl, - } - - void --ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type) -+ipa_struct_reorg::check_alloc_num (gimple *stmt, srtype *type, bool ptrptr) - { - if (current_layout_opt_level >= COMPLETE_STRUCT_RELAYOUT
View file
_service:tar_scm:0054-Struct-Reorg-Bugfix-for-structure-pointer-compressio.patch
Deleted
@@ -1,28 +0,0 @@ -From 9dc3df938b9ed2c27498c8548087fee1ce930366 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com> -Date: Tue, 2 Apr 2024 11:08:30 +0800 -Subject: PATCH Struct Reorg Bugfix for structure pointer compression - ---- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index fa33f2d35..3922873f3 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -7541,9 +7541,11 @@ ipa_struct_reorg::check_and_prune_struct_for_pointer_compression (void) - if (!type->has_legal_alloc_num) - { - if (current_layout_opt_level & POINTER_COMPRESSION_UNSAFE) -+ { - if (dump_file) - fprintf (dump_file, " has unknown alloc size, but" - " in unsafe mode, so"); -+ } - else - { - if (dump_file) --- -2.33.0 -
View file
_service:tar_scm:0055-Struct-Reorg-Port-bugfixes-to-GCC-12.3.1.patch
Deleted
@@ -1,420 +0,0 @@ -From 55c547748af36ffc3f2d5ed154a91fb3fcb8431c Mon Sep 17 00:00:00 2001 -From: Mingchuan Wu <wumingchuan1992@foxmail.com> -Date: Thu, 11 Apr 2024 15:49:59 +0800 -Subject: PATCH Struct Reorg Port bugfixes to GCC 12.3.1 - -Migrated from commits in GCC10.3.1: -https://gitee.com/openeuler/gcc/commit/41af6d361a6d85ef4fce8a8438113d765596afdd -https://gitee.com/openeuler/gcc/commit/25d74b98caeaae881e374924886ee664aa1af5bc -https://gitee.com/openeuler/gcc/commit/b5a3bfe92f96cd0d2224d80ac4eaa80dab1bd6bf -https://gitee.com/openeuler/gcc/commit/708ffe6f132ee39441b66b6ab6b98847d35916b7 -https://gitee.com/openeuler/gcc/commit/e875e4e7f3716aa268ffbbf55ee199ec82b6aeba ---- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 97 ++++++++++--------- - gcc/testsuite/gcc.dg/struct/dfe_escape.c | 50 ++++++++++ - gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c | 69 +++++++++++++ - gcc/testsuite/gcc.dg/struct/struct-reorg.exp | 2 + - gcc/testsuite/gcc.dg/struct/struct_reorg-10.c | 29 ++++++ - gcc/testsuite/gcc.dg/struct/struct_reorg-11.c | 16 +++ - gcc/testsuite/gcc.dg/struct/struct_reorg-12.c | 26 +++++ - 7 files changed, 243 insertions(+), 46 deletions(-) - create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_escape.c - create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_func_ptr.c - create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-10.c - create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-11.c - create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-12.c - -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index 6a202b4bd..f03d1d875 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -466,10 +466,19 @@ srtype::has_dead_field (void) - unsigned i; - FOR_EACH_VEC_ELT (fields, i, this_field) - { -- if (!(this_field->field_access & READ_FIELD)) -- { -- may_dfe = true; -- break; -+ /* Function pointer members are not processed, because DFE -+ does not currently support accurate analysis of function -+ pointers, and we have not identified specific use cases. */ -+ if (!(this_field->field_access & READ_FIELD) -+ && !FUNCTION_POINTER_TYPE_P (this_field->fieldtype)) -+ { -+ /* Fields with escape risks should not be processed. */ -+ if (this_field->type == NULL -+ || (this_field->type->escapes == does_not_escape)) -+ { -+ may_dfe = true; -+ break; -+ } - } - } - return may_dfe; -@@ -1032,8 +1041,13 @@ srtype::create_new_type (void) - { - srfield *f = fieldsi; - if (current_layout_opt_level & DEAD_FIELD_ELIMINATION -- && !(f->field_access & READ_FIELD)) -- continue; -+ && !(f->field_access & READ_FIELD) -+ && !FUNCTION_POINTER_TYPE_P (f->fieldtype)) -+ { -+ /* Fields with escape risks should not be processed. */ -+ if (f->type == NULL || (f->type->escapes == does_not_escape)) -+ continue; -+ } - f->create_new_fields (newtype, newfields, newlast); - } - -@@ -3815,9 +3829,17 @@ ipa_struct_reorg::maybe_mark_or_record_other_side (tree side, tree other, - if (VOID_POINTER_P (TREE_TYPE (side)) - && TREE_CODE (side) == SSA_NAME) - { -- /* The type is other, the declaration is side. */ -- current_function->record_decl (type, side, -1, -- isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL); -+ tree inner = SSA_NAME_VAR (side); -+ if (inner) -+ { -+ srdecl *in = find_decl (inner); -+ if (in && !in->type->has_escaped ()) -+ { -+ /* The type is other, the declaration is side. */ -+ current_function->record_decl (type, side, -1, -+ isptrptr (TREE_TYPE (other)) ? TREE_TYPE (other) : NULL); -+ } -+ } - } - else - /* *_1 = &MEM(void *)&x + 8B. */ -@@ -3910,6 +3932,12 @@ ipa_struct_reorg::maybe_record_assign (cgraph_node *node, gassign *stmt) - maybe_mark_or_record_other_side (rhs, lhs, stmt); - if (TREE_CODE (lhs) == SSA_NAME) - maybe_mark_or_record_other_side (lhs, rhs, stmt); -+ -+ /* Handle missing ARRAY_REF cases. */ -+ if (TREE_CODE (lhs) == ARRAY_REF) -+ mark_type_as_escape (TREE_TYPE (lhs), escape_array, stmt); -+ if (TREE_CODE (rhs) == ARRAY_REF) -+ mark_type_as_escape (TREE_TYPE (rhs), escape_array, stmt); - } - } - -@@ -5272,8 +5300,11 @@ ipa_struct_reorg::record_accesses (void) - record_function (cnode); - else - { -- tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl)); -- mark_type_as_escape (return_type, escape_return, NULL); -+ if (cnode->externally_visible) -+ { -+ tree return_type = TREE_TYPE (TREE_TYPE (cnode->decl)); -+ mark_type_as_escape (return_type, escape_return, NULL); -+ } - } - - } -@@ -5889,6 +5920,7 @@ ipa_struct_reorg::rewrite_expr (tree expr, - bool escape_from_base = false; - - tree newbasemax_split; -+ memset (newbase, 0, sizeof (treemax_split)); - memset (newexpr, 0, sizeof (treemax_split)); - - if (TREE_CODE (expr) == CONSTRUCTOR) -@@ -6912,7 +6944,7 @@ create_bb_for_group_diff_ne_0 (basic_block new_bb, tree &phi, tree ptr, - } - - tree --ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt, -+ipa_struct_reorg::rewrite_pointer_plus_integer (gimple *stmt ATTRIBUTE_UNUSED, - gimple_stmt_iterator *gsi, - tree ptr, tree offset, - srtype *type) -@@ -7889,41 +7921,14 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, - should be removed. */ - - bool --ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) -+ipa_struct_reorg::rewrite_debug (gimple *, gimple_stmt_iterator *) - { -- if (current_layout_opt_level >= STRUCT_REORDER_FIELDS) -- /* Delete debug gimple now. */ -- return true; -- bool remove = false; -- if (gimple_debug_bind_p (stmt)) -- { -- tree var = gimple_debug_bind_get_var (stmt); -- tree newvarmax_split; -- if (rewrite_expr (var, newvar, true)) -- remove = true; -- if (gimple_debug_bind_has_value_p (stmt)) -- { -- var = gimple_debug_bind_get_value (stmt); -- if (TREE_CODE (var) == POINTER_PLUS_EXPR) -- var = TREE_OPERAND (var, 0); -- if (rewrite_expr (var, newvar, true)) -- remove = true; -- } -- } -- else if (gimple_debug_source_bind_p (stmt)) -- { -- tree var = gimple_debug_source_bind_get_var (stmt); -- tree newvarmax_split; -- if (rewrite_expr (var, newvar, true)) -- remove = true; -- var = gimple_debug_source_bind_get_value (stmt); -- if (TREE_CODE (var) == POINTER_PLUS_EXPR) -- var = TREE_OPERAND (var, 0); -- if (rewrite_expr (var, newvar, true)) -- remove = true; -- } -- -- return remove; -+ /* In debug statements, there might be some statements that have -+ been optimized out in gimple but left in debug gimple. Sometimes -+ these statements need to be analyzed to escape, but in rewrite -+ stage it shouldn't happen. It needs to care a lot to handle these -+ cases but seems useless. So now we just delete debug gimple. */ -+ return true; - } - - /* Rewrite PHI nodes, return true if the PHI was replaced. */ -diff --git a/gcc/testsuite/gcc.dg/struct/dfe_escape.c b/gcc/testsuite/gcc.dg/struct/dfe_escape.c -new file mode 100644 -index 000000000..09efe8027 ---- /dev/null -+++ b/gcc/testsuite/gcc.dg/struct/dfe_escape.c -@@ -0,0 +1,50 @@ -+/* { dg-do compile } */ -+ -+#include <stdio.h> -+#include <stdlib.h> -+ -+typedef struct arc arc_t; -+typedef struct arc *arc_p; -+ -+typedef struct network
View file
_service:tar_scm:0056-Fix-bug-that-verifying-gimple-failed-when-reorg-leve.patch
Deleted
@@ -1,27 +0,0 @@ -From fa6f80044dcebd28506e871e6e5d25e2dfd7e105 Mon Sep 17 00:00:00 2001 -From: tiancheng-bao <baotiancheng1@huawei.com> -Date: Fri, 12 Apr 2024 15:09:28 +0800 -Subject: PATCH 01/32 Fix bug that verifying gimple failed when reorg-level > - 5 - ---- - gcc/ipa-struct-reorg/ipa-struct-reorg.cc | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -index f03d1d875..e08577c0c 100644 ---- a/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -+++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.cc -@@ -7461,6 +7461,9 @@ ipa_struct_reorg::rewrite_assign (gassign *stmt, gimple_stmt_iterator *gsi) - continue; - tree lhs_expr = newlhsi ? newlhsi : lhs; - tree rhs_expr = newrhsi ? newrhsi : rhs; -+ if (!useless_type_conversion_p (TREE_TYPE (lhs_expr), -+ TREE_TYPE (rhs_expr))) -+ rhs_expr = gimplify_build1 (gsi, NOP_EXPR, TREE_TYPE (lhs_expr), rhs_expr); - gimple *newstmt = gimple_build_assign (lhs_expr, rhs_expr); - if (dump_file && (dump_flags & TDF_DETAILS)) - { --- -2.28.0.windows.1 -
View file
_service:tar_scm:0057-AutoFdo-Fix-memory-leaks-in-autofdo.patch
Deleted
@@ -1,90 +0,0 @@ -From 13e82fccba781b29e55a6e1934986514019b728d Mon Sep 17 00:00:00 2001 -From: zhenyu--zhao <zhaozhenyu17@huawei.com> -Date: Sun, 24 Mar 2024 20:42:27 +0800 -Subject: PATCH 02/32 AutoFdo Fix memory leaks in autofdo - ---- - gcc/final.cc | 22 ++++++++++++++-------- - 1 file changed, 14 insertions(+), 8 deletions(-) - -diff --git a/gcc/final.cc b/gcc/final.cc -index d4c4fa08f..af4e529bb 100644 ---- a/gcc/final.cc -+++ b/gcc/final.cc -@@ -4402,12 +4402,15 @@ get_fdo_count_quality (profile_count count) - return profile_qualitycount.quality (); - } - --static const char * -+/* If the function is not public, return the function_name/file_name for -+ disambiguation of local symbols since there could be identical function -+ names coming from identical file names. The caller needs to free memory. */ -+static char * - alias_local_functions (const char *fnname) - { - if (TREE_PUBLIC (cfun->decl)) - { -- return fnname; -+ return concat (fnname, NULL); - } - return concat (fnname, "/", lbasename (dump_base_name), NULL); - } -@@ -4457,12 +4460,13 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count) - - if (callee) - { -+ char *func_name = -+ alias_local_functions (get_fnname_from_decl (callee)); - fprintf (asm_out_file, "\t.string \"%x\"\n", - INSN_ADDRESSES (INSN_UID (insn))); - - fprintf (asm_out_file, "\t.string \"%s%s\"\n", -- ASM_FDO_CALLEE_FLAG, -- alias_local_functions (get_fnname_from_decl (callee))); -+ ASM_FDO_CALLEE_FLAG, func_name); - - fprintf (asm_out_file, - "\t.string \"" HOST_WIDE_INT_PRINT_DEC "\"\n", -@@ -4472,9 +4476,9 @@ dump_direct_callee_info_to_asm (basic_block bb, gcov_type call_count) - { - fprintf (dump_file, "call: %x --> %s \n", - INSN_ADDRESSES (INSN_UID (insn)), -- alias_local_functions -- (get_fnname_from_decl (callee))); -+ func_name); - } -+ free (func_name); - } - } - } -@@ -4547,8 +4551,9 @@ dump_bb_info_to_asm (basic_block bb, gcov_type bb_count) - static void - dump_function_info_to_asm (const char *fnname) - { -+ char *func_name = alias_local_functions (fnname); - fprintf (asm_out_file, "\t.string \"%s%s\"\n", -- ASM_FDO_CALLER_FLAG, alias_local_functions (fnname)); -+ ASM_FDO_CALLER_FLAG, func_name); - fprintf (asm_out_file, "\t.string \"%s%d\"\n", - ASM_FDO_CALLER_SIZE_FLAG, get_function_end_addr ()); - fprintf (asm_out_file, "\t.string \"%s%s\"\n", -@@ -4557,7 +4562,7 @@ dump_function_info_to_asm (const char *fnname) - if (dump_file) - { - fprintf (dump_file, "\n FUNC_NAME: %s\n", -- alias_local_functions (fnname)); -+ func_name); - fprintf (dump_file, " file: %s\n", - dump_base_name); - fprintf (dump_file, "profile_status: %s\n", -@@ -4567,6 +4572,7 @@ dump_function_info_to_asm (const char *fnname) - fprintf (dump_file, " function_bind: %s\n", - simple_get_function_bind ()); - } -+ free (func_name); - } - - /* Dump function profile into form AutoFDO or PGO to asm. */ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0058-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ-.patch
Deleted
@@ -1,135 +0,0 @@ -From 1649f9fbbc5267de2a675336d3ac665528a03db8 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Wed, 10 May 2023 15:16:58 +0800 -Subject: PATCH 03/32 x86: Add a new option -mdaz-ftz to enable FTZ and DAZ - flags in MXCSR. - - if (mdaz-ftz) - link crtfastmath.o - else if ((Ofast || ffast-math || funsafe-math-optimizations) - && !mno-daz-ftz) - link crtfastmath.o - else - Don't link crtfastmath.o - -gcc/ChangeLog: - - * config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o - whenever -mdaz-ftz is specified. Don't link crtfastmath.o - when -mno-daz-ftz is specified. - * config/i386/darwin.h (ENDFILE_SPEC): Ditto. - * config/i386/gnu-user-common.h - (GNU_USER_TARGET_MATHFILE_SPEC): Ditto. - * config/i386/mingw32.h (ENDFILE_SPEC): Ditto. - * config/i386/i386.opt (mdaz-ftz): New option. - * doc/invoke.texi (x86 options): Document mftz-daz. ---- - gcc/config/i386/cygwin.h | 2 +- - gcc/config/i386/darwin.h | 4 ++-- - gcc/config/i386/gnu-user-common.h | 2 +- - gcc/config/i386/i386.opt | 4 ++++ - gcc/config/i386/mingw32.h | 2 +- - gcc/doc/invoke.texi | 11 ++++++++++- - 6 files changed, 19 insertions(+), 6 deletions(-) - -diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h -index d06eda369..5412c5d44 100644 ---- a/gcc/config/i386/cygwin.h -+++ b/gcc/config/i386/cygwin.h -@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see - - #undef ENDFILE_SPEC - #define ENDFILE_SPEC \ -- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\ -+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ - %{!shared:%:if-exists(default-manifest.o%s)}\ - %{fvtable-verify=none:%s; \ - fvtable-verify=preinit:vtv_end.o%s; \ -diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h -index a55f6b2b8..2f773924d 100644 ---- a/gcc/config/i386/darwin.h -+++ b/gcc/config/i386/darwin.h -@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see - "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} " - - #undef ENDFILE_SPEC --#define ENDFILE_SPEC \ -- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ -+#define ENDFILE_SPEC -+\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ - %{mpc32:crtprec32.o%s} \ - %{mpc64:crtprec64.o%s} \ - %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR -diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h -index 23b54c5be..3d2a33f17 100644 ---- a/gcc/config/i386/gnu-user-common.h -+++ b/gcc/config/i386/gnu-user-common.h -@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see - - /* Similar to standard GNU userspace, but adding -ffast-math support. */ - #define GNU_USER_TARGET_MATHFILE_SPEC \ -- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ -+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ - %{mpc32:crtprec32.o%s} \ - %{mpc64:crtprec64.o%s} \ - %{mpc80:crtprec80.o%s}" -diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt -index fc1b944ac..498fb454d 100644 ---- a/gcc/config/i386/i386.opt -+++ b/gcc/config/i386/i386.opt -@@ -420,6 +420,10 @@ mpc80 - Target RejectNegative - Set 80387 floating-point precision to 80-bit. - -+mdaz-ftz -+Target -+Set the FTZ and DAZ Flags. -+ - mpreferred-stack-boundary= - Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg) - Attempt to keep stack aligned to this power of 2. -diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h -index d3ca0cd02..ddbe6a405 100644 ---- a/gcc/config/i386/mingw32.h -+++ b/gcc/config/i386/mingw32.h -@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see - - #undef ENDFILE_SPEC - #define ENDFILE_SPEC \ -- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ -+ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ - %{!shared:%:if-exists(default-manifest.o%s)}\ - %{fvtable-verify=none:%s; \ - fvtable-verify=preinit:vtv_end.o%s; \ -diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index 2b376e0e9..3a48655e5 100644 ---- a/gcc/doc/invoke.texi -+++ b/gcc/doc/invoke.texi -@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options. - -m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol - -mregparm=@var{num} -msseregparm @gol - -mveclibabi=@var{type} -mvect8-ret-in-mem @gol ---mpc32 -mpc64 -mpc80 -mstackrealign @gol -+-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol - -momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol - -mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol - -m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol -@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant - loss of accuracy, typically through so-called ``catastrophic cancellation'', - when this option is used to set the precision to less than extended precision. - -+@item -mdaz-ftz -+@opindex mdaz-ftz -+ -+The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register -+are used to control floating-point calculations.SSE and AVX instructions -+including scalar and vector instructions could benefit from enabling the FTZ -+and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags -+when @option{-mno-daz-ftz} is specified. -+ - @item -mstackrealign - @opindex mstackrealign - Realign the stack at entry. On the x86, the @option{-mstackrealign} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0059-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
Deleted
@@ -1,65 +0,0 @@ -From e70fa730dcfcb3a7b1d56a2e166752d4299f0504 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Mon, 5 Jun 2023 12:38:41 +0800 -Subject: PATCH 04/32 Explicitly view_convert_expr mask to signed type when - folding pblendvb builtins. - -Since mask < 0 will be always false for vector char when --funsigned-char, but vpblendvb needs to check the most significant -bit. The patch explicitly VCE to vector signed char. - -gcc/ChangeLog: - - PR target/110108 - * config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly - view_convert_expr mask to signed type when folding pblendvb - builtins. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/pr110108-2.c: New test. ---- - gcc/config/i386/i386.cc | 4 +++- - gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++ - 2 files changed, 17 insertions(+), 1 deletion(-) - create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c - -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index 462dce10e..479fc6010 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) - tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode - ? intSI_type_node : intDI_type_node; - type = get_same_sized_vectype (itype, type); -- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); - } -+ else -+ type = signed_type_for (type); -+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); - tree zero_vec = build_zero_cst (type); - tree cmp_type = truth_type_for (type); - tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); -diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c -new file mode 100644 -index 000000000..2d1d2fd49 ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c -@@ -0,0 +1,14 @@ -+/* { dg-do compile } */ -+/* { dg-options "-mavx2 -O2 -funsigned-char" } */ -+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ -+ -+#include <immintrin.h> -+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) { -+ __m128i Result = _mm_blendv_epi8(X0, X1, X2); -+ return Result; -+} -+ -+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) { -+ __m256i Result = _mm256_blendv_epi8(X0, X1, X2); -+ return Result; -+} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0060-Make-option-mvzeroupper-independent-of-optimization-.patch
Deleted
@@ -1,138 +0,0 @@ -From 48715f03ad08f185153bfb0ff4c0802ab2d9579c Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Mon, 26 Jun 2023 09:50:25 +0800 -Subject: PATCH 05/32 Make option mvzeroupper independent of optimization - level. - -pass_insert_vzeroupper is under condition - -TARGET_AVX && TARGET_VZEROUPPER -&& flag_expensive_optimizations && !optimize_size - -But the document of mvzeroupper doesn't mention the insertion -required -O2 and above, it may confuse users when they explicitly -use -Os -mvzeroupper. - ------------- -mvzeroupper -Target Mask(VZEROUPPER) Save -Generate vzeroupper instruction before a transfer of control flow out of -the function. ------------- - -The patch moves flag_expensive_optimizations && !optimize_size to -ix86_option_override_internal. It makes -mvzeroupper independent of -optimization level, but still keeps the behavior of architecture -tuning(emit_vzeroupper) unchanged. - -gcc/ChangeLog: - - * config/i386/i386-features.cc (pass_insert_vzeroupper:gate): - Move flag_expensive_optimizations && !optimize_size to .. - * config/i386/i386-options.cc (ix86_option_override_internal): - .. this, it makes -mvzeroupper independent of optimization - level, but still keeps the behavior of architecture - tuning(emit_vzeroupper) unchanged. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/avx-vzeroupper-29.c: New testcase. - * gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase. - * gcc.target/i386/avx-vzeroupper-7.c: Ditto. - * gcc.target/i386/avx-vzeroupper-9.c: Ditto. ---- - gcc/config/i386/i386-features.cc | 3 +-- - gcc/config/i386/i386-options.cc | 4 +++- - gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++- - gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++ - gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++- - gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++- - 6 files changed, 24 insertions(+), 6 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c - -diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc -index 6fe41c3c2..6a2444eb6 100644 ---- a/gcc/config/i386/i386-features.cc -+++ b/gcc/config/i386/i386-features.cc -@@ -1875,8 +1875,7 @@ public: - /* opt_pass methods: */ - virtual bool gate (function *) - { -- return TARGET_AVX && TARGET_VZEROUPPER -- && flag_expensive_optimizations && !optimize_size; -+ return TARGET_AVX && TARGET_VZEROUPPER; - } - - virtual unsigned int execute (function *) -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index ff44ad4e0..74e969b68 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p, - sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); - - if (!(opts_set->x_target_flags & MASK_VZEROUPPER) -- && TARGET_EMIT_VZEROUPPER) -+ && TARGET_EMIT_VZEROUPPER -+ && flag_expensive_optimizations -+ && !optimize_size) - opts->x_target_flags |= MASK_VZEROUPPER; - if (!(opts_set->x_target_flags & MASK_STV)) - opts->x_target_flags |= MASK_STV; -diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c -index e694d4048..5a40e8783 100644 ---- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c -+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c -@@ -16,5 +16,6 @@ foo () - _mm256_zeroupper (); - } - --/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ - /* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ -diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c -new file mode 100644 -index 000000000..4af637757 ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c -@@ -0,0 +1,14 @@ -+/* { dg-do compile } */ -+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */ -+ -+#include <immintrin.h> -+ -+extern __m256 x, y; -+ -+void -+foo () -+{ -+ x = y; -+} -+ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ -diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c -index ab6d68779..75fe58897 100644 ---- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c -+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c -@@ -12,4 +12,5 @@ foo () - _mm256_zeroupper (); - } - --/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */ -diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c -index 974e1626a..fa0a6dfca 100644 ---- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c -+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c -@@ -15,4 +15,5 @@ foo () - _mm256_zeroupper (); - } - --/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ -+/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0061-i386-Sync-tune_string-with-arch_string-for-target-at.patch
Deleted
@@ -1,68 +0,0 @@ -From 8039d773354360ed8ff2f25c63843fc637eacc67 Mon Sep 17 00:00:00 2001 -From: Hongyu Wang <hongyu.wang@intel.com> -Date: Sun, 25 Jun 2023 09:50:21 +0800 -Subject: PATCH 06/32 i386: Sync tune_string with arch_string for target - attribute - -arch=* - -For function with target attribute arch=*, current logic will set its -tune to -mtune from command line so all target_clones will get same -tuning flags which would affect the performance for each clone. Override -tune with arch if tune was not explicitly specified to get proper tuning -flags for target_clones. - -gcc/ChangeLog: - - * config/i386/i386-options.cc (ix86_valid_target_attribute_tree): - Override tune_string with arch_string if tune_string is not - explicitly specified. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/mvc17.c: New test. - -(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8) ---- - gcc/config/i386/i386-options.cc | 6 +++++- - gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++ - 2 files changed, 16 insertions(+), 1 deletion(-) - create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c - -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 74e969b68..fb2ed942f 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args, - if (option_stringsIX86_FUNCTION_SPECIFIC_TUNE) - opts->x_ix86_tune_string - = ggc_strdup (option_stringsIX86_FUNCTION_SPECIFIC_TUNE); -- else if (orig_tune_defaulted) -+ /* If we have explicit arch string and no tune string specified, set -+ tune_string to NULL and later it will be overriden by arch_string -+ so target clones can get proper optimization. */ -+ else if (option_stringsIX86_FUNCTION_SPECIFIC_ARCH -+ || orig_tune_defaulted) - opts->x_ix86_tune_string = NULL; - - /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ -diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c -new file mode 100644 -index 000000000..8b83c1aec ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/mvc17.c -@@ -0,0 +1,11 @@ -+/* { dg-do compile } */ -+/* { dg-require-ifunc "" } */ -+/* { dg-options "-O2 -march=x86-64" } */ -+/* { dg-final { scan-assembler-times "rep mov" 1 } } */ -+ -+__attribute__((target_clones("default","arch=icelake-server"))) -+void -+foo (char *a, char *b, int size) -+{ -+ __builtin_memcpy (a, b, size & 0x7F); -+} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0062-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
Deleted
@@ -1,111 +0,0 @@ -From fbcb1a5899b1bd3964aed78ed74041121e618d36 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Tue, 20 Jun 2023 15:41:00 +0800 -Subject: PATCH 07/32 Refine maskloadmn pattern with UNSPEC_MASKLOAD. - -If mem_addr points to a memory region with less than whole vector size -bytes of accessible memory and k is a mask that would prevent reading -the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent -it to be transformed to vpblendd. - -gcc/ChangeLog: - - PR target/110309 - * config/i386/sse.md (maskload<mode><avx512fmaskmodelower>): - Refine pattern with UNSPEC_MASKLOAD. - (maskload<mode><avx512fmaskmodelower>): Ditto. - (*<avx512>_load<mode>_mask): Extend mode iterator to - VI12HF_AVX512VL. - (*<avx512>_load<mode>): Ditto. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/pr110309.c: New test. ---- - gcc/config/i386/sse.md | 32 +++++++++++++----------- - gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++ - 2 files changed, 28 insertions(+), 14 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c - -diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md -index eb767e56c..b30e96cb1 100644 ---- a/gcc/config/i386/sse.md -+++ b/gcc/config/i386/sse.md -@@ -1411,12 +1411,12 @@ - }) - - (define_insn "*<avx512>_load<mode>_mask" -- (set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") -- (vec_merge:VI12_AVX512VL -- (unspec:VI12_AVX512VL -- (match_operand:VI12_AVX512VL 1 "memory_operand" "m") -+ (set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") -+ (vec_merge:VI12HF_AVX512VL -+ (unspec:VI12HF_AVX512VL -+ (match_operand:VI12HF_AVX512VL 1 "memory_operand" "m") - UNSPEC_MASKLOAD) -- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C") -+ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C") - (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk"))) - "TARGET_AVX512BW" - "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" -@@ -1425,9 +1425,9 @@ - (set_attr "mode" "<sseinsnmode>")) - - (define_insn_and_split "*<avx512>_load<mode>" -- (set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") -- (unspec:VI12_AVX512VL -- (match_operand:VI12_AVX512VL 1 "memory_operand" "m") -+ (set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") -+ (unspec:VI12HF_AVX512VL -+ (match_operand:VI12HF_AVX512VL 1 "memory_operand" "m") - UNSPEC_MASKLOAD)) - "TARGET_AVX512BW" - "#" -@@ -25973,17 +25973,21 @@ - "TARGET_AVX") - - (define_expand "maskload<mode><avx512fmaskmodelower>" -- (set (match_operand:V48H_AVX512VL 0 "register_operand") -- (vec_merge:V48H_AVX512VL -- (match_operand:V48H_AVX512VL 1 "memory_operand") -+ (set (match_operand:V48_AVX512VL 0 "register_operand") -+ (vec_merge:V48_AVX512VL -+ (unspec:V48_AVX512VL -+ (match_operand:V48_AVX512VL 1 "memory_operand") -+ UNSPEC_MASKLOAD) - (match_dup 0) - (match_operand:<avx512fmaskmode> 2 "register_operand"))) - "TARGET_AVX512F") - - (define_expand "maskload<mode><avx512fmaskmodelower>" -- (set (match_operand:VI12_AVX512VL 0 "register_operand") -- (vec_merge:VI12_AVX512VL -- (match_operand:VI12_AVX512VL 1 "memory_operand") -+ (set (match_operand:VI12HF_AVX512VL 0 "register_operand") -+ (vec_merge:VI12HF_AVX512VL -+ (unspec:VI12HF_AVX512VL -+ (match_operand:VI12HF_AVX512VL 1 "memory_operand") -+ UNSPEC_MASKLOAD) - (match_dup 0) - (match_operand:<avx512fmaskmode> 2 "register_operand"))) - "TARGET_AVX512BW") -diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c -new file mode 100644 -index 000000000..f6e9e9c3c ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr110309.c -@@ -0,0 +1,10 @@ -+/* { dg-do compile } */ -+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */ -+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */ -+ -+ -+void foo (int * __restrict a, int *b) -+{ -+ for (int i = 0; i < 6; ++i) -+ ai = bi + 42; -+} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0063-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
Deleted
@@ -1,126 +0,0 @@ -From 5ad28ef4010c1248b4d94396d03f863705f7b0db Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Mon, 26 Jun 2023 21:07:09 +0800 -Subject: PATCH 08/32 Refine maskstore patterns with UNSPEC_MASKMOV. - -Similar like r14-2070-gc79476da46728e - -If mem_addr points to a memory region with less than whole vector size -bytes of accessible memory and k is a mask that would prevent reading -the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent -it to be transformed to any other whole memory access instructions. - -gcc/ChangeLog: - - PR rtl-optimization/110237 - * config/i386/sse.md (<avx512>_store<mode>_mask): Refine with - UNSPEC_MASKMOV. - (maskstore<mode><avx512fmaskmodelower): Ditto. - (*<avx512>_store<mode>_mask): New define_insn, it's renamed - from original <avx512>_store<mode>_mask. ---- - gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++-------- - 1 file changed, 57 insertions(+), 12 deletions(-) - -diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md -index b30e96cb1..3af159896 100644 ---- a/gcc/config/i386/sse.md -+++ b/gcc/config/i386/sse.md -@@ -1554,7 +1554,7 @@ - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")) - --(define_insn "<avx512>_store<mode>_mask" -+(define_insn "*<avx512>_store<mode>_mask" - (set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") - (vec_merge:V48_AVX512VL - (match_operand:V48_AVX512VL 1 "register_operand" "v") -@@ -1582,7 +1582,7 @@ - (set_attr "memory" "store") - (set_attr "mode" "<sseinsnmode>")) - --(define_insn "<avx512>_store<mode>_mask" -+(define_insn "*<avx512>_store<mode>_mask" - (set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") - (vec_merge:VI12HF_AVX512VL - (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") -@@ -26002,21 +26002,66 @@ - "TARGET_AVX") - - (define_expand "maskstore<mode><avx512fmaskmodelower>" -- (set (match_operand:V48H_AVX512VL 0 "memory_operand") -- (vec_merge:V48H_AVX512VL -- (match_operand:V48H_AVX512VL 1 "register_operand") -- (match_dup 0) -- (match_operand:<avx512fmaskmode> 2 "register_operand"))) -+ (set (match_operand:V48_AVX512VL 0 "memory_operand") -+ (unspec:V48_AVX512VL -+ (match_operand:V48_AVX512VL 1 "register_operand") -+ (match_dup 0) -+ (match_operand:<avx512fmaskmode> 2 "register_operand") -+ UNSPEC_MASKMOV)) - "TARGET_AVX512F") - - (define_expand "maskstore<mode><avx512fmaskmodelower>" -- (set (match_operand:VI12_AVX512VL 0 "memory_operand") -- (vec_merge:VI12_AVX512VL -- (match_operand:VI12_AVX512VL 1 "register_operand") -- (match_dup 0) -- (match_operand:<avx512fmaskmode> 2 "register_operand"))) -+ (set (match_operand:VI12HF_AVX512VL 0 "memory_operand") -+ (unspec:VI12HF_AVX512VL -+ (match_operand:VI12HF_AVX512VL 1 "register_operand") -+ (match_dup 0) -+ (match_operand:<avx512fmaskmode> 2 "register_operand") -+ UNSPEC_MASKMOV)) - "TARGET_AVX512BW") - -+(define_insn "<avx512>_store<mode>_mask" -+ (set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") -+ (unspec:V48_AVX512VL -+ (match_operand:V48_AVX512VL 1 "register_operand" "v") -+ (match_dup 0) -+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk") -+ UNSPEC_MASKMOV)) -+ "TARGET_AVX512F" -+{ -+ if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode))) -+ { -+ if (misaligned_operand (operands0, <MODE>mode)) -+ return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"; -+ else -+ return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"; -+ } -+ else -+ { -+ if (misaligned_operand (operands0, <MODE>mode)) -+ return "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"; -+ else -+ return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"; -+ } -+} -+ (set_attr "type" "ssemov") -+ (set_attr "prefix" "evex") -+ (set_attr "memory" "store") -+ (set_attr "mode" "<sseinsnmode>")) -+ -+(define_insn "<avx512>_store<mode>_mask" -+ (set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") -+ (unspec:VI12HF_AVX512VL -+ (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") -+ (match_dup 0) -+ (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk") -+ UNSPEC_MASKMOV)) -+ "TARGET_AVX512BW" -+ "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}" -+ (set_attr "type" "ssemov") -+ (set_attr "prefix" "evex") -+ (set_attr "memory" "store") -+ (set_attr "mode" "<sseinsnmode>")) -+ - (define_expand "cbranch<mode>4" - (set (reg:CC FLAGS_REG) - (compare:CC (match_operand:VI48_AVX 1 "register_operand") --- -2.28.0.windows.1 -
View file
_service:tar_scm:0064-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
Deleted
@@ -1,38 +0,0 @@ -From 50757adc93ef32a97a8a1083f5d53a9c00da6ac8 Mon Sep 17 00:00:00 2001 -From: "Cui, Lili" <lili.cui@intel.com> -Date: Thu, 29 Jun 2023 03:10:35 +0000 -Subject: PATCH 09/32 x86: Update model values for Alderlake and Rocketlake. - -Update model values for Alderlake and Rocketlake according to SDM. - -gcc/ChangeLog - - * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 - from Rocketlake, remove model value 0xbf from Alderlake. ---- - gcc/common/config/i386/cpuinfo.h | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 0333da56b..28b2ff0b0 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model, - cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE; - break; - case 0xa7: -- case 0xa8: - /* Rocket Lake. */ - cpu = "rocketlake"; - CHECK___builtin_cpu_is ("corei7"); -@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model, - break; - case 0x97: - case 0x9a: -- case 0xbf: - /* Alder Lake. */ - cpu = "alderlake"; - CHECK___builtin_cpu_is ("corei7"); --- -2.28.0.windows.1 -
View file
_service:tar_scm:0065-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
Deleted
@@ -1,78 +0,0 @@ -From 60364b439a80c217174e1830e0b7507d6f4538c4 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Fri, 4 Aug 2023 09:27:39 +0800 -Subject: PATCH 10/32 Workaround possible CPUID bug in Sandy Bridge. - -Don't access leaf 7 subleaf 1 unless subleaf 0 says it is -supported via EAX. - -Intel documentation says invalid subleaves return 0. We had been -relying on that behavior instead of checking the max sublef number. - -It appears that some Sandy Bridge CPUs return at least the subleaf 0 -EDX value for subleaf 1. Best guess is that this is a bug in a -microcode patch since all of the bits we're seeing set in EDX were -introduced after Sandy Bridge was originally released. - -This is causing avxvnniint16 to be incorrectly enabled with --march=native on these CPUs. - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h (get_available_features): Check - max_subleaf_level for valid subleaf before use CPUID. ---- - gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------ - 1 file changed, 17 insertions(+), 12 deletions(-) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 28b2ff0b0..316ad3cb3 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model, - /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ - if (max_cpuid_level >= 7) - { -- __cpuid_count (7, 0, eax, ebx, ecx, edx); -+ unsigned int max_subleaf_level; -+ -+ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx); - if (ebx & bit_BMI) - set_feature (FEATURE_BMI); - if (ebx & bit_SGX) -@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model, - set_feature (FEATURE_AVX512FP16); - } - -- __cpuid_count (7, 1, eax, ebx, ecx, edx); -- if (eax & bit_HRESET) -- set_feature (FEATURE_HRESET); -- if (avx_usable) -- { -- if (eax & bit_AVXVNNI) -- set_feature (FEATURE_AVXVNNI); -- } -- if (avx512_usable) -+ if (max_subleaf_level >= 1) - { -- if (eax & bit_AVX512BF16) -- set_feature (FEATURE_AVX512BF16); -+ __cpuid_count (7, 1, eax, ebx, ecx, edx); -+ if (eax & bit_HRESET) -+ set_feature (FEATURE_HRESET); -+ if (avx_usable) -+ { -+ if (eax & bit_AVXVNNI) -+ set_feature (FEATURE_AVXVNNI); -+ } -+ if (avx512_usable) -+ { -+ if (eax & bit_AVX512BF16) -+ set_feature (FEATURE_AVX512BF16); -+ } - } - } - --- -2.28.0.windows.1 -
View file
_service:tar_scm:0066-Software-mitigation-Disable-gather-generation-in-vec.patch
Deleted
@@ -1,220 +0,0 @@ -From cfffbec938afdc45c31db5ec282ce21ad1ba2dc7 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Thu, 10 Aug 2023 11:41:39 +0800 -Subject: PATCH 11/32 Software mitigation: Disable gather generation in - vectorization for GDS affected Intel Processors. - -For more details of GDS (Gather Data Sampling), refer to -https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html - -After microcode update, there's performance regression. To avoid that, -the patch disables gather generation in autovectorization but uses -gather scalar emulation instead. - -gcc/ChangeLog: - - * config/i386/i386-options.cc (m_GDS): New macro. - * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't - enable for m_GDS. - (X86_TUNE_USE_GATHER_4PARTS): Ditto. - (X86_TUNE_USE_GATHER): Ditto. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/avx2-gather-2.c: Adjust options to keep - gather vectorization. - * gcc.target/i386/avx2-gather-6.c: Ditto. - * gcc.target/i386/avx512f-pr88464-1.c: Ditto. - * gcc.target/i386/avx512f-pr88464-5.c: Ditto. - * gcc.target/i386/avx512vl-pr88464-1.c: Ditto. - * gcc.target/i386/avx512vl-pr88464-11.c: Ditto. - * gcc.target/i386/avx512vl-pr88464-3.c: Ditto. - * gcc.target/i386/avx512vl-pr88464-9.c: Ditto. - * gcc.target/i386/pr88531-1b.c: Ditto. - * gcc.target/i386/pr88531-1c.c: Ditto. - -(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d) ---- - gcc/config/i386/i386-options.cc | 5 +++++ - gcc/config/i386/x86-tune.def | 9 ++++++--- - gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +- - gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +- - gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +- - gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +- - gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +- - 12 files changed, 21 insertions(+), 13 deletions(-) - -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index fb2ed942f..9617fc162 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see - #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS) - #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT) - #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL) -+/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828. -+ Software mitigation. */ -+#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \ -+ | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ -+ | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE) - - #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE) - #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6) -diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def -index e6b9e2125..4392709fc 100644 ---- a/gcc/config/i386/x86-tune.def -+++ b/gcc/config/i386/x86-tune.def -@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", - /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 - elements. */ - DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", -- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC)) -+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE -+ | m_GENERIC | m_GDS)) - - /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 - elements. */ -@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", - /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 - elements. */ - DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", -- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC)) -+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE -+ | m_GENERIC | m_GDS)) - - /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 - elements. */ -@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", - /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more - elements. */ - DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", -- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC)) -+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE -+ | m_GENERIC | m_GDS)) - - /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more - elements. */ -diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c -index ad5ef7310..978924b0f 100644 ---- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c -+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */ -+/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */ - - #include "avx2-gather-1.c" - -diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c -index b9119581a..067b251e3 100644 ---- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c -+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c -@@ -1,5 +1,5 @@ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */ -+/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=haswell" } */ - - #include "avx2-gather-5.c" - -diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c -index 06d21bb01..d1a229861 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c -+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c -index 462e951fd..d7b0b2b28 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c -+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c -index 55a28dddb..07439185e 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c -+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c -index 969600885..3a9810827 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c -+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c -index 6b0c8a859..ac669e048 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c -+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c -index 3af568ab3..14a1083b6 100644 ---- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c -+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c -@@ -1,6 +1,6 @@ - /* PR tree-optimization/88464 */ - /* { dg-do compile } */ --/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -+/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */ - /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */ - /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ - -diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c -index 812c8a10f..e6df789de 100644 ---- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c -+++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
View file
_service:tar_scm:0067-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
Deleted
@@ -1,187 +0,0 @@ -From c269629130cb23252da2db026ce9ed13f57f69f4 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Thu, 10 Aug 2023 16:26:13 +0800 -Subject: PATCH 12/32 Support -mno-gather -mno-scatter to enable/disable - vectorization for all gather/scatter instructions - -Rename original use_gather to use_gather_8parts, Support --mtune-ctrl={,^}use_gather to set/clear tune features -use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather -as alias of -mtune-ctrl=, use_gather, ^use_gather. - -Similar for use_scatter. - -gcc/ChangeLog: - - * config/i386/i386-builtins.cc - (ix86_vectorize_builtin_gather): Adjust for use_gather_8parts. - * config/i386/i386-options.cc (parse_mtune_ctrl_str): - Set/Clear tune features use_{gather,scatter}_{2parts, 4parts, - 8parts} for -mtune-crtl={,^}{use_gather,use_scatter}. - * config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust - for use_scatter_8parts - * config/i386/i386.h (TARGET_USE_GATHER): Rename to .. - (TARGET_USE_GATHER_8PARTS): .. this. - (TARGET_USE_SCATTER): Rename to .. - (TARGET_USE_SCATTER_8PARTS): .. this. - * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to - (X86_TUNE_USE_GATHER_8PARTS): .. this. - (X86_TUNE_USE_SCATTER): Rename to - (X86_TUNE_USE_SCATTER_8PARTS): .. this. - * config/i386/i386.opt: Add new options mgather, mscatter. - -(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737) ---- - gcc/config/i386/i386-builtins.cc | 2 +- - gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++--------- - gcc/config/i386/i386.cc | 2 +- - gcc/config/i386/i386.h | 8 ++--- - gcc/config/i386/i386.opt | 4 +++ - gcc/config/i386/x86-tune.def | 4 +-- - 6 files changed, 52 insertions(+), 22 deletions(-) - -diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc -index 050c6228a..8ed32e14f 100644 ---- a/gcc/config/i386/i386-builtins.cc -+++ b/gcc/config/i386/i386-builtins.cc -@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, - ? !TARGET_USE_GATHER_2PARTS - : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u) - ? !TARGET_USE_GATHER_4PARTS -- : !TARGET_USE_GATHER))) -+ : !TARGET_USE_GATHER_8PARTS))) - return NULL_TREE; - - if ((TREE_CODE (index_type) != INTEGER_TYPE -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 9617fc162..3df1f0c41 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump) - curr_feature_string++; - clear = true; - } -- for (i = 0; i < X86_TUNE_LAST; i++) -- { -- if (!strcmp (curr_feature_string, ix86_tune_feature_namesi)) -- { -- ix86_tune_featuresi = !clear; -- if (dump) -- fprintf (stderr, "Explicitly %s feature %s\n", -- clear ? "clear" : "set", ix86_tune_feature_namesi); -- break; -- } -- } -- if (i == X86_TUNE_LAST) -- error ("unknown parameter to option %<-mtune-ctrl%>: %s", -- clear ? curr_feature_string - 1 : curr_feature_string); -+ -+ if (!strcmp (curr_feature_string, "use_gather")) -+ { -+ ix86_tune_featuresX86_TUNE_USE_GATHER_2PARTS = !clear; -+ ix86_tune_featuresX86_TUNE_USE_GATHER_4PARTS = !clear; -+ ix86_tune_featuresX86_TUNE_USE_GATHER_8PARTS = !clear; -+ if (dump) -+ fprintf (stderr, "Explicitly %s features use_gather_2parts," -+ " use_gather_4parts, use_gather_8parts\n", -+ clear ? "clear" : "set"); -+ -+ } -+ else if (!strcmp (curr_feature_string, "use_scatter")) -+ { -+ ix86_tune_featuresX86_TUNE_USE_SCATTER_2PARTS = !clear; -+ ix86_tune_featuresX86_TUNE_USE_SCATTER_4PARTS = !clear; -+ ix86_tune_featuresX86_TUNE_USE_SCATTER_8PARTS = !clear; -+ if (dump) -+ fprintf (stderr, "Explicitly %s features use_scatter_2parts," -+ " use_scatter_4parts, use_scatter_8parts\n", -+ clear ? "clear" : "set"); -+ } -+ else -+ { -+ for (i = 0; i < X86_TUNE_LAST; i++) -+ { -+ if (!strcmp (curr_feature_string, ix86_tune_feature_namesi)) -+ { -+ ix86_tune_featuresi = !clear; -+ if (dump) -+ fprintf (stderr, "Explicitly %s feature %s\n", -+ clear ? "clear" : "set", ix86_tune_feature_namesi); -+ break; -+ } -+ } -+ -+ if (i == X86_TUNE_LAST) -+ error ("unknown parameter to option %<-mtune-ctrl%>: %s", -+ clear ? curr_feature_string - 1 : curr_feature_string); -+ } - curr_feature_string = next_feature_string; - } - while (curr_feature_string); -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index 479fc6010..e75d37023 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype, - ? !TARGET_USE_SCATTER_2PARTS - : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) - ? !TARGET_USE_SCATTER_4PARTS -- : !TARGET_USE_SCATTER)) -+ : !TARGET_USE_SCATTER_8PARTS)) - return NULL_TREE; - - if ((TREE_CODE (index_type) != INTEGER_TYPE -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index 688aaabd3..aaa136ba0 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_featuresX86_TUNE_LAST; - ix86_tune_featuresX86_TUNE_USE_GATHER_4PARTS - #define TARGET_USE_SCATTER_4PARTS \ - ix86_tune_featuresX86_TUNE_USE_SCATTER_4PARTS --#define TARGET_USE_GATHER \ -- ix86_tune_featuresX86_TUNE_USE_GATHER --#define TARGET_USE_SCATTER \ -- ix86_tune_featuresX86_TUNE_USE_SCATTER -+#define TARGET_USE_GATHER_8PARTS \ -+ ix86_tune_featuresX86_TUNE_USE_GATHER_8PARTS -+#define TARGET_USE_SCATTER_8PARTS \ -+ ix86_tune_featuresX86_TUNE_USE_SCATTER_8PARTS - #define TARGET_FUSE_CMP_AND_BRANCH_32 \ - ix86_tune_featuresX86_TUNE_FUSE_CMP_AND_BRANCH_32 - #define TARGET_FUSE_CMP_AND_BRANCH_64 \ -diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt -index 498fb454d..b154110d8 100644 ---- a/gcc/config/i386/i386.opt -+++ b/gcc/config/i386/i386.opt -@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated. - munroll-only-small-loops - Target Var(ix86_unroll_only_small_loops) Init(0) Save - Enable conservative small loop unrolling. -+ -+mscatter -+Target Alias(mtune-ctrl=, use_scatter, ^use_scatter) -+Enable vectorization for scatter instruction. -diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def -index 4392709fc..bdb455d20 100644 ---- a/gcc/config/i386/x86-tune.def -+++ b/gcc/config/i386/x86-tune.def -@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", - - /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more - elements. */ --DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", -+DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE - | m_GENERIC | m_GDS)) - - /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more - elements. */ --DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter", -+DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", - ~(m_ZNVER4)) - - /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or --- -2.28.0.windows.1 -
View file
_service:tar_scm:0068-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
Deleted
@@ -1,129 +0,0 @@ -From 764518a35e90a3e13c469275da9c3c7002fe1982 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Fri, 8 Sep 2023 09:22:43 +0800 -Subject: PATCH 13/32 Remove constraint modifier % for - fcmaddcph/fmaddcph/fcmulcph since there're not commutative. - -gcc/ChangeLog: - - PR target/111306 - PR target/111335 - * config/i386/sse.md (int_comm): New int_attr. - (fma_<complexopname>_<mode><sdc_maskz_name><round_name>): - Remove % for Complex conjugate operations since they're not - commutative. - (fma_<complexpairopname>_<mode>_pair): Ditto. - (<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto. - (cmul<conj_op><mode>3): Ditto. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/pr111306.c: New test. - -(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb) ---- - gcc/config/i386/sse.md | 16 ++++++++--- - gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++ - 2 files changed, 48 insertions(+), 4 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c - -diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md -index 3af159896..f25dd5f2b 100644 ---- a/gcc/config/i386/sse.md -+++ b/gcc/config/i386/sse.md -@@ -6318,6 +6318,14 @@ - (UNSPEC_COMPLEX_FMA_PAIR "fmaddc") - (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")) - -+(define_int_attr int_comm -+ (UNSPEC_COMPLEX_FMA "") -+ (UNSPEC_COMPLEX_FMA_PAIR "") -+ (UNSPEC_COMPLEX_FCMA "") -+ (UNSPEC_COMPLEX_FCMA_PAIR "") -+ (UNSPEC_COMPLEX_FMUL "%") -+ (UNSPEC_COMPLEX_FCMUL "")) -+ - (define_int_attr conj_op - (UNSPEC_COMPLEX_FMA "") - (UNSPEC_COMPLEX_FCMA "_conj") -@@ -6431,7 +6439,7 @@ - (define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>" - (set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") - (unspec:VF_AVX512FP16VL -- (match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v") -+ (match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v") - (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>") - (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0") - UNSPEC_COMPLEX_F_C_MA)) -@@ -6495,7 +6503,7 @@ - (define_insn "fma_<complexpairopname>_<mode>_pair" - (set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v") - (unspec:VF1_AVX512VL -- (match_operand:VF1_AVX512VL 1 "vector_operand" "%v") -+ (match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v") - (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr") - (match_operand:VF1_AVX512VL 3 "vector_operand" "0") - UNSPEC_COMPLEX_F_C_MA_PAIR)) -@@ -6562,7 +6570,7 @@ - (set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") - (vec_merge:VF_AVX512FP16VL - (unspec:VF_AVX512FP16VL -- (match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") -+ (match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v") - (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>") - (match_operand:VF_AVX512FP16VL 3 "register_operand" "0") - UNSPEC_COMPLEX_F_C_MA) -@@ -6586,7 +6594,7 @@ - (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>" - (set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") - (unspec:VF_AVX512FP16VL -- (match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") -+ (match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v") - (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>") - UNSPEC_COMPLEX_F_C_MUL)) - "TARGET_AVX512FP16 && <round_mode512bit_condition>" -diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c -new file mode 100644 -index 000000000..541725ebd ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr111306.c -@@ -0,0 +1,36 @@ -+/* { dg-do run } */ -+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ -+/* { dg-require-effective-target avx512fp16 } */ -+ -+#define AVX512FP16 -+#include "avx512f-helper.h" -+ -+__attribute__((optimize("O2"),noipa)) -+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) { -+ __m512h rA = _mm512_loadu_ph(a); -+ for (int i = 0; i < n; i += 32) { -+ __m512h rB = _mm512_loadu_ph(b + i); -+ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA)); -+ } -+} -+ -+void -+test_512 (void) -+{ -+ int n = 32; -+ _Float16 an, bn, cn; -+ _Float16 expn; -+ for (int i = 1; i <= n; i++) { -+ ai - 1 = i & 1 ? -i : i; -+ bi - 1 = i; -+ } -+ -+ func1(a, b, n, c); -+ for (int i = 0; i < n / 32; i += 2) { -+ if (ci != ai * bi + ai+1 * bi+1 -+ || ci+1 != ai * bi+1 - ai+1*bi) -+ __builtin_abort (); -+ } -+} -+ -+ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0069-Disparage-slightly-for-the-alternative-which-move-DF.patch
Deleted
@@ -1,106 +0,0 @@ -From afd539adfe762adb57863299a11987b7e20e7987 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Wed, 5 Jul 2023 13:45:11 +0800 -Subject: PATCH 14/32 Disparage slightly for the alternative which move - DFmode between SSE_REGS and GENERAL_REGS. - -For testcase - -void __cond_swap(double* __x, double* __y) { - bool __r = (*__x < *__y); - auto __tmp = __r ? *__x : *__y; - *__y = __r ? *__y : *__x; - *__x = __tmp; -} - -GCC-14 with -O2 and -march=x86-64 options generates the following code: - -__cond_swap(double*, double*): - movsd xmm1, QWORD PTR rdi - movsd xmm0, QWORD PTR rsi - comisd xmm0, xmm1 - jbe .L2 - movq rax, xmm1 - movapd xmm1, xmm0 - movq xmm0, rax -.L2: - movsd QWORD PTR rsi, xmm1 - movsd QWORD PTR rdi, xmm0 - ret - -rax is used to save and restore DFmode value. In RA both GENERAL_REGS -and SSE_REGS cost zero since we didn't disparage the -alternative in movdf_internal pattern, according to register -allocation order, GENERAL_REGS is allocated. The patch add ? for -alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal -pattern, after that we get optimal RA. - -__cond_swap: -.LFB0: - .cfi_startproc - movsd (%rdi), %xmm1 - movsd (%rsi), %xmm0 - comisd %xmm1, %xmm0 - jbe .L2 - movapd %xmm1, %xmm2 - movapd %xmm0, %xmm1 - movapd %xmm2, %xmm0 -.L2: - movsd %xmm1, (%rsi) - movsd %xmm0, (%rdi) - ret - -gcc/ChangeLog: - - PR target/110170 - * config/i386/i386.md (movdf_internal): Disparage slightly for - 2 alternatives (r,v) and (v,r) by adding constraint modifier - '?'. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/pr110170-3.c: New test. - -(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e) ---- - gcc/config/i386/i386.md | 4 ++-- - gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++ - 2 files changed, 13 insertions(+), 2 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c - -diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md -index be07be10d..71691f598 100644 ---- a/gcc/config/i386/i386.md -+++ b/gcc/config/i386/i386.md -@@ -3582,9 +3582,9 @@ - ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7. - (define_insn "*movdf_internal" - (set (match_operand:DF 0 "nonimmediate_operand" -- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m") -+ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m") - (match_operand:DF 1 "general_operand" -- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC")) -+ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC")) - "!(MEM_P (operands0) && MEM_P (operands1)) - && (lra_in_progress || reload_completed - || !CONST_DOUBLE_P (operands1) -diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c -new file mode 100644 -index 000000000..70daa89e9 ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c -@@ -0,0 +1,11 @@ -+/* { dg-do compile { target { ! ia32 } } } */ -+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */ -+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */ -+ -+void __cond_swap(double* __x, double* __y) { -+ _Bool __r = (*__x < *__y); -+ double __tmp = __r ? *__x : *__y; -+ *__y = __r ? *__y : *__x; -+ *__x = __tmp; -+} -+ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0070-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
Deleted
@@ -1,163 +0,0 @@ -From 88516507757932c1e67ce99d240596935971d2d0 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Thu, 9 Nov 2023 13:20:05 +0800 -Subject: PATCH 15/32 Fix wrong code due to vec_merge + pcmp to blendvb - splitter. - -gcc/ChangeLog: - - PR target/112443 - * config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition - from LT to GT since there's not in the pattern. - (*avx2_pcmp<mode>3_5): Ditto. - -gcc/testsuite/ChangeLog: - - * g++.target/i386/pr112443.C: New test. - -(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd) ---- - gcc/config/i386/sse.md | 4 +- - gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++ - 2 files changed, 110 insertions(+), 2 deletions(-) - create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C - -diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md -index f25dd5f2b..23b858ab2 100644 ---- a/gcc/config/i386/sse.md -+++ b/gcc/config/i386/sse.md -@@ -16358,7 +16358,7 @@ - (match_dup 4)) - UNSPEC_BLENDV)) - { -- if (INTVAL (operands5) == 1) -+ if (INTVAL (operands5) == 5) - std::swap (operands1, operands2); - operands3 = gen_lowpart (<MODE>mode, operands3); - }) -@@ -16388,7 +16388,7 @@ - (match_dup 4)) - UNSPEC_BLENDV)) - { -- if (INTVAL (operands5) == 1) -+ if (INTVAL (operands5) == 5) - std::swap (operands1, operands2); - }) - -diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C -new file mode 100644 -index 000000000..ebfa9b4a7 ---- /dev/null -+++ b/gcc/testsuite/g++.target/i386/pr112443.C -@@ -0,0 +1,108 @@ -+/* { dg-do run } */ -+/* { dg-require-effective-target avx512bw } */ -+/* { dg-require-effective-target avx512vl } */ -+/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */ -+ -+#include <cstdint> -+#include <x86intrin.h> -+#include <functional> -+#include <ostream> -+ -+#define AVX512BW -+#define AVX512VL -+ -+#include "avx512f-helper.h" -+ -+struct TensorIteratorBase{ -+ char* in; -+ char* out; -+ -+ void for_each(std::function<void(char*, char*, int64_t size)> loop){ -+ loop(out, in, 32); -+ } -+}; -+ -+class Vectorized { -+protected: -+ __m256i values; -+ -+ static inline __m256i invert(const __m256i& v) { -+ const auto ones = _mm256_set1_epi64x(-1); -+ return _mm256_xor_si256(ones, v); -+ } -+public: -+ operator __m256i() const { -+ return values; -+ } -+ -+ static constexpr int size() { -+ return 32; -+ } -+ -+ Vectorized() {} -+ Vectorized(__m256i v) : values(v) {} -+ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); } -+ static Vectorized blendv(const Vectorized& a, const Vectorized& b, -+ const Vectorized& mask) { -+ return _mm256_blendv_epi8(a, b, mask); -+ } -+ static Vectorized loadu(const void* ptr) { -+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); -+ } -+ void store(void* ptr) const { -+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); -+ } -+ -+ Vectorized operator<(const Vectorized& other) const { -+ __m256i max = _mm256_max_epu8(values, other); -+ return invert(_mm256_cmpeq_epi8(max, values)); -+ } -+ Vectorized operator-(const Vectorized& b) { -+ return _mm256_sub_epi8(values, b); -+ } -+}; -+ -+std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { -+ uint8_t bufVectorized::size(); -+ vec.store(buf); -+ stream << "vec"; -+ for (int i = 0; i != Vectorized::size(); i++) { -+ if (i != 0) -+ stream << ", "; -+ stream << bufi*1; -+ } -+ stream << ""; -+ return stream; -+} -+ -+void run(TensorIteratorBase iter){ -+ Vectorized zero_vec(0); -+ Vectorized one_vec(1); -+ -+ iter.for_each(=(char* out, char* in, int64_t size) { -+ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) { -+ auto self_vec = Vectorized::loadu(in + i); -+ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec); -+ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec); -+ auto outv = left - right; -+ outv.store(out + i); -+ } -+ }); -+} -+ -+void -+test_256 (){ -+ char in32; -+ char out32; -+ for(auto& x: in) x = 1; -+ run(TensorIteratorBase{in, out}); -+ Vectorized::loadu (out); -+ for (int i = 0; i != 32; i++) -+ if (outi != 1) -+ __builtin_abort (); -+} -+ -+void -+test_128 () -+{ -+} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0071-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
Deleted
@@ -1,151 +0,0 @@ -From 204ffa7f503411ccac0161c951726274648b6374 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Thu, 7 Dec 2023 09:17:27 +0800 -Subject: PATCH 16/32 Don't assume it's AVX_U128_CLEAN after call_insn whose - abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. - -If the function desn't clobber any sse registers or only clobber -128-bit part, then vzeroupper isn't issued before the function exit. -the status not CLEAN but ANY after the function. - -Also for sibling_call, it's safe to issue an vzeroupper. Also there -could be missing vzeroupper since there's no mode_exit for -sibling_call_p. - -gcc/ChangeLog: - - PR target/112891 - * config/i386/i386.cc (ix86_avx_u128_mode_after): Return - AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to - align with ix86_avx_u128_mode_needed. - (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for - sibling_call. - -gcc/testsuite/ChangeLog: - - * gcc.target/i386/pr112891.c: New test. - * gcc.target/i386/pr112891-2.c: New test. - -(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642) ---- - gcc/config/i386/i386.cc | 22 +++++++++++++--- - gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++ - gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++ - 3 files changed, 78 insertions(+), 3 deletions(-) - create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c - create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c - -diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc -index e75d37023..60f3296b0 100644 ---- a/gcc/config/i386/i386.cc -+++ b/gcc/config/i386/i386.cc -@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) - modes wider than 256 bits. It's only safe to issue a - vzeroupper if all SSE registers are clobbered. */ - const function_abi &abi = insn_callee_abi (insn); -- if (!hard_reg_set_subset_p (reg_class_contentsSSE_REGS, -- abi.mode_clobbers (V4DImode))) -+ /* Should be safe to issue an vzeroupper before sibling_call_p. -+ Also there not mode_exit for sibling_call, so there could be -+ missing vzeroupper for that. */ -+ if (!(SIBLING_CALL_P (insn) -+ || hard_reg_set_subset_p (reg_class_contentsSSE_REGS, -+ abi.mode_clobbers (V4DImode)))) - return AVX_U128_ANY; - - return AVX_U128_CLEAN; -@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn) - bool avx_upper_reg_found = false; - note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found); - -- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN; -+ if (avx_upper_reg_found) -+ return AVX_U128_DIRTY; -+ -+ /* If the function desn't clobber any sse registers or only clobber -+ 128-bit part, Then vzeroupper isn't issued before the function exit. -+ the status not CLEAN but ANY after the function. */ -+ const function_abi &abi = insn_callee_abi (insn); -+ if (!(SIBLING_CALL_P (insn) -+ || hard_reg_set_subset_p (reg_class_contentsSSE_REGS, -+ abi.mode_clobbers (V4DImode)))) -+ return AVX_U128_ANY; -+ -+ return AVX_U128_CLEAN; - } - - /* Otherwise, return current mode. Remember that if insn -diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c -new file mode 100644 -index 000000000..164c3985d ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c -@@ -0,0 +1,30 @@ -+/* { dg-do compile } */ -+/* { dg-options "-mavx2 -O3" } */ -+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ -+ -+void -+__attribute__((noinline)) -+bar (double* a) -+{ -+ a0 = 1.0; -+ a1 = 2.0; -+} -+ -+double -+__attribute__((noinline)) -+foo (double* __restrict a, double* b) -+{ -+ a0 += b0; -+ a1 += b1; -+ a2 += b2; -+ a3 += b3; -+ bar (b); -+ return a5 + b5; -+} -+ -+double -+foo1 (double* __restrict a, double* b) -+{ -+ double c = foo (a, b); -+ return __builtin_exp (c); -+} -diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c -new file mode 100644 -index 000000000..dbf6c6794 ---- /dev/null -+++ b/gcc/testsuite/gcc.target/i386/pr112891.c -@@ -0,0 +1,29 @@ -+/* { dg-do compile } */ -+/* { dg-options "-mavx2 -O3" } */ -+/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ -+ -+void -+__attribute__((noinline)) -+bar (double* a) -+{ -+ a0 = 1.0; -+ a1 = 2.0; -+} -+ -+void -+__attribute__((noinline)) -+foo (double* __restrict a, double* b) -+{ -+ a0 += b0; -+ a1 += b1; -+ a2 += b2; -+ a3 += b3; -+ bar (b); -+} -+ -+double -+foo1 (double* __restrict a, double* b) -+{ -+ foo (a, b); -+ return __builtin_exp (b1); -+} --- -2.28.0.windows.1 -
View file
_service:tar_scm:0072-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
Deleted
@@ -1,142 +0,0 @@ -From 19ee37b11702c86d7ed271e9e1d00e23cc4ab93c Mon Sep 17 00:00:00 2001 -From: Jan Hubicka <jh@suse.cz> -Date: Fri, 29 Dec 2023 23:51:03 +0100 -Subject: PATCH 17/32 Disable FMADD in chains for Zen4 and generic - -this patch disables use of FMA in matrix multiplication loop for generic (for -x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. - -For Intel this is neutral both on the matrix multiplication microbenchmark -(attached) and spec2k17 where the difference was within noise for Core. - -On core the micro-benchmark runs as follows: - -With FMA: - - 578,500,241 cycles:u # 3.645 GHz - ( +- 0.12% ) - 753,318,477 instructions:u # 1.30 insn per -cycle ( +- 0.00% ) - 125,417,701 branches:u # 790.227 M/sec - ( +- 0.00% ) - 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) - -No FMA: - - 577,573,960 cycles:u # 3.514 GHz - ( +- 0.15% ) - 878,318,479 instructions:u # 1.52 insn per -cycle ( +- 0.00% ) - 125,417,702 branches:u # 763.035 M/sec - ( +- 0.00% ) - 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) - -So the cycle count is unchanged and discrete multiply+add takes same time as -FMA. - -While on zen: - -With FMA: - 484875179 cycles:u # 3.599 GHz - ( +- 0.05% ) (82.11%) - 752031517 instructions:u # 1.55 insn per -cycle - 125106525 branches:u # 928.712 M/sec - ( +- 0.03% ) (85.09%) - 128356 branch-misses:u # 0.10% of all -branches ( +- 0.06% ) (83.58%) - -No FMA: - 375875209 cycles:u # 3.592 GHz - ( +- 0.08% ) (80.74%) - 875725341 instructions:u # 2.33 insn per -cycle - 124903825 branches:u # 1.194 G/sec - ( +- 0.04% ) (84.59%) - 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) - -The diffrerence is that Cores understand the fact that fmadd does not need -all three parameters to start computation, while Zen cores doesn't. - -Since this seems noticeable win on zen and not loss on Core it seems like good -default for generic. - -float aSIZESIZE; -float bSIZESIZE; -float cSIZESIZE; - -void init(void) -{ - int i, j, k; - for(i=0; i<SIZE; ++i) - { - for(j=0; j<SIZE; ++j) - { - aij = (float)i + j; - bij = (float)i - j; - cij = 0.0f; - } - } -} - -void mult(void) -{ - int i, j, k; - - for(i=0; i<SIZE; ++i) - { - for(j=0; j<SIZE; ++j) - { - for(k=0; k<SIZE; ++k) - { - cij += aik * bkj; - } - } - } -} - -int main(void) -{ - clock_t s, e; - - init(); - s=clock(); - mult(); - e=clock(); - printf(" mult took %10d clocks\n", (int)(e-s)); - - return 0; - -} - -gcc/ChangeLog: - - * config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, - X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core. ---- - gcc/config/i386/x86-tune.def | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def -index bdb455d20..fd095f3ec 100644 ---- a/gcc/config/i386/x86-tune.def -+++ b/gcc/config/i386/x86-tune.def -@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", - - /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or - smaller FMA chain. */ --DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3) -+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 -+ | m_ZNVER3 | m_ZNVER4 | m_GENERIC) - - /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or - smaller FMA chain. */ - DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 -- | m_ALDERLAKE | m_SAPPHIRERAPIDS) -+ | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC) - - /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or - smaller FMA chain. */ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0073-Initial-Raptorlake-Support.patch
Deleted
@@ -1,47 +0,0 @@ -From 411d1f0bcc0d1c8018fdf5fe84ad2404929556ec Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Fri, 16 Sep 2022 13:59:01 +0800 -Subject: PATCH 18/32 Initial Raptorlake Support - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h: - (get_intel_cpu): Handle Raptorlake. - * common/config/i386/i386-common.cc: - (processor_alias_table): Add Raptorlake. - -(cherry picked from commit 470a0659b508d684148f362c4dc0eccf5a83a23e) ---- - gcc/common/config/i386/cpuinfo.h | 2 ++ - gcc/common/config/i386/i386-common.cc | 2 ++ - 2 files changed, 4 insertions(+) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 316ad3cb3..13d0f4cd8 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -508,6 +508,8 @@ get_intel_cpu (struct __processor_model *cpu_model, - case 0x97: - case 0x9a: - /* Alder Lake. */ -+ case 0xb7: -+ /* Raptor Lake. */ - cpu = "alderlake"; - CHECK___builtin_cpu_is ("corei7"); - CHECK___builtin_cpu_is ("alderlake"); -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index f650e255f..c1d700f89 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1939,6 +1939,8 @@ const pta processor_alias_table = - M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, - {"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, -+ {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, -+ M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, - M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3}, - {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, --- -2.28.0.windows.1 -
View file
_service:tar_scm:0074-Initial-Meteorlake-Support.patch
Deleted
@@ -1,49 +0,0 @@ -From 87cea29ede520f4a5af01dff7071ab1d23bd47b5 Mon Sep 17 00:00:00 2001 -From: "Hu, Lin1" <lin1.hu@intel.com> -Date: Fri, 16 Sep 2022 11:25:13 +0800 -Subject: PATCH 19/32 Initial Meteorlake Support - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h: - (get_intel_cpu): Handle Meteorlake. - * common/config/i386/i386-common.cc: - (processor_alias_table): Add Meteorlake. - -(cherry picked from commit fd206f0e95fb6f41b96eaaaab1dc0c30378e5e08) ---- - gcc/common/config/i386/cpuinfo.h | 4 ++++ - gcc/common/config/i386/i386-common.cc | 2 ++ - 2 files changed, 6 insertions(+) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 13d0f4cd8..37af92d6b 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -510,6 +510,10 @@ get_intel_cpu (struct __processor_model *cpu_model, - /* Alder Lake. */ - case 0xb7: - /* Raptor Lake. */ -+ case 0xb5: -+ case 0xaa: -+ case 0xac: -+ /* Meteor Lake. */ - cpu = "alderlake"; - CHECK___builtin_cpu_is ("corei7"); - CHECK___builtin_cpu_is ("alderlake"); -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index c1d700f89..cfee672fb 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1941,6 +1941,8 @@ const pta processor_alias_table = - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, -+ {"meteorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, -+ M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, - M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3}, - {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, --- -2.28.0.windows.1 -
View file
_service:tar_scm:0075-Support-Intel-AMX-FP16-ISA.patch
Deleted
@@ -1,691 +0,0 @@ -From c11301c7780213ddf46a0bcdb06079af485f431c Mon Sep 17 00:00:00 2001 -From: Hongyu Wang <hongyu.wang@intel.com> -Date: Fri, 4 Nov 2022 15:50:55 +0800 -Subject: PATCH 20/32 Support Intel AMX-FP16 ISA - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h (get_available_features): Detect - amx-fp16. - * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_FP16_SET, - OPTION_MASK_ISA2_AMX_FP16_UNSET): New macros. - (ix86_handle_option): Handle -mamx-fp16. - * common/config/i386/i386-cpuinfo.h (enum processor_features): - Add FEATURE_AMX_FP16. - * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for - amx-fp16. - * config.gcc: Add amxfp16intrin.h. - * config/i386/cpuid.h (bit_AMX_FP16): New. - * config/i386/i386-c.cc (ix86_target_macros_internal): Define - __AMX_FP16__. - * config/i386/i386-isa.def: Add DEF_PTA for AMX_FP16. - * config/i386/i386-options.cc (isa2_opts): Add -mamx-fp16. - (ix86_valid_target_attribute_inner_p): Add new ATTR. - (ix86_option_override_internal): Handle AMX-FP16. - * config/i386/i386.opt: Add -mamx-fp16. - * config/i386/immintrin.h: Include amxfp16intrin.h. - * doc/extend.texi: Document -mamx-fp16. - * doc/invoke.texi: Document amx-fp16. - * doc/sourcebuild.texi: Document amx_fp16. - * config/i386/amxfp16intrin.h: New file. - -gcc/testsuite/ChangeLog: - - * g++.dg/other/i386-2.C: Add -mamx-fp16. - * g++.dg/other/i386-3.C: Ditto. - * gcc.target/i386/sse-12.c: Ditto. - * gcc.target/i386/sse-13.c: Ditto. - * gcc.target/i386/sse-14.c: Ditto. - * gcc.target/i386/sse-22.c: Ditto. - * gcc.target/i386/sse-23.c: Ditto. - * lib/target-supports.exp: (check_effective_target_amx_fp16): - New proc. - * gcc.target/i386/funcspec-56.inc: Add new target attribute. - * gcc.target/i386/amx-check.h: Add AMX_FP16. - * gcc.target/i386/amx-helper.h: New file to support amx-fp16. - * gcc.target/i386/amxfp16-asmatt-1.c: New test. - * gcc.target/i386/amxfp16-asmintel-1.c: Ditto. - * gcc.target/i386/amxfp16-dpfp16ps-2.c: Ditto. - -Co-authored-by: Haochen Jiang <haochen.jiang@intel.com> - -(cherry picked from commit 2b4a03962a0fe18cadc944d90f1fb85a40004226) ---- - gcc/common/config/i386/cpuinfo.h | 5 ++ - gcc/common/config/i386/i386-common.cc | 15 +++++ - gcc/common/config/i386/i386-cpuinfo.h | 1 + - gcc/common/config/i386/i386-isas.h | 1 + - gcc/config.gcc | 3 +- - gcc/config/i386/amxfp16intrin.h | 46 ++++++++++++++ - gcc/config/i386/cpuid.h | 1 + - gcc/config/i386/i386-c.cc | 2 + - gcc/config/i386/i386-isa.def | 1 + - gcc/config/i386/i386-options.cc | 4 +- - gcc/config/i386/i386.opt | 4 ++ - gcc/config/i386/immintrin.h | 2 + - gcc/doc/extend.texi | 5 ++ - gcc/doc/invoke.texi | 9 ++- - gcc/doc/sourcebuild.texi | 3 + - gcc/testsuite/g++.dg/other/i386-2.C | 2 +- - gcc/testsuite/g++.dg/other/i386-3.C | 2 +- - gcc/testsuite/gcc.target/i386/amx-check.h | 3 + - gcc/testsuite/gcc.target/i386/amx-helper.h | 61 +++++++++++++++++++ - .../gcc.target/i386/amxfp16-asmatt-1.c | 13 ++++ - .../gcc.target/i386/amxfp16-asmintel-1.c | 10 +++ - .../gcc.target/i386/amxfp16-dpfp16ps-2.c | 57 +++++++++++++++++ - gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + - gcc/testsuite/gcc.target/i386/sse-12.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-13.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-14.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-22.c | 4 +- - gcc/testsuite/gcc.target/i386/sse-23.c | 2 +- - gcc/testsuite/lib/target-supports.exp | 11 ++++ - 29 files changed, 262 insertions(+), 13 deletions(-) - create mode 100644 gcc/config/i386/amxfp16intrin.h - create mode 100644 gcc/testsuite/gcc.target/i386/amx-helper.h - create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmatt-1.c - create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-asmintel-1.c - create mode 100644 gcc/testsuite/gcc.target/i386/amxfp16-dpfp16ps-2.c - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 37af92d6b..5951a30aa 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -783,6 +783,11 @@ get_available_features (struct __processor_model *cpu_model, - set_feature (FEATURE_AVX512BF16); - } - } -+ if (amx_usable) -+ { -+ if (eax & bit_AMX_FP16) -+ set_feature (FEATURE_AMX_FP16); -+ } - } - - /* Get Advanced Features at level 0xd (eax = 0xd, ecx = 1). */ -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index cfee672fb..922db33ee 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -107,6 +107,7 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_AMX_TILE_SET OPTION_MASK_ISA2_AMX_TILE - #define OPTION_MASK_ISA2_AMX_INT8_SET OPTION_MASK_ISA2_AMX_INT8 - #define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16 -+#define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16 - - /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same - as -msse4.2. */ -@@ -275,6 +276,7 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_KL_UNSET \ - (OPTION_MASK_ISA2_KL | OPTION_MASK_ISA2_WIDEKL_UNSET) - #define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL -+#define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16 - - /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same - as -mno-sse4.1. */ -@@ -1125,6 +1127,19 @@ ix86_handle_option (struct gcc_options *opts, - } - return true; - -+ case OPT_mamx_fp16: -+ if (value) -+ { -+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_FP16_SET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_SET; -+ } -+ else -+ { -+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16_UNSET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_FP16_UNSET; -+ } -+ return true; -+ - case OPT_mfma: - if (value) - { -diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h -index 82996ebb3..8f22897de 100644 ---- a/gcc/common/config/i386/i386-cpuinfo.h -+++ b/gcc/common/config/i386/i386-cpuinfo.h -@@ -240,6 +240,7 @@ enum processor_features - FEATURE_X86_64_V2, - FEATURE_X86_64_V3, - FEATURE_X86_64_V4, -+ FEATURE_AMX_FP16, - CPU_FEATURE_MAX - }; - -diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h -index 2d0646a68..95bab6da2 100644 ---- a/gcc/common/config/i386/i386-isas.h -+++ b/gcc/common/config/i386/i386-isas.h -@@ -175,4 +175,5 @@ ISA_NAMES_TABLE_START - ISA_NAMES_TABLE_ENTRY("x86-64-v2", FEATURE_X86_64_V2, P_X86_64_V2, NULL) - ISA_NAMES_TABLE_ENTRY("x86-64-v3", FEATURE_X86_64_V3, P_X86_64_V3, NULL) - ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL) -+ ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16") - ISA_NAMES_TABLE_END -diff --git a/gcc/config.gcc b/gcc/config.gcc -index 4a0ae9328..e2b4a23dc 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -423,7 +423,8 @@ i3456786-*-* | x86_64-*-*) - tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h - amxbf16intrin.h x86gprintrin.h uintrintrin.h - hresetintrin.h keylockerintrin.h avxvnniintrin.h -- mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h" -+ mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h -+ amxfp16intrin.h" - ;; - ia64-*-*) - extra_headers=ia64intrin.h -diff --git a/gcc/config/i386/amxfp16intrin.h b/gcc/config/i386/amxfp16intrin.h -new file mode 100644 -index 000000000..6a114741a ---- /dev/null -+++ b/gcc/config/i386/amxfp16intrin.h -@@ -0,0 +1,46 @@ -+/* Copyright (C) 2020 Free Software Foundation, Inc. -+ -+ This file is part of GCC. -+ -+ GCC is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 3, or (at your option) -+ any later version. -+ -+ GCC is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
View file
_service:tar_scm:0076-Support-Intel-prefetchit0-t1.patch
Deleted
@@ -1,902 +0,0 @@ -From 42a38c8abaa28f67e26b9af3f434fe0107894e7d Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Fri, 4 Nov 2022 15:01:05 +0800 -Subject: PATCH 21/32 Support Intel prefetchit0/t1 - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h (get_available_features): - Detect PREFETCHI. - * common/config/i386/i386-common.cc - (OPTION_MASK_ISA2_PREFETCHI_SET, - OPTION_MASK_ISA2_PREFETCHI_UNSET): New. - (ix86_handle_option): Handle -mprefetchi. - * common/config/i386/i386-cpuinfo.h - (enum processor_features): Add FEATURE_PREFETCHI. - * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY - for prefetchi. - * config.gcc: Add prfchiintrin.h. - * config/i386/cpuid.h (bit_PREFETCHI): New. - * config/i386/i386-builtin-types.def: - Add DEF_FUNCTION_TYPE (VOID, PCVOID, INT) - and DEF_FUNCTION_TYPE (VOID, PCVOID, INT, INT, INT). - * config/i386/i386-builtin.def (BDESC): Add new builtins. - * config/i386/i386-c.cc (ix86_target_macros_internal): - Define __PREFETCHI__. - * config/i386/i386-expand.cc: Handle new builtins. - * config/i386/i386-isa.def (PREFETCHI): - Add DEF_PTA(PREFETCHI). - * config/i386/i386-options.cc - (ix86_valid_target_attribute_inner_p): Handle prefetchi. - * config/i386/i386.md (prefetchi): New define_insn. - * config/i386/i386.opt: Add option -mprefetchi. - * config/i386/predicates.md (local_func_symbolic_operand): - New predicates. - * config/i386/x86gprintrin.h: Include prfchiintrin.h. - * config/i386/xmmintrin.h (enum _mm_hint): New enum for - prefetchi. - (_mm_prefetch): Handle the highest bit of enum. - * doc/extend.texi: Document prefetchi. - * doc/invoke.texi: Document -mprefetchi. - * doc/sourcebuild.texi: Document target prefetchi. - * config/i386/prfchiintrin.h: New file. - -gcc/testsuite/ChangeLog: - - * g++.dg/other/i386-2.C: Add -mprefetchi. - * g++.dg/other/i386-3.C: Ditto. - * gcc.target/i386/avx-1.c: Ditto. - * gcc.target/i386/funcspec-56.inc: Add new target attribute. - * gcc.target/i386/sse-13.c: Add -mprefetchi. - * gcc.target/i386/sse-23.c: Ditto. - * gcc.target/i386/x86gprintrin-1.c: Ditto. - * gcc.target/i386/x86gprintrin-2.c: Ditto. - * gcc.target/i386/x86gprintrin-3.c: Ditto. - * gcc.target/i386/x86gprintrin-4.c: Ditto. - * gcc.target/i386/x86gprintrin-5.c: Ditto. - * gcc.target/i386/prefetchi-1.c: New test. - * gcc.target/i386/prefetchi-2.c: Ditto. - * gcc.target/i386/prefetchi-3.c: Ditto. - * gcc.target/i386/prefetchi-4.c: Ditto. - -Co-authored-by: Hongtao Liu <hongtao.liu@intel.com> ---- - gcc/common/config/i386/cpuinfo.h | 2 + - gcc/common/config/i386/i386-common.cc | 15 ++++ - gcc/common/config/i386/i386-cpuinfo.h | 1 + - gcc/common/config/i386/i386-isas.h | 1 + - gcc/config.gcc | 2 +- - gcc/config/i386/cpuid.h | 1 + - gcc/config/i386/i386-builtin-types.def | 4 + - gcc/config/i386/i386-builtin.def | 4 + - gcc/config/i386/i386-c.cc | 2 + - gcc/config/i386/i386-expand.cc | 77 +++++++++++++++++++ - gcc/config/i386/i386-isa.def | 1 + - gcc/config/i386/i386-options.cc | 4 +- - gcc/config/i386/i386.md | 23 ++++++ - gcc/config/i386/i386.opt | 4 + - gcc/config/i386/predicates.md | 15 ++++ - gcc/config/i386/prfchiintrin.h | 49 ++++++++++++ - gcc/config/i386/x86gprintrin.h | 2 + - gcc/config/i386/xmmintrin.h | 7 +- - gcc/doc/extend.texi | 5 ++ - gcc/doc/invoke.texi | 7 +- - gcc/doc/sourcebuild.texi | 3 + - gcc/testsuite/g++.dg/other/i386-2.C | 2 +- - gcc/testsuite/g++.dg/other/i386-3.C | 2 +- - gcc/testsuite/gcc.target/i386/avx-1.c | 4 +- - gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + - gcc/testsuite/gcc.target/i386/prefetchi-1.c | 40 ++++++++++ - gcc/testsuite/gcc.target/i386/prefetchi-2.c | 26 +++++++ - gcc/testsuite/gcc.target/i386/prefetchi-3.c | 20 +++++ - gcc/testsuite/gcc.target/i386/prefetchi-4.c | 19 +++++ - gcc/testsuite/gcc.target/i386/sse-13.c | 4 +- - gcc/testsuite/gcc.target/i386/sse-23.c | 4 +- - .../gcc.target/i386/x86gprintrin-1.c | 2 +- - .../gcc.target/i386/x86gprintrin-2.c | 2 +- - .../gcc.target/i386/x86gprintrin-3.c | 2 +- - .../gcc.target/i386/x86gprintrin-4.c | 2 +- - .../gcc.target/i386/x86gprintrin-5.c | 2 +- - 36 files changed, 343 insertions(+), 19 deletions(-) - create mode 100644 gcc/config/i386/prfchiintrin.h - create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-1.c - create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-2.c - create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-3.c - create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-4.c - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 5951a30aa..f17e88144 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -772,6 +772,8 @@ get_available_features (struct __processor_model *cpu_model, - __cpuid_count (7, 1, eax, ebx, ecx, edx); - if (eax & bit_HRESET) - set_feature (FEATURE_HRESET); -+ if (edx & bit_PREFETCHI) -+ set_feature (FEATURE_PREFETCHI); - if (avx_usable) - { - if (eax & bit_AVXVNNI) -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 922db33ee..c8cf532cf 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -108,6 +108,7 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_AMX_INT8_SET OPTION_MASK_ISA2_AMX_INT8 - #define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16 - #define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16 -+#define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI - - /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same - as -msse4.2. */ -@@ -277,6 +278,7 @@ along with GCC; see the file COPYING3. If not see - (OPTION_MASK_ISA2_KL | OPTION_MASK_ISA2_WIDEKL_UNSET) - #define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL - #define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16 -+#define OPTION_MASK_ISA2_PREFETCHI_UNSET OPTION_MASK_ISA2_PREFETCHI - - /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same - as -mno-sse4.1. */ -@@ -1140,6 +1142,19 @@ ix86_handle_option (struct gcc_options *opts, - } - return true; - -+ case OPT_mprefetchi: -+ if (value) -+ { -+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_PREFETCHI_SET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_PREFETCHI_SET; -+ } -+ else -+ { -+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_PREFETCHI_UNSET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_PREFETCHI_UNSET; -+ } -+ return true; -+ - case OPT_mfma: - if (value) - { -diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h -index 8f22897de..95b078acf 100644 ---- a/gcc/common/config/i386/i386-cpuinfo.h -+++ b/gcc/common/config/i386/i386-cpuinfo.h -@@ -241,6 +241,7 @@ enum processor_features - FEATURE_X86_64_V3, - FEATURE_X86_64_V4, - FEATURE_AMX_FP16, -+ FEATURE_PREFETCHI, - CPU_FEATURE_MAX - }; - -diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h -index 95bab6da2..6caf06249 100644 ---- a/gcc/common/config/i386/i386-isas.h -+++ b/gcc/common/config/i386/i386-isas.h -@@ -176,4 +176,5 @@ ISA_NAMES_TABLE_START - ISA_NAMES_TABLE_ENTRY("x86-64-v3", FEATURE_X86_64_V3, P_X86_64_V3, NULL) - ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL) - ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16") -+ ISA_NAMES_TABLE_ENTRY("prefetchi", FEATURE_PREFETCHI, P_NONE, "-mprefetchi") - ISA_NAMES_TABLE_END -diff --git a/gcc/config.gcc b/gcc/config.gcc -index e2b4a23dc..81012c651 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -424,7 +424,7 @@ i3456786-*-* | x86_64-*-*) - amxbf16intrin.h x86gprintrin.h uintrintrin.h - hresetintrin.h keylockerintrin.h avxvnniintrin.h - mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h -- amxfp16intrin.h" -+ amxfp16intrin.h prfchiintrin.h" - ;; - ia64-*-*) - extra_headers=ia64intrin.h -diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h -index d6cd8d1bf..21100149a 100644 ---- a/gcc/config/i386/cpuid.h -+++ b/gcc/config/i386/cpuid.h -@@ -50,6 +50,7 @@
View file
_service:tar_scm:0077-Initial-Granite-Rapids-Support.patch
Deleted
@@ -1,277 +0,0 @@ -From 7f0f8b585cf60b4c09bca42b5339995c2cc74633 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Mon, 7 Nov 2022 11:04:57 +0800 -Subject: PATCH 22/32 Initial Granite Rapids Support - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h - (get_intel_cpu): Handle Granite Rapids. - * common/config/i386/i386-common.cc: - (processor_names): Add graniterapids. - (processor_alias_table): Ditto. - * common/config/i386/i386-cpuinfo.h - (enum processor_subtypes): Add INTEL_GRANTIERAPIDS. - * config.gcc: Add -march=graniterapids. - * config/i386/driver-i386.cc (host_detect_local_cpu): - Handle graniterapids. - * config/i386/i386-c.cc (ix86_target_macros_internal): - Ditto. - * config/i386/i386-options.cc (m_GRANITERAPIDS): New. - (processor_cost_table): Add graniterapids. - * config/i386/i386.h (enum processor_type): - Add PROCESSOR_GRANITERAPIDS. - (PTA_GRANITERAPIDS): Ditto. - * doc/extend.texi: Add graniterapids. - * doc/invoke.texi: Ditto. - -gcc/testsuite/ChangeLog: - - * g++.target/i386/mv16.C: Add graniterapids. - * gcc.target/i386/funcspec-56.inc: Handle new march. - -(cherry picked from commit 339ffc5a792dd66647392a235f2f7f6344c5359e) ---- - gcc/common/config/i386/cpuinfo.h | 9 +++++++++ - gcc/common/config/i386/i386-common.cc | 3 +++ - gcc/common/config/i386/i386-cpuinfo.h | 1 + - gcc/config.gcc | 2 +- - gcc/config/i386/driver-i386.cc | 5 ++++- - gcc/config/i386/i386-c.cc | 7 +++++++ - gcc/config/i386/i386-options.cc | 4 +++- - gcc/config/i386/i386.h | 3 +++ - gcc/doc/extend.texi | 3 +++ - gcc/doc/invoke.texi | 11 +++++++++++ - gcc/testsuite/g++.target/i386/mv16.C | 6 ++++++ - gcc/testsuite/gcc.target/i386/funcspec-56.inc | 1 + - 12 files changed, 52 insertions(+), 3 deletions(-) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index f17e88144..1f75ff1ca 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -528,6 +528,15 @@ get_intel_cpu (struct __processor_model *cpu_model, - cpu_model->__cpu_type = INTEL_COREI7; - cpu_model->__cpu_subtype = INTEL_COREI7_SAPPHIRERAPIDS; - break; -+ case 0xad: -+ case 0xae: -+ /* Granite Rapids. */ -+ cpu = "graniterapids"; -+ CHECK___builtin_cpu_is ("corei7"); -+ CHECK___builtin_cpu_is ("graniterapids"); -+ cpu_model->__cpu_type = INTEL_COREI7; -+ cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS; -+ break; - case 0x17: - case 0x1d: - /* Penryn. */ -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index c8cf532cf..1aa163463 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1855,6 +1855,7 @@ const char *const processor_names = - "sapphirerapids", - "alderlake", - "rocketlake", -+ "graniterapids", - "intel", - "geode", - "k6", -@@ -1973,6 +1974,8 @@ const pta processor_alias_table = - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"meteorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, -+ {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS, -+ M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F}, - {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, - M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3}, - {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, -diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h -index 95b078acf..7b2d4d242 100644 ---- a/gcc/common/config/i386/i386-cpuinfo.h -+++ b/gcc/common/config/i386/i386-cpuinfo.h -@@ -92,6 +92,7 @@ enum processor_subtypes - AMDFAM19H_ZNVER3, - INTEL_COREI7_ROCKETLAKE, - AMDFAM19H_ZNVER4, -+ INTEL_COREI7_GRANITERAPIDS, - CPU_SUBTYPE_MAX - }; - -diff --git a/gcc/config.gcc b/gcc/config.gcc -index 81012c651..9bad238e3 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -670,7 +670,7 @@ slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ - silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \ - skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \ - sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \ --nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 native" -+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids native" - - # Additional x86 processors supported by --with-cpu=. Each processor - # MUST be separated by exactly one space. -diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc -index 3b5161aed..ea8c3d8d1 100644 ---- a/gcc/config/i386/driver-i386.cc -+++ b/gcc/config/i386/driver-i386.cc -@@ -576,8 +576,11 @@ const char *host_detect_local_cpu (int argc, const char **argv) - /* This is unknown family 0x6 CPU. */ - if (has_feature (FEATURE_AVX)) - { -+ /* Assume Granite Rapids. */ -+ if (has_feature (FEATURE_AMX_FP16)) -+ cpu = "graniterapids"; - /* Assume Tiger Lake */ -- if (has_feature (FEATURE_AVX512VP2INTERSECT)) -+ else if (has_feature (FEATURE_AVX512VP2INTERSECT)) - cpu = "tigerlake"; - /* Assume Sapphire Rapids. */ - else if (has_feature (FEATURE_TSXLDTRK)) -diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc -index 00880bd17..04f1dd682 100644 ---- a/gcc/config/i386/i386-c.cc -+++ b/gcc/config/i386/i386-c.cc -@@ -242,6 +242,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, - def_or_undef (parse_in, "__sapphirerapids"); - def_or_undef (parse_in, "__sapphirerapids__"); - break; -+ case PROCESSOR_GRANITERAPIDS: -+ def_or_undef (parse_in, "__graniterapids"); -+ def_or_undef (parse_in, "__graniterapids__"); -+ break; - case PROCESSOR_ALDERLAKE: - def_or_undef (parse_in, "__alderlake"); - def_or_undef (parse_in, "__alderlake__"); -@@ -419,6 +423,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, - case PROCESSOR_ROCKETLAKE: - def_or_undef (parse_in, "__tune_rocketlake__"); - break; -+ case PROCESSOR_GRANITERAPIDS: -+ def_or_undef (parse_in, "__tune_graniterapids__"); -+ break; - case PROCESSOR_INTEL: - case PROCESSOR_GENERIC: - break; -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 724375f02..6645e3259 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -127,10 +127,11 @@ along with GCC; see the file COPYING3. If not see - #define m_SAPPHIRERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_SAPPHIRERAPIDS) - #define m_ALDERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ALDERLAKE) - #define m_ROCKETLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ROCKETLAKE) -+#define m_GRANITERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS) - #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ - | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ - | m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \ -- | m_ROCKETLAKE) -+ | m_ROCKETLAKE | m_GRANITERAPIDS) - #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) - #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) - #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) -@@ -761,6 +762,7 @@ static const struct processor_costs *processor_cost_table = - &icelake_cost, - &alderlake_cost, - &icelake_cost, -+ &icelake_cost, - &intel_cost, - &geode_cost, - &k6_cost, -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index aaa136ba0..75953defc 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -2250,6 +2250,7 @@ enum processor_type - PROCESSOR_SAPPHIRERAPIDS, - PROCESSOR_ALDERLAKE, - PROCESSOR_ROCKETLAKE, -+ PROCESSOR_GRANITERAPIDS, - PROCESSOR_INTEL, - PROCESSOR_GEODE, - PROCESSOR_K6, -@@ -2356,6 +2357,8 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX - | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_LZCNT - | PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE - | PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI; -+constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16 -+ | PTA_PREFETCHI;
View file
_service:tar_scm:0078-Support-Intel-AMX-COMPLEX.patch
Deleted
@@ -1,722 +0,0 @@ -From 4f1aff10d93cabe8dfbaf076b6d826a142efb6e1 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Wed, 31 May 2023 10:45:00 +0800 -Subject: PATCH 23/32 Support Intel AMX-COMPLEX - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h (get_available_features): - Detect AMX-COMPLEX. - * common/config/i386/i386-common.cc - (OPTION_MASK_ISA2_AMX_COMPLEX_SET, - OPTION_MASK_ISA2_AMX_COMPLEX_UNSET): New. - (ix86_handle_option): Handle -mamx-complex. - * common/config/i386/i386-cpuinfo.h (enum processor_features): - Add FEATURE_AMX_COMPLEX. - * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for - amx-complex. - * config.gcc: Add amxcomplexintrin.h. - * config/i386/cpuid.h (bit_AMX_COMPLEX): New. - * config/i386/i386-c.cc (ix86_target_macros_internal): Define - __AMX_COMPLEX__. - * config/i386/i386-isa.def (AMX_COMPLEX): Add DEF_PTA(AMX_COMPLEX). - * config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p): - Handle amx-complex. - * config/i386/i386.opt: Add option -mamx-complex. - * config/i386/immintrin.h: Include amxcomplexintrin.h. - * doc/extend.texi: Document amx-complex. - * doc/invoke.texi: Document -mamx-complex. - * doc/sourcebuild.texi: Document target amx-complex. - * config/i386/amxcomplexintrin.h: New file. - -gcc/testsuite/ChangeLog: - - * g++.dg/other/i386-2.C: Add -mamx-complex. - * g++.dg/other/i386-3.C: Ditto. - * gcc.target/i386/amx-check.h: Add cpu check for AMX-COMPLEX. - * gcc.target/i386/amx-helper.h: Add amx-complex support. - * gcc.target/i386/funcspec-56.inc: Add new target attribute. - * gcc.target/i386/sse-12.c: Add -mamx-complex. - * gcc.target/i386/sse-13.c: Ditto. - * gcc.target/i386/sse-14.c: Ditto. - * gcc.target/i386/sse-22.c: Add amx-complex. - * gcc.target/i386/sse-23.c: Ditto. - * lib/target-supports.exp (check_effective_target_amx_complex): New. - * gcc.target/i386/amxcomplex-asmatt-1.c: New test. - * gcc.target/i386/amxcomplex-asmintel-1.c: Ditto. - * gcc.target/i386/amxcomplex-cmmimfp16ps-2.c: Ditto. - * gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c: Ditto. ---- - gcc/common/config/i386/cpuinfo.h | 2 + - gcc/common/config/i386/i386-common.cc | 19 +++++- - gcc/common/config/i386/i386-cpuinfo.h | 1 + - gcc/common/config/i386/i386-isas.h | 2 + - gcc/config.gcc | 2 +- - gcc/config/i386/amxcomplexintrin.h | 59 +++++++++++++++++++ - gcc/config/i386/cpuid.h | 1 + - gcc/config/i386/i386-c.cc | 2 + - gcc/config/i386/i386-isa.def | 1 + - gcc/config/i386/i386-options.cc | 4 +- - gcc/config/i386/i386.opt | 4 ++ - gcc/config/i386/immintrin.h | 2 + - gcc/doc/extend.texi | 5 ++ - gcc/doc/invoke.texi | 7 ++- - gcc/doc/sourcebuild.texi | 3 + - gcc/testsuite/g++.dg/other/i386-2.C | 2 +- - gcc/testsuite/g++.dg/other/i386-3.C | 2 +- - gcc/testsuite/gcc.target/i386/amx-check.h | 3 + - gcc/testsuite/gcc.target/i386/amx-helper.h | 4 +- - .../gcc.target/i386/amxcomplex-asmatt-1.c | 15 +++++ - .../gcc.target/i386/amxcomplex-asmintel-1.c | 12 ++++ - .../i386/amxcomplex-cmmimfp16ps-2.c | 53 +++++++++++++++++ - .../i386/amxcomplex-cmmrlfp16ps-2.c | 53 +++++++++++++++++ - gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + - gcc/testsuite/gcc.target/i386/sse-12.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-13.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-14.c | 2 +- - gcc/testsuite/gcc.target/i386/sse-22.c | 4 +- - gcc/testsuite/gcc.target/i386/sse-23.c | 2 +- - gcc/testsuite/lib/target-supports.exp | 11 ++++ - 30 files changed, 268 insertions(+), 15 deletions(-) - create mode 100644 gcc/config/i386/amxcomplexintrin.h - create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-asmatt-1.c - create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-asmintel-1.c - create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-cmmimfp16ps-2.c - create mode 100644 gcc/testsuite/gcc.target/i386/amxcomplex-cmmrlfp16ps-2.c - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 1f75ff1ca..39d3351db 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -798,6 +798,8 @@ get_available_features (struct __processor_model *cpu_model, - { - if (eax & bit_AMX_FP16) - set_feature (FEATURE_AMX_FP16); -+ if (edx & bit_AMX_COMPLEX) -+ set_feature (FEATURE_AMX_COMPLEX); - } - } - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 1aa163463..87e8afe9b 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -109,6 +109,8 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_AMX_BF16_SET OPTION_MASK_ISA2_AMX_BF16 - #define OPTION_MASK_ISA2_AMX_FP16_SET OPTION_MASK_ISA2_AMX_FP16 - #define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI -+#define OPTION_MASK_ISA2_AMX_COMPLEX_SET \ -+ (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX) - - /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same - as -msse4.2. */ -@@ -269,7 +271,8 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_SERIALIZE_UNSET OPTION_MASK_ISA2_SERIALIZE - #define OPTION_MASK_ISA2_AVX512VP2INTERSECT_UNSET OPTION_MASK_ISA2_AVX512VP2INTERSECT - #define OPTION_MASK_ISA2_TSXLDTRK_UNSET OPTION_MASK_ISA2_TSXLDTRK --#define OPTION_MASK_ISA2_AMX_TILE_UNSET OPTION_MASK_ISA2_AMX_TILE -+#define OPTION_MASK_ISA2_AMX_TILE_UNSET \ -+ (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX_UNSET) - #define OPTION_MASK_ISA2_AMX_INT8_UNSET OPTION_MASK_ISA2_AMX_INT8 - #define OPTION_MASK_ISA2_AMX_BF16_UNSET OPTION_MASK_ISA2_AMX_BF16 - #define OPTION_MASK_ISA2_UINTR_UNSET OPTION_MASK_ISA2_UINTR -@@ -279,6 +282,7 @@ along with GCC; see the file COPYING3. If not see - #define OPTION_MASK_ISA2_WIDEKL_UNSET OPTION_MASK_ISA2_WIDEKL - #define OPTION_MASK_ISA2_AMX_FP16_UNSET OPTION_MASK_ISA2_AMX_FP16 - #define OPTION_MASK_ISA2_PREFETCHI_UNSET OPTION_MASK_ISA2_PREFETCHI -+#define OPTION_MASK_ISA2_AMX_COMPLEX_UNSET OPTION_MASK_ISA2_AMX_COMPLEX - - /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same - as -mno-sse4.1. */ -@@ -1155,6 +1159,19 @@ ix86_handle_option (struct gcc_options *opts, - } - return true; - -+ case OPT_mamx_complex: -+ if (value) -+ { -+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_AMX_COMPLEX_SET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_COMPLEX_SET; -+ } -+ else -+ { -+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_COMPLEX_UNSET; -+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AMX_COMPLEX_UNSET; -+ } -+ return true; -+ - case OPT_mfma: - if (value) - { -diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h -index 7b2d4d242..56020faac 100644 ---- a/gcc/common/config/i386/i386-cpuinfo.h -+++ b/gcc/common/config/i386/i386-cpuinfo.h -@@ -243,6 +243,7 @@ enum processor_features - FEATURE_X86_64_V4, - FEATURE_AMX_FP16, - FEATURE_PREFETCHI, -+ FEATURE_AMX_COMPLEX, - CPU_FEATURE_MAX - }; - -diff --git a/gcc/common/config/i386/i386-isas.h b/gcc/common/config/i386/i386-isas.h -index 6caf06249..cbef68479 100644 ---- a/gcc/common/config/i386/i386-isas.h -+++ b/gcc/common/config/i386/i386-isas.h -@@ -177,4 +177,6 @@ ISA_NAMES_TABLE_START - ISA_NAMES_TABLE_ENTRY("x86-64-v4", FEATURE_X86_64_V4, P_X86_64_V4, NULL) - ISA_NAMES_TABLE_ENTRY("amx-fp16", FEATURE_AMX_FP16, P_NONE, "-mamx-fp16") - ISA_NAMES_TABLE_ENTRY("prefetchi", FEATURE_PREFETCHI, P_NONE, "-mprefetchi") -+ ISA_NAMES_TABLE_ENTRY("amx-complex", FEATURE_AMX_COMPLEX, -+ P_NONE, "-mamx-complex") - ISA_NAMES_TABLE_END -diff --git a/gcc/config.gcc b/gcc/config.gcc -index 9bad238e3..ca5c8f8a0 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -424,7 +424,7 @@ i3456786-*-* | x86_64-*-*) - amxbf16intrin.h x86gprintrin.h uintrintrin.h - hresetintrin.h keylockerintrin.h avxvnniintrin.h - mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h -- amxfp16intrin.h prfchiintrin.h" -+ amxfp16intrin.h prfchiintrin.h amxcomplexintrin.h" - ;; - ia64-*-*) - extra_headers=ia64intrin.h -diff --git a/gcc/config/i386/amxcomplexintrin.h b/gcc/config/i386/amxcomplexintrin.h -new file mode 100644 -index 000000000..6ea1eca04 ---- /dev/null -+++ b/gcc/config/i386/amxcomplexintrin.h -@@ -0,0 +1,59 @@ -+/* Copyright (C) 2023 Free Software Foundation, Inc. -+ -+ This file is part of GCC. -+ -+ GCC is free software; you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 3, or (at your option)
View file
_service:tar_scm:0079-i386-Add-AMX-COMPLEX-to-Granite-Rapids.patch
Deleted
@@ -1,30 +0,0 @@ -From 40469a6119085e4c4741bcaeb9418606d28b40c4 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Fri, 31 Mar 2023 10:49:14 +0800 -Subject: PATCH 24/32 i386: Add AMX-COMPLEX to Granite Rapids - -gcc/Changelog: - - * config/i386/i386.h (PTA_GRANITERAPIDS): Add PTA_AMX_COMPLEX. - -(cherry picked from commit afa87bd5f7b126e20268aa959441cde2e02bba0e) ---- - gcc/config/i386/i386.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index 75953defc..56d7794dc 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -2358,7 +2358,7 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX - | PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE - | PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI; - constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16 -- | PTA_PREFETCHI; -+ | PTA_PREFETCHI | PTA_AMX_COMPLEX; - constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW - | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ; - constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 --- -2.28.0.windows.1 -
View file
_service:tar_scm:0080-Initial-Granite-Rapids-D-Support.patch
Deleted
@@ -1,212 +0,0 @@ -From 125e5d448538f7534e0fe3df9b7947cf41605b51 Mon Sep 17 00:00:00 2001 -From: "Mo, Zewei" <zewei.mo@intel.com> -Date: Mon, 3 Jul 2023 11:00:26 +0800 -Subject: PATCH 25/32 Initial Granite Rapids D Support - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h - (get_intel_cpu): Handle Granite Rapids D. - * common/config/i386/i386-common.cc: - (processor_alias_table): Add graniterapids-d. - * common/config/i386/i386-cpuinfo.h - (enum processor_subtypes): Add INTEL_COREI7_GRANITERAPIDS_D. - * config.gcc: Add -march=graniterapids-d. - * config/i386/driver-i386.cc (host_detect_local_cpu): - Handle graniterapids-d. - * config/i386/i386.h: (PTA_GRANITERAPIDS_D): New. - * doc/extend.texi: Add graniterapids-d. - * doc/invoke.texi: Ditto. - -gcc/testsuite/ChangeLog: - - * g++.target/i386/mv16.C: Add graniterapids-d. - * gcc.target/i386/funcspec-56.inc: Handle new march. - -(cherry picked from commit a0cb65d34cc141571e870fb3b53b3ff47ae3338d) ---- - gcc/common/config/i386/cpuinfo.h | 9 ++++++++- - gcc/common/config/i386/i386-common.cc | 2 ++ - gcc/common/config/i386/i386-cpuinfo.h | 1 + - gcc/config.gcc | 3 ++- - gcc/config/i386/driver-i386.cc | 5 ++++- - gcc/config/i386/i386.h | 4 +++- - gcc/doc/extend.texi | 3 +++ - gcc/doc/invoke.texi | 11 +++++++++++ - gcc/testsuite/g++.target/i386/mv16.C | 6 ++++++ - gcc/testsuite/gcc.target/i386/funcspec-56.inc | 1 + - 10 files changed, 41 insertions(+), 4 deletions(-) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 39d3351db..1e53248ef 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -529,7 +529,6 @@ get_intel_cpu (struct __processor_model *cpu_model, - cpu_model->__cpu_subtype = INTEL_COREI7_SAPPHIRERAPIDS; - break; - case 0xad: -- case 0xae: - /* Granite Rapids. */ - cpu = "graniterapids"; - CHECK___builtin_cpu_is ("corei7"); -@@ -537,6 +536,14 @@ get_intel_cpu (struct __processor_model *cpu_model, - cpu_model->__cpu_type = INTEL_COREI7; - cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS; - break; -+ case 0xae: -+ /* Granite Rapids D. */ -+ cpu = "graniterapids-d"; -+ CHECK___builtin_cpu_is ("corei7"); -+ CHECK___builtin_cpu_is ("graniterapids-d"); -+ cpu_model->__cpu_type = INTEL_COREI7; -+ cpu_model->__cpu_subtype = INTEL_COREI7_GRANITERAPIDS_D; -+ break; - case 0x17: - case 0x1d: - /* Penryn. */ -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 87e8afe9b..28f468f48 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1993,6 +1993,8 @@ const pta processor_alias_table = - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS, - M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F}, -+ {"graniterapids-d", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS_D, -+ M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F}, - {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, - M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3}, - {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, -diff --git a/gcc/common/config/i386/i386-cpuinfo.h b/gcc/common/config/i386/i386-cpuinfo.h -index 56020faac..a32f32c97 100644 ---- a/gcc/common/config/i386/i386-cpuinfo.h -+++ b/gcc/common/config/i386/i386-cpuinfo.h -@@ -93,6 +93,7 @@ enum processor_subtypes - INTEL_COREI7_ROCKETLAKE, - AMDFAM19H_ZNVER4, - INTEL_COREI7_GRANITERAPIDS, -+ INTEL_COREI7_GRANITERAPIDS_D, - CPU_SUBTYPE_MAX - }; - -diff --git a/gcc/config.gcc b/gcc/config.gcc -index ca5c8f8a0..3108ac4eb 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -670,7 +670,8 @@ slm nehalem westmere sandybridge ivybridge haswell broadwell bonnell \ - silvermont knl knm skylake-avx512 cannonlake icelake-client icelake-server \ - skylake goldmont goldmont-plus tremont cascadelake tigerlake cooperlake \ - sapphirerapids alderlake rocketlake eden-x2 nano nano-1000 nano-2000 nano-3000 \ --nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids native" -+nano-x2 eden-x4 nano-x4 x86-64 x86-64-v2 x86-64-v3 x86-64-v4 graniterapids \ -+graniterapids-d native" - - # Additional x86 processors supported by --with-cpu=. Each processor - # MUST be separated by exactly one space. -diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc -index ea8c3d8d1..e3bca4b49 100644 ---- a/gcc/config/i386/driver-i386.cc -+++ b/gcc/config/i386/driver-i386.cc -@@ -576,8 +576,11 @@ const char *host_detect_local_cpu (int argc, const char **argv) - /* This is unknown family 0x6 CPU. */ - if (has_feature (FEATURE_AVX)) - { -+ /* Assume Granite Rapids D. */ -+ if (has_feature (FEATURE_AMX_COMPLEX)) -+ cpu = "graniterapids-d"; - /* Assume Granite Rapids. */ -- if (has_feature (FEATURE_AMX_FP16)) -+ else if (has_feature (FEATURE_AMX_FP16)) - cpu = "graniterapids"; - /* Assume Tiger Lake */ - else if (has_feature (FEATURE_AVX512VP2INTERSECT)) -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index 56d7794dc..eda3e5e5b 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -2358,7 +2358,9 @@ constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX - | PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE - | PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI; - constexpr wide_int_bitmask PTA_GRANITERAPIDS = PTA_SAPPHIRERAPIDS | PTA_AMX_FP16 -- | PTA_PREFETCHI | PTA_AMX_COMPLEX; -+ | PTA_PREFETCHI; -+constexpr wide_int_bitmask PTA_GRANITERAPIDS_D = PTA_GRANITERAPIDS -+ | PTA_AMX_COMPLEX; - constexpr wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW - | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ; - constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 -diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi -index d7b0bc802..674db2f1a 100644 ---- a/gcc/doc/extend.texi -+++ b/gcc/doc/extend.texi -@@ -21837,6 +21837,9 @@ Intel Core i7 Rocketlake CPU. - @item graniterapids - Intel Core i7 graniterapids CPU. - -+@item graniterapids-d -+Intel Core i7 graniterapids D CPU. -+ - @item bonnell - Intel Atom Bonnell CPU. - -diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index 186b33481..a2ec060fd 100644 ---- a/gcc/doc/invoke.texi -+++ b/gcc/doc/invoke.texi -@@ -31626,6 +31626,17 @@ MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, - SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, - AVX512BF16, AMX-FP16 and PREFETCHI instruction set support. - -+@item graniterapids-d -+Intel graniterapids D CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, -+SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, -+RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, -+AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, -+AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, -+VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB, -+MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, -+SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, -+AVX512BF16, AMX-FP16, PREFETCHI and AMX-COMPLEX instruction set support. -+ - @item k6 - AMD K6 CPU with MMX instruction set support. - -diff --git a/gcc/testsuite/g++.target/i386/mv16.C b/gcc/testsuite/g++.target/i386/mv16.C -index 65cc24f32..17b1fc722 100644 ---- a/gcc/testsuite/g++.target/i386/mv16.C -+++ b/gcc/testsuite/g++.target/i386/mv16.C -@@ -96,6 +96,10 @@ int __attribute__ ((target("arch=graniterapids"))) foo () { - return 26; - } - -+int __attribute__ ((target("arch=graniterapids-d"))) foo () { -+ return 28; -+} -+ - int main () - { - int val = foo (); -@@ -136,6 +140,8 @@ int main () - assert (val == 24); - else if (__builtin_cpu_is ("graniterapids")) - assert (val == 25); -+ else if (__builtin_cpu_is ("graniterapids-d")) -+ assert (val == 26); - else - assert (val == 0); - -diff --git a/gcc/testsuite/gcc.target/i386/funcspec-56.inc b/gcc/testsuite/gcc.target/i386/funcspec-56.inc -index 1a2f3b83d..f0f3397a7 100644
View file
_service:tar_scm:0081-Correct-Granite-Rapids-D-documentation.patch
Deleted
@@ -1,48 +0,0 @@ -From a809a6a416af4d08f7feeadfdd5d1f5a76a830b5 Mon Sep 17 00:00:00 2001 -From: Haochen Jiang <haochen.jiang@intel.com> -Date: Thu, 20 Jul 2023 10:47:18 +0800 -Subject: PATCH 26/32 Correct Granite Rapids{, D} documentation - -gcc/Changelog: - - * doc/invoke.texi: Remove AVX512VP2INTERSECT in - Granite Rapids{, D} from documentation. - -(cherry picked from commit 38daaaa91438d3f635a10bf5d5181c3b29f07df9) ---- - gcc/doc/invoke.texi | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi -index a2ec060fd..4d3eccdb2 100644 ---- a/gcc/doc/invoke.texi -+++ b/gcc/doc/invoke.texi -@@ -31622,9 +31622,9 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, - AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, - AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, - VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB, --MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, --SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, --AVX512BF16, AMX-FP16 and PREFETCHI instruction set support. -+MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, -+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512-FP16, AVX512BF16, AMX-FP16 -+and PREFETCHI instruction set support. - - @item graniterapids-d - Intel graniterapids D CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, -@@ -31633,9 +31633,9 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, - AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, - AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, - VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB, --MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, --SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, --AVX512BF16, AMX-FP16, PREFETCHI and AMX-COMPLEX instruction set support. -+MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, -+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16, AVX512BF16, AMX-FP16, -+PREFETCHI and AMX-COMPLEX instruction set support. - - @item k6 - AMD K6 CPU with MMX instruction set support. --- -2.28.0.windows.1 -
View file
_service:tar_scm:0082-i386-Remove-Meteorlake-s-family_model.patch
Deleted
@@ -1,30 +0,0 @@ -From 62852213bc6d3e56804ca05826bb95a3a2fe4eba Mon Sep 17 00:00:00 2001 -From: "Hu, Lin1" <lin1.hu@intel.com> -Date: Thu, 15 Dec 2022 15:51:18 +0800 -Subject: PATCH 27/32 i386: Remove Meteorlake's family_model - -gcc/ChangeLog: - - * common/config/i386/cpuinfo.h (get_intel_cpu): Remove case 0xb5 - for meteorlake. - -(cherry picked from commit 9e74b7ec0b218364905e3e7de5c41e8148ffc61b) ---- - gcc/common/config/i386/cpuinfo.h | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 1e53248ef..348bc0c12 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -510,7 +510,6 @@ get_intel_cpu (struct __processor_model *cpu_model, - /* Alder Lake. */ - case 0xb7: - /* Raptor Lake. */ -- case 0xb5: - case 0xaa: - case 0xac: - /* Meteor Lake. */ --- -2.28.0.windows.1 -
View file
_service:tar_scm:0083-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch
Deleted
@@ -1,33 +0,0 @@ -From 73042aa18fe70aa30a9c7c760b08e642560ecccd Mon Sep 17 00:00:00 2001 -From: "Cui, Lili" <lili.cui@intel.com> -Date: Thu, 29 Jun 2023 03:10:35 +0000 -Subject: PATCH 28/32 x86: Update model values for Alderlake, Rocketlake and - Raptorlake. - -Update model values for Alderlake, Rocketlake and Raptorlake according to SDM. - -gcc/ChangeLog - - * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 - from Rocketlake, move model value 0xbf from Alderlake to Raptorlake. - -(cherry picked from commit e510c3be13a8ccdf1fc1b27c2501c126d493f335) ---- - gcc/common/config/i386/cpuinfo.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index 348bc0c12..f9bcb6fad 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -509,6 +509,7 @@ get_intel_cpu (struct __processor_model *cpu_model, - case 0x9a: - /* Alder Lake. */ - case 0xb7: -+ case 0xbf: - /* Raptor Lake. */ - case 0xaa: - case 0xac: --- -2.28.0.windows.1 -
View file
_service:tar_scm:0084-x86-Update-model-values-for-Raptorlake.patch
Deleted
@@ -1,32 +0,0 @@ -From 3dbe28984e0f9c24d6670cfba42983bc32c08b0a Mon Sep 17 00:00:00 2001 -From: "Cui, Lili" <lili.cui@intel.com> -Date: Mon, 14 Aug 2023 02:06:00 +0000 -Subject: PATCH 29/32 x86: Update model values for Raptorlake. - -Update model values for Raptorlake according to SDM. - -gcc/ChangeLog - - * common/config/i386/cpuinfo.h (get_intel_cpu): Add model value 0xba - to Raptorlake. - -(cherry picked from commit 614052dd4ea083e086712809c754ffebd9361316) ---- - gcc/common/config/i386/cpuinfo.h | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h -index f9bcb6fad..da1568fd1 100644 ---- a/gcc/common/config/i386/cpuinfo.h -+++ b/gcc/common/config/i386/cpuinfo.h -@@ -509,6 +509,7 @@ get_intel_cpu (struct __processor_model *cpu_model, - case 0x9a: - /* Alder Lake. */ - case 0xb7: -+ case 0xba: - case 0xbf: - /* Raptor Lake. */ - case 0xaa: --- -2.28.0.windows.1 -
View file
_service:tar_scm:0085-Fix-target_clone-arch-graniterapids-d.patch
Deleted
@@ -1,159 +0,0 @@ -From 8db0f3cd29bd7f937ffa01dd1100360fbbf5b6f4 Mon Sep 17 00:00:00 2001 -From: liuhongt <hongtao.liu@intel.com> -Date: Tue, 22 Aug 2023 18:18:31 +0800 -Subject: PATCH 30/32 Fix target_clone ("arch=graniterapids-d") - -Both "graniterapid-d" and "graniterapids" are attached with -PROCESSOR_GRANITERAPID in processor_alias_table but mapped to -different __cpu_subtype in get_intel_cpu. - -And get_builtin_code_for_version will try to match the first -PROCESSOR_GRANITERAPIDS in processor_alias_table which maps to -"granitepraids" here. - -861 else if (new_target->arch_specified && new_target->arch > 0) -1862 for (i = 0; i < pta_size; i++) -1863 if (processor_alias_tablei.processor == new_target->arch) -1864 { -1865 const pta *arch_info = &processor_alias_tablei; -1866 switch (arch_info->priority) -1867 { -1868 default: -1869 arg_str = arch_info->name; - -This mismatch makes dispatch_function_versions check the preidcate -of__builtin_cpu_is ("graniterapids") for "graniterapids-d" and causes -the issue. -The patch explicitly adds PROCESSOR_GRANITERAPIDS_D to make a distinction. - -For "alderlake","raptorlake", "meteorlake" they share same isa, cost, -tuning, and mapped to the same __cpu_type/__cpu_subtype in -get_intel_cpu, so no need to add PROCESSOR_RAPTORLAKE and others. - -gcc/ChangeLog: - - * common/config/i386/i386-common.cc (processor_names): Add new - member graniterapids-s. - * config/i386/i386-options.cc (processor_alias_table): Update - table with and PROCESSOR_GRANITERAPIDS_D. - (m_GRANITERAPID_D): New macro. - (m_CORE_AVX512): Add m_GRANITERAPIDS_D. - (processor_cost_table): Add icelake_cost for - PROCESSOR_GRANITERAPIDS_D. - * config/i386/i386.h (enum processor_type): Add new member - PROCESSOR_GRANITERAPIDS_D. - * config/i386/i386-c.cc (ix86_target_macros_internal): Handle - PROCESSOR_GRANITERAPIDS_D ---- - gcc/common/config/i386/i386-common.cc | 6 ++++-- - gcc/config/i386/i386-c.cc | 8 ++++++++ - gcc/config/i386/i386-options.cc | 4 +++- - gcc/config/i386/i386.h | 3 ++- - 4 files changed, 17 insertions(+), 4 deletions(-) - -diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc -index 28f468f48..bec6801ce 100644 ---- a/gcc/common/config/i386/i386-common.cc -+++ b/gcc/common/config/i386/i386-common.cc -@@ -1873,6 +1873,7 @@ const char *const processor_names = - "alderlake", - "rocketlake", - "graniterapids", -+ "graniterapids-d", - "intel", - "geode", - "k6", -@@ -1993,8 +1994,9 @@ const pta processor_alias_table = - M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, - {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS, - M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F}, -- {"graniterapids-d", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS_D, -- M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F}, -+ {"graniterapids-d", PROCESSOR_GRANITERAPIDS_D, CPU_HASWELL, -+ PTA_GRANITERAPIDS_D, M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), -+ P_PROC_AVX512F}, - {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, - M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3}, - {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL, -diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc -index 5e0ac278c..49f0db2b8 100644 ---- a/gcc/config/i386/i386-c.cc -+++ b/gcc/config/i386/i386-c.cc -@@ -246,6 +246,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, - def_or_undef (parse_in, "__graniterapids"); - def_or_undef (parse_in, "__graniterapids__"); - break; -+ case PROCESSOR_GRANITERAPIDS_D: -+ def_or_undef (parse_in, "__graniterapids_d"); -+ def_or_undef (parse_in, "__graniterapids_d__"); -+ break; - case PROCESSOR_ALDERLAKE: - def_or_undef (parse_in, "__alderlake"); - def_or_undef (parse_in, "__alderlake__"); -@@ -254,6 +258,7 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, - def_or_undef (parse_in, "__rocketlake"); - def_or_undef (parse_in, "__rocketlake__"); - break; -+ - /* use PROCESSOR_max to not set/unset the arch macro. */ - case PROCESSOR_max: - break; -@@ -426,6 +431,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, - case PROCESSOR_GRANITERAPIDS: - def_or_undef (parse_in, "__tune_graniterapids__"); - break; -+ case PROCESSOR_GRANITERAPIDS_D: -+ def_or_undef (parse_in, "__tune_graniterapids_d__"); -+ break; - case PROCESSOR_INTEL: - case PROCESSOR_GENERIC: - break; -diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc -index 7efd25084..86932d719 100644 ---- a/gcc/config/i386/i386-options.cc -+++ b/gcc/config/i386/i386-options.cc -@@ -128,10 +128,11 @@ along with GCC; see the file COPYING3. If not see - #define m_ALDERLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ALDERLAKE) - #define m_ROCKETLAKE (HOST_WIDE_INT_1U<<PROCESSOR_ROCKETLAKE) - #define m_GRANITERAPIDS (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS) -+#define m_GRANITERAPIDS_D (HOST_WIDE_INT_1U<<PROCESSOR_GRANITERAPIDS_D) - #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \ - | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \ - | m_TIGERLAKE | m_COOPERLAKE | m_SAPPHIRERAPIDS \ -- | m_ROCKETLAKE | m_GRANITERAPIDS) -+ | m_ROCKETLAKE | m_GRANITERAPIDS | m_GRANITERAPIDS_D) - #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512) - #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2) - #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT) -@@ -764,6 +765,7 @@ static const struct processor_costs *processor_cost_table = - &alderlake_cost, - &icelake_cost, - &icelake_cost, -+ &icelake_cost, - &intel_cost, - &geode_cost, - &k6_cost, -diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h -index eda3e5e5b..5052f878d 100644 ---- a/gcc/config/i386/i386.h -+++ b/gcc/config/i386/i386.h -@@ -2216,7 +2216,7 @@ extern int const svr4_dbx_register_mapFIRST_PSEUDO_REGISTER; - #define DEFAULT_LARGE_SECTION_THRESHOLD 65536 -  - /* Which processor to tune code generation for. These must be in sync -- with processor_target_table in i386.cc. */ -+ with processor_cost_table in i386-options.cc. */ - - enum processor_type - { -@@ -2251,6 +2251,7 @@ enum processor_type - PROCESSOR_ALDERLAKE, - PROCESSOR_ROCKETLAKE, - PROCESSOR_GRANITERAPIDS, -+ PROCESSOR_GRANITERAPIDS_D, - PROCESSOR_INTEL, - PROCESSOR_GEODE, - PROCESSOR_K6, --- -2.28.0.windows.1 -
View file
_service:tar_scm:0086-Modfify-cost-calculation-for-dealing-with-equivalenc.patch
Deleted
@@ -1,321 +0,0 @@ -From c546aad5d38165e2962456525a0f6a427e03583b Mon Sep 17 00:00:00 2001 -From: "Vladimir N. Makarov" <vmakarov@redhat.com> -Date: Thu, 26 Oct 2023 09:50:40 -0400 -Subject: PATCH 31/32 Modfify cost calculation for dealing with equivalences - -RISCV target developers reported that pseudos with equivalence used in -a loop can be spilled. Simple changes of heuristics of cost -calculation of pseudos with equivalence or even ignoring equivalences -resulted in numerous testsuite failures on different targets or worse -spec2017 performance. This patch implements more sophisticated cost -calculations of pseudos with equivalences. The patch does not change -RA behaviour for targets still using the old reload pass instead of -LRA. The patch solves the reported problem and improves x86-64 -specint2017 a bit (specfp2017 performance stays the same). The patch -takes into account how the equivalence will be used: will it be -integrated into the user insns or require an input reload insn. It -requires additional pass over insns. To compensate RA slow down, the -patch removes a pass over insns in the reload pass used by IRA before. -This also decouples IRA from reload more and will help to remove the -reload pass in the future if it ever happens. - -gcc/ChangeLog: - - * dwarf2out.cc (reg_loc_descriptor): Use lra_eliminate_regs when - LRA is used. - * ira-costs.cc: Include regset.h. - (equiv_can_be_consumed_p, get_equiv_regno, calculate_equiv_gains): - New functions. - (find_costs_and_classes): Call calculate_equiv_gains and redefine - mem_cost of pseudos with equivs when LRA is used. - * var-tracking.cc: Include ira.h and lra.h. - (vt_initialize): Use lra_eliminate_regs when LRA is used. ---- - gcc/dwarf2out.cc | 4 +- - gcc/ira-costs.cc | 169 ++++++++++++++++++++++++++++++++++++++++++-- - gcc/var-tracking.cc | 14 +++- - 3 files changed, 179 insertions(+), 8 deletions(-) - -diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc -index 0a5c081d8..f0f6f4fd4 100644 ---- a/gcc/dwarf2out.cc -+++ b/gcc/dwarf2out.cc -@@ -14263,7 +14263,9 @@ reg_loc_descriptor (rtx rtl, enum var_init_status initialized) - argument pointer and soft frame pointer rtx's. - Use DW_OP_fbreg offset DW_OP_stack_value in this case. */ - if ((rtl == arg_pointer_rtx || rtl == frame_pointer_rtx) -- && eliminate_regs (rtl, VOIDmode, NULL_RTX) != rtl) -+ && (ira_use_lra_p -+ ? lra_eliminate_regs (rtl, VOIDmode, NULL_RTX) -+ : eliminate_regs (rtl, VOIDmode, NULL_RTX)) != rtl) - { - dw_loc_descr_ref result = NULL; - -diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc -index 642fda529..c79311783 100644 ---- a/gcc/ira-costs.cc -+++ b/gcc/ira-costs.cc -@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see - #include "tm_p.h" - #include "insn-config.h" - #include "regs.h" -+#include "regset.h" - #include "ira.h" - #include "ira-int.h" - #include "addresses.h" -@@ -1750,6 +1751,145 @@ process_bb_node_for_costs (ira_loop_tree_node_t loop_tree_node) - process_bb_for_costs (bb); - } - -+/* Check that reg REGNO can be changed by TO in INSN. Return true in case the -+ result insn would be valid one. */ -+static bool -+equiv_can_be_consumed_p (int regno, rtx to, rtx_insn *insn) -+{ -+ validate_replace_src_group (regno_reg_rtxregno, to, insn); -+ bool res = verify_changes (0); -+ cancel_changes (0); -+ return res; -+} -+ -+/* Return true if X contains a pseudo with equivalence. In this case also -+ return the pseudo through parameter REG. If the pseudo is a part of subreg, -+ return the subreg through parameter SUBREG. */ -+ -+static bool -+get_equiv_regno (rtx x, int ®no, rtx &subreg) -+{ -+ subreg = NULL_RTX; -+ if (GET_CODE (x) == SUBREG) -+ { -+ subreg = x; -+ x = SUBREG_REG (x); -+ } -+ if (REG_P (x) -+ && (ira_reg_equivREGNO (x).memory != NULL -+ || ira_reg_equivREGNO (x).constant != NULL)) -+ { -+ regno = REGNO (x); -+ return true; -+ } -+ RTX_CODE code = GET_CODE (x); -+ const char *fmt = GET_RTX_FORMAT (code); -+ -+ for (int i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) -+ if (fmti == 'e') -+ { -+ if (get_equiv_regno (XEXP (x, i), regno, subreg)) -+ return true; -+ } -+ else if (fmti == 'E') -+ { -+ for (int j = 0; j < XVECLEN (x, i); j++) -+ if (get_equiv_regno (XVECEXP (x, i, j), regno, subreg)) -+ return true; -+ } -+ return false; -+} -+ -+/* A pass through the current function insns. Calculate costs of using -+ equivalences for pseudos and store them in regno_equiv_gains. */ -+ -+static void -+calculate_equiv_gains (void) -+{ -+ basic_block bb; -+ int regno, freq, cost; -+ rtx subreg; -+ rtx_insn *insn; -+ machine_mode mode; -+ enum reg_class rclass; -+ bitmap_head equiv_pseudos; -+ -+ ira_assert (allocno_p); -+ bitmap_initialize (&equiv_pseudos, ®_obstack); -+ for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--) -+ if (ira_reg_equivregno.init_insns != NULL -+ && (ira_reg_equivregno.memory != NULL -+ || (ira_reg_equivregno.constant != NULL -+ /* Ignore complicated constants which probably will be placed -+ in memory: */ -+ && GET_CODE (ira_reg_equivregno.constant) != CONST_DOUBLE -+ && GET_CODE (ira_reg_equivregno.constant) != CONST_VECTOR -+ && GET_CODE (ira_reg_equivregno.constant) != LABEL_REF))) -+ { -+ rtx_insn_list *x; -+ for (x = ira_reg_equivregno.init_insns; x != NULL; x = x->next ()) -+ { -+ insn = x->insn (); -+ rtx set = single_set (insn); -+ -+ if (set == NULL_RTX || SET_DEST (set) != regno_reg_rtxregno) -+ break; -+ bb = BLOCK_FOR_INSN (insn); -+ ira_curr_regno_allocno_map -+ = ira_bb_nodesbb->index.parent->regno_allocno_map; -+ mode = PSEUDO_REGNO_MODE (regno); -+ rclass = prefCOST_INDEX (regno); -+ ira_init_register_move_cost_if_necessary (mode); -+ if (ira_reg_equivregno.memory != NULL) -+ cost = ira_memory_move_costmoderclass1; -+ else -+ cost = ira_register_move_costmoderclassrclass; -+ freq = REG_FREQ_FROM_BB (bb); -+ regno_equiv_gainsregno += cost * freq; -+ } -+ if (x != NULL) -+ /* We found complicated equiv or reverse equiv mem=reg. Ignore -+ them. */ -+ regno_equiv_gainsregno = 0; -+ else -+ bitmap_set_bit (&equiv_pseudos, regno); -+ } -+ -+ FOR_EACH_BB_FN (bb, cfun) -+ { -+ freq = REG_FREQ_FROM_BB (bb); -+ ira_curr_regno_allocno_map -+ = ira_bb_nodesbb->index.parent->regno_allocno_map; -+ FOR_BB_INSNS (bb, insn) -+ { -+ if (!INSN_P (insn) || !get_equiv_regno (PATTERN (insn), regno, subreg) -+ || !bitmap_bit_p (&equiv_pseudos, regno)) -+ continue; -+ rtx subst = ira_reg_equivregno.memory; -+ -+ if (subst == NULL) -+ subst = ira_reg_equivregno.constant; -+ ira_assert (subst != NULL); -+ mode = PSEUDO_REGNO_MODE (regno); -+ ira_init_register_move_cost_if_necessary (mode); -+ bool consumed_p = equiv_can_be_consumed_p (regno, subst, insn); -+ -+ rclass = prefCOST_INDEX (regno); -+ if (MEM_P (subst) -+ /* If it is a change of constant into double for example, the -+ result constant probably will be placed in memory. */ -+ || (subreg != NULL_RTX && !INTEGRAL_MODE_P (GET_MODE (subreg)))) -+ cost = ira_memory_move_costmoderclass1 + (consumed_p ? 0 : 1); -+ else if (consumed_p)
View file
_service:tar_scm:0087-Add-cost-calculation-for-reg-equivalence-invariants.patch
Deleted
@@ -1,49 +0,0 @@ -From 4965473a4211a9feb46a0d168180ab450cb18bcc Mon Sep 17 00:00:00 2001 -From: "Vladimir N. Makarov" <vmakarov@redhat.com> -Date: Fri, 27 Oct 2023 08:28:24 -0400 -Subject: PATCH 32/32 Add cost calculation for reg equivalence invariants - -My recent patch improving cost calculation for pseudos with equivalence -resulted in failure of gcc.target/arm/eliminate.c on aarch64. This patch -fixes this failure. - -gcc/ChangeLog: - - * ira-costs.cc: (get_equiv_regno, calculate_equiv_gains): - Process reg equivalence invariants. ---- - gcc/ira-costs.cc | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc -index c79311783..d33104a30 100644 ---- a/gcc/ira-costs.cc -+++ b/gcc/ira-costs.cc -@@ -1777,6 +1777,7 @@ get_equiv_regno (rtx x, int ®no, rtx &subreg) - } - if (REG_P (x) - && (ira_reg_equivREGNO (x).memory != NULL -+ || ira_reg_equivREGNO (x).invariant != NULL - || ira_reg_equivREGNO (x).constant != NULL)) - { - regno = REGNO (x); -@@ -1819,6 +1820,7 @@ calculate_equiv_gains (void) - for (regno = max_reg_num () - 1; regno >= FIRST_PSEUDO_REGISTER; regno--) - if (ira_reg_equivregno.init_insns != NULL - && (ira_reg_equivregno.memory != NULL -+ || ira_reg_equivregno.invariant != NULL - || (ira_reg_equivregno.constant != NULL - /* Ignore complicated constants which probably will be placed - in memory: */ -@@ -1869,6 +1871,8 @@ calculate_equiv_gains (void) - - if (subst == NULL) - subst = ira_reg_equivregno.constant; -+ if (subst == NULL) -+ subst = ira_reg_equivregno.invariant; - ira_assert (subst != NULL); - mode = PSEUDO_REGNO_MODE (regno); - ira_init_register_move_cost_if_necessary (mode); --- -2.28.0.windows.1 -
View file
_service:tar_scm:LoongArch-Add-LA664-support.patch
Deleted
@@ -1,332 +0,0 @@ -From c68463abbab98aa7f5a9b91e71ed6f6834c723df Mon Sep 17 00:00:00 2001 -From: Lulu Cheng <chenglulu@loongson.cn> -Date: Thu, 16 Nov 2023 20:43:53 +0800 -Subject: PATCH LoongArch: Add LA664 support. - -Define ISA_BASE_LA64V110, which represents the base instruction set defined in LoongArch1.1. -Support the configure setting --with-arch =la664, and support -march=la664,-mtune=la664. - -gcc/ChangeLog: - - * config.gcc: Support LA664. - * config/loongarch/genopts/loongarch-strings: Likewise. - * config/loongarch/genopts/loongarch.opt.in: Likewise. - * config/loongarch/loongarch-cpu.cc (fill_native_cpu_config): Likewise. - * config/loongarch/loongarch-def.c: Likewise. - * config/loongarch/loongarch-def.h (N_ISA_BASE_TYPES): Likewise. - (ISA_BASE_LA64V110): Define macro. - (N_ARCH_TYPES): Update value. - (N_TUNE_TYPES): Update value. - (CPU_LA664): New macro. - * config/loongarch/loongarch-opts.cc (isa_default_abi): Likewise. - (isa_base_compat_p): Likewise. - * config/loongarch/loongarch-opts.h (TARGET_64BIT): This parameter is enabled - when la_target.isa.base is equal to ISA_BASE_LA64V100 or ISA_BASE_LA64V110. - (TARGET_uARCH_LA664): Define macro. - * config/loongarch/loongarch-str.h (STR_CPU_LA664): Likewise. - * config/loongarch/loongarch.cc (loongarch_cpu_sched_reassociation_width): - Add LA664 support. - * config/loongarch/loongarch.opt: Regenerate. - -Signed-off-by: ticat_fp <fanpeng@loongson.cn> ---- - gcc/config.gcc | 10 ++++----- - .../loongarch/genopts/loongarch-strings | 1 + - gcc/config/loongarch/genopts/loongarch.opt.in | 3 +++ - gcc/config/loongarch/loongarch-cpu.cc | 4 ++++ - gcc/config/loongarch/loongarch-def.c | 21 +++++++++++++++++++ - gcc/config/loongarch/loongarch-def.h | 8 ++++--- - gcc/config/loongarch/loongarch-opts.cc | 8 +++---- - gcc/config/loongarch/loongarch-opts.h | 4 +++- - gcc/config/loongarch/loongarch-str.h | 1 + - gcc/config/loongarch/loongarch.cc | 1 + - gcc/config/loongarch/loongarch.opt | 3 +++ - 11 files changed, 51 insertions(+), 13 deletions(-) - -diff --git a/gcc/config.gcc b/gcc/config.gcc -index 6d51bd93f3f..b88591b6fd8 100644 ---- a/gcc/config.gcc -+++ b/gcc/config.gcc -@@ -5039,7 +5039,7 @@ case "${target}" in - - # Perform initial sanity checks on --with-* options. - case ${with_arch} in -- "" | abi-default | loongarch64 | la464) ;; # OK, append here. -+ "" | abi-default | loongarch64 | la4664) ;; # OK, append here. - native) - if test x${host} != x${target}; then - echo "--with-arch=native is illegal for cross-compiler." 1>&2 -@@ -5088,7 +5088,7 @@ case "${target}" in - case ${abi_base}/${abi_ext} in - lp64*/base) - # architectures that support lp64* ABI -- arch_pattern="native|abi-default|loongarch64|la464" -+ arch_pattern="native|abi-default|loongarch64|la4664" - # default architecture for lp64* ABI - arch_default="abi-default" - ;; -@@ -5163,7 +5163,7 @@ case "${target}" in - # Check default with_tune configuration using with_arch. - case ${with_arch} in - loongarch64) -- tune_pattern="native|abi-default|loongarch64|la464" -+ tune_pattern="native|abi-default|loongarch64|la4664" - ;; - *) - # By default, $with_tune == $with_arch -@@ -5219,7 +5219,7 @@ case "${target}" in - # Fixed: use the default gcc configuration for all multilib - # builds by default. - with_multilib_default="" ;; -- arch,native|arch,loongarch64|arch,la464) # OK, append here. -+ arch,native|arch,loongarch64|arch,la4664) # OK, append here. - with_multilib_default="/march=${component}" ;; - arch,*) - with_multilib_default="/march=abi-default" -@@ -5307,7 +5307,7 @@ case "${target}" in - if test x${parse_state} = x"arch"; then - # -march option - case ${component} in -- native | abi-default | loongarch64 | la464) # OK, append here. -+ native | abi-default | loongarch64 | la4664) # OK, append here. - # Append -march spec for each multilib variant. - loongarch_multilib_list_make="${loongarch_multilib_list_make}/march=${component}" - parse_state="opts" -diff --git a/gcc/config/loongarch/genopts/loongarch-strings b/gcc/config/loongarch/genopts/loongarch-strings -index 8e412f7536e..7bc4824007e 100644 ---- a/gcc/config/loongarch/genopts/loongarch-strings -+++ b/gcc/config/loongarch/genopts/loongarch-strings -@@ -26,6 +26,7 @@ STR_CPU_NATIVE native - STR_CPU_ABI_DEFAULT abi-default - STR_CPU_LOONGARCH64 loongarch64 - STR_CPU_LA464 la464 -+STR_CPU_LA664 la664 - - # Base architecture - STR_ISA_BASE_LA64V100 la64 -diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in -index 158701d327a..00b4733d75b 100644 ---- a/gcc/config/loongarch/genopts/loongarch.opt.in -+++ b/gcc/config/loongarch/genopts/loongarch.opt.in -@@ -107,6 +107,9 @@ Enum(cpu_type) String(@@STR_CPU_LOONGARCH64@@) Value(CPU_LOONGARCH64) - EnumValue - Enum(cpu_type) String(@@STR_CPU_LA464@@) Value(CPU_LA464) - -+EnumValue -+Enum(cpu_type) String(@@STR_CPU_LA664@@) Value(CPU_LA664) -+ - m@@OPTSTR_ARCH@@= - Target RejectNegative Joined Enum(cpu_type) Var(la_opt_cpu_arch) Init(M_OPT_UNSET) - -m@@OPTSTR_ARCH@@=PROCESSOR Generate code for the given PROCESSOR ISA. -diff --git a/gcc/config/loongarch/loongarch-cpu.cc b/gcc/config/loongarch/loongarch-cpu.cc -index 7a2866f60f9..f3a13414143 100644 ---- a/gcc/config/loongarch/loongarch-cpu.cc -+++ b/gcc/config/loongarch/loongarch-cpu.cc -@@ -106,6 +106,10 @@ fill_native_cpu_config (struct loongarch_target *tgt) - native_cpu_type = CPU_LA464; - break; - -+ case 0x0014d000: /* LA664 */ -+ native_cpu_type = CPU_LA664; -+ break; -+ - default: - /* Unknown PRID. */ - if (tune_native_p) -diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c -index 430ef8b2d95..067629141b6 100644 ---- a/gcc/config/loongarch/loongarch-def.c -+++ b/gcc/config/loongarch/loongarch-def.c -@@ -28,6 +28,7 @@ loongarch_cpu_stringsN_TUNE_TYPES = { - CPU_ABI_DEFAULT = STR_CPU_ABI_DEFAULT, - CPU_LOONGARCH64 = STR_CPU_LOONGARCH64, - CPU_LA464 = STR_CPU_LA464, -+ CPU_LA664 = STR_CPU_LA664, - }; - - struct loongarch_isa -@@ -42,6 +43,11 @@ loongarch_cpu_default_isaN_ARCH_TYPES = { - .fpu = ISA_EXT_FPU64, - .simd = ISA_EXT_SIMD_LASX, - }, -+ CPU_LA664 = { -+ .base = ISA_BASE_LA64V110, -+ .fpu = ISA_EXT_FPU64, -+ .simd = ISA_EXT_SIMD_LASX, -+ }, - }; - - struct loongarch_cache -@@ -58,6 +64,12 @@ loongarch_cpu_cacheN_TUNE_TYPES = { - .l2d_size = 256, - .simultaneous_prefetches = 4, - }, -+ CPU_LA664 = { -+ .l1d_line_size = 64, -+ .l1d_size = 64, -+ .l2d_size = 256, -+ .simultaneous_prefetches = 4, -+ }, - }; - - struct loongarch_align -@@ -70,6 +82,10 @@ loongarch_cpu_alignN_TUNE_TYPES = { - .function = "32", - .label = "16", - }, -+ CPU_LA664 = { -+ .function = "32", -+ .label = "16", -+ }, - }; - - -@@ -104,6 +120,9 @@ loongarch_cpu_rtx_cost_dataN_TUNE_TYPES = { - CPU_LA464 = { - DEFAULT_COSTS - }, -+ CPU_LA664 = { -+ DEFAULT_COSTS -+ }, - }; - - /* RTX costs to use when optimizing for size. */ -@@ -127,6 +146,7 @@ loongarch_cpu_issue_rateN_TUNE_TYPES = { - CPU_NATIVE = 4, - CPU_LOONGARCH64 = 4, - CPU_LA464 = 4, -+ CPU_LA664 = 6, - };
View file
_service:tar_scm:LoongArch-Fix-internal-error-running-gcc-march-nativ.patch
Deleted
@@ -1,106 +0,0 @@ -From 56752a6bbfb3d3501d0899b23020c3e2eb58882c Mon Sep 17 00:00:00 2001 -From: Xi Ruoyao <xry111@xry111.site> -Date: Fri, 17 Nov 2023 20:44:17 +0800 -Subject: PATCH LoongArch: Fix internal error running "gcc -march=native" on - LA664 - -On LA664, the PRID preset is ISA_BASE_LA64V110 but the base architecture -is guessed ISA_BASE_LA64V100. This causes a warning to be outputed: - - cc1: warning: base architecture 'la64' differs from PRID preset '?' - -But we've not set the "?" above in loongarch_isa_base_strings, thus it's -a nullptr and then an ICE is triggered. - -Add ISA_BASE_LA64V110 to genopts and initialize -loongarch_isa_base_stringsISA_BASE_LA64V110 correctly to fix the ICE. -The warning itself will be fixed later. - -gcc/ChangeLog: - - * config/loongarch/genopts/loongarch-strings: - (STR_ISA_BASE_LA64V110): Add. - * config/loongarch/genopts/loongarch.opt.in: - (ISA_BASE_LA64V110): Add. - * config/loongarch/loongarch-def.c - (loongarch_isa_base_strings): Initialize ISA_BASE_LA64V110 - to STR_ISA_BASE_LA64V110. - * config/loongarch/loongarch.opt: Regenerate. - * config/loongarch/loongarch-str.h: Regenerate. - -Signed-off-by: ticat_fp <fanpeng@loongson.cn> ---- - gcc/config/loongarch/genopts/loongarch-strings | 1 + - gcc/config/loongarch/genopts/loongarch.opt.in | 3 +++ - gcc/config/loongarch/loongarch-def.c | 1 + - gcc/config/loongarch/loongarch-str.h | 1 + - gcc/config/loongarch/loongarch.opt | 3 +++ - 5 files changed, 9 insertions(+) - -diff --git a/gcc/config/loongarch/genopts/loongarch-strings b/gcc/config/loongarch/genopts/loongarch-strings -index 7bc4824007e..b2070c83ed0 100644 ---- a/gcc/config/loongarch/genopts/loongarch-strings -+++ b/gcc/config/loongarch/genopts/loongarch-strings -@@ -30,6 +30,7 @@ STR_CPU_LA664 la664 - - # Base architecture - STR_ISA_BASE_LA64V100 la64 -+STR_ISA_BASE_LA64V110 la64v1.1 - - # -mfpu - OPTSTR_ISA_EXT_FPU fpu -diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in -index 00b4733d75b..b274b3fb21e 100644 ---- a/gcc/config/loongarch/genopts/loongarch.opt.in -+++ b/gcc/config/loongarch/genopts/loongarch.opt.in -@@ -32,6 +32,9 @@ Basic ISAs of LoongArch: - EnumValue - Enum(isa_base) String(@@STR_ISA_BASE_LA64V100@@) Value(ISA_BASE_LA64V100) - -+EnumValue -+Enum(isa_base) String(@@STR_ISA_BASE_LA64V110@@) Value(ISA_BASE_LA64V110) -+ - ;; ISA extensions / adjustments - Enum - Name(isa_ext_fpu) Type(int) -diff --git a/gcc/config/loongarch/loongarch-def.c b/gcc/config/loongarch/loongarch-def.c -index 067629141b6..f22d488acb2 100644 ---- a/gcc/config/loongarch/loongarch-def.c -+++ b/gcc/config/loongarch/loongarch-def.c -@@ -165,6 +165,7 @@ loongarch_cpu_multipass_dfa_lookaheadN_TUNE_TYPES = { - const char* - loongarch_isa_base_stringsN_ISA_BASE_TYPES = { - ISA_BASE_LA64V100 = STR_ISA_BASE_LA64V100, -+ ISA_BASE_LA64V110 = STR_ISA_BASE_LA64V110, - }; - - const char* -diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h -index fc4f41bfc1e..114dbc692d7 100644 ---- a/gcc/config/loongarch/loongarch-str.h -+++ b/gcc/config/loongarch/loongarch-str.h -@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3. If not see - #define STR_CPU_LA664 "la664" - - #define STR_ISA_BASE_LA64V100 "la64" -+#define STR_ISA_BASE_LA64V110 "la64v1.1" - - #define OPTSTR_ISA_EXT_FPU "fpu" - #define STR_NONE "none" -diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt -index 7f129e53ba5..350ca30d232 100644 ---- a/gcc/config/loongarch/loongarch.opt -+++ b/gcc/config/loongarch/loongarch.opt -@@ -39,6 +39,9 @@ Basic ISAs of LoongArch: - EnumValue - Enum(isa_base) String(la64) Value(ISA_BASE_LA64V100) - -+EnumValue -+Enum(isa_base) String(la64v1.1) Value(ISA_BASE_LA64V110) -+ - ;; ISA extensions / adjustments - Enum - Name(isa_ext_fpu) Type(int) --- -2.33.0 -
View file
_service:tar_scm:LoongArch-Fix-lsx-vshuf.c-and-lasx-xvshuf_b.c-tests-.patch
Deleted
@@ -1,907 +0,0 @@ -From 40366b89e9c8e727af70ecf7007cba6c51e4b7d2 Mon Sep 17 00:00:00 2001 -From: Jiahao Xu <xujiahao@loongson.cn> -Date: Wed, 29 Nov 2023 11:16:59 +0800 -Subject: PATCH LoongArch: Fix lsx-vshuf.c and lasx-xvshuf_b.c tests fail on - LA664 PR112611 - -For xvshuf instructions, if the index value in the selector exceeds 63, it triggers -undefined behavior on LA464, but not on LA664. To ensure compatibility of these two -tests on both LA464 and LA664, we have modified both tests to ensure that the index -value in the selector does not exceed 63. - -gcc/testsuite/ChangeLog: - - PR target/112611 - * gcc.target/loongarch/vector/lasx/lasx-xvshuf_b.c: Sure index less than 64. - * gcc.target/loongarch/vector/lsx/lsx-vshuf.c: Ditto. - -Signed-off-by: ticat_fp <fanpeng@loongson.cn> ---- - .../loongarch/vector/lasx/lasx-xvshuf_b.c | 343 ++++++------------ - .../loongarch/vector/lsx/lsx-vshuf.c | 162 +++------ - 2 files changed, 164 insertions(+), 341 deletions(-) - -diff --git a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvshuf_b.c b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvshuf_b.c -index d8a29dbd225..b8ab387118a 100644 ---- a/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvshuf_b.c -+++ b/gcc/testsuite/gcc.target/loongarch/vector/lasx/lasx-xvshuf_b.c -@@ -43,9 +43,9 @@ main () - *((unsigned long *)&__m256i_op11) = 0xfffffefefffffefe; - *((unsigned long *)&__m256i_op10) = 0xfffffefefffffefe; - *((unsigned long *)&__m256i_op23) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op22) = 0xfffffff8fffffff8; -+ *((unsigned long *)&__m256i_op22) = 0x3f3f3f383f3f3f38; - *((unsigned long *)&__m256i_op21) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op20) = 0xfffffff8fc000000; -+ *((unsigned long *)&__m256i_op20) = 0x3f3f3f383c000000; - *((unsigned long *)&__m256i_result3) = 0xfafafafafafafafa; - *((unsigned long *)&__m256i_result2) = 0x0000000000000000; - *((unsigned long *)&__m256i_result1) = 0xfefefefefefefefe; -@@ -137,33 +137,14 @@ main () - *((unsigned long *)&__m256i_op12) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op11) = 0x0000000000000000; - *((unsigned long *)&__m256i_op10) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op23) = 0x0000ffffffffffff; -- *((unsigned long *)&__m256i_op22) = 0x0000ffff0000ffff; -- *((unsigned long *)&__m256i_op21) = 0x0000ffffffffffff; -- *((unsigned long *)&__m256i_op20) = 0x0000ffff0000ffff; -+ *((unsigned long *)&__m256i_op23) = 0x0000111111111111; -+ *((unsigned long *)&__m256i_op22) = 0x0000222200002222; -+ *((unsigned long *)&__m256i_op21) = 0x0000111111111111; -+ *((unsigned long *)&__m256i_op20) = 0x0000222200002222; - *((unsigned long *)&__m256i_result3) = 0xffff000000000000; -- *((unsigned long *)&__m256i_result2) = 0xffff0000ffff0000; -+ *((unsigned long *)&__m256i_result2) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_result1) = 0xffff000000000000; -- *((unsigned long *)&__m256i_result0) = 0xffff0000ffff0000; -- __m256i_out = __lasx_xvshuf_b (__m256i_op0, __m256i_op1, __m256i_op2); -- ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); -- -- *((unsigned long *)&__m256i_op03) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op02) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op01) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op00) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op13) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op12) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op11) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op10) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op23) = 0x000000000000ffff; -- *((unsigned long *)&__m256i_op22) = 0x000000000000ffff; -- *((unsigned long *)&__m256i_op21) = 0x000000000000ffff; -- *((unsigned long *)&__m256i_op20) = 0x000000000000ffff; -- *((unsigned long *)&__m256i_result3) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result2) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result1) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result0) = 0x0000000000000000; -+ *((unsigned long *)&__m256i_result0) = 0xffffffffffffffff; - __m256i_out = __lasx_xvshuf_b (__m256i_op0, __m256i_op1, __m256i_op2); - ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); - -@@ -176,7 +157,7 @@ main () - *((unsigned long *)&__m256i_op11) = 0x0000000000000000; - *((unsigned long *)&__m256i_op10) = 0x0000000000000000; - *((unsigned long *)&__m256i_op23) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op22) = 0x0000000000077fff; -+ *((unsigned long *)&__m256i_op22) = 0x0000000000032f1f; - *((unsigned long *)&__m256i_op21) = 0x0000000000000000; - *((unsigned long *)&__m256i_op20) = 0x0000000000000000; - *((unsigned long *)&__m256i_result3) = 0xffffffffffffffff; -@@ -186,9 +167,9 @@ main () - __m256i_out = __lasx_xvshuf_b (__m256i_op0, __m256i_op1, __m256i_op2); - ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); - -- *((unsigned long *)&__m256i_op03) = 0xfffffffffffffefe; -- *((unsigned long *)&__m256i_op02) = 0x0000000000000101; -- *((unsigned long *)&__m256i_op01) = 0xfffffffffffffefe; -+ *((unsigned long *)&__m256i_op03) = 0x0011001100110011; -+ *((unsigned long *)&__m256i_op02) = 0x0000000000000001; -+ *((unsigned long *)&__m256i_op01) = 0x0011001100110011; - *((unsigned long *)&__m256i_op00) = 0x0000000000000101; - *((unsigned long *)&__m256i_op13) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op12) = 0x67eee33567eee435; -@@ -198,35 +179,16 @@ main () - *((unsigned long *)&__m256i_op22) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op21) = 0x00000000ffffffff; - *((unsigned long *)&__m256i_op20) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_result3) = 0x0000000000000000; -+ *((unsigned long *)&__m256i_result3) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_result2) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_result1) = 0x0000000000000000; -+ *((unsigned long *)&__m256i_result1) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_result0) = 0xffffffffffffffff; - __m256i_out = __lasx_xvshuf_h (__m256i_op0, __m256i_op1, __m256i_op2); - ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); - -- *((unsigned long *)&__m256i_op03) = 0x0000000000000000; -+ *((unsigned long *)&__m256i_op03) = 0x0022002200000000; - *((unsigned long *)&__m256i_op02) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op01) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op00) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op13) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op12) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op11) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op10) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op23) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op22) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op21) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op20) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result3) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result2) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result1) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result0) = 0x0000000000000000; -- __m256i_out = __lasx_xvshuf_h (__m256i_op0, __m256i_op1, __m256i_op2); -- ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); -- -- *((unsigned long *)&__m256i_op03) = 0xffffffff80000000; -- *((unsigned long *)&__m256i_op02) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op01) = 0xffffffff80000000; -+ *((unsigned long *)&__m256i_op01) = 0x001f001f00000000; - *((unsigned long *)&__m256i_op00) = 0x0000000000000000; - *((unsigned long *)&__m256i_op13) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op12) = 0xffffffffffffffff; -@@ -243,10 +205,10 @@ main () - __m256i_out = __lasx_xvshuf_h (__m256i_op0, __m256i_op1, __m256i_op2); - ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); - -- *((unsigned long *)&__m256i_op03) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op02) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op01) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op00) = 0xffffffffffffffff; -+ *((unsigned long *)&__m256i_op03) = 0x0011001100110011; -+ *((unsigned long *)&__m256i_op02) = 0x0011001100110011; -+ *((unsigned long *)&__m256i_op01) = 0x0011001100110011; -+ *((unsigned long *)&__m256i_op00) = 0x0011001100110011; - *((unsigned long *)&__m256i_op13) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op12) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op11) = 0xffffffffffffffff; -@@ -255,17 +217,17 @@ main () - *((unsigned long *)&__m256i_op22) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op21) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op20) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_result3) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result2) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result1) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result0) = 0x0000000000000000; -+ *((unsigned long *)&__m256i_result3) = 0xffffffffffffffff; -+ *((unsigned long *)&__m256i_result2) = 0xffffffffffffffff; -+ *((unsigned long *)&__m256i_result1) = 0xffffffffffffffff; -+ *((unsigned long *)&__m256i_result0) = 0xffffffffffffffff; - __m256i_out = __lasx_xvshuf_h (__m256i_op0, __m256i_op1, __m256i_op2); - ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); - -- *((unsigned long *)&__m256i_op03) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op02) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op01) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_op00) = 0xffffffffffffffff; -+ *((unsigned long *)&__m256i_op03) = 0x003f003f003f003f; -+ *((unsigned long *)&__m256i_op02) = 0x003f003f003f003f; -+ *((unsigned long *)&__m256i_op01) = 0x003f003f003f003f; -+ *((unsigned long *)&__m256i_op00) = 0x003f003f003f003f; - *((unsigned long *)&__m256i_op13) = 0xefdfefdf00000000; - *((unsigned long *)&__m256i_op12) = 0xefdfefdfefdfefdf; - *((unsigned long *)&__m256i_op11) = 0xefdfefdf00000000; -@@ -274,36 +236,17 @@ main () - *((unsigned long *)&__m256i_op22) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op21) = 0xffffffffffffffff; - *((unsigned long *)&__m256i_op20) = 0xffffffffffffffff; -- *((unsigned long *)&__m256i_result3) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result2) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result1) = 0x0000000000000000; -- *((unsigned long *)&__m256i_result0) = 0x0000000000000000; -- __m256i_out = __lasx_xvshuf_h (__m256i_op0, __m256i_op1, __m256i_op2); -- ASSERTEQ_64 (__LINE__, __m256i_result, __m256i_out); -- -- *((unsigned long *)&__m256i_op03) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op02) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op01) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op00) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op13) = 0x0000000000000000; -- *((unsigned long *)&__m256i_op12) = 0x0000000000000000;
View file
_service:tar_scm:LoongArch-Use-finer-grained-DBAR-hints.patch
Deleted
@@ -1,137 +0,0 @@ -From 4a70bfbf686c2b6a1ecd83fe851de826c612c3e0 Mon Sep 17 00:00:00 2001 -From: Xi Ruoyao <xry111@xry111.site> -Date: Tue, 14 Nov 2023 05:32:38 +0800 -Subject: PATCH LoongArch: Use finer-grained DBAR hints - -LA664 defines DBAR hints 0x1 - 0x1f (except 0xf and 0x1f) as follows 1-2: - -- Bit 4: kind of constraint (0: completion, 1: ordering) -- Bit 3: barrier for previous read (0: true, 1: false) -- Bit 2: barrier for previous write (0: true, 1: false) -- Bit 1: barrier for succeeding read (0: true, 1: false) -- Bit 0: barrier for succeeding write (0: true, 1: false) - -LLVM has already utilized them for different memory orders 3: - -- Bit 4 is always set to one because it's only intended to be zero for - things like MMIO devices, which are out of the scope of memory orders. -- An acquire barrier is used to implement acquire loads like - - ld.d $a1, $t0, 0 - dbar acquire_hint - - where the load operation (ld.d) should not be reordered with any load - or store operation after the acquire load. To accomplish this - constraint, we need to prevent the load operation from being reordered - after the barrier, and also prevent any following load/store operation - from being reordered before the barrier. Thus bits 0, 1, and 3 must - be zero, and bit 2 can be one, so acquire_hint should be 0b10100. -- An release barrier is used to implement release stores like - - dbar release_hint - st.d $a1, $t0, 0 - - where the store operation (st.d) should not be reordered with any load - or store operation before the release store. So we need to prevent - the store operation from being reordered before the barrier, and also - prevent any preceding load/store operation from being reordered after - the barrier. So bits 0, 2, 3 must be zero, and bit 1 can be one. So - release_hint should be 0b10010. - -A similar mapping has been utilized for RISC-V GCC 4, LoongArch Linux -kernel 1, and LoongArch LLVM 3. So the mapping should be correct. -And I've also bootstrapped & regtested GCC on a LA664 with this patch. - -The LoongArch CPUs should treat "unknown" hints as dbar 0, so we can -unconditionally emit the new hints without a compiler switch. - -1: https://git.kernel.org/torvalds/c/e031a5f3f1ed -2: https://github.com/loongson-community/docs/pull/12 -3: https://github.com/llvm/llvm-project/pull/68787 -4: https://gcc.gnu.org/r14-406 - -gcc/ChangeLog: - - * config/loongarch/sync.md (mem_thread_fence): Remove redundant - check. - (mem_thread_fence_1): Emit finer-grained DBAR hints for - different memory models, instead of 0. - -Signed-off-by: ticat_fp <fanpeng@loongson.cn> ---- - gcc/config/loongarch/sync.md | 51 +++++++++++++++++++++++++++++------- - 1 file changed, 42 insertions(+), 9 deletions(-) - -diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md -index 9924d522bcd..1ad0c63e0d9 100644 ---- a/gcc/config/loongarch/sync.md -+++ b/gcc/config/loongarch/sync.md -@@ -50,23 +50,56 @@ - (match_operand:SI 0 "const_int_operand" "") ;; model - "" - { -- if (INTVAL (operands0) != MEMMODEL_RELAXED) -- { -- rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); -- MEM_VOLATILE_P (mem) = 1; -- emit_insn (gen_mem_thread_fence_1 (mem, operands0)); -- } -+ rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); -+ MEM_VOLATILE_P (mem) = 1; -+ emit_insn (gen_mem_thread_fence_1 (mem, operands0)); -+ - DONE; - }) - --;; Until the LoongArch memory model (hence its mapping from C++) is finalized, --;; conservatively emit a full FENCE. -+;; DBAR hint encoding for LA664 and later micro-architectures, paraphrased from -+;; the Linux patch revealing it 1: -+;; -+;; - Bit 4: kind of constraint (0: completion, 1: ordering) -+;; - Bit 3: barrier for previous read (0: true, 1: false) -+;; - Bit 2: barrier for previous write (0: true, 1: false) -+;; - Bit 1: barrier for succeeding read (0: true, 1: false) -+;; - Bit 0: barrier for succeeding write (0: true, 1: false) -+;; -+;; 1: https://git.kernel.org/torvalds/c/e031a5f3f1ed -+;; -+;; Implementations without support for the finer-granularity hints simply treat -+;; all as the full barrier (DBAR 0), so we can unconditionally start emiting the -+;; more precise hints right away. - (define_insn "mem_thread_fence_1" - (set (match_operand:BLK 0 "" "") - (unspec:BLK (match_dup 0) UNSPEC_MEMORY_BARRIER)) - (match_operand:SI 1 "const_int_operand" "") ;; model - "" -- "dbar\t0") -+ { -+ enum memmodel model = memmodel_base (INTVAL (operands1)); -+ -+ switch (model) -+ { -+ case MEMMODEL_ACQUIRE: -+ return "dbar\t0b10100"; -+ case MEMMODEL_RELEASE: -+ return "dbar\t0b10010"; -+ case MEMMODEL_ACQ_REL: -+ case MEMMODEL_SEQ_CST: -+ return "dbar\t0b10000"; -+ default: -+ /* GCC internal: "For the '__ATOMIC_RELAXED' model no instructions -+ need to be issued and this expansion is not invoked." -+ -+ __atomic builtins doc: "Consume is implemented using the -+ stronger acquire memory order because of a deficiency in C++11's -+ semantics." See PR 59448 and get_memmodel in builtins.cc. -+ -+ Other values should not be returned by memmodel_base. */ -+ gcc_unreachable (); -+ } -+ }) - - ;; Atomic memory operations. - --- -2.33.0 -
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2