开源软件构建与测试

Changes of Revision 2

_service:tar_scm:llvm.spec Changed

@@ -1,6 +1,12 @@
 %bcond_without sys_llvm
 %bcond_without check
 %bcond_with classic_flang
+%bcond_with toolchain_clang
+%bcond_without bisheng_autotuner
+
+%if %{with toolchain_clang}
+%global toolchain clang
+%endif
 
 %global maj_ver 17
 %global min_ver 0
@@ -38,7 +44,7 @@
 
 Name:		%{pkg_name}
 Version:	%{maj_ver}.%{min_ver}.%{patch_ver}
-Release:	11
+Release:	19
 Summary:	The Low Level Virtual Machine
 
 License:	NCSA
@@ -69,13 +75,10 @@
 Patch18:	0018-Fix-declaration-definition-mismatch-for-classic-flang.patch
 Patch19: 	0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch
 Patch20: 	0020-Update-llvm-lit-config-to-support-build_for_openeule.patch
-
-Patch21:	0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch
-Patch22:	0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch
-Patch23:	0023-Backport-AArch64-Stack-probing-for-function-prologues.patch
-Patch24:	0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch
-Patch25:	0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch
-Patch26:	0026-Update-testcase-for-stack-clash-protection-backport.patch
+Patch21:	0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch
+Patch22:	0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch
+Patch23:	0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch
+Patch24:	0024-Backport-LoongArch-fix-and-add-some-new-support.patch
 
 BuildRequires:	binutils-devel
 BuildRequires:	cmake
@@ -92,6 +95,9 @@
 BuildRequires:	python3-sphinx
 BuildRequires:	python3-setuptools
 BuildRequires:	zlib-devel
+%if %{with toolchain_clang}
+BuildRequires:	clang
+%endif
 
 Requires:	%{name}-libs%{?_isa} = %{version}-%{release}
 
@@ -128,6 +134,8 @@
 Summary:	Documentation for LLVM
 BuildArch:	noarch
 Requires:	%{name} = %{version}-%{release}
+Provides:	%{name}-help = %{version}-%{release}
+Obsoletes:	%{name}-help < %{version}-%{release}
 
 %description doc
 Documentation for the LLVM compiler infrastructure.
@@ -238,6 +246,13 @@
 %if %{with classic_flang}
         -DLLVM_ENABLE_CLASSIC_FLANG=ON \
 %endif
+%if "%{toolchain}" == "clang"
+	-DCMAKE_C_COMPILER=clang \
+	-DCMAKE_CXX_COMPILER=clang++ \
+%endif
+%if %{with bisheng_autotuner}
+	-DLLVM_ENABLE_AUTOTUNER=ON \
+%endif
 	-DLLVM_INCLUDE_BENCHMARKS=OFF
 %ninja_build LLVM
 %ninja_build
@@ -299,7 +314,6 @@
 
 %files
 %license LICENSE.TXT
-%{install_prefix}/share/man/man1/*
 %{install_bindir}/*
 %exclude %{install_bindir}/not
 %exclude %{install_bindir}/count
@@ -329,6 +343,7 @@
 %files doc
 %license LICENSE.TXT
 %doc %{install_docdir}/html
+%{install_prefix}/share/man/man1/*
 
 %files static
 %license LICENSE.TXT
@@ -360,8 +375,32 @@
 %{install_includedir}/llvm-gmock
 
 %changelog
-* Fri May 10 2024 rickyleung <leung.wing.chung@huawei.com> - 17.0.6-11
-- Backport the patches to support stack clash protection
+* Mon Sep 23 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-19
+- LoongArch Backport some new support
+
+* Thu Sep 12 2024 xiajingze <xiajingze1@huawei.com> - 17.0.6-18
+- AArch64 Support HiSilicon's HIP09 Processor
+
+* Wed Sep 11 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-17
+- doc add Provides llvm-help
+
+* Tue Sep 10 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-16
+- doc add Obsoletes llvm-help
+
+* Tue Sep 3 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-15
+- mv man to doc subpackage
+
+* Mon Jul 22 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-14
+- Prevent environment variables from exceeding NAME_MAX.
+
+* Mon Jul 22 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-13
+- Disable toolchain_clang build for BiSheng Autotuner support temporary.
+
+* Tue Jul 16 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-12
+- Add BiSheng Autotuner support.
+
+* Fri Jul 5 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-11
+- Add toolchain_clang build support
 
 * Mon Apr 29 2024 wangqiang <wangqiang1@kylinos.cn> - 17.0.6-10
 - Update llvm-lit config to support macro `build_for_openeuler`

_service:tar_scm:0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch Added

@@ -0,0 +1,9915 @@
+From a9863e2b6e6783aa9be0b9d1d187084fd4b32a3a Mon Sep 17 00:00:00 2001
+From: Muhammad Asif Manzoor <muhammad.asif.manzoor1@huawei.com>
+Date: Thu, 21 Mar 2024 12:50:38 -0400
+Subject: Add BiSheng Autotuner support for LLVM compiler
+
+Automatic tuning is an automatic iterative process that optimizes a given
+program by manipulating compilation options for optimal performance.
+BiSheng Autotuner provides a resumable interface for tuning process. BiSheng
+Autotuner can tune 1) individual code segments/blocks (fine grain turning) like
+loops, callsites, instructions, etc. and 2) entire modules/programs (coarse
+grain tuning) for compiler flags, pass ordering, etc.
+This patch enables LLVM compiler to extract tuneable code regions and then apply
+suggested configuration (by Autotuner) to find out the optimal configurations.
+---
+ llvm/cmake/modules/CrossCompile.cmake         |   1 +
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |   8 +
+ llvm/include/llvm/Analysis/AutotuningDump.h   |  75 ++
+ llvm/include/llvm/Analysis/LoopInfo.h         |  13 +
+ llvm/include/llvm/Analysis/Passes.h           |  10 +
+ llvm/include/llvm/AutoTuner/AutoTuning.h      | 486 ++++++++++++
+ .../llvm/AutoTuner/AutoTuningRemarkManager.h  |  43 ++
+ .../llvm/AutoTuner/AutoTuningRemarkStreamer.h |  47 ++
+ llvm/include/llvm/CodeGen/MachineBasicBlock.h |  13 +
+ llvm/include/llvm/IR/Function.h               |  37 +
+ llvm/include/llvm/IR/InstrTypes.h             |  24 +
+ llvm/include/llvm/IR/Instructions.h           |  24 +
+ llvm/include/llvm/IR/Module.h                 |   3 +
+ llvm/include/llvm/IR/StructuralHash.h         |  14 +
+ llvm/include/llvm/InitializePasses.h          |   5 +
+ llvm/include/llvm/LinkAllPasses.h             |   8 +
+ llvm/include/llvm/Remarks/Remark.h            |  32 +
+ llvm/include/llvm/Support/CommandLine.h       |  17 +
+ llvm/include/llvm/Transforms/Scalar.h         |  17 +
+ .../Transforms/Scalar/AutoTuningCompile.h     | 170 +++++
+ .../llvm/Transforms/Utils/UnrollLoop.h        |   4 +
+ llvm/lib/Analysis/AutotuningDump.cpp          | 265 +++++++
+ llvm/lib/Analysis/CMakeLists.txt              |   2 +
+ llvm/lib/Analysis/InlineAdvisor.cpp           |  18 +
+ llvm/lib/Analysis/InlineCost.cpp              |  29 +
+ llvm/lib/Analysis/LoopInfo.cpp                |  52 ++
+ llvm/lib/AutoTuner/AutoTuning.cpp             | 705 ++++++++++++++++++
+ .../lib/AutoTuner/AutoTuningRemarkManager.cpp | 299 ++++++++
+ .../AutoTuner/AutoTuningRemarkStreamer.cpp    |  55 ++
+ llvm/lib/AutoTuner/CMakeLists.txt             |  11 +
+ llvm/lib/CMakeLists.txt                       |   1 +
+ llvm/lib/CodeGen/CMakeLists.txt               |   1 +
+ llvm/lib/CodeGen/CalcSpillWeights.cpp         |  30 +
+ llvm/lib/CodeGen/MachineBasicBlock.cpp        |  36 +
+ llvm/lib/CodeGen/MachineScheduler.cpp         |  44 ++
+ llvm/lib/CodeGen/SwitchLoweringUtils.cpp      |  19 +
+ llvm/lib/IR/AsmWriter.cpp                     | 151 ++++
+ llvm/lib/IR/CMakeLists.txt                    |   1 +
+ llvm/lib/IR/Function.cpp                      |  34 +
+ llvm/lib/IR/Instructions.cpp                  |  86 +++
+ llvm/lib/IR/StructuralHash.cpp                | 114 +++
+ llvm/lib/Passes/PassBuilder.cpp               |   5 +
+ llvm/lib/Passes/PassBuilderPipelines.cpp      |  46 ++
+ llvm/lib/Passes/PassRegistry.def              |  13 +
+ llvm/lib/Passes/StandardInstrumentations.cpp  |  23 +
+ .../lib/Remarks/BitstreamRemarkSerializer.cpp |   8 +
+ llvm/lib/Remarks/RemarkStreamer.cpp           |   4 +
+ llvm/lib/Remarks/YAMLRemarkParser.cpp         | 122 +++
+ llvm/lib/Remarks/YAMLRemarkParser.h           |   6 +
+ llvm/lib/Remarks/YAMLRemarkSerializer.cpp     |  84 +++
+ llvm/lib/Support/CommandLine.cpp              |  41 +
+ llvm/lib/Transforms/IPO/CMakeLists.txt        |   1 +
+ llvm/lib/Transforms/IPO/Inliner.cpp           |  36 +
+ llvm/lib/Transforms/IPO/SampleProfile.cpp     |  14 +
+ .../Transforms/Instrumentation/CMakeLists.txt |   1 +
+ .../Instrumentation/PGOInstrumentation.cpp    |   8 +
+ .../Transforms/Scalar/AutoTuningCompile.cpp   | 334 +++++++++
+ llvm/lib/Transforms/Scalar/CMakeLists.txt     |   2 +
+ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 187 +++++
+ llvm/lib/Transforms/Scalar/Scalar.cpp         |   4 +
+ llvm/lib/Transforms/Scalar/Sink.cpp           |   5 +
+ llvm/lib/Transforms/Utils/CMakeLists.txt      |   1 +
+ llvm/lib/Transforms/Utils/LCSSA.cpp           |   5 +
+ llvm/lib/Transforms/Utils/LoopSimplify.cpp    |   8 +
+ llvm/lib/Transforms/Utils/LoopUnroll.cpp      |   3 +
+ llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
+ .../Vectorize/LoopVectorizationLegality.cpp   |  12 +
+ .../Transforms/Vectorize/LoopVectorize.cpp    |  34 +
+ .../Inputs/unroll_template.yaml               |   8 +
+ .../AutotuningDump/create-data-dir.ll         |  65 ++
+ llvm/test/AutoTuning/AutotuningDump/unroll.ll |  35 +
+ .../autotune_datadir/baseline_config.yaml     |   9 +
+ .../autotune_datadir/random_config.yaml       |   9 +
+ .../AutoTuning/BaselineConfig/Inputs/test.ll  | 117 +++
+ .../BaselineConfig/apply_baseline_config.ll   |  11 +
+ llvm/test/AutoTuning/BaselineConfig/opp.ll    |  67 ++
+ .../CodeRegionFilter/function-filtering.ll    |  62 ++
+ .../Error/Inputs/invalid-format.yaml          |   3 +
+ .../AutoTuning/Error/Inputs/template.yaml     |  10 +
+ .../AutoTuning/Error/file-not-found-error.ll  |  29 +
+ .../AutoTuning/Error/invalid-yaml-error.ll    |  27 +
+ .../AutoTuning/Error/malformed-input-error.ll | 136 ++++
+ llvm/test/AutoTuning/Error/output-error.ll    |  28 +
+ llvm/test/AutoTuning/Error/valid-input.ll     |  27 +
+ .../Inputs/template.yaml                      |   9 +
+ .../inc-compile-parse-input.ll                | 103 +++
+ .../AutoTuning/Inline/Inputs/template.yaml    |   9 +
+ .../Inline/Inputs/template_no_metadata.yaml   |   7 +
+ .../test/AutoTuning/Inline/duplicate-calls.ll |  96 +++
+ llvm/test/AutoTuning/Inline/force-inline.ll   |  84 +++
+ .../AutoTuning/Inline/inline-attribute.ll     |  85 +++
+ llvm/test/AutoTuning/Inline/opp.ll            |  64 ++
+ .../LoopUnroll/Inputs/debug_loc_template.yaml |  10 +
+ .../LoopUnroll/Inputs/loop_nest.yaml          |  10 +
+ .../LoopUnroll/Inputs/loop_peel.yaml          |   9 +
+ .../Inputs/unroll_raw_template.yaml           |  10 +
+ .../LoopUnroll/Inputs/unroll_template.yaml    |  10 +
+ .../Inputs/unroll_template_no_metadata.yaml   |   8 +
+ llvm/test/AutoTuning/LoopUnroll/debug_loc.ll  | 161 ++++
+ .../AutoTuning/LoopUnroll/dynamic_config.ll   |  56 ++
+ llvm/test/AutoTuning/LoopUnroll/loop_nest.ll  | 136 ++++
+ llvm/test/AutoTuning/LoopUnroll/loop_peel.ll  |  53 ++
+ .../AutoTuning/LoopUnroll/unroll-pragma.ll    | 129 ++++
+ llvm/test/AutoTuning/LoopUnroll/unroll.ll     | 101 +++
+ llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll | 113 +++
+ .../Inputs/vectorize_template.yaml            |   9 +
+ .../vectorize_template_no_metadata.yaml       |   7 +
+ .../LoopVectorize/force-vector-interleave.ll  |  88 +++
+ .../Inputs/misched_x86_template.yaml          |  10 +
+ .../misched_x86_bidirectional.ll              |  73 ++
+ .../MachineScheduler/misched_x86_bottomup.ll  |  72 ++
+ .../MachineScheduler/misched_x86_topdown.ll   |  72 ++
+ .../AutoTuning/MetaData/structural_hash.ll    | 234 ++++++
+ .../AutoTuning/MetaData/write_no_metadata.ll  | 191 +++++
+ .../MetaData/write_with_metadata.ll           | 204 +++++
+ .../AutoTuning/PGO/Inputs/pgo-instr.proftext  |  17 +
+ .../PGO/Inputs/pgo-sample-cold.prof           |   7 +
+ .../AutoTuning/PGO/Inputs/pgo-sample-hot.prof |   7 +
+ llvm/test/AutoTuning/PGO/pgo-instr-filters.ll |  61 ++
+ .../test/AutoTuning/PGO/pgo-sample-filters.ll | 138 ++++
+ .../Inputs/pass_invocation.yaml               |  10 +
+ .../PassInvocation/pass_invocation_read.ll    |  64 ++
+ .../PassInvocation/pass_invocation_write.ll   |  67 ++
+ .../PhaseOrdering/Inputs/template.yaml        |   8 +
+ .../AutoTuning/PhaseOrdering/pass-order.ll    |  65 ++
+ .../AutoTuning/SwitchLowering/switch-opp.ll   |  47 ++
+ llvm/test/AutoTuning/lit.local.cfg            |   2 +
+ llvm/test/AutoTuning/opt-opp.ll               | 315 ++++++++
+ llvm/test/lit.site.cfg.py.in                  |   1 +
+ llvm/tools/llc/llc.cpp                        |  19 +
+ llvm/tools/opt/NewPMDriver.cpp                |  42 ++
+ llvm/tools/opt/opt.cpp                        |  53 ++
+ 132 files changed, 7801 insertions(+)
+ create mode 100644 llvm/include/llvm/Analysis/AutotuningDump.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuning.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+ create mode 100644 llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+ create mode 100644 llvm/lib/Analysis/AutotuningDump.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuning.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+ create mode 100644 llvm/lib/AutoTuner/CMakeLists.txt
+ create mode 100644 llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/unroll.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/opp.ll
+ create mode 100644 llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+ create mode 100644 llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+ create mode 100644 llvm/test/AutoTuning/Error/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/Error/file-not-found-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/malformed-input-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/output-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/valid-input.ll
+ create mode 100644 llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/Inline/duplicate-calls.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/force-inline.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/inline-attribute.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/opp.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/structural_hash.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+ create mode 100644 llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+ create mode 100644 llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+ create mode 100644 llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+ create mode 100644 llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+ create mode 100644 llvm/test/AutoTuning/lit.local.cfg
+ create mode 100644 llvm/test/AutoTuning/opt-opp.ll
+
+diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
+index 6af47b51d4c6..1a9fb4b2dddc 100644
+--- a/llvm/cmake/modules/CrossCompile.cmake
++++ b/llvm/cmake/modules/CrossCompile.cmake
+@@ -82,6 +82,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
+         -DLLVM_ENABLE_PROJECTS="${llvm_enable_projects_arg}"
+         -DLLVM_EXTERNAL_PROJECTS="${llvm_external_projects_arg}"
+         -DLLVM_ENABLE_RUNTIMES="${llvm_enable_runtimes_arg}"
++        -DLLVM_ENABLE_AUTOTUNER="${LLVM_ENABLE_AUTOTUNER}"
+         ${external_project_source_dirs}
+         -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN="${LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN}"
+         -DLLVM_INCLUDE_BENCHMARKS=OFF
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 62a1a64d37d4..b8e9dbe29d88 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -112,6 +112,14 @@ else()
+   set(BUILD_FOR_OPENEULER 0)
+ endif()
+ 
++option(LLVM_ENABLE_AUTOTUNER "Enable BiSheng Auto-Tuning features" OFF)
++if (LLVM_ENABLE_AUTOTUNER)
++  set(LLVM_ENABLE_AUTOTUNER 1)
++  add_definitions( -DENABLE_AUTOTUNER )
++else()
++  set(LLVM_ENABLE_AUTOTUNER 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm/Analysis/AutotuningDump.h b/llvm/include/llvm/Analysis/AutotuningDump.h
+new file mode 100644
+index 000000000000..fb973f05323e
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/AutotuningDump.h
+@@ -0,0 +1,75 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===-- AutotuningDump.h - Auto-Tuning-----------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// ===--------------------------------------------------------------------===//
++//
++// This file contains pass collecting IR of tuned regions and storing them into
++// predetrmined locations, to be used later by autotuning ML guidance
++//
++// ===--------------------------------------------------------------------===//
++
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/LoopPass.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Transforms/Scalar/LoopPassManager.h"
++#include <string>
++
++namespace llvm {
++class AutotuningDump {
++public:
++  AutotuningDump(bool IncrementalCompilation = false);
++  bool run(Module &F, function_ref<LoopInfo &(Function &)> GetLI);
++
++private:
++  std::string AutoTuneDirPath;
++  std::unique_ptr<raw_ostream> createFile(const Twine &File);
++  int getConfigNumber();
++  void dumpToStream(llvm::raw_ostream &os, const Loop &L) const;
++  void dumpToStream(llvm::raw_ostream &os, const Function &F) const;
++  void dumpFunctions(llvm::Module &M);
++  void dumpLoops(llvm::Module &M, function_ref<LoopInfo &(Function &)> GetLI);
++  void dumpModule(llvm::Module &M);
++  std::string getDirectoryName(const std::string File) const;
++  std::string getFileName(std::string FilePath);
++
++  bool IsIncrementalCompilation;
++};
++
++class AutotuningDumpLegacy : public ModulePass {
++public:
++  static char ID;
++  AutotuningDumpLegacy(bool IncrementalCompilation = false);
++  StringRef getPassName() const override;
++  bool runOnModule(Module &M) override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override;
++
++private:
++  bool IsIncrementalCompilation;
++};
++
++class AutotuningDumpAnalysis
++    : public AnalysisInfoMixin<AutotuningDumpAnalysis> {
++  friend AnalysisInfoMixin<AutotuningDumpAnalysis>;
++  static AnalysisKey Key;
++
++public:
++  AutotuningDumpAnalysis(bool IncrementalCompilation = false) {
++    IsIncrementalCompilation = IncrementalCompilation;
++  }
++
++  // This pass only prints IRs of selected function or loops without doing any
++  // real analyses, thus the return value is meaningless. To avoid leaking data
++  // or memory, we typedef Result to Optional<bool> to avoid having to return an
++  // AutotuningDump object.
++  using Result = std::optional<bool>;
++  Result run(Module &M, ModuleAnalysisManager &AM);
++
++private:
++  bool IsIncrementalCompilation;
++};
++} // namespace llvm
++#endif
+\ No newline at end of file
+diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
+index 3434630c27cf..9be3e056cf76 100644
+--- a/llvm/include/llvm/Analysis/LoopInfo.h
++++ b/llvm/include/llvm/Analysis/LoopInfo.h
+@@ -26,6 +26,9 @@
+ #include <algorithm>
+ #include <optional>
+ #include <utility>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -44,7 +47,12 @@ extern template class LoopBase<BasicBlock, Loop>;
+ 
+ /// Represents a single loop in the control flow graph.  Note that not all SCCs
+ /// in the CFG are necessarily loops.
++#if defined(ENABLE_AUTOTUNER)
++class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop>,
++                                      public autotuning::Container {
++#else
+ class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop> {
++#endif
+ public:
+   /// A range representing the start and end location of a loop.
+   class LocRange {
+@@ -395,6 +403,11 @@ public:
+     return "<unnamed loop>";
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++#endif
++
+ private:
+   Loop() = default;
+ 
+diff --git a/llvm/include/llvm/Analysis/Passes.h b/llvm/include/llvm/Analysis/Passes.h
+index ac1bc3549910..65f566cc75de 100644
+--- a/llvm/include/llvm/Analysis/Passes.h
++++ b/llvm/include/llvm/Analysis/Passes.h
+@@ -58,6 +58,16 @@ namespace llvm {
+   // in a function and builds the region hierarchy.
+   //
+   FunctionPass *createRegionInfoPass();
++
++#if defined(ENABLE_AUTOTUNER)
++  //===--------------------------------------------------------------------===//
++  //
++  // createAutotuningDumpPass - This pass collects IR of tuned regions
++  // and stores them into predetrmined locations.
++  // for the purpose of autotuning ML guidance
++  //
++  ModulePass *createAutotuningDumpPass();
++#endif
+ }
+ 
+ #endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuning.h b/llvm/include/llvm/AutoTuner/AutoTuning.h
+new file mode 100644
+index 000000000000..0f1f276306ec
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuning.h
+@@ -0,0 +1,486 @@
++#if defined(ENABLE_AUTOTUNER)
++//===-- AutoTuning.h - Auto-Tuning-----------------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines Auto Tuning related functions, models and interfaces.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNING_H_
++#define LLVM_AUTOTUNER_AUTOTUNING_H_
++
++#include "llvm/ADT/DenseMapInfo.h"
++#include "llvm/ADT/Hashing.h"
++#include "llvm/ADT/SetVector.h"
++#include "llvm/ADT/SmallVector.h"
++#include "llvm/IR/DebugInfoMetadata.h"
++#include "llvm/IR/DebugLoc.h"
++#include "llvm/Support/Casting.h"
++#include <map>
++#include <memory>
++#include <string>
++#include <unordered_map>
++#include <unordered_set>
++
++// Options for AutoTuner incremental compilation.
++enum AutoTuningCompileOpt {
++  Inactive,    // Disabled incremental compilation.
++  CoarseGrain, // For tuning LLVMParam.
++  FineGrain,   // For tuning default code regions (Loop, CallSite, Function).
++  Basic        // Same as CoarseGrain but can be applied for any code region.
++               // Can be used with ImpactRanker.
++};
++
++namespace autotuning {
++// Constant defintion for AutoTuner incremental compilation.
++const std::string CompileOptionStart = "start";
++const std::string CompileOptionEnd = "end";
++const std::string CompileOptionUnknow = "unknown";
++const std::string CompileOptionUnroll = "loop-unroll";
++const std::string CompileOptionVectorize = "loop-vectorize";
++const std::string CompileOptionInline = "inline";
++
++class ParameterBase {
++public:
++  virtual ~ParameterBase() = default;
++  enum ParameterKind {
++    PK_PARAMETER,
++  };
++  ParameterKind getKind() const { return Kind; }
++
++  explicit ParameterBase(ParameterKind K) : Kind(K) {}
++
++private:
++  const ParameterKind Kind;
++};
++
++template <typename T> class Parameter : public ParameterBase {
++public:
++  Parameter(const T &RHS) : ParameterBase(PK_PARAMETER), Value(RHS) {}
++  const T &getValue() const { return Value; }
++  void setValue(const T &RHS) { Value = RHS; }
++
++  static bool classof(const ParameterBase *P) {
++    return P->getKind() == PK_PARAMETER;
++  }
++
++private:
++  T Value;
++};
++
++/// This class manages parameters of one codeRegion.
++class ParameterManager {
++
++public:
++  // add a param into this ParameterManager
++  template <typename T>
++  void add(const std::string &ParamName, const T ParamValue) {
++    std::shared_ptr<ParameterBase> Param =
++        std::make_shared<Parameter<T>>(ParamValue);
++    this->ParametersParamName = Param;
++  }
++
++  // Look up the value of a parameter by name in this ParameterManager.
++  // The found value will be assigned to the reference variable "Value".
++  // Return true if the parameter exits in this ParameterManager,
++  // and false otherwise.
++  template <typename T>
++  bool findByName(const std::string &ParamName, T &Value) const {
++    auto Iterator = Parameters.find(ParamName);
++    if (Iterator == Parameters.end()) {
++      return false;
++    }
++
++    auto ParamPtr = llvm::dyn_cast<Parameter<T>>(Iterator->second.get());
++    if (ParamPtr != nullptr) {
++      Value = ParamPtr->getValue();
++      return true;
++    } else {
++      return false;
++    }
++  }
++
++private:
++  std::unordered_map<std::string, std::shared_ptr<ParameterBase>> Parameters;
++};
++
++/// The debug location used to track a CodeRegion back to the source file.
++struct SourceLocation {
++  ///  The source file corresponding to this CodeRegion.
++  std::string SourceFilePath;
++  unsigned SourceLine = 0;
++  unsigned SourceColumn = 0;
++
++  bool operator==(const SourceLocation &CR) const {
++    return (this->SourceFilePath == CR.SourceFilePath) &&
++           (this->SourceLine == CR.SourceLine) &&
++           (this->SourceColumn == CR.SourceColumn);
++  };
++
++  explicit operator bool() const {
++    return !(SourceFilePath.empty() && SourceLine == 0 && SourceColumn == 0);
++  }
++};
++
++enum CodeRegionType {
++  CallSite,          // Code region for function inlining.
++  Function,          // Used in AutoTuningDump pass for IR writing.
++  LLVMParam,         // Compilation flags. Tuned individually for each module.
++  Loop,              // Code region for loops.
++  MachineBasicBlock, // Instruction scheduling code region.
++  Other,             // Pass ordering code region.
++  ProgramParam,      // Compilation flags. Tuned collectively for program.
++  Switch,            // Tuning MinJumpTableEntries parameter for switch inst.
++  Empty,             // Empty CodeRegion.
++  Invalid            // Invalid CodeRegion.
++};
++
++enum HotnessType {
++  Unknown,
++  Cold,
++  Hot,
++};
++
++/// DynamicOptions represent a map: Arg -> DynamicConfigs.
++/// Where Arg is a tuning parameter on the associated CodeRegion.
++/// And DynamicConfigs is the possible tuning values associated with Arg.
++typedef std::map<std::string, std::vector<unsigned int>> DynamicOptions;
++
++/// This class represents a region in source code including
++/// its name, function name, type, debug location, and associated pass name.
++class CodeRegion {
++
++public:
++  // Default constructor
++  CodeRegion(const CodeRegionType Type = CodeRegionType::Other);
++  ~CodeRegion() = default;
++  // Concrete constructors
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const CodeRegionType &Type, const llvm::DebugLoc &DL,
++             const DynamicOptions DO = {});
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const CodeRegionType &Type,
++             const SourceLocation &Location = SourceLocation(),
++             const DynamicOptions DO = {});
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const std::string &PassName, const CodeRegionType &Type,
++             const SourceLocation &Location = SourceLocation(),
++             const unsigned int Invocation = 0);
++
++  bool operator==(const CodeRegion &CR) const;
++  inline bool operator!=(const CodeRegion &CR) const { return !(*this == CR); };
++
++  explicit operator bool() const {
++    return !(Name.empty() && FuncName.empty() && PassName.empty());
++  }
++
++  static std::string getTypeAsString(CodeRegionType CRType);
++  static std::string getHotnessAsString(HotnessType Hotness);
++  const std::string &getName() const { return Name; }
++  const std::string &getFuncName() const { return FuncName; }
++  const CodeRegionType &getType() const { return Type; }
++  const std::string &getFileName() const { return Location.SourceFilePath; }
++  const std::string &getTypeAsString() const { return StringType; }
++  const SourceLocation &getSourceLoc() const { return Location; }
++  const std::string &getPassName() const { return PassName; }
++  unsigned getSize() const { return Size; };
++  void setPassName(const std::string &NewPassName);
++  void setSize(unsigned Size) { this->Size = Size; };
++  void setHotness(HotnessType NewHotness) const { this->Hotness = NewHotness; }
++  HotnessType getHotness() const { return this->Hotness; }
++  std::string getHotnessAsString() const { return getHotnessAsString(Hotness); }
++  bool isCold() const { return this->Hotness == Cold; }
++  bool isHot() const { return this->Hotness == Hot; }
++  std::uint64_t getHash() const { return this->Hash; }
++  void setHash(std::uint64_t Hash) { this->Hash = Hash; }
++  DynamicOptions getAutoTunerOptions() const { return this->AutoTunerOptions; }
++  void setInvocation(unsigned int Invocation) { this->Invocation = Invocation; }
++  unsigned int getInvocation() const { return this->Invocation; }
++
++  /// Add dynamic config options with Code Region for AutoTuner to tune instead
++  /// of using static config options.
++  void addAutoTunerOptions(const std::string ParamName,
++                           std::vector<unsigned int> Options) const {
++    this->AutoTunerOptions.insert(
++        std::pair<std::string, std::vector<unsigned int>>(ParamName, Options));
++  }
++  static CodeRegion getInvalidInstance();
++  static CodeRegion getEmptyInstance();
++  void setBaselineConfig(std::map<std::string, std::string> Value) const {
++    this->BaselineConfig = Value;
++  };
++  std::map<std::string, std::string> getBaselineConfig() const {
++    return this->BaselineConfig;
++  }
++
++private:
++  /// Name of the code region.
++  /// For most of cases it's set to the name of a header basic block.
++  std::string Name;
++  /// Function name of this code region if any.
++  std::string FuncName;
++  /// Name of the pass which this code region is associated.
++  std::string PassName;
++  /// Type of this code region. Options are other, function, loop,
++  /// and machine basic block.
++  CodeRegionType Type;
++  /// Source Location.
++  SourceLocation Location;
++  std::string StringType;
++  /// Structural hash for the CodeRegion.
++  std::uint64_t Hash = 0;
++  /// Configs values passed to AutoTuner for dynamic setting of search space
++  /// for code regions.
++  mutable DynamicOptions AutoTunerOptions;
++  /// Configuration values passed to AutoTuner for generating the same binary
++  /// as the baseline.
++  mutable std::map<std::string, std::string> BaselineConfig;
++
++  /// Record the order of invocation of an optimization pass during the whole
++  /// compilation pipeline. It is used to differentiate multiple invocations of
++  /// a same optimization pass.
++  /// Currently, Loop Unroll pass is invoked twice during the compilation
++  /// pipeline. 'Invocation' helps to relate a code region with the invocation
++  /// of Loop Unroll pass where the code region is generated.
++  mutable unsigned int Invocation;
++
++  /// Size of this code region. Usually it refers to the number of instructions
++  /// but could be different based on implementations.
++  unsigned Size = 0;
++  mutable HotnessType Hotness = Unknown;
++
++  /// A boolean flag to record if a CR is initialized or not.
++  /// It should only be set to true by initContainer().
++  /// We only add initialized CR to TuningOpps.
++  bool Initialized = false;
++
++  friend class AutoTuningEngine;
++};
++
++/// This class is an interface for classes representing code regions in LLVM
++/// (eg. Loop, Function and MachineBasicBlock) to inherit
++/// so that auto-tuning can be enabled on them.
++/// A Container must contain a CodeRegion.
++class Container {
++
++public:
++  Container() {}
++  virtual ~Container(){};
++
++  /// Abstract method for derived classes to overwrite
++  virtual void initCodeRegion() = 0;
++  virtual uint64_t computeStructuralHash() = 0;
++
++  /// Get the Container's CodeRegion.
++  const CodeRegion &getCodeRegion() const;
++  /// Set the Container's CodeRegion.
++  void setCodeRegion(const CodeRegion &NewCR);
++  /// This method is to look up the value of a parameter that corresponds to an
++  /// Container. The parameter being looked up is stored in a ParameterManager.
++  template <typename T>
++  bool lookUpParams(const std::string &ParamsName, T &Value) const;
++
++  /// Check if the code region is being tuned by config file.
++  bool requiresIRDump(bool IsFunctionIR = false) const;
++
++private:
++  CodeRegion CR;
++  friend class AutoTuningEngine;
++};
++} // end namespace autotuning
++
++namespace std {
++template <>
++// Implement hash for CodeRegion data type in std namespace. Only using common
++// attributes (with and without using 'OmitAutotuningMetadata' flag) of
++// CodeRegion. Remaining attributes are compared in overloaded == function.
++struct hash<autotuning::CodeRegion> {
++  std::size_t operator()(const autotuning::CodeRegion &CR) const {
++    return llvm::hash_combine(CR.getPassName(), CR.getType());
++  }
++};
++} // namespace std
++
++namespace llvm {
++// Forward Decleration.
++class CallBase;
++
++typedef autotuning::CodeRegion CodeRegion;
++template <> struct DenseMapInfo<CodeRegion> {
++  static bool isEqual(const CodeRegion &LHS, const CodeRegion &RHS) {
++    return LHS == RHS;
++  }
++  static inline CodeRegion getEmptyKey() {
++    return autotuning::CodeRegion::getEmptyInstance();
++  }
++  static inline CodeRegion getTombstoneKey() {
++    return autotuning::CodeRegion::getInvalidInstance();
++  }
++  // Implement hash for CodeRegion data type in llvm namespace. Only using
++  // common attributes (with and without using 'OmitAutotuningMetadata' flag)
++  // of CodeRegion. Remaining attributes are compared in overloaded ==
++  // function.
++  static unsigned getHashValue(const CodeRegion &CR) {
++    return llvm::hash_combine(CR.getPassName(), CR.getType());
++  }
++};
++} // namespace llvm
++
++namespace autotuning {
++using namespace llvm;
++typedef std::unordered_map<CodeRegion, ParameterManager> LookUpTable;
++typedef llvm::SetVector<CodeRegion> CodeRegions;
++
++/// Structure to store information of CallSite code regions which is used to
++/// get a different SourceLocation for multiple callsites (same callee) in a
++/// function when these callsites have same SourceLocation due to inlining.
++struct CallSiteLocation {
++  llvm::CallBase *CB;
++  llvm::Function *Caller;
++  llvm::Function *Callee;
++  SourceLocation SrcLoc;
++};
++
++class AutoTuningEngine {
++public:
++  AutoTuningEngine() { Enabled = false; }
++  ~AutoTuningEngine() {}
++
++  /// Initialize the Container for auto-tuning.
++  void initContainer(Container *Container, const std::string &PassName,
++                     const StringRef FuncName = "", bool AddOpportunity = true,
++                     unsigned int Invocation = 0);
++
++  /// Initialize auto-tuning. This method should only be called in the main
++  /// function.
++  /// \return Error::success() on success or the related Error otherwise.
++  llvm::Error init(const std::string &ModuleID);
++
++  /// Finalize auto-tuning. This method should only be called in the main
++  /// function.
++  /// \return Error::success() on success or the related Error otherwise.
++  llvm::Error finalize();
++
++  /// Return the number of tuning configuration used for this compilation.
++  llvm::Expected<int> getConfigNumber();
++
++  void enable() { Enabled = true; }
++  void disable() { Enabled = false; }
++  bool isEnabled() const { return Enabled; }
++  bool isMLEnabled() const { return MLEnabled; }
++  bool isDumpEnabled() const { return DumpEnabled; }
++  bool isGenerateOutput() const { return GenerateOutput; }
++  bool isParseInput() const { return ParseInput; }
++  bool isTuningAllowedForType(CodeRegionType CRType) const {
++    return (CodeRegionFilterTypes.count(CRType) > 0);
++  }
++  bool isThinLTOTuning() const;
++
++  /// Convert a pass-name to CodeRegionType.
++  CodeRegionType convertPassToType(std::string Pass);
++
++  /// First sets BaselineConfig value for the CR then
++  /// add a tuning opportunity into the TuningOpps list.
++  void addOpportunity(const CodeRegion &OppCR,
++                      std::map<std::string, std::string> BaselineConfig = {});
++  bool hasOpportunities() const { return TuningOpps.empty(); }
++
++  bool shouldRunOptPass(std::string FileName, std::string Pass);
++
++  /// Insert all of the callsites of a function in CallSiteLocs vector.
++  void insertCallSiteLoc(CallSiteLocation Loc);
++
++  /// Update CallSiteLocs vector with new callsites (if any) which get available
++  /// due to inlining.
++  void updateCallSiteLocs(llvm::CallBase *CB, llvm::CallBase *Ptr,
++                          llvm::Function *F, unsigned int Line);
++
++  /// Clean up the CallSiteLocs vector by keeping the callsite if there are
++  /// multiple calls to same callee. This cleaning will be perform before
++  /// inlining any callsite.
++  void cleanCallSiteLoc();
++
++  /// clear the CallSiteLocs vector.
++  void clearCallSiteLocs();
++
++  /// Return the SourceLocation::SourceLine (if available).
++  std::optional<unsigned int> getCallSiteLoc(llvm::CallBase *CB);
++
++  template <typename T>
++  bool lookUpGlobalParams(const std::string &ParamsName, T &Value) const;
++  /// A map storing llvm parameters.
++  std::unordered_map<std::string, std::string> LLVMParams;
++  /// A map storing program parameters.
++  std::unordered_map<std::string, std::string> ProgramParams;
++
++private:
++  std::string ModuleID;
++  /// This boolean indicates if the auto-tuning mode is enabled.
++  /// It will be set to true if the any of the following command line options
++  /// (auto-tuning-input, auto-tuning-result and auto-tuning-opp) is specified.
++  bool Enabled;
++  /// This boolean indicates if the ML guidance feature is enabled in
++  /// Autotuner. It will be set to true if -fautotune-rank is specified.
++  bool MLEnabled;
++  /// This boolean indicates if the IR dumping is enabled or not. IR dumping
++  /// is enabled for ML guidance feature. It can also be enabled with command
++  /// line compiler flag 'enable-autotuning-dump'.
++  bool DumpEnabled = false;
++  /// This boolean indicates if compiler is parsing/using 'config.yaml' file
++  /// generated by AutoTuner and use the configuration values instead of
++  /// determining with compiler heuristic.
++  bool ParseInput;
++  /// This boolean indicates if compiler is creating/generating opportunity
++  /// file(s) which will be consumed by AutoTuner to create the search space.
++  bool GenerateOutput;
++  /// A map of filename and set of optimization passes; an optimization pass
++  /// will be added to this set if a CodeRegion belongs to the optimization
++  /// pass.
++  std::unordered_map<std::string, std::unordered_set<std::string>> OppPassList;
++
++  /// Vector to store all of the duplicate calls in a function and the calls
++  /// which get available due to inlining.
++  SmallVector<CallSiteLocation, 10> CallSiteLocs;
++
++  /// A set to store the code region types that will be tuned in current
++  /// autotuning flow. This will be populated with code region types based on
++  /// 'auto-tuning-type-filter' for -fautotune-generate and the types will be
++  /// extracted from config.yaml in case of -fautotune.
++  /// This set is used to apply type-based filtering prior to creating/
++  /// initializing a code region.
++  std::unordered_set<CodeRegionType> CodeRegionFilterTypes;
++
++  // A statically initialized map used to convert 'pass-name' to
++  // 'CodeRegionType'.
++  std::unordered_map<std::string, CodeRegionType> PTTMap;
++
++  /// A map of CodeRegion and ParameterManager to keep track of all the
++  /// parameters of code regions loaded from input config file.
++  LookUpTable ParamTable;
++  /// A list of CodeRegions as tuning opportunities
++  CodeRegions TuningOpps;
++  /// A ParameterManager for global parameters.
++  ParameterManager GlobalParams;
++
++  /// Apply filters for CodeRegions.
++  void applyOppFilters(CodeRegions &CRs);
++
++  /// Apply function name filter for CodeRegions.
++  bool applyFunctionFilter(std::string FuncName);
++
++  friend class Container;
++  friend class CodeRegion;
++  friend class AutoTuningRemarkManager;
++};
++
++extern class AutoTuningEngine Engine; // AutoTuning Engine
++
++} // end namespace autotuning
++
++#endif /* LLVM_AUTOTUNER_AUTOTUNING_H_ */
++#endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h b/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+new file mode 100644
+index 000000000000..153a2c6246ad
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+@@ -0,0 +1,43 @@
++#if defined(ENABLE_AUTOTUNER)
++//===- llvm/AutoTuner/AutoTuningRemarkManager.h - Remark Manager ----------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file declares the main interface for inputting and outputting
++// remarks for AutoTuning.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNINGREMARKMANAGER_H
++#define LLVM_AUTOTUNINGREMARKMANAGER_H
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/Error.h"
++#include <string>
++#include <unordered_map>
++#include <vector>
++
++namespace autotuning {
++class AutoTuningRemarkManager {
++public:
++  /// Read a list of parameters from input file.
++  /// Return true on success and false on failure.
++  static llvm::Error read(autotuning::AutoTuningEngine &E,
++                          const std::string &InputName,
++                          const std::string &RemarksFormat);
++
++  /// Dump a list of CodeRegions as tuning opportunities into a file.
++  /// Return true on success and false on failure.
++  static llvm::Error dump(const autotuning::AutoTuningEngine &E,
++                          const std::string &DirPath,
++                          const std::string &RemarksFormat,
++                          const std::string &RemarksPasses);
++};
++} // namespace autotuning
++#endif // LLVM_AUTOTUNINGREMARKMANAGER_H
++#endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h b/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+new file mode 100644
+index 000000000000..0096139b12e9
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+@@ -0,0 +1,47 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===------------ llvm/AutoTuner/AutoTuningRemarkStreamer.h --------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++// ===---------------------------------------------------------------------===//
++//
++// This file contains the implementation of the conversion between AutoTuner
++// CodeRegions and serializable remarks::Remark objects.
++//
++// ===---------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++#define LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Remarks/Remark.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/Error.h"
++#include "llvm/Support/ToolOutputFile.h"
++#include <memory>
++#include <string>
++
++namespace llvm {
++/// Streamer for AutoTuner remarks which has logic for dealing with CodeRegions.
++class AutoTuningRemarkStreamer {
++  remarks::RemarkStreamer &RS;
++  /// Convert CodeRegion into remark objects.
++  remarks::Remark toRemark(const autotuning::CodeRegion &CR);
++
++public:
++  AutoTuningRemarkStreamer(remarks::RemarkStreamer &RS) : RS(RS) {}
++  /// Emit a CodeRegion through the streamer.
++  void emit(const autotuning::CodeRegion &CR);
++  /// Set a pass filter based on a regex \p Filter.
++  /// Returns an error if the regex is invalid.
++  Error setFilter(StringRef Filter);
++};
++} // end namespace llvm
++
++#endif // LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++#endif
+diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+index 52388692c196..95ac9acf4e5e 100644
+--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
++++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+@@ -27,6 +27,9 @@
+ #include <iterator>
+ #include <string>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -91,9 +94,19 @@ public:
+   void deleteNode(MachineInstr *MI);
+ };
+ 
++#if defined(ENABLE_AUTOTUNER)
++class MachineBasicBlock
++    : public ilist_node_with_parent<MachineBasicBlock, MachineFunction>,
++      public autotuning::Container {
++#else
+ class MachineBasicBlock
+     : public ilist_node_with_parent<MachineBasicBlock, MachineFunction> {
++#endif
+ public:
++#if defined(ENABLE_AUTOTUNER)
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++#endif
+   /// Pair of physical register and lane mask.
+   /// This is not simply a std::pair typedef because the members should be named
+   /// clearly as they both have an integer type.
+diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
+index 93cf0d27e9a7..c0db48ae1789 100644
+--- a/llvm/include/llvm/IR/Function.h
++++ b/llvm/include/llvm/IR/Function.h
+@@ -37,6 +37,9 @@
+ #include <cstdint>
+ #include <memory>
+ #include <string>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -56,6 +59,24 @@ class User;
+ class BranchProbabilityInfo;
+ class BlockFrequencyInfo;
+ 
++#if defined(ENABLE_AUTOTUNER)
++class AutoTuningEnabledFunction : public autotuning::Container {
++public:
++  AutoTuningEnabledFunction() = delete;
++  void initCodeRegion() override;
++  void setHot() { this->Hotness = autotuning::Hot; }
++  void setCold() { this->Hotness = autotuning::Cold; }
++  autotuning::HotnessType getHotness() const { return this->Hotness; }
++  uint64_t computeStructuralHash() override;
++
++private:
++  AutoTuningEnabledFunction(Function *F) { Func = F; };
++  Function *Func;
++  autotuning::HotnessType Hotness = autotuning::Unknown;
++  friend class Function;
++};
++#endif
++
+ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
+                                           public ilist_node<Function> {
+ public:
+@@ -68,6 +89,13 @@ public:
+   using arg_iterator = Argument *;
+   using const_arg_iterator = const Argument *;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATEFunction and the current
++  // Function object to avoid messing up the LLVM User and owned Use classes'
++  // memory layout.
++  AutoTuningEnabledFunction ATEFunction = AutoTuningEnabledFunction(this);
++#endif
++
+ private:
+   // Important things that make up a function!
+   BasicBlockListType BasicBlocks;         ///< The basic blocks
+@@ -128,6 +156,11 @@ public:
+   void operator=(const Function&) = delete;
+   ~Function();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Return the auto-tuning enabled version of this Function object.
++  AutoTuningEnabledFunction &getATEFunction() { return ATEFunction; }
++#endif
++
+   // This is here to help easily convert from FunctionT * (Function * or
+   // MachineFunction *) in BlockFrequencyInfoImpl to Function * by calling
+   // FunctionT->getFunction().
+@@ -840,7 +873,11 @@ public:
+   /// AssemblyAnnotationWriter.
+   void print(raw_ostream &OS, AssemblyAnnotationWriter *AAW = nullptr,
+              bool ShouldPreserveUseListOrder = false,
++#if defined(ENABLE_AUTOTUNER)
++             bool IsForDebug = false, bool PrintCompleteIR = false) const;
++#else
+              bool IsForDebug = false) const;
++#endif
+ 
+   /// viewCFG - This function is meant for use from the debugger.  You can just
+   /// say 'call F->viewCFG()' and a ghostview window should pop up from the
+diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
+index 6095b0a1be69..dcc9bbee30fa 100644
+--- a/llvm/include/llvm/IR/InstrTypes.h
++++ b/llvm/include/llvm/IR/InstrTypes.h
+@@ -1169,6 +1169,23 @@ public:
+ using OperandBundleDef = OperandBundleDefT<Value *>;
+ using ConstOperandBundleDef = OperandBundleDefT<const Value *>;
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===----------------------------------------------------------------------===//
++//                    AutoTuningEnabledCallSite Class
++//===----------------------------------------------------------------------===//
++class CallBase;
++class AutoTuningEnabledCallSite : public autotuning::Container {
++public:
++  AutoTuningEnabledCallSite() = delete;
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++  AutoTuningEnabledCallSite(CallBase *CallBase) { CB = CallBase; }
++
++private:
++  CallBase *CB;
++};
++#endif
++
+ //===----------------------------------------------------------------------===//
+ //                               CallBase Class
+ //===----------------------------------------------------------------------===//
+@@ -1229,6 +1246,13 @@ protected:
+   unsigned getNumSubclassExtraOperandsDynamic() const;
+ 
+ public:
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATECallSite and CallBase class
++  // to enable auto-tuning.
++  std::unique_ptr<AutoTuningEnabledCallSite> ATECallSite =
++      std::make_unique<AutoTuningEnabledCallSite>(this);
++#endif
++
+   using Instruction::getContext;
+ 
+   /// Create a clone of \p CB with a different set of operand bundles and
+diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
+index 8d60384e1a32..9d638af6eeef 100644
+--- a/llvm/include/llvm/IR/Instructions.h
++++ b/llvm/include/llvm/IR/Instructions.h
+@@ -3287,6 +3287,23 @@ struct OperandTraits<BranchInst> : public VariadicOperandTraits<BranchInst, 1> {
+ 
+ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===----------------------------------------------------------------------===//
++//                    AutoTuningEnabledSwitchInst Class
++//===----------------------------------------------------------------------===//
++class SwitchInst;
++
++class AutoTuningEnabledSwitchInst : public autotuning::Container {
++public:
++  AutoTuningEnabledSwitchInst() = delete;
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++  AutoTuningEnabledSwitchInst(SwitchInst *SwitchInst) { SI = SwitchInst; }
++
++private:
++  SwitchInst *SI;
++};
++#endif
+ //===----------------------------------------------------------------------===//
+ //                               SwitchInst Class
+ //===----------------------------------------------------------------------===//
+@@ -3332,6 +3349,13 @@ protected:
+ public:
+   void operator delete(void *Ptr) { User::operator delete(Ptr); }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATESwitchInst and
++  // SwitchInst class to enable AutoTuner.
++  std::unique_ptr<AutoTuningEnabledSwitchInst> ATESwitchInst =
++      std::make_unique<AutoTuningEnabledSwitchInst>(this);
++#endif
++
+   // -2
+   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
+ 
+diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
+index 670a40b28eab..904a450a1888 100644
+--- a/llvm/include/llvm/IR/Module.h
++++ b/llvm/include/llvm/IR/Module.h
+@@ -38,6 +38,9 @@
+ #include <optional>
+ #include <string>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
+index 1bdeb85afa3c..c0bcc8153eb8 100644
+--- a/llvm/include/llvm/IR/StructuralHash.h
++++ b/llvm/include/llvm/IR/StructuralHash.h
+@@ -15,6 +15,9 @@
+ #define LLVM_IR_STRUCTURALHASH_H
+ 
+ #include <cstdint>
++#if defined(ENABLE_AUTOTUNER)
++#include <vector>
++#endif
+ 
+ namespace llvm {
+ 
+@@ -24,6 +27,17 @@ class Module;
+ uint64_t StructuralHash(const Function &F);
+ uint64_t StructuralHash(const Module &M);
+ 
++#if defined(ENABLE_AUTOTUNER)
++class MachineBasicBlock;
++class BasicBlock;
++class CallBase;
++class SwitchInst;
++
++uint64_t StructuralHash(const std::vector<BasicBlock *> BBs);
++uint64_t StructuralHash(const MachineBasicBlock &MBB);
++uint64_t StructuralHash(const CallBase &CB);
++uint64_t StructuralHash(const SwitchInst &SI);
++#endif
+ } // end namespace llvm
+ 
+ #endif
+diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
+index c6fee47b464b..80bec2d82e24 100644
+--- a/llvm/include/llvm/InitializePasses.h
++++ b/llvm/include/llvm/InitializePasses.h
+@@ -340,6 +340,11 @@ void initializeWasmEHPreparePass(PassRegistry&);
+ void initializeWinEHPreparePass(PassRegistry&);
+ void initializeWriteBitcodePassPass(PassRegistry&);
+ void initializeXRayInstrumentationPass(PassRegistry&);
++#if defined(ENABLE_AUTOTUNER)
++void initializeAutotuningDumpLegacyPass(PassRegistry &);
++void initializeAutoTuningCompileFunctionLegacyPass(PassRegistry &);
++void initializeAutoTuningCompileModuleLegacyPass(PassRegistry &);
++#endif
+ 
+ } // end namespace llvm
+ 
+diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
+index 7420ea64e954..3a8ecb1399f1 100644
+--- a/llvm/include/llvm/LinkAllPasses.h
++++ b/llvm/include/llvm/LinkAllPasses.h
+@@ -54,6 +54,9 @@
+ #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+ #include "llvm/Transforms/Vectorize.h"
+ #include <cstdlib>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
+ 
+ namespace {
+   struct ForcePassLinking {
+@@ -93,6 +96,11 @@ namespace {
+       (void) llvm::createInstSimplifyLegacyPass();
+       (void) llvm::createInstructionCombiningPass();
+       (void) llvm::createJMCInstrumenterPass();
++#if defined(ENABLE_AUTOTUNER)
++      (void) llvm::createAutotuningDumpPass();
++      (void) llvm::createAutoTuningCompileFunctionLegacyPass();
++      (void) llvm::createAutoTuningCompileModuleLegacyPass();
++#endif
+       (void) llvm::createKCFIPass();
+       (void) llvm::createLCSSAPass();
+       (void) llvm::createLICMPass();
+diff --git a/llvm/include/llvm/Remarks/Remark.h b/llvm/include/llvm/Remarks/Remark.h
+index a66f7ed73f2f..3bcc0c710498 100644
+--- a/llvm/include/llvm/Remarks/Remark.h
++++ b/llvm/include/llvm/Remarks/Remark.h
+@@ -20,6 +20,10 @@
+ #include "llvm/Support/raw_ostream.h"
+ #include <optional>
+ #include <string>
++#if defined(ENABLE_AUTOTUNER)
++#include <map>
++#include <vector>
++#endif
+ 
+ namespace llvm {
+ namespace remarks {
+@@ -47,6 +51,9 @@ struct Argument {
+   StringRef Key;
+   // FIXME: We might want to be able to store other types than strings here.
+   StringRef Val;
++#if defined(ENABLE_AUTOTUNER)
++  std::optional<std::vector<StringRef>> VectorVal;
++#endif
+   // If set, the debug location corresponding to the value.
+   std::optional<RemarkLocation> Loc;
+ 
+@@ -65,6 +72,9 @@ enum class Type {
+   Analysis,
+   AnalysisFPCommute,
+   AnalysisAliasing,
++#if defined(ENABLE_AUTOTUNER)
++  AutoTuning,
++#endif
+   Failure,
+   First = Unknown,
+   Last = Failure
+@@ -105,6 +115,28 @@ struct Remark {
+   /// Mangled name of the function that triggers the emssion of this remark.
+   StringRef FunctionName;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  /// Type of the code region that the remark is associated with.
++  std::optional<StringRef> CodeRegionType;
++
++  /// Configuration value for generating the same baseline binary associated
++  /// with this remark.
++  std::optional<std::map<std::string, std::string>> BaselineConfig;
++
++  /// Hash of the code region that the remark is associated with.
++  std::optional<uint64_t> CodeRegionHash;
++
++  /// Configs values passed to AutoTuner for dynamic setting of search space
++  /// for code regions.
++  std::optional<std::map<std::string, std::vector<unsigned int>>>
++      AutoTunerOptions;
++
++  /// Invocation/Registering of Optimization Pass in the compilation pipeline.
++  /// It is used to differentiate between different invocations of same
++  /// optimization pass.
++  std::optional<unsigned int> Invocation;
++#endif
++
+   /// The location in the source file of the remark.
+   std::optional<RemarkLocation> Loc;
+ 
+diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
+index d2079fead668..c59dba2749f0 100644
+--- a/llvm/include/llvm/Support/CommandLine.h
++++ b/llvm/include/llvm/Support/CommandLine.h
+@@ -40,6 +40,9 @@
+ #include <type_traits>
+ #include <vector>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include <unordered_map>
++#endif
+ namespace llvm {
+ 
+ namespace vfs {
+@@ -72,6 +75,20 @@ bool ParseCommandLineOptions(int argc, const char *const *argv,
+                              const char *EnvVar = nullptr,
+                              bool LongOptionsUseDoubleDash = false);
+ 
++#if defined(ENABLE_AUTOTUNER)
++// It will parse AutoTuner options (LLVMParams & ProgramParams) and add them as
++// command line flags for the compilation process. These options are suggested
++// by AutoTuner during tuning flow. This function will always be called after
++// AutoTuner initialization.
++// Returns true on success. Otherwise, this will print the error message to
++// stderr and exit.
++bool ParseAutoTunerOptions(
++    std::unordered_map<std::string, std::string> LLVMParams,
++    std::unordered_map<std::string, std::string> ProgramParams,
++    StringRef Overview = "", raw_ostream *Errs = nullptr,
++    const char *EnvVar = nullptr, bool LongOptionsUseDoubleDash = false);
++#endif
++
+ // Function pointer type for printing version information.
+ using VersionPrinterTy = std::function<void(raw_ostream &)>;
+ 
+diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
+index aaba710cfde6..e69beeade947 100644
+--- a/llvm/include/llvm/Transforms/Scalar.h
++++ b/llvm/include/llvm/Transforms/Scalar.h
+@@ -16,6 +16,10 @@
+ 
+ #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+ #include <functional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Pass.h"
++#include <string>
++#endif
+ 
+ namespace llvm {
+ 
+@@ -299,6 +303,19 @@ Pass *createLoopSimplifyCFGPass();
+ //
+ FunctionPass *createInstSimplifyLegacyPass();
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===--------------------------------------------------------------------===//
++//
++// createAutotuningCompilePass - It writes IR files with -fautotune-generate
++// for autotuning flow. It also enables/disables the execution of optimization
++// passes in subsequent compilations (with -fautotune) based on autotuning
++// methodology and available opportunities.
++//
++FunctionPass *
++createAutoTuningCompileFunctionLegacyPass(std::string Pass = "unknown");
++ModulePass *
++createAutoTuningCompileModuleLegacyPass(std::string Pass = "unknown");
++#endif
+ 
+ //===----------------------------------------------------------------------===//
+ //
+diff --git a/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h b/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+new file mode 100644
+index 000000000000..2cbb48f336ef
+--- /dev/null
++++ b/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+@@ -0,0 +1,170 @@
++#if defined(ENABLE_AUTOTUNER)
++//===---------------- AutoTuningCompile.h - Auto-Tuning -------------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++//===----------------------------------------------------------------------===//
++//
++/// \file
++/// This file declares the interface for AutoTuning Incremental Compilation.
++/// Incremental compilation requires two passes 1) Module Pass and 2) Function
++/// Pass for legacy pass manager. It requires an additional Loop Pass for new
++/// pass manager.
++/// AutoTuningOptPassGate class is also defined here which is used to enable/
++/// disable the execution of optimization passes for the compilation pipeline.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_
++#define LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_
++
++#include "llvm/Analysis/LoopAnalysisManager.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/LoopPass.h"
++#include "llvm/IR/OptBisect.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Pass.h"
++#include "llvm/Transforms/Scalar/LoopPassManager.h"
++
++namespace llvm {
++
++class Pass;
++
++//  Skips or runs optimization passes.
++class AutoTuningOptPassGate : public OptPassGate {
++public:
++  explicit AutoTuningOptPassGate(bool Skip = false) : Skip(Skip) {}
++
++  bool shouldRunPass(const StringRef PassName,
++                     StringRef IRDescription) override;
++  bool isEnabled() const override { return true; }
++  bool checkPass(const StringRef PassName, const StringRef TargetDesc);
++  void setSkip(bool Skip) { this->Skip = Skip; }
++  bool getSkip() const { return Skip; }
++
++private:
++  bool Skip;
++};
++
++// Returns a static AutoTuningOptPassGate object which will be used to register
++// CallBack for OptBisect instrumentation.
++// It will also be used by AutoTuningCompile passes to enable/disable
++// optimization passes.
++AutoTuningOptPassGate &getAutoTuningOptPassGate();
++
++class AutoTuningCompileModule {
++public:
++  explicit AutoTuningCompileModule(std::string Pass = "unknown");
++  bool run(Module &M);
++  // Write IR files for each module to be re-used in subsequent compilations
++  // for autotuning cycles. It only works with -fautotune-generate.
++  void writeIRFiles(Module &M) const;
++  // Enable/Disable execution of optimization passes in subsequent compilations
++  // based on autotuning methodology and available opportunities. It Only works
++  // with -fautotune
++  bool modifyCompilationPipeline(Module &M) const;
++
++  static void setSkipCompilation(bool Option) { SkipCompilation = Option; }
++  static bool getSkipCompilation() { return SkipCompilation; }
++
++private:
++  static bool SkipCompilation;
++  std::string Pass = "";
++};
++
++class AutoTuningCompileModuleLegacy : public ModulePass {
++public:
++  static char ID;
++  explicit AutoTuningCompileModuleLegacy(std::string Pass = "unknown");
++  bool runOnModule(Module &M) override;
++  StringRef getPassName() const override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    AU.setPreservesAll();
++  }
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileModulePass
++    : public PassInfoMixin<AutoTuningCompileModulePass> {
++public:
++  explicit AutoTuningCompileModulePass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunction {
++public:
++  explicit AutoTuningCompileFunction(std::string Pass = "unknown");
++  bool run(Function &F);
++  // Write IR files for each module to be re-used in subsequent compilations
++  // for autotuning cycles. It only works with -fautotune-generate.
++  void writeIRFiles(Module &M);
++  // Enable/Disable execution of optimization passes in subsequent compilations
++  // based on autotuning methodology and available opportunities. It Only works
++  // with -fautotune
++  bool modifyCompilationPipeline(Function &F);
++
++private:
++  // A module may have multiple functions; decision to enable/disable
++  // execution of an optimization pass will be made for the first function and
++  // will be used for all of the functions in the module.
++  // 'SkipDecision' will be set once the decision is made for a specific 'Pass'.
++  bool SkipDecision = false;
++
++  // A module may have multiple functions; IR file will be written once for the
++  // entire module for a specific 'Pass'.
++  bool IsModuleWritten = false;
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunctionLegacy : public FunctionPass {
++public:
++  static char ID;
++  explicit AutoTuningCompileFunctionLegacy(std::string Pass = "unknown");
++  bool runOnFunction(Function &F) override;
++  StringRef getPassName() const override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    AU.setPreservesAll();
++  }
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunctionPass
++    : public PassInfoMixin<AutoTuningCompileFunctionPass> {
++public:
++  explicit AutoTuningCompileFunctionPass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileLoopPass
++    : public PassInfoMixin<AutoTuningCompileLoopPass> {
++public:
++  explicit AutoTuningCompileLoopPass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
++                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
++
++private:
++  std::string Pass = "";
++};
++
++} // end namespace llvm
++
++#endif /* LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_ */
++#endif
+diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+index 4f3010965b59..e1cccf417898 100644
+--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
++++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+@@ -108,7 +108,11 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                         unsigned TripMultiple, unsigned LoopSize,
+                         TargetTransformInfo::UnrollingPreferences &UP,
+                         TargetTransformInfo::PeelingPreferences &PP,
++#if defined(ENABLE_AUTOTUNER)
++                        bool &UseUpperBound, unsigned int Invocation = 0);
++#else
+                         bool &UseUpperBound);
++#endif
+ 
+ void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                              ScalarEvolution *SE, DominatorTree *DT,
+diff --git a/llvm/lib/Analysis/AutotuningDump.cpp b/llvm/lib/Analysis/AutotuningDump.cpp
+new file mode 100644
+index 000000000000..81b2bbead70e
+--- /dev/null
++++ b/llvm/lib/Analysis/AutotuningDump.cpp
+@@ -0,0 +1,265 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===-- AutotuningDump.cpp - Auto-Tuning---------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// ===--------------------------------------------------------------------===//
++//
++// This file contains pass collecting IR of tuned regions and storing them into
++// predetrmined locations, to be used later by autotuning ML guidance
++//
++// ===--------------------------------------------------------------------===//
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/Analysis/Passes.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/IR/LegacyPassManager.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Pass.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Path.h"
++#include "llvm/Support/Process.h"
++#include "llvm/Support/raw_ostream.h"
++#include <sys/stat.h>
++
++using namespace llvm;
++
++#define DEBUG_TYPE "autotuning-dump"
++
++enum AutotuningDumpOpt { whole_modules, functions, loops };
++
++// Enable Debug Options to be specified on the command line
++cl::opt<AutotuningDumpOpt> AutotuningDumpMode(
++    "autotuning-dump-mode", cl::desc("Choose autotuning dump mode:"),
++    cl::init(whole_modules),
++    cl::values(clEnumVal(whole_modules, "dump each module in its own file"),
++               clEnumVal(functions, "dump each function in its own file"),
++               clEnumVal(loops, "dump each loop in its own file")));
++
++AutotuningDump::AutotuningDump(bool IncrementalCompilation) {
++  // Check if the environment variable AUTOTUNE_DATADIR is set.
++  IsIncrementalCompilation = IncrementalCompilation;
++  AutoTuneDirPath = "autotune_datadir";
++  if (std::optional<std::string> MaybePath =
++          llvm::sys::Process::GetEnv("AUTOTUNE_DATADIR"))
++    AutoTuneDirPath = *MaybePath;
++}
++
++int AutotuningDump::getConfigNumber() {
++  auto ConfigNumOrErr = autotuning::Engine.getConfigNumber();
++  if (ConfigNumOrErr)
++    return *ConfigNumOrErr;
++  else {
++    report_fatal_error("Invalid/missing Autotuner configuration ID");
++    return -1;
++  }
++}
++
++void AutotuningDump::dumpToStream(llvm::raw_ostream &os, const Loop &L) const {
++  L.print(os);
++}
++
++void AutotuningDump::dumpToStream(llvm::raw_ostream &os,
++                                  const Function &F) const {
++  F.print(os, /*AAW*/ nullptr, /*ShouldPreserveUseListOrder*/ false,
++          /*IsForDebug*/ false, /*PrintCompleteIR*/ true);
++}
++
++// Create appropriate file. File will contains AbsolutePath/FileName.
++std::unique_ptr<raw_ostream> AutotuningDump::createFile(const Twine &File) {
++  std::error_code EC;
++  return std::make_unique<raw_fd_ostream>((File).str(), EC,
++                                          sys::fs::CD_CreateAlways,
++                                          sys::fs::FA_Write, sys::fs::OF_None);
++}
++
++std::string AutotuningDump::getDirectoryName(const std::string File) const {
++  std::string DirectoryName = AutoTuneDirPath;
++  if (!autotuning::Engine.isMLEnabled())
++    DirectoryName += "/IR_files";
++
++  DirectoryName = DirectoryName + "/" + File + "/";
++
++  // Create directory if not already present.
++  if (std::error_code EC = sys::fs::create_directories(DirectoryName))
++    errs() << "could not create directory: " << DirectoryName << ": "
++           << EC.message();
++
++  return DirectoryName;
++}
++
++std::string AutotuningDump::getFileName(std::string FilePath) {
++  if (autotuning::Engine.isMLEnabled())
++    return std::to_string(this->getConfigNumber()) + ".ll";
++  std::replace(FilePath.begin(), FilePath.end(), '/', '_');
++  return FilePath + ".ll";
++}
++
++void AutotuningDump::dumpModule(Module &M) {
++  std::unique_ptr<raw_ostream> fptr;
++  LLVM_DEBUG(dbgs() << "AutotuningDump: Dump module IR files.\n");
++  if (IsIncrementalCompilation) {
++    std::string Filename = M.getSourceFileName();
++    llvm::SmallString<128> FilenameVec = StringRef(Filename);
++    llvm::sys::fs::make_absolute(FilenameVec);
++    size_t Pos = FilenameVec.rfind(".");
++    if (Pos != std::string::npos) {
++      FilenameVec.pop_back_n(FilenameVec.size() - Pos);
++      FilenameVec.append(".ll");
++    }
++    fptr = createFile(FilenameVec);
++  } else {
++    std::string File = llvm::sys::path::filename(M.getName()).str();
++    std::string DirectoryName = getDirectoryName(File);
++    std::string FileName = getFileName(M.getName().str());
++    fptr = createFile(DirectoryName + FileName);
++  }
++
++  M.print(*fptr, nullptr, true, false);
++}
++
++void AutotuningDump::dumpFunctions(Module &M) {
++  std::string FilePath = M.getName().str();
++  std::replace(FilePath.begin(), FilePath.end(), '/', '_');
++  std::string DirectoryName = getDirectoryName(FilePath);
++  for (Function &F : M.getFunctionList()) { // go through all functions
++    if (F.isDeclaration() || F.empty())
++      continue;
++
++    AutoTuningEnabledFunction *AutotuneFunc = &F.getATEFunction();
++    assert(AutotuneFunc);
++    autotuning::Engine.initContainer(AutotuneFunc, "autotuning-dump",
++                                     F.getName(), false);
++    std::string FuncName = F.getName().str();
++    // check the whole function
++    if (AutotuneFunc->requiresIRDump(true)) {
++      auto fptr = createFile(DirectoryName + Twine(FuncName) + ".ll");
++      this->dumpToStream(*fptr, F);
++    }
++  }
++}
++
++void AutotuningDump::dumpLoops(Module &M,
++                               function_ref<LoopInfo &(Function &)> GetLI) {
++  for (Function &F : M) {
++    // Nothing to do for declarations.
++    if (F.isDeclaration() || F.empty())
++      continue;
++
++    LoopInfo &LI = GetLI(F);
++    for (auto &L : LI.getLoopsInPreorder()) {
++      Function *Func = nullptr;
++      StringRef FuncName = "";
++      if (!L->isInvalid())
++        Func = L->getHeader()->getParent();
++      if (Func)
++        FuncName = Func->getName();
++
++      autotuning::Engine.initContainer(L, "autotuning-dump", FuncName, false);
++      if (L->requiresIRDump()) {
++        std::string FuncName = L->getCodeRegion().getFuncName();
++        unsigned SourceLine = L->getCodeRegion().getSourceLoc().SourceLine;
++        std::string DirectoryName = AutoTuneDirPath + "/" +
++                                    llvm::sys::path::filename(FuncName).str() +
++                                    "_loop_" + std::to_string(SourceLine);
++        std::string FileName = std::to_string(this->getConfigNumber()) + ".ll";
++        auto fptr = createFile(DirectoryName + "/" + FileName);
++        this->dumpToStream(*fptr, *L);
++      }
++    }
++  }
++}
++
++bool AutotuningDump::run(Module &M,
++                         function_ref<LoopInfo &(Function &)> GetLI) {
++  // Change to absolute path.
++  SmallString<256> OutputPath = StringRef(AutoTuneDirPath);
++  sys::fs::make_absolute(OutputPath);
++
++  // Creating new output directory, if it does not exists.
++  if (std::error_code EC = sys::fs::create_directories(OutputPath)) {
++    llvm::errs() << (make_error<StringError>(
++        "could not create directory: " + Twine(OutputPath) + ": " +
++            EC.message(),
++        EC));
++    return false;
++  }
++
++  if (IsIncrementalCompilation) {
++    LLVM_DEBUG(
++        dbgs()
++        << "AutotuningDump: IR files writing for incremental compilation.\n");
++    dumpModule(M);
++    return false;
++  }
++
++  switch (AutotuningDumpMode) {
++  case whole_modules:
++    dumpModule(M);
++    break;
++  case functions:
++    dumpFunctions(M);
++    break;
++  case loops:
++    dumpLoops(M, GetLI);
++  }
++
++  return false;
++}
++
++AutotuningDumpLegacy::AutotuningDumpLegacy(bool IncrementalCompilation)
++    : ModulePass(AutotuningDumpLegacy::ID) {
++  IsIncrementalCompilation = IncrementalCompilation;
++  initializeAutotuningDumpLegacyPass(*PassRegistry::getPassRegistry());
++}
++
++bool AutotuningDumpLegacy::runOnModule(Module &M) {
++  if (!autotuning::Engine.isDumpEnabled())
++    return false;
++
++  auto GetLI = this(Function &F) -> LoopInfo & {
++    return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
++  };
++
++  AutotuningDump Impl(IsIncrementalCompilation);
++  return Impl.run(M, GetLI);
++}
++
++StringRef AutotuningDumpLegacy::getPassName() const {
++  return "Autotuning Dump";
++}
++
++void AutotuningDumpLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
++  AU.setPreservesAll();
++  AU.addRequired<LoopInfoWrapperPass>();
++}
++
++char AutotuningDumpLegacy::ID = 0;
++INITIALIZE_PASS_BEGIN(AutotuningDumpLegacy, "autotuning-dump",
++                      "Dump IR for Autotuned Code Regions", false, false)
++INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
++INITIALIZE_PASS_END(AutotuningDumpLegacy, "autotuning-dump",
++                    "Dump IR for Autotuned Code Regions", false, false)
++
++ModulePass *llvm::createAutotuningDumpPass() {
++  return new AutotuningDumpLegacy();
++}
++
++AnalysisKey AutotuningDumpAnalysis::Key;
++
++AutotuningDumpAnalysis::Result
++AutotuningDumpAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
++  if (!autotuning::Engine.isDumpEnabled())
++    return false;
++
++  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
++  auto GetLI = &FAM(Function &F) -> LoopInfo & {
++    return FAM.getResult<LoopAnalysis>(F);
++  };
++
++  AutotuningDump Impl(IsIncrementalCompilation);
++  Impl.run(M, GetLI);
++  return false;
++}
++#endif
+diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
+index 4a1797c42789..9c6a70f0221f 100644
+--- a/llvm/lib/Analysis/CMakeLists.txt
++++ b/llvm/lib/Analysis/CMakeLists.txt
+@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMAnalysis
+   Analysis.cpp
+   AssumeBundleQueries.cpp
+   AssumptionCache.cpp
++  AutotuningDump.cpp
+   BasicAliasAnalysis.cpp
+   BlockFrequencyInfo.cpp
+   BlockFrequencyInfoImpl.cpp
+@@ -153,6 +154,7 @@ add_llvm_component_library(LLVMAnalysis
+   ${MLLinkDeps}
+ 
+   LINK_COMPONENTS
++  AutoTuner
+   BinaryFormat
+   Core
+   Object
+diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
+index e2480d51d372..f6b3c14a0345 100644
+--- a/llvm/lib/Analysis/InlineAdvisor.cpp
++++ b/llvm/lib/Analysis/InlineAdvisor.cpp
+@@ -383,15 +383,27 @@ llvm::shouldInline(CallBase &CB,
+   Function *Callee = CB.getCalledFunction();
+   Function *Caller = CB.getCaller();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Get the code Region to add BaselineConfig values for inline
++  const autotuning::CodeRegion &CR = CB.ATECallSite.get()->getCodeRegion();
++  static const std::string ForceInlineParamStr = "ForceInline";
++#endif
++
+   if (IC.isAlways()) {
+     LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC)
+                       << ", Call: " << CB << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "1"}});
++#endif
+     return IC;
+   }
+ 
+   if (!IC) {
+     LLVM_DEBUG(dbgs() << "    NOT Inlining " << inlineCostStr(IC)
+                       << ", Call: " << CB << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "0"}});
++#endif
+     if (IC.isNever()) {
+       ORE.emit(&() {
+         return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+@@ -417,6 +429,9 @@ llvm::shouldInline(CallBase &CB,
+     LLVM_DEBUG(dbgs() << "    NOT Inlining: " << CB
+                       << " Cost = " << IC.getCost()
+                       << ", outer Cost = " << TotalSecondaryCost << '\n');
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "0"}});
++#endif
+     ORE.emit(&() {
+       return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                       Call)
+@@ -430,6 +445,9 @@ llvm::shouldInline(CallBase &CB,
+ 
+   LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC) << ", Call: " << CB
+                     << '\n');
++#if defined(ENABLE_AUTOTUNER)
++  autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "1"}});
++#endif
+   return IC;
+ }
+ 
+diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
+index a2f46edcf5ef..9f8f57865de2 100644
+--- a/llvm/lib/Analysis/InlineCost.cpp
++++ b/llvm/lib/Analysis/InlineCost.cpp
+@@ -162,6 +162,14 @@ static cl::opt<bool> DisableGEPConstOperand(
+     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
+     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
+ 
++#if defined(ENABLE_AUTOTUNER)
++static cl::opt<bool>
++    EnableLocalCallSiteTuning("auto-tuning-enable-local-callsite-tuning",
++                              cl::init(false), cl::Hidden,
++                              cl::desc("Enable AutoTuning for local callsites "
++                                       "as well."));
++#endif
++
+ namespace llvm {
+ std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
+   if (Attr.isValid()) {
+@@ -2990,6 +2998,27 @@ InlineCost llvm::getInlineCost(
+     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled() && Call.getCaller() &&
++      (!Callee->hasLocalLinkage() || EnableLocalCallSiteTuning)) {
++    bool ForceInline = false;
++    bool Found = false;
++
++    autotuning::Engine.initContainer(Call.ATECallSite.get(), "inline",
++                                     Call.getCaller()->getName(),
++                                     /* addOpportunity */ false);
++
++    Found = Call.ATECallSite->lookUpParams<bool>("ForceInline", ForceInline);
++
++    if (Found) {
++      if (ForceInline)
++        return llvm::InlineCost::getAlways("Force inlined by auto-tuning");
++      else
++        return llvm::InlineCost::getNever("Force non-inlined by auto-tuning");
++    }
++  }
++#endif
++
+   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+                           << "... (caller:" << Call.getCaller()->getName()
+                           << ")\n");
+diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
+index 60a72079e864..36aca73ee675 100644
+--- a/llvm/lib/Analysis/LoopInfo.cpp
++++ b/llvm/lib/Analysis/LoopInfo.cpp
+@@ -37,6 +37,10 @@
+ #include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/GenericLoopInfoImpl.h"
+ #include "llvm/Support/raw_ostream.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/IR/StructuralHash.h"
++#endif
+ using namespace llvm;
+ 
+ // Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
+@@ -663,6 +667,54 @@ Loop::LocRange Loop::getLocRange() const {
+   return LocRange();
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t Loop::computeStructuralHash() {
++  std::vector<BasicBlock *> BBs = getBlocks();
++  return StructuralHash(BBs);
++}
++
++void Loop::initCodeRegion() {
++  std::string LoopName;
++  // use the header's name as the loop name
++  if (BasicBlock *Header = getHeader()) {
++    if (Header->hasName()) {
++      LoopName = Header->getName().str();
++    }
++    // if the header doesn't have a name,
++    // use the label of this header from AsmWriter
++    else {
++      std::string Str;
++      llvm::raw_string_ostream RSO(Str);
++      Header->printAsOperand(RSO);
++      LoopName = RSO.str();
++    }
++  } else {
++    LoopName = "<unnamed loop>";
++  }
++
++  Function *F = this->getHeader()->getParent();
++  StringRef FuncName = F->getName();
++
++  // init the CodeRegion
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      LoopName, FuncName.data(), autotuning::CodeRegionType::Loop,
++      this->getStartLoc());
++  // Compute the number of non-debug IR instructions in this loop.
++  unsigned TotalNumInstrs = 0;
++  for (const BasicBlock *BB : this->getBlocks()) {
++    unsigned NumInstrs = std::distance(BB->instructionsWithoutDebug().begin(),
++                                       BB->instructionsWithoutDebug().end());
++    TotalNumInstrs += NumInstrs;
++  }
++  CR.setSize(TotalNumInstrs);
++  // Compute hotness.
++  autotuning::HotnessType Hotness = F->ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++#endif
++
+ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD void Loop::dump() const { print(dbgs()); }
+ 
+diff --git a/llvm/lib/AutoTuner/AutoTuning.cpp b/llvm/lib/AutoTuner/AutoTuning.cpp
+new file mode 100644
+index 000000000000..1f09f06d84a2
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuning.cpp
+@@ -0,0 +1,705 @@
++#if defined(ENABLE_AUTOTUNER)
++//===-- AutoTuning.cpp - Auto-Tuning --------------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines Auto Tuning related functions, models and interfaces.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/StringRef.h"
++#include "llvm/AutoTuner/AutoTuningRemarkManager.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Error.h"
++#include "llvm/Support/Process.h"
++
++// Enable debug messages for AutoTuning.
++#define DEBUG_TYPE "autotuning"
++
++using namespace llvm;
++
++// defined in 'lib/Remarks/YAMLRemarkParser.cpp'.
++extern cl::opt<bool> OmitAutotuningMetadata;
++
++// -auto-tuning-input - Command line option to specify the input file.
++static cl::opt<std::string> InputFile("auto-tuning-input", cl::Hidden,
++                                      cl::desc("Specify the input file"));
++
++// -auto-tuning-opp - Command line option to specify the output directory of
++// tuning opportunities.
++static cl::opt<std::string> OutputOppDir(
++    "auto-tuning-opp", cl::Hidden,
++    cl::desc("Specify the output directory of tuning opportunities"));
++
++static cl::opt<std::string>
++    RemarksPasses("auto-tuning-pass-filter", cl::Hidden,
++                  cl::desc("Only dump auto-tuning remarks from passes whose "
++                           "names match the given regular expression"),
++                  cl::value_desc("regex"));
++
++static cl::opt<std::string>
++    ProjectDir("autotuning-project-dir", cl::Hidden, cl::init(""),
++               cl::desc("Specify project base dir to make code region name "
++                        "relative to base dir. This operation will only be "
++                        "applied for coarse-grain code regions."));
++
++// -auto-tuning-config-id - Command line option to specify the config number
++// being used for compilation. Required only for ML guidance feature.
++static cl::opt<int> CFGNumber(
++    "auto-tuning-config-id", cl::Hidden,
++    cl::desc(
++        "Specify the auto-tuning configuration ID used in this compilation."));
++
++static cl::opt<std::string> OutputFormat(
++    "auto-tuning-remark-format", cl::Hidden,
++    cl::desc("The format used for auto-tuning remarks (default: YAML)"),
++    cl::value_desc("format"), cl::init("yaml"));
++
++// AutoTuner incremental compilation options.
++cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode(
++    "auto-tuning-compile-mode", cl::Hidden, cl::init(Inactive),
++    cl::desc("AutoTuner: Choose incremental compilation mode."),
++    cl::values(clEnumVal(Inactive,
++                         "AutoTuner: Disable incremental compilation."),
++               clEnumVal(CoarseGrain, "AutoTuner: Enable incremental "
++                                      "compilation for coarse grain tuning."),
++               clEnumVal(FineGrain, "AutoTuner: Enable incremental compilation "
++                                    "for fine grain tuning."),
++               clEnumVal(Basic, "AutoTuner: Enable incremental compilation for "
++                                "any kind of code region.")));
++
++static cl::opt<bool>
++    EnableAutoTuningDump("enable-autotuning-dump", cl::Hidden, cl::init(false),
++                         cl::desc("Enable AutoTuningDump Pass"));
++
++static cl::opt<bool>
++    ThinLTOTuning("autotuning-thin-lto", cl::Hidden, cl::init(false),
++                  cl::desc("AutoTuner enabled in ThinLTO mode."));
++
++namespace autotuning {
++
++static cl::list<CodeRegionType> AutotuningOutputFilter(
++    "auto-tuning-type-filter", cl::Hidden, cl::CommaSeparated,
++    cl::desc(
++        "Select types of code regions to dump auto-tuning opportunities for:"),
++    cl::values(clEnumVal(LLVMParam, "LLVMParam code regions only"),
++               clEnumVal(ProgramParam, "ProgramParam code regions only"),
++               clEnumVal(CallSite, "CallSite code regions only"),
++               clEnumVal(Function, "Function code regions only"),
++               clEnumVal(Loop, "Loop code regions only"),
++               clEnumVal(MachineBasicBlock,
++                         "Machine basic block code regions only"),
++               clEnumVal(Switch, "Switch code regions only"),
++               clEnumVal(Other, "All other types of code regions")));
++
++static cl::list<std::string> AutotuningFunctionFilter(
++    "auto-tuning-function-filter", cl::Hidden, cl::CommaSeparated,
++    cl::desc("Apply code region filtering based on function names"));
++
++static const cl::opt<bool> ExcludeColdCodeRegion(
++    "auto-tuning-exclude-cold", cl::Hidden, cl::init(true),
++    cl::desc("Use profile data to prune cold code regions from auto-tuning"));
++
++static const cl::opt<bool> CodeRegionMatchingWithHash(
++    "auto-tuning-code-region-matching-hash", cl::Hidden, cl::init(true),
++    cl::desc("Use IR hashing to match the Code Regions"));
++
++static const cl::opt<bool> HotCodeRegionOnly(
++    "auto-tuning-hot-only", cl::Hidden, cl::init(false),
++    cl::desc(
++        "Use profile data to include hot code regions only from auto-tuning"));
++
++static const cl::opt<unsigned>
++    SizeThreshold("auto-tuning-size-threshold", cl::Hidden, cl::init(0),
++                  cl::desc("Prune small code regions from auto-tuning with a "
++                           "size smaller than the threshold"));
++
++static inline const std::string generateName(const std::string &Name) {
++  if (Name.empty())
++    return "unnamed";
++  else
++    return Name;
++}
++
++//===----------------------------------------------------------------------===//
++// CodeRegion implementation
++CodeRegion::CodeRegion(const CodeRegionType Type) : Type(Type) {}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const CodeRegionType &Type, const DebugLoc &DL,
++                       const DynamicOptions DO) {
++  this->Name = generateName(Name);
++  this->FuncName = generateName(FuncName);
++  this->Type = Type;
++  this->StringType = getTypeAsString(Type);
++  if (DL) {
++    StringRef File = DL->getFilename();
++    unsigned Line = DL->getLine();
++    unsigned Col = DL->getColumn();
++    this->Location = SourceLocation{File.str(), Line, Col};
++  }
++  this->AutoTunerOptions = DO;
++}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const CodeRegionType &Type,
++                       const SourceLocation &Location,
++                       const DynamicOptions DO) {
++  this->Name = generateName(Name);
++  this->FuncName = generateName(FuncName);
++  this->Type = Type;
++  this->StringType = getTypeAsString(Type);
++  this->Location = Location;
++  this->AutoTunerOptions = DO;
++}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const std::string &PassName, const CodeRegionType &Type,
++                       const SourceLocation &Location,
++                       const unsigned int Invocation)
++    : CodeRegion(Name, FuncName, Type, Location) {
++  this->PassName = generateName(PassName);
++  this->Invocation = Invocation;
++}
++
++bool CodeRegion::operator==(const CodeRegion &CodeRegion) const {
++  bool IsEqual = false;
++  if (OmitAutotuningMetadata)
++    IsEqual = (this->getHash() == CodeRegion.getHash()) &&
++              (this->Type == CodeRegion.getType()) &&
++              (this->PassName == CodeRegion.getPassName());
++  else {
++    IsEqual = (this->Type == CodeRegion.getType()) &&
++              (this->Name == CodeRegion.getName()) &&
++              (this->PassName == CodeRegion.getPassName()) &&
++              (this->FuncName == CodeRegion.getFuncName()) &&
++              (this->Location == CodeRegion.getSourceLoc());
++    if (CodeRegionMatchingWithHash)
++      IsEqual = IsEqual && (this->getHash() == CodeRegion.getHash());
++  }
++
++  if (autotuning::Engine.ParseInput)
++    IsEqual = IsEqual && this->getInvocation() == CodeRegion.getInvocation();
++
++  if (autotuning::Engine.GenerateOutput)
++    IsEqual =
++        IsEqual && this->getBaselineConfig() == CodeRegion.getBaselineConfig();
++
++  return IsEqual;
++}
++
++std::string CodeRegion::getTypeAsString(CodeRegionType CRType) {
++  switch (CRType) {
++  case autotuning::CodeRegionType::MachineBasicBlock:
++    return "machine_basic_block";
++  case autotuning::CodeRegionType::Loop:
++    return "loop";
++  case autotuning::CodeRegionType::Function:
++    return "function";
++  case autotuning::CodeRegionType::CallSite:
++    return "callsite";
++  case autotuning::CodeRegionType::LLVMParam:
++    return "llvm-param";
++  case autotuning::CodeRegionType::ProgramParam:
++    return "program-param";
++  case autotuning::CodeRegionType::Switch:
++    return "switch";
++  default:
++    return "other";
++  }
++}
++
++std::string CodeRegion::getHotnessAsString(HotnessType Hotness) {
++  switch (Hotness) {
++  case autotuning::HotnessType::Cold:
++    return "cold";
++  case autotuning::HotnessType::Hot:
++    return "hot";
++  default:
++    return "unknown";
++  }
++}
++
++void CodeRegion::setPassName(const std::string &NewPassName) {
++  this->PassName = generateName(NewPassName);
++}
++
++/* static */
++autotuning::CodeRegion CodeRegion::getInvalidInstance() {
++  static autotuning::CodeRegion Invalid =
++      CodeRegion(autotuning::CodeRegionType::Invalid);
++  return Invalid;
++}
++
++/* static */
++autotuning::CodeRegion CodeRegion::getEmptyInstance() {
++  static autotuning::CodeRegion Empty =
++      CodeRegion(autotuning::CodeRegionType::Empty);
++  return Empty;
++}
++
++//===----------------------------------------------------------------------===//
++// Container implementation
++//
++
++const CodeRegion &Container::getCodeRegion() const { return CR; }
++
++void Container::setCodeRegion(const CodeRegion &NewCR) { this->CR = NewCR; }
++
++template <typename T>
++bool Container::lookUpParams(const std::string &ParamsName, T &Value) const {
++  bool Found = false;
++  auto ConfigMapIterator = Engine.ParamTable.find(CR);
++  if (ConfigMapIterator != Engine.ParamTable.end()) {
++    ParameterManager InputParams = ConfigMapIterator->second;
++    Found = InputParams.findByName(ParamsName, Value);
++    if (Found) {
++      LLVM_DEBUG(dbgs() << ParamsName << " is set for the CodeRegion: \n"
++                        << "  Name: " << CR.getName() << "\n"
++                        << "  FuncName: " << CR.getFuncName() << "\n"
++                        << "  PassName: " << CR.getPassName() << "\n"
++                        << "  Type: " << CR.getTypeAsString() << "\n"
++                        << "  Hash: " << CR.getHash() << "\n"
++                        << "\n");
++    }
++  }
++  return Found;
++}
++
++bool Container::requiresIRDump(bool IsFunctionIR) const {
++  auto findBaselineRegion = &() -> bool {
++    for (auto &entry : Engine.TuningOpps)
++      if (!IsFunctionIR) {
++        if (CR.getSourceLoc() == entry.getSourceLoc())
++          return true;
++      } else {
++        if (CR.getFileName() == entry.getFileName() &&
++            CR.getFuncName() == entry.getFuncName())
++          return true;
++      }
++    return false;
++  };
++  auto findNonBaselineRegion = &() {
++    for (auto &entry : Engine.ParamTable)
++      if (!IsFunctionIR) {
++        if (CR.getSourceLoc() == entry.first.getSourceLoc())
++          return true;
++      } else {
++        if (CR.getFileName() == entry.first.getFileName() &&
++            CR.getFuncName() == entry.first.getFuncName())
++          return true;
++      }
++    return false;
++  };
++
++  if (CFGNumber == -1)
++    return findBaselineRegion();
++  else
++    return findNonBaselineRegion();
++}
++
++template bool Container::lookUpParams<int>(const std::string &ParamsName,
++                                           int &Value) const;
++template bool Container::lookUpParams<bool>(const std::string &ParamsName,
++                                            bool &Value) const;
++template bool
++Container::lookUpParams<std::string>(const std::string &ParamsName,
++                                     std::string &Value) const;
++template bool Container::lookUpParams<std::vector<std::string>>(
++    const std::string &ParamsName, std::vector<std::string> &Value) const;
++
++static unsigned int count(SmallVector<CallSiteLocation, 10> CallSiteLocs,
++                          CallSiteLocation Loc) {
++  unsigned int Count = 0;
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (Loc.Caller == CallSiteLocsIdx.Caller &&
++        Loc.Callee == CallSiteLocsIdx.Callee)
++      Count++;
++  }
++  return Count;
++}
++
++bool AutoTuningEngine::isThinLTOTuning() const { return ThinLTOTuning; }
++
++CodeRegionType AutoTuningEngine::convertPassToType(std::string PassName) {
++  auto Search = PTTMap.find(PassName);
++  if (Search == PTTMap.end())
++    llvm_unreachable(
++        "AutoTuningEngine: Invalid/unsupported optimization pass provided.\n");
++  return Search->second;
++}
++
++void AutoTuningEngine::insertCallSiteLoc(CallSiteLocation Loc) {
++  CallSiteLocs.emplace_back(Loc);
++}
++
++// If a function has multiple calls to same callee, then insert all the calls in
++// the CallSiteLocs vector which get available due to inlining of such calls.
++// It will use "Original Call Line No + New Call Line No" instead of using
++// "DebugLoc Line No".
++void AutoTuningEngine::updateCallSiteLocs(llvm::CallBase *OldCB,
++                                          llvm::CallBase *NewCB,
++                                          llvm::Function *Callee,
++                                          unsigned int Line) {
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (OldCB == CallSiteLocsIdx.CB) {
++      CallSiteLocation Loc = CallSiteLocsIdx;
++      Loc.CB = NewCB;
++      Loc.Callee = Callee;
++      Loc.SrcLoc.SourceLine = Loc.SrcLoc.SourceLine + Line;
++      CallSiteLocs.emplace_back(Loc);
++      break;
++    }
++  }
++}
++
++void AutoTuningEngine::cleanCallSiteLoc() {
++  unsigned int Size = CallSiteLocs.size();
++  unsigned int Idx = 0;
++  for (unsigned int I = 0; I < Size; ++I) {
++    CallSiteLocation Loc = CallSiteLocsIdx;
++    unsigned int Count = count(CallSiteLocs, Loc);
++    if (Count == 1) {
++      CallSiteLocs.erase(CallSiteLocs.begin() + Idx);
++      continue;
++    }
++    Idx++;
++  }
++}
++
++void AutoTuningEngine::clearCallSiteLocs() { CallSiteLocs.clear(); }
++
++std::optional<unsigned int>
++AutoTuningEngine::getCallSiteLoc(llvm::CallBase *CB) {
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (CB == CallSiteLocsIdx.CB)
++      return CallSiteLocsIdx.SrcLoc.SourceLine;
++  }
++  return std::nullopt;
++}
++
++void AutoTuningEngine::addOpportunity(
++    const CodeRegion &OppCR,
++    std::map<std::string, std::string> BaselineConfig) {
++  if (!OppCR.Initialized)
++    return;
++
++  OppCR.setBaselineConfig(BaselineConfig);
++  if (!TuningOpps.contains(OppCR))
++    TuningOpps.insert(OppCR);
++  else if (OppCR.getHotness() != Unknown) {
++    // If OppCR already exists in TuningOpps with unknown hotness,
++    // then update it if the current hotness is hot/cold.
++    auto OppI = find(TuningOpps, OppCR);
++    if (OppI->getHotness() == Unknown)
++      OppI->setHotness(OppCR.getHotness());
++  }
++}
++
++void AutoTuningEngine::applyOppFilters(CodeRegions &CRs) {
++  CodeRegions NewCRs;
++  for (CodeRegion CR : CRs) {
++    if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++      bool IsMatched = false;
++      for (auto CRType : AutotuningOutputFilter) {
++        if (CRType == CR.getType()) {
++          IsMatched = true;
++          break;
++        }
++      }
++      // Filter out the CodeRegion if its type fails to match any types
++      // specified from the command line.
++      if (!IsMatched)
++        continue;
++    }
++    if (SizeThreshold.getNumOccurrences() > 0 && CR.getSize() < SizeThreshold)
++      continue;
++    if (ExcludeColdCodeRegion && CR.isCold()) {
++      LLVM_DEBUG(dbgs() << "Skip CodeRegion with cold function "
++                        << CR.getFuncName() << "\n");
++      continue;
++    }
++    if (HotCodeRegionOnly && !CR.isHot()) {
++      LLVM_DEBUG(dbgs() << "Skip CodeRegion with " << CR.getHotnessAsString()
++                        << " function " << CR.getFuncName() << "\n");
++      continue;
++    }
++    NewCRs.insert(CR);
++    LLVM_DEBUG(dbgs() << "CodeRegion added as an tuning opportunity: \n"
++                      << "  Name: " << CR.getName() << "\n"
++                      << "  FuncName: " << CR.getFuncName() << "\n"
++                      << "  PassName: " << CR.getPassName() << "\n"
++                      << "  Type: " << CR.getTypeAsString() << "\n"
++                      << "  Size: " << CR.getSize() << "\n"
++                      << "  Hotness: " << CR.getHotnessAsString() << "\n"
++                      << "  Hash:   " << CR.getHash() << "\n"
++                      << "  Location:   " << CR.getSourceLoc().SourceFilePath
++                      << "; " << CR.getSourceLoc().SourceLine << "; "
++                      << CR.getSourceLoc().SourceColumn << "\n\n");
++  }
++  if (AutotuningOutputFilter.getNumOccurrences() == 0 ||
++      std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                Other) != AutotuningOutputFilter.end()) {
++    // Add an empty CodeRegion with ModuleID as an tuning opportunity.
++    // It could be used to represent a module level code region.
++    autotuning::CodeRegion GlobalCR =
++        CodeRegion(ModuleID, "none", "all", Other);
++    GlobalCR.setHash(llvm::hash_combine(ModuleID, Other));
++    NewCRs.insert(GlobalCR);
++    LLVM_DEBUG(dbgs() << "Module added as an tuning opportunity: \n"
++                      << "  Name: " << GlobalCR.getName() << "\n"
++                      << "  Hash: " << GlobalCR.getHash() << "\n"
++                      << "\n");
++  }
++
++  // Include LLVMParam as an tuning opportunity only if it is specified with
++  // -auto-tuning-type-filter.
++  if (std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                LLVMParam) != AutotuningOutputFilter.end())
++    NewCRs.insert(CodeRegion(ModuleID, "none", "none", LLVMParam));
++
++  if (std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                ProgramParam) != AutotuningOutputFilter.end())
++    NewCRs.insert(CodeRegion(ModuleID, "none", "none", ProgramParam));
++
++  CRs = NewCRs;
++}
++
++bool AutoTuningEngine::applyFunctionFilter(std::string FuncName) {
++  if (AutotuningFunctionFilter.getNumOccurrences() == 0)
++    return true;
++
++  for (std::string FunctionFilter : AutotuningFunctionFilter)
++    if (FuncName == FunctionFilter)
++      return true;
++
++  return false;
++}
++
++void AutoTuningEngine::initContainer(Container *Container,
++                                     const std::string &PassName,
++                                     const StringRef FuncName,
++                                     bool AddOpportunity,
++                                     unsigned int Invocation) {
++  if (Enabled) {
++    if (!isTuningAllowedForType(convertPassToType(PassName)) &&
++        !(isGenerateOutput() &&
++          AutotuningOutputFilter.getNumOccurrences() == 0))
++      return;
++
++    if (!applyFunctionFilter(FuncName.str()))
++      return;
++
++    // The attributes of a Container could potentially change overtime even with
++    // the same pass if the associated pass is invoked multiple times at
++    // different places in the pipeline. Therefore, we need to initCodeRegion
++    // every time when this function is called to ensure the CodeRegion with the
++    // latest information will be added as tuning opportunities.
++    Container->initCodeRegion();
++    if (Container->CR.getType() == autotuning::CodeRegionType::Invalid)
++      return;
++
++    uint64_t hash = Container->computeStructuralHash();
++    CodeRegion &OppCR = Container->CR;
++    if (GenerateOutput) {
++      if (OppCR.getSize() < SizeThreshold)
++        return;
++      if (ExcludeColdCodeRegion && OppCR.isCold()) {
++        LLVM_DEBUG(dbgs() << "Skip CodeRegion with cold function "
++                          << OppCR.getFuncName() << "\n");
++        return;
++      }
++      if (HotCodeRegionOnly && !OppCR.isHot()) {
++        LLVM_DEBUG(dbgs() << "Skip CodeRegion with "
++                          << OppCR.getHotnessAsString() << " function "
++                          << OppCR.getFuncName() << "\n");
++        return;
++      }
++    }
++    OppCR.setPassName(PassName);
++    OppCR.setHash(hash);
++    OppCR.setInvocation(Invocation);
++    OppCR.Initialized = true;
++    if (AddOpportunity)
++      addOpportunity(OppCR);
++  }
++}
++
++bool AutoTuningEngine::shouldRunOptPass(std::string Filename,
++                                        std::string Pass) {
++  return OppPassList.count(Filename) ? OppPassListFilename.count(Pass)
++                                     : false;
++}
++
++Error AutoTuningEngine::init(const std::string &Module) {
++  ParseInput = false;
++  if (std::optional<std::string> MaybePath =
++          llvm::sys::Process::GetEnv("AUTOTUNE_INPUT")) {
++    InputFile = *MaybePath;
++    ParseInput = true;
++  } else if (InputFile.getNumOccurrences() > 0) {
++    ParseInput = true;
++  }
++
++  GenerateOutput = false;
++  if (OutputOppDir.getNumOccurrences() > 0)
++    GenerateOutput = true;
++
++  // Invocation of any of the following command line options
++  // (auto-tuning-input and auto-tuning-opp) or env variable
++  // AUTOTUNE_ALL_INPUT can enable auto-tuning mode.
++  if (ParseInput || GenerateOutput) {
++    Enabled = true;
++    // Generate absolute path and remove the base directory (if available).
++    // A relative path will be used as (coarse-grain) code region name.
++    llvm::SmallString<128> ModuleVec = StringRef(Module);
++    llvm::sys::fs::make_absolute(ModuleVec);
++    if (ProjectDir.size() && ModuleVec.startswith(ProjectDir))
++      ModuleID = ModuleVec.substr(ProjectDir.size()).str();
++    else
++      ModuleID = std::string(ModuleVec);
++  }
++
++  // Initialization of map to be used for pass-name to CodeRegionType
++  // conversion.
++  PTTMap = {{"loop-unroll", Loop},
++            {"loop-vectorize", Loop},
++            {"inline", CallSite},
++            {"machine-scheduler", MachineBasicBlock},
++            {"switch-lowering", Switch},
++            {"autotuning-dump", Function}};
++
++  if (ParseInput) {
++    // Currently we only support yaml format for input.
++    if (Error E = AutoTuningRemarkManager::read(*this, InputFile, "yaml")) {
++      errs() << "Error parsing auto-tuning input.\n";
++      return E;
++    } else {
++      LLVM_DEBUG(dbgs() << "AutoTuningEngine is initialized.\n"
++                        << " Size of ParamTable: " << this->ParamTable.size()
++                        << "\n");
++      if (LLVMParams.size())
++        LLVM_DEBUG(dbgs() << "AutoTuner: LLVMParams applied.");
++      if (ProgramParams.size())
++        LLVM_DEBUG(dbgs() << "AutoTuner: ProgramParams applied.\n");
++    }
++  }
++
++  for (auto CRType : AutotuningOutputFilter)
++    CodeRegionFilterTypes.insert(CRType);
++
++  if (GenerateOutput) {
++    switch (AutoTuningCompileMode) {
++    case CoarseGrain: {
++      bool Valid = false;
++      if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++        Valid = true;
++        for (auto CRType : AutotuningOutputFilter)
++          if (CRType != LLVMParam) {
++            Valid = false;
++            break;
++          }
++      }
++      if (!Valid) {
++        AutoTuningCompileMode = Inactive;
++        errs() << "AutoTunerCompile: Code region type filtering does not match"
++                  " with incremental compilation option.\n"
++                  "Disabling incremental compilation.\n";
++      }
++      break;
++    }
++    case FineGrain: {
++      bool Valid = false;
++      if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++        Valid = true;
++        for (auto CRType : AutotuningOutputFilter) {
++          if (CRType != Loop && CRType != CallSite && CRType != Function) {
++            Valid = false;
++            break;
++          }
++        }
++      }
++      if (!Valid) {
++        AutoTuningCompileMode = Inactive;
++        errs() << "AutoTunerCompile: Code region type filtering does not match"
++                  "with incremental compilation option.\n"
++                  "Disabling incremental compilation.\n";
++      }
++      break;
++    }
++    case Basic:
++    case Inactive:
++      break;
++    default:
++      llvm_unreachable("AutoTuningCompile: Unknown AutoTuner Incremental "
++                       "Compilation mode.\n");
++    }
++  }
++
++  MLEnabled = (CFGNumber.getNumOccurrences() > 0);
++  if (EnableAutoTuningDump || MLEnabled)
++    DumpEnabled = true;
++  return Error::success();
++}
++
++llvm::Expected<int> AutoTuningEngine::getConfigNumber() {
++  if (!isMLEnabled()) {
++    std::string errorMsg =
++        "No Autotuner configuration specified; ML guidance is unavailable.";
++    return createStringError(inconvertibleErrorCode(), errorMsg);
++  } else
++    return CFGNumber;
++}
++
++Error AutoTuningEngine::finalize() {
++  if (OutputOppDir.getNumOccurrences() > 0) {
++    // Apply filters.
++    applyOppFilters(TuningOpps);
++    if (!TuningOpps.empty()) {
++      if (Error E = AutoTuningRemarkManager::dump(
++              *this, OutputOppDir, OutputFormat, RemarksPasses)) {
++        errs() << "Error generating auto-tuning opportunities.\n";
++        return E;
++      }
++    }
++
++    // Clear these two global lists when ending the auto-tuning
++    // in case of redundant information
++    TuningOpps.clear();
++  }
++  return Error::success();
++}
++
++template <typename T>
++bool AutoTuningEngine::lookUpGlobalParams(const std::string &ParamsName,
++                                          T &Value) const {
++  bool Found = GlobalParams.findByName(ParamsName, Value);
++  if (Found) {
++    LLVM_DEBUG(dbgs() << "Global Variable " << ParamsName << " is set.\n");
++  }
++  return Found;
++}
++
++template bool
++AutoTuningEngine::lookUpGlobalParams<int>(const std::string &ParamsName,
++                                          int &Value) const;
++template bool
++AutoTuningEngine::lookUpGlobalParams<bool>(const std::string &ParamsName,
++                                           bool &Value) const;
++template bool
++AutoTuningEngine::lookUpGlobalParams<std::string>(const std::string &ParamsName,
++                                                  std::string &Value) const;
++template bool AutoTuningEngine::lookUpGlobalParams<std::vector<std::string>>(
++    const std::string &ParamsName, std::vector<std::string> &Value) const;
++
++class AutoTuningEngine Engine;
++
++} // namespace autotuning
++
++#endif
+diff --git a/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp b/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+new file mode 100644
+index 000000000000..3e0506e534c4
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+@@ -0,0 +1,299 @@
++#if defined(ENABLE_AUTOTUNER)
++//===- llvm/AutoTuner/AutoTuningRemarkManager.cpp - Remark Manager --------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file contains the implementation of for inputting and outputting remarks
++// for AutoTuning.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuningRemarkManager.h"
++#include "llvm/ADT/StringRef.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/AutoTuner/AutoTuningRemarkStreamer.h"
++#include "llvm/IR/DebugInfoMetadata.h"
++#include "llvm/IR/LLVMRemarkStreamer.h"
++#include "llvm/Remarks/Remark.h"
++#include "llvm/Remarks/RemarkFormat.h"
++#include "llvm/Remarks/RemarkParser.h"
++#include "llvm/Remarks/RemarkSerializer.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Debug.h"
++#include "llvm/Support/FileSystem.h"
++#include "llvm/Support/MemoryBuffer.h"
++#include "llvm/Support/Path.h"
++#include "llvm/Support/ToolOutputFile.h"
++
++// Enable debug messages for AutoTuner.
++#define DEBUG_TYPE "autotuning"
++
++using namespace llvm;
++using namespace autotuning;
++
++// Helper functions.
++namespace {
++// Convert string into CodeRegionType.
++Expected<CodeRegionType> StringToCodeRegionType(const std::string &CRType) {
++  if (CRType == "machine_basic_block")
++    return autotuning::CodeRegionType::MachineBasicBlock;
++  else if (CRType == "loop")
++    return autotuning::CodeRegionType::Loop;
++  else if (CRType == "function")
++    return autotuning::CodeRegionType::Function;
++  else if (CRType == "callsite")
++    return autotuning::CodeRegionType::CallSite;
++  else if (CRType == "llvm-param")
++    return autotuning::CodeRegionType::LLVMParam;
++  else if (CRType == "program-param")
++    return autotuning::CodeRegionType::ProgramParam;
++  else if (CRType == "switch")
++    return autotuning::CodeRegionType::Switch;
++  else if (CRType == "other")
++    return autotuning::CodeRegionType::Other;
++  else
++    return make_error<StringError>("Unsupported CodeRegionType:" + CRType,
++                                   inconvertibleErrorCode());
++}
++
++// Remark -> autotuning::ParameterManager
++ParameterManager RemarkToParameterManager(const remarks::Remark &Remark) {
++  // Create Parameters from a remark.
++  ParameterManager ParamManager;
++  for (const remarks::Argument &Arg : Remark.Args) {
++    int Value = 0;
++    if (!Arg.Val.getAsInteger(10, Value))
++      // If no errors
++      ParamManager.add(Arg.Key.str(), Value);
++    else if (Arg.Val == "true")
++      ParamManager.add(Arg.Key.str(), true);
++    else if (Arg.Val == "false")
++      ParamManager.add(Arg.Key.str(), false);
++    // If there is a value of vector type
++    else if (Arg.VectorVal) {
++      std::vector<std::string> Strings;
++      for (const StringRef &Val : *Arg.VectorVal) {
++        Strings.push_back(Val.str());
++      }
++      ParamManager.add(Arg.Key.str(), Strings);
++    } else
++      // Add as String Value
++      ParamManager.add(Arg.Key.str(), Arg.Val);
++  }
++
++  return ParamManager;
++}
++
++// Remark -> std::unordered_map<std::string, std::string>
++std::unordered_map<std::string, std::string>
++RemarkToStringMap(const remarks::Remark &Remark) {
++  std::unordered_map<std::string, std::string> LLVMParams;
++  for (const remarks::Argument &Arg : Remark.Args) {
++    // Add as String Value
++    LLVMParamsArg.Key.str() = Arg.Val.str();
++  }
++  return LLVMParams;
++}
++
++// Remark -> autotuning::SourceLocation
++SourceLocation RemarkToSourceLocation(const remarks::Remark &Remark) {
++  SourceLocation Location;
++  if (Remark.Loc) {
++    StringRef File = Remark.Loc->SourceFilePath;
++    unsigned Line = Remark.Loc->SourceLine;
++    unsigned Column = Remark.Loc->SourceColumn;
++    Location = {File.str(), Line, Column};
++  }
++  return Location;
++}
++
++// Remark -> autotuning::CodeRegion
++CodeRegion RemarkToCodeRegion(const remarks::Remark &Remark,
++                              Expected<CodeRegionType> &Type) {
++  // Create a SourceLocation from a remark.
++  SourceLocation Location = RemarkToSourceLocation(Remark);
++  // Create a CodeRegion from a remark.
++  CodeRegion CR = CodeRegion(Remark.RemarkName.str(), Remark.FunctionName.str(),
++                             Remark.PassName.str(), Type.get(), Location);
++  if (Remark.CodeRegionHash)
++    CR.setHash(Remark.CodeRegionHash.value_or(0));
++  if (Remark.Invocation)
++    CR.setInvocation(Remark.Invocation.value_or(0));
++
++  return CR;
++}
++
++Expected<std::unique_ptr<ToolOutputFile>> emitAutoTuningRemarks(
++    const StringRef RemarksFilename, const StringRef RemarksFormat,
++    const StringRef RemarksPasses, const CodeRegions &CRList) {
++  if (RemarksFilename.empty())
++    return nullptr;
++  // Parse remark format. Options are yaml, yaml-strtab and bitstream.
++  Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
++  if (Error E = Format.takeError())
++    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++
++  std::error_code EC;
++  auto Flags =
++      *Format == remarks::Format::YAML ? sys::fs::OF_Text : sys::fs::OF_None;
++  auto RemarksFile =
++      std::make_unique<ToolOutputFile>(RemarksFilename, EC, Flags);
++  if (EC)
++    return make_error<LLVMRemarkSetupFormatError>(errorCodeToError(EC));
++  // Create a remark serializer to emit code regions.
++  Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
++      remarks::createRemarkSerializer(
++          *Format, remarks::SerializerMode::Separate, RemarksFile->os());
++
++  if (Error E = RemarkSerializer.takeError())
++    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++  // Create remark streamer based on the serializer.
++  remarks::RemarkStreamer RStreamer =
++      remarks::RemarkStreamer(std::move(*RemarkSerializer), RemarksFilename);
++  AutoTuningRemarkStreamer Streamer(RStreamer);
++
++  if (!RemarksPasses.empty())
++    if (Error E = Streamer.setFilter(RemarksPasses))
++      return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++  // Emit CodeRegions in Remark format.
++  for (const CodeRegion &CR : CRList) {
++    Streamer.emit(CR);
++  }
++  return std::move(RemarksFile);
++}
++} // namespace
++
++llvm::Error AutoTuningRemarkManager::read(AutoTuningEngine &E,
++                                          const std::string &InputFileName,
++                                          const std::string &RemarksFormat) {
++  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
++      MemoryBuffer::getFile(InputFileName.c_str());
++  if (std::error_code EC = Buf.getError())
++    return make_error<StringError>(
++        "Can't open file " + InputFileName + ": " + EC.message(), EC);
++  // Parse remark format. Options are yaml, yaml-strtab and bitstream.
++  Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
++  if (!Format)
++    return Format.takeError();
++
++  Expected<std::unique_ptr<remarks::RemarkParser>> MaybeParser =
++      remarks::createRemarkParserFromMeta(*Format, (*Buf)->getBuffer());
++  if (!MaybeParser) {
++    return MaybeParser.takeError();
++  }
++  remarks::RemarkParser &Parser = **MaybeParser;
++
++  while (true) {
++    Expected<std::unique_ptr<remarks::Remark>> MaybeRemark = Parser.next();
++    if (!MaybeRemark) {
++      Error E = MaybeRemark.takeError();
++      if (E.isA<remarks::EndOfFileError>()) {
++        // EOF.
++        consumeError(std::move(E));
++        break;
++      }
++      return E;
++    }
++    const remarks::Remark &Remark = **MaybeRemark;
++
++    if (Remark.RemarkType != remarks::Type::AutoTuning)
++      continue;
++
++    if (!Remark.CodeRegionType)
++      return make_error<StringError>("CodeRegionType field is missing.",
++                                     inconvertibleErrorCode());
++    Expected<CodeRegionType> Type =
++        StringToCodeRegionType((*Remark.CodeRegionType).str());
++    if (!Type)
++      return Type.takeError();
++    CodeRegionType CRType = Type.get();
++    // If CodeRegionType is Other, this remark corresponds to global
++    // parameters, and no need to create a CodeRegion object. Check if the
++    // Remark of global parameters is for the current Module.
++    if (CRType == autotuning::Other && Remark.RemarkName == Engine.ModuleID) {
++      Engine.GlobalParams = RemarkToParameterManager(Remark);
++      continue;
++    }
++    if (CRType == autotuning::LLVMParam &&
++        Remark.RemarkName == Engine.ModuleID) {
++      Engine.LLVMParams = RemarkToStringMap(Remark);
++      continue;
++    }
++    if (CRType == autotuning::ProgramParam &&
++        Remark.RemarkName == Engine.ModuleID) {
++      Engine.ProgramParams = RemarkToStringMap(Remark);
++      continue;
++    }
++    if (Engine.isThinLTOTuning() &&
++        (CRType == autotuning::CallSite || CRType == autotuning::Loop ||
++         CRType == autotuning::MachineBasicBlock ||
++         CRType == autotuning::Function)) {
++      LLVM_DEBUG(dbgs() << "AutoTuner does not support tuning of "
++                        << CodeRegion::getTypeAsString(CRType)
++                        << " for thinLTO durning link-time optimization. "
++                           "Ignoring current code region.\n");
++      continue;
++    }
++    // Create a SourceLocation from a remark.
++    CodeRegion CR = RemarkToCodeRegion(Remark, Type);
++    ParameterManager ParamManager = RemarkToParameterManager(Remark);
++    // Add the CodeRegion-ParameterManager entry into LoopUpTable.
++    Engine.ParamTableCR = ParamManager;
++
++    std::string Filename = CR.getSourceLoc().SourceFilePath;
++    size_t Pos = Filename.rfind(".");
++    if (Pos != std::string::npos)
++      Filename.erase(Pos, Filename.size());
++    Engine.OppPassListFilename.insert(CR.getPassName());
++    Engine.CodeRegionFilterTypes.insert(CR.getType());
++  }
++  return Error::success();
++}
++
++Error AutoTuningRemarkManager::dump(const autotuning::AutoTuningEngine &E,
++                                    const std::string &DirName,
++                                    const std::string &RemarksFormat,
++                                    const std::string &RemarksPasses) {
++  // Change to absolute path.
++  SmallString<256> OutputPath = StringRef(DirName);
++  sys::fs::make_absolute(OutputPath);
++
++  // Make sure the new output directory exists, creating it if necessary.
++  if (std::error_code EC = sys::fs::create_directories(OutputPath)) {
++    return make_error<StringError>("could not create directory: " +
++                                       Twine(OutputPath) + ": " + EC.message(),
++                                   EC);
++  }
++  if (!Engine.TuningOpps.empty()) {
++    StringRef ModelFileName = sys::path::filename(Engine.ModuleID);
++    sys::path::append(OutputPath, ModelFileName + "." + RemarksFormat);
++
++    int i = 1; // Output file suffix starts from 1.
++    // Check all exiting xml files xml.1...i and create a new file
++    // suffix.(i+1).
++    while (sys::fs::exists(OutputPath)) {
++      sys::path::remove_filename(OutputPath);
++      sys::path::append(OutputPath,
++                        ModelFileName + "." + RemarksFormat + "." + Twine(i));
++      i += 1;
++    }
++    Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
++        emitAutoTuningRemarks(OutputPath, RemarksFormat, RemarksPasses,
++                              Engine.TuningOpps);
++    if (Error E = RemarksFileOrErr.takeError()) {
++      return E;
++    }
++
++    std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
++    if (RemarksFile)
++      RemarksFile->keep();
++  }
++  return Error::success();
++}
++
++#endif
+diff --git a/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp b/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+new file mode 100644
+index 000000000000..0516c055a139
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+@@ -0,0 +1,55 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===---------- llvm/AutoTuner/AutoTuningRemarkStreamer.cpp --------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++// ===---------------------------------------------------------------------===//
++//
++// This file contains the implementation of the conversion between AutoTuner
++// CodeRegions and serializable remarks::Remark objects.
++//
++// ===---------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuningRemarkStreamer.h"
++
++using namespace llvm;
++
++// autotuning::CodeRegion -> Remark
++remarks::Remark
++AutoTuningRemarkStreamer::toRemark(const autotuning::CodeRegion &CR) {
++  remarks::Remark R; // The result.
++  R.RemarkType = remarks::Type::AutoTuning;
++  R.PassName = CR.getPassName();
++  R.RemarkName = CR.getName();
++  R.FunctionName = CR.getFuncName();
++  const autotuning::SourceLocation &Location = CR.getSourceLoc();
++  if (Location)
++    R.Loc = remarks::RemarkLocation{Location.SourceFilePath,
++                                    Location.SourceLine, Location.SourceColumn};
++  R.CodeRegionType = CR.getTypeAsString();
++  R.CodeRegionHash = CR.getHash();
++  R.AutoTunerOptions = CR.getAutoTunerOptions();
++  R.Invocation = CR.getInvocation();
++  R.BaselineConfig = CR.getBaselineConfig();
++  return R;
++}
++
++void AutoTuningRemarkStreamer::emit(const autotuning::CodeRegion &CR) {
++  if (!RS.matchesFilter(CR.getPassName()))
++    return;
++
++  // First, convert the code region to a remark.
++  remarks::Remark R = toRemark(CR);
++  // Then, emit the remark through the serializer.
++  RS.getSerializer().emit(R);
++}
++
++Error AutoTuningRemarkStreamer::setFilter(StringRef Filter) {
++  return RS.setFilter(Filter);
++}
++#endif
+diff --git a/llvm/lib/AutoTuner/CMakeLists.txt b/llvm/lib/AutoTuner/CMakeLists.txt
+new file mode 100644
+index 000000000000..c618474fe5ae
+--- /dev/null
++++ b/llvm/lib/AutoTuner/CMakeLists.txt
+@@ -0,0 +1,11 @@
++add_llvm_component_library(LLVMAutoTuner
++  AutoTuning.cpp
++  AutoTuningRemarkManager.cpp
++  AutoTuningRemarkStreamer.cpp
++
++  ADDITIONAL_HEADER_DIRS
++  ${LLVM_MAIN_INCLUDE_DIR}/llvm/AutoTuner
++
++  LINK_COMPONENTS
++  Remarks
++)
+\ No newline at end of file
+diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
+index 283baa6090eb..966137c0f71f 100644
+--- a/llvm/lib/CMakeLists.txt
++++ b/llvm/lib/CMakeLists.txt
+@@ -28,6 +28,7 @@ add_subdirectory(Object)
+ add_subdirectory(ObjectYAML)
+ add_subdirectory(Option)
+ add_subdirectory(Remarks)
++add_subdirectory(AutoTuner)
+ add_subdirectory(Debuginfod)
+ add_subdirectory(DebugInfo)
+ add_subdirectory(DWP)
+diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
+index 106571b9061b..9029dc7bb3d9 100644
+--- a/llvm/lib/CodeGen/CMakeLists.txt
++++ b/llvm/lib/CodeGen/CMakeLists.txt
+@@ -273,6 +273,7 @@ add_llvm_component_library(LLVMCodeGen
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   BitReader
+   BitWriter
+   CodeGenTypes
+diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
+index 5a005ba7b414..9dcb3833ab91 100644
+--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
++++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
+@@ -29,6 +29,24 @@ using namespace llvm;
+ 
+ #define DEBUG_TYPE "calcspillweights"
+ 
++#if defined(ENABLE_AUTOTUNER)
++static cl::opt<float> LoopWeight(
++    "reg-spill-loop-weight", cl::Hidden,
++    cl::desc(
++        "Tunable extra weight to what looks like a loop induction variable"),
++    cl::init(3));
++
++static cl::opt<float> RemaWeight(
++    "reg-spill-rematerialize-weight", cl::Hidden,
++    cl::desc("Tunable reduced weight giving re-materialize oppotunities"),
++    cl::init(0.5f));
++
++static cl::opt<float>
++    HintWeight("reg-spill-hint-weight", cl::Hidden,
++               cl::desc("Tunable weakly boost weight of hinted registers"),
++               cl::init(1.01f));
++#endif
++
+ void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
+   LLVM_DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+                     << "********** Function: " << MF.getName() << '\n');
+@@ -252,7 +270,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+ 
+       // Give extra weight to what looks like a loop induction variable update.
+       if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
++#if defined(ENABLE_AUTOTUNER)
++        Weight *= LoopWeight;
++#else
+         Weight *= 3;
++#endif
+ 
+       TotalWeight += Weight;
+     }
+@@ -288,7 +310,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+     }
+ 
+     // Weakly boost the spill weight of hinted registers.
++#if defined(ENABLE_AUTOTUNER)
++    TotalWeight *= HintWeight;
++#else
+     TotalWeight *= 1.01F;
++#endif
+   }
+ 
+   // If the live interval was already unspillable, leave it that way.
+@@ -315,7 +341,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+   // FIXME: this gets much more complicated once we support non-trivial
+   // re-materialization.
+   if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
++#if defined(ENABLE_AUTOTUNER)
++    TotalWeight *= RemaWeight;
++#else
+     TotalWeight *= 0.5F;
++#endif
+ 
+   if (IsLocalSplitArtifact)
+     return normalize(TotalWeight, Start->distance(*End), NumInstr);
+diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
+index 231544494c32..327cd40f86a4 100644
+--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
++++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
+@@ -37,6 +37,9 @@
+ #include "llvm/Support/raw_ostream.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include <algorithm>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
+ #include <cmath>
+ using namespace llvm;
+ 
+@@ -1703,6 +1706,39 @@ MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
+   return LiveIns.begin();
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t MachineBasicBlock::computeStructuralHash() {
++  return StructuralHash(*this);
++}
++
++void MachineBasicBlock::initCodeRegion() {
++  std::string BasicBlockName =
++      ("%bb." + Twine(this->getNumber()) + ":" + this->getName()).str();
++  MachineFunction *MF = this->getParent();
++  StringRef FuncName = MF->getName();
++
++  autotuning::CodeRegion CR;
++  if (!this->empty()) {
++    const DebugLoc &StartLoc = this->front().getDebugLoc();
++    CR = autotuning::CodeRegion(BasicBlockName, FuncName.data(),
++                                autotuning::CodeRegionType::MachineBasicBlock,
++                                StartLoc);
++  } else {
++    CR = autotuning::CodeRegion(BasicBlockName, FuncName.data(),
++                                autotuning::CodeRegionType::MachineBasicBlock);
++  }
++  // Compute the number of non-debug IR instructions in this MBB.
++  unsigned NumInstrs = std::distance(this->getFirstNonDebugInstr(),
++                                     this->getLastNonDebugInstr());
++  CR.setSize(NumInstrs);
++  // Compute hotness.
++  autotuning::HotnessType Hotness = MF->getFunction().ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++#endif
++
+ MachineBasicBlock::liveout_iterator MachineBasicBlock::liveout_begin() const {
+   const MachineFunction &MF = *getParent();
+   assert(MF.getProperties().hasProperty(
+diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
+index ba5432459d12..caccc9e5fad4 100644
+--- a/llvm/lib/CodeGen/MachineScheduler.cpp
++++ b/llvm/lib/CodeGen/MachineScheduler.cpp
+@@ -569,6 +569,12 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+        MBB != MBBEnd; ++MBB) {
+ 
++#if defined(ENABLE_AUTOTUNER)
++    // before visiting this MBB
++    // if AutoTuning is enabled, initialize this MBB for auto-tuning
++    autotuning::Engine.initContainer(&*MBB, DEBUG_TYPE);
++#endif
++
+     Scheduler.startBlock(&*MBB);
+ 
+ #ifndef NDEBUG
+@@ -3244,6 +3250,44 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+     RegionPolicy.ShouldTrackLaneMasks = false;
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - Look up for MMB level scheduling direction if AutoTuning is
++  // enabled
++  if (autotuning::Engine.isEnabled()) {
++    MachineBasicBlock &MBB = *Begin->getParent();
++
++    bool NewForceBottomUp = false;
++    // Look up from xml file, and overwrite values
++    bool IsForceBottomUpSet =
++        MBB.lookUpParams<bool>("ForceBottomUp", NewForceBottomUp);
++
++    bool NewForceForceTopDown = false;
++    bool IsForceTopDownSet =
++        MBB.lookUpParams<bool>("ForceTopDown", NewForceForceTopDown);
++
++    assert((!NewForceBottomUp || !NewForceForceTopDown) &&
++           "BottomUp and TopDown cannot both set to true");
++
++    if (IsForceBottomUpSet) {
++      RegionPolicy.OnlyBottomUp = NewForceBottomUp;
++      if (RegionPolicy.OnlyBottomUp) {
++        RegionPolicy.OnlyTopDown = false;
++      }
++    }
++
++    if (IsForceTopDownSet) {
++      RegionPolicy.OnlyTopDown = NewForceForceTopDown;
++      if (RegionPolicy.OnlyTopDown) {
++        RegionPolicy.OnlyBottomUp = false;
++      }
++    }
++
++    if (IsForceBottomUpSet || IsForceTopDownSet) {
++      return;
++    }
++  }
++#endif
++
+   // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+   // e.g. -misched-bottomup=false allows scheduling in both directions.
+   assert((!ForceTopDown || !ForceBottomUp) &&
+diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+index 36a02d5beb4b..d4ac95d534ed 100644
+--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
++++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+@@ -16,6 +16,9 @@
+ #include "llvm/CodeGen/MachineJumpTableInfo.h"
+ #include "llvm/CodeGen/TargetLowering.h"
+ #include "llvm/Target/TargetMachine.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace SwitchCG;
+@@ -61,7 +64,23 @@ void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters,
+   if (!TLI->areJTsAllowed(SI->getParent()->getParent()))
+     return;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  unsigned MinJumpTableEntries = TLI->getMinimumJumpTableEntries();
++  // Overwrite MinJumpTableEntries when it is set by Autotuner
++  if (autotuning::Engine.isEnabled()) {
++    autotuning::Engine.initContainer(SI->ATESwitchInst.get(),
++                                     "switch-lowering");
++
++    int NewValue = 0; // the int value is set by lookUpParams()
++    bool Changed =
++        SI->ATESwitchInst->lookUpParams<int>("MinJumpTableEntries", NewValue);
++    if (Changed)
++      MinJumpTableEntries = NewValue;
++  }
++#else
+   const unsigned MinJumpTableEntries = TLI->getMinimumJumpTableEntries();
++#endif
++
+   const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
+ 
+   // Bail if not enough cases.
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index df753b91ff90..af77e6c2dc4d 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -2602,11 +2602,21 @@ public:
+   void writeAllAttributeGroups();
+ 
+   void printTypeIdentities();
++#if defined(ENABLE_AUTOTUNER)
++  void printGlobal(const GlobalVariable *GV, bool PrintDeclarationOnly = false);
++  void printAlias(const GlobalAlias *GA);
++  void printIFunc(const GlobalIFunc *GI);
++  void printComdat(const Comdat *C);
++  void printRequisiteDeclarations(const Function *F);
++  void printFunction(const Function *F, bool PrintCompleteIR = false,
++                     bool PrintDeclarationOnly = false);
++#else
+   void printGlobal(const GlobalVariable *GV);
+   void printAlias(const GlobalAlias *GA);
+   void printIFunc(const GlobalIFunc *GI);
+   void printComdat(const Comdat *C);
+   void printFunction(const Function *F);
++#endif
+   void printArgument(const Argument *FA, AttributeSet Attrs);
+   void printBasicBlock(const BasicBlock *BB);
+   void printInstructionLine(const Instruction &I);
+@@ -3593,15 +3603,26 @@ static void maybePrintComdat(formatted_raw_ostream &Out,
+   Out << ')';
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++void AssemblyWriter::printGlobal(const GlobalVariable *GV,
++                                 bool PrintDeclarationOnly) {
++  if (GV->isMaterializable() && !PrintDeclarationOnly)
++#else
+ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   if (GV->isMaterializable())
++#endif
+     Out << "; Materializable\n";
+ 
+   AsmWriterContext WriterCtx(&TypePrinter, &Machine, GV->getParent());
+   WriteAsOperandInternal(Out, GV, WriterCtx);
+   Out << " = ";
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if ((!GV->hasInitializer() || PrintDeclarationOnly) &&
++      GV->hasExternalLinkage())
++#else
+   if (!GV->hasInitializer() && GV->hasExternalLinkage())
++#endif
+     Out << "external ";
+ 
+   Out << getLinkageNameWithSpace(GV->getLinkage());
+@@ -3619,7 +3640,11 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   Out << (GV->isConstant() ? "constant " : "global ");
+   TypePrinter.print(GV->getValueType(), Out);
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (GV->hasInitializer() && !PrintDeclarationOnly) {
++#else
+   if (GV->hasInitializer()) {
++#endif
+     Out << ' ';
+     writeOperand(GV->getInitializer(), false);
+   }
+@@ -3769,12 +3794,102 @@ void AssemblyWriter::printTypeIdentities() {
+   }
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++/// printRequisiteDeclarations - Print the declarations of type identities,
++/// global variables, functions, and function attribute groups of a function.
++void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
++  // walk through instructions and collect global variables & functions
++  SmallPtrSet<GlobalVariable *, 8> GVs;
++  SmallPtrSet<Function *, 8> Functions;
++  for (const BasicBlock &BB : *F) {
++    for (const Instruction &I : BB) {
++      // Check for function
++      if (const auto *CI = dyn_cast<CallInst>(&I)) {
++        Function *func = CI->getCalledFunction();
++        if (func)
++          Functions.insert(func);
++      }
++      // Check for global variables
++      for (const Use &U : I.operands()) {
++        if (GlobalVariable *gv = dyn_cast<GlobalVariable>(U))
++          GVs.insert(gv);
++        if (GEPOperator *gepo = dyn_cast<GEPOperator>(&U)) {
++          if (GlobalVariable *gv =
++                  dyn_cast<GlobalVariable>(gepo->getPointerOperand()))
++            GVs.insert(gv);
++          for (auto it = gepo->idx_begin(), et = gepo->idx_end(); it != et;
++               ++it) {
++            if (GlobalVariable *gv = dyn_cast<GlobalVariable>(*it))
++              GVs.insert(gv);
++          }
++        }
++      }
++    }
++  }
++
++  // print type identities
++  printTypeIdentities();
++
++  // print global variables
++  if (!GVs.empty()) {
++    Out << '\n';
++    for (auto GVit = GVs.begin(), et = GVs.end(); GVit != et; ++GVit) {
++      // Make backups of some properties. They may be modified for printing.
++      GlobalValue::LinkageTypes SavedLinkage = (*GVit)->getLinkage();
++      GlobalVariable::VisibilityTypes SavedVisibility =
++          (*GVit)->getVisibility();
++
++      // modify property if needed
++      if (!(*GVit)->hasAvailableExternallyLinkage() &&
++          !((*GVit)->getName() == "llvm.global_ctors") &&
++          (*GVit)->hasLocalLinkage()) {
++        (*GVit)->setLinkage(GlobalValue::ExternalLinkage);
++        (*GVit)->setVisibility(GlobalValue::HiddenVisibility);
++      }
++
++      printGlobal(*GVit, true);
++      Out << '\n';
++
++      // restore backups
++      (*GVit)->setLinkage(SavedLinkage);
++      (*GVit)->setVisibility(SavedVisibility);
++    }
++    Out << '\n';
++  }
++
++  // print functions
++  for (auto FuncIt = Functions.begin(), et = Functions.end(); FuncIt != et;
++       ++FuncIt) {
++    Out << '\n';
++    printFunction(*FuncIt, false, true);
++  }
++
++  // Write attribute groups.
++  if (!Machine.as_empty()) {
++    Out << '\n';
++    writeAllAttributeGroups();
++  }
++  Out << '\n';
++}
++
+ /// printFunction - Print all aspects of a function.
++void AssemblyWriter::printFunction(const Function *F, bool PrintCompleteIR,
++                                   bool PrintDeclarationOnly) {
++  if (PrintCompleteIR && !PrintDeclarationOnly) {
++    printRequisiteDeclarations(F);
++  }
++  if (AnnotationWriter && !PrintDeclarationOnly)
++    AnnotationWriter->emitFunctionAnnot(F, Out);
++
++  if (F->isMaterializable() && !PrintDeclarationOnly)
++    Out << "; Materializable\n";
++#else
+ void AssemblyWriter::printFunction(const Function *F) {
+   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
+ 
+   if (F->isMaterializable())
+     Out << "; Materializable\n";
++#endif
+ 
+   const AttributeList &Attrs = F->getAttributes();
+   if (Attrs.hasFnAttrs()) {
+@@ -3792,6 +3907,18 @@ void AssemblyWriter::printFunction(const Function *F) {
+       Out << "; Function Attrs: " << AttrStr << '\n';
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (!PrintDeclarationOnly)
++    Machine.incorporateFunction(F);
++
++  if (F->isDeclaration() || PrintDeclarationOnly) {
++    Out << "declare";
++    if (!PrintDeclarationOnly) {
++      SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
++      F->getAllMetadata(MDs);
++      printMetadataAttachments(MDs, " ");
++    }
++#else
+   Machine.incorporateFunction(F);
+ 
+   if (F->isDeclaration()) {
+@@ -3799,6 +3926,7 @@ void AssemblyWriter::printFunction(const Function *F) {
+     SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+     F->getAllMetadata(MDs);
+     printMetadataAttachments(MDs, " ");
++#endif
+     Out << ' ';
+   } else
+     Out << "define ";
+@@ -3824,7 +3952,11 @@ void AssemblyWriter::printFunction(const Function *F) {
+   Out << '(';
+ 
+   // Loop over the arguments, printing them...
++#if defined(ENABLE_AUTOTUNER)
++  if ((F->isDeclaration() && !IsForDebug) || PrintDeclarationOnly) {
++#else
+   if (F->isDeclaration() && !IsForDebug) {
++#endif
+     // We're only interested in the type here - don't print argument names.
+     for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) {
+       // Insert commas as we go... the first arg doesn't get a comma
+@@ -3895,7 +4027,11 @@ void AssemblyWriter::printFunction(const Function *F) {
+     writeOperand(F->getPersonalityFn(), /*PrintType=*/true);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (F->isDeclaration() || PrintDeclarationOnly) {
++#else
+   if (F->isDeclaration()) {
++#endif
+     Out << '\n';
+   } else {
+     SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+@@ -3913,6 +4049,13 @@ void AssemblyWriter::printFunction(const Function *F) {
+     Out << "}\n";
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Output metadata
++  if (!Machine.mdn_empty() && PrintCompleteIR && !PrintDeclarationOnly) {
++    Out << '\n';
++    writeAllMDNodes();
++  }
++#endif
+   Machine.purgeFunction();
+ }
+ 
+@@ -4591,13 +4734,21 @@ void AssemblyWriter::printUseLists(const Function *F) {
+ 
+ void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+                      bool ShouldPreserveUseListOrder,
++#if defined(ENABLE_AUTOTUNER)
++                     bool IsForDebug, bool PrintCompleteIR) const {
++#else
+                      bool IsForDebug) const {
++#endif
+   SlotTracker SlotTable(this->getParent());
+   formatted_raw_ostream OS(ROS);
+   AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
+                    IsForDebug,
+                    ShouldPreserveUseListOrder);
++#if defined(ENABLE_AUTOTUNER)
++  W.printFunction(this, PrintCompleteIR);
++#else
+   W.printFunction(this);
++#endif
+ }
+ 
+ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
+index 217fe703dd4e..d44d1eea9f3e 100644
+--- a/llvm/lib/IR/CMakeLists.txt
++++ b/llvm/lib/IR/CMakeLists.txt
+@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMCore
+   intrinsics_gen
+ 
+   LINK_COMPONENTS
++  AutoTuner
+   BinaryFormat
+   Demangle
+   Remarks
+diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
+index 435800d9e5f9..ec2620efac38 100644
+--- a/llvm/lib/IR/Function.cpp
++++ b/llvm/lib/IR/Function.cpp
+@@ -70,6 +70,10 @@
+ #include <cstring>
+ #include <string>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
++
+ using namespace llvm;
+ using ProfileCount = Function::ProfileCount;
+ 
+@@ -1977,6 +1981,36 @@ std::optional<StringRef> Function::getSectionPrefix() const {
+   return std::nullopt;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t AutoTuningEnabledFunction::computeStructuralHash() {
++  return StructuralHash(*(this->Func));
++}
++
++void AutoTuningEnabledFunction::initCodeRegion() {
++  StringRef FuncName = Func->getName();
++  StringRef EntryBBName;
++  autotuning::SourceLocation Loc;
++
++  if (!Func->empty())
++    EntryBBName = Func->front().getName();
++  else
++    EntryBBName = StringRef("None");
++
++  DISubprogram *SubProgram = Func->getSubprogram();
++  if (SubProgram)
++    // Set the column number to 0 because there is no information about
++    // column number for functions.
++    Loc = {SubProgram->getFilename().str(), SubProgram->getLine(), 0};
++
++  autotuning::CodeRegion CR =
++      autotuning::CodeRegion(EntryBBName.data(), FuncName.data(),
++                             autotuning::CodeRegionType::Function, Loc);
++  CR.setSize(Func->getInstructionCount());
++  CR.setHotness(this->getHotness());
++  this->setCodeRegion(CR);
++}
++#endif
++
+ bool Function::nullPointerIsDefined() const {
+   return hasFnAttribute(Attribute::NullPointerIsValid);
+ }
+diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
+index cb0ac0f8eae6..e614285df07a 100644
+--- a/llvm/lib/IR/Instructions.cpp
++++ b/llvm/lib/IR/Instructions.cpp
+@@ -45,6 +45,9 @@
+ #include <cstdint>
+ #include <optional>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -259,6 +262,89 @@ void LandingPadInst::addClause(Constant *Val) {
+   getOperandList()OpNo = Val;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t AutoTuningEnabledSwitchInst::computeStructuralHash() {
++  return StructuralHash(*(this->SI));
++}
++
++void AutoTuningEnabledSwitchInst::initCodeRegion() {
++  std::string SwitchName;
++  if (this->SI->hasName()) {
++    SwitchName = this->SI->getName().str();
++  } else {
++    std::string Str;
++    llvm::raw_string_ostream RSO(Str);
++    this->SI->getCondition()->printAsOperand(RSO);
++    SwitchName = RSO.str();
++  }
++
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      SwitchName, this->SI->getFunction()->getName().str(),
++      autotuning::CodeRegionType::Switch, this->SI->getDebugLoc());
++
++  unsigned TotalNumInsts = 0;
++  for (auto Case : SI->cases()) {
++    const BasicBlock *BB = Case.getCaseSuccessor();
++    unsigned NumInsts = std::distance(BB->instructionsWithoutDebug().begin(),
++                                      BB->instructionsWithoutDebug().end());
++    TotalNumInsts += NumInsts;
++  }
++
++  CR.setSize(TotalNumInsts);
++  // Compute hotness.
++  autotuning::HotnessType Hotness =
++      this->SI->getFunction()->ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++
++uint64_t AutoTuningEnabledCallSite::computeStructuralHash() {
++  return StructuralHash(*(this->CB));
++}
++
++void AutoTuningEnabledCallSite::initCodeRegion() {
++  // Use Caller's name as FuncName and Callee's name as Name of a CodeRegion.
++  Function *Caller = this->CB->getCaller();
++  Function *Callee = this->CB->getCalledFunction();
++  if (Caller == nullptr || Callee == nullptr) {
++    this->setCodeRegion(autotuning::CodeRegion::getInvalidInstance());
++    return;
++  }
++
++  autotuning::SourceLocation SrcLoc;
++  if (this->CB->getDebugLoc()) {
++    unsigned int SourceLine = this->CB->getDebugLoc()->getLine();
++    // Get modified source line number for current callsite if there is another
++    // call instruction (to same callee) which has same source line number
++    // happened due to inlining.
++    std::optional<unsigned int> LineNum = autotuning::Engine.getCallSiteLoc(CB);
++    if (LineNum)
++      SourceLine = *LineNum;
++    SrcLoc = autotuning::SourceLocation{
++        this->CB->getDebugLoc()->getFilename().str(), SourceLine,
++        this->CB->getDebugLoc()->getColumn()};
++  }
++
++  // We are using DebugLoc to distinguish between multiple calls to the same
++  // callee in a function. It may be possible that these multiple calls have
++  // same DebugLoc either 1) due to inlining of multiple calls (same callee)
++  // and callee having more calls, or 2) cloned calls added by previous
++  // optimizations. We are using 'callee name + it's parent (basic block) name'
++  // to solve these problems. Additionally we are using modified line number
++  // for the issue # 1; this will handle the cases where the multiple calls are
++  // in the same basic block.
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      Callee->getName().str() + "-" + this->CB->getParent()->getName().str(),
++      Caller->getName().data(), autotuning::CodeRegionType::CallSite, SrcLoc,
++      autotuning::DynamicOptions{{"ForceInline", {0, 1}}});
++
++  CR.setSize(Callee->getInstructionCount());
++  CR.setHotness(Caller->ATEFunction.getHotness());
++  this->setCodeRegion(CR);
++}
++#endif
++
+ //===----------------------------------------------------------------------===//
+ //                        CallBase Implementation
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
+index 6ea108d831a1..1583e1c82b3e 100644
+--- a/llvm/lib/IR/StructuralHash.cpp
++++ b/llvm/lib/IR/StructuralHash.cpp
+@@ -10,9 +10,23 @@
+ #include "llvm/IR/Function.h"
+ #include "llvm/IR/GlobalVariable.h"
+ #include "llvm/IR/Module.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/CodeGen/MachineBasicBlock.h"
++#include "llvm/IR/InstrTypes.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ 
++#if defined(ENABLE_AUTOTUNER)
++// AutoTuner Flag to use callsite Debug Location for hash cacluation.
++static cl::opt<bool> HashCallSite(
++    "hash-prior-to-callsite", cl::init(true), cl::Hidden,
++    cl::desc("Use function IR prior to a call site to compute the hashcode for"
++             " the call site"));
++#endif
++
+ namespace {
+ 
+ // Basic hashing mechanism to detect structural change to the IR, used to verify
+@@ -21,16 +35,81 @@ namespace {
+ 
+ class StructuralHashImpl {
+   hash_code Hash;
++#if defined(ENABLE_AUTOTUNER)
++  const uint64_t BLOCK_HEADER_HASH = 45798;
++#endif
+ 
+   template <typename T> void hash(const T &V) { Hash = hash_combine(Hash, V); }
+ 
+ public:
+   StructuralHashImpl() : Hash(4) {}
+ 
++#if defined(ENABLE_AUTOTUNER)
++  void update(const MachineBasicBlock &MBB) {
++    // Update the structural hash when we encounter a new basic block.
++    // Prevents CodeRegions with different structures, but many empty
++    // BasicBlocks to have the same structural hash.
++    if (const BasicBlock *Block = MBB.getBasicBlock()) {
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *Block)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const std::vector<BasicBlock *> BBs) {
++    // Update the structural hash when we encounter a new basic block.
++    // Prevents CodeRegions with different structures, but many empty
++    // BasicBlocks to have the same structural hash.
++    for (BasicBlock *BB : BBs) {
++      if (BB == nullptr)
++        continue;
++
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *BB)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const llvm::CallBase &CB) {
++    StringRef Name = "";
++    if (HashCallSite) {
++      update(*CB.getCaller(), std::addressof(CB));
++    } else {
++      const Function &F = *CB.getCaller();
++      Name = F.getName();
++      std::string FileName = Name.str();
++      for (uint64_t Idx = 0; Idx < Name.size(); Idx = Idx + sizeof(uint64_t)) {
++        uint64_t Value = 0;
++        FileName.copy((char *)&Value, sizeof(uint64_t), Idx);
++        hash(Value);
++      }
++    }
++
++    update(*CB.getCalledFunction());
++  }
++
++  void update(const SwitchInst &SI) {
++    hash(SI.getNumCases());
++    for (auto Case : SI.cases()) {
++      hash(BLOCK_HEADER_HASH);
++      const BasicBlock *BB = Case.getCaseSuccessor();
++      for (auto &Inst : *BB)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const Function &F, const CallBase *TargetCB = nullptr) {
++    if (F.isDeclaration())
++      return;
++
++    const Instruction *I =
++        TargetCB ? (dyn_cast<Instruction>(TargetCB)) : nullptr;
++#else
+   void update(const Function &F) {
+     // Declarations don't affect analyses.
+     if (F.isDeclaration())
+       return;
++#endif
+ 
+     hash(12345); // Function header
+ 
+@@ -44,9 +123,18 @@ public:
+     VisitedBBs.insert(BBs0);
+     while (!BBs.empty()) {
+       const BasicBlock *BB = BBs.pop_back_val();
++#if defined(ENABLE_AUTOTUNER)
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *BB) {
++        hash(Inst.getOpcode());
++        if (I && Inst.isIdenticalTo(I))
++          return;
++      }
++#else
+       hash(45798); // Block header
+       for (auto &Inst : *BB)
+         hash(Inst.getOpcode());
++#endif
+ 
+       const Instruction *Term = BB->getTerminator();
+       for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+@@ -79,6 +167,32 @@ public:
+ 
+ } // namespace
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t llvm::StructuralHash(const MachineBasicBlock &MBB) {
++  StructuralHashImpl H;
++  H.update(MBB);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const std::vector<BasicBlock *> BBs) {
++  StructuralHashImpl H;
++  H.update(BBs);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const CallBase &CB) {
++  StructuralHashImpl H;
++  H.update(CB);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const SwitchInst &SI) {
++  StructuralHashImpl H;
++  H.update(SI);
++  return H.getHash();
++}
++#endif
++
+ uint64_t llvm::StructuralHash(const Function &F) {
+   StructuralHashImpl H;
+   H.update(F);
+diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
+index d0cbbcc0e310..a3ccbc6d258f 100644
+--- a/llvm/lib/Passes/PassBuilder.cpp
++++ b/llvm/lib/Passes/PassBuilder.cpp
+@@ -262,6 +262,11 @@
+ #include "llvm/Transforms/Vectorize/VectorCombine.h"
+ #include <optional>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
++
+ using namespace llvm;
+ 
+ static const Regex DefaultAliasRegex(
+diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
+index 660cb2e974d7..8009e011833c 100644
+--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
++++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
+@@ -133,6 +133,11 @@
+ #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+ #include "llvm/Transforms/Vectorize/VectorCombine.h"
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
++
+ using namespace llvm;
+ 
+ static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
+@@ -289,6 +294,10 @@ PipelineTuningOptions::PipelineTuningOptions() {
+   EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++#endif
++
+ namespace llvm {
+ extern cl::opt<unsigned> MaxDevirtIterations;
+ extern cl::opt<bool> EnableKnowledgeRetention;
+@@ -452,9 +461,17 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
+   // attention to it.
+   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+       PGOOpt->Action != PGOOptions::SampleUse)
++#if defined(ENABLE_AUTOTUNER)
++  {
++    if (AutoTuningCompileMode)
++      LPM2.addPass(AutoTuningCompileLoopPass(autotuning::CompileOptionUnroll));
++#endif
+     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                     PTO.ForgetAllSCEVInLoopUnroll));
++#if defined(ENABLE_AUTOTUNER)
++  }
++#endif
+ 
+   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ 
+@@ -631,9 +648,17 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
+   // attention to it.
+   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+       PGOOpt->Action != PGOOptions::SampleUse)
++#if defined(ENABLE_AUTOTUNER)
++  {
++    if (AutoTuningCompileMode)
++      LPM2.addPass(AutoTuningCompileLoopPass(autotuning::CompileOptionUnroll));
++#endif
+     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                     PTO.ForgetAllSCEVInLoopUnroll));
++#if defined(ENABLE_AUTOTUNER)
++  }
++#endif
+ 
+   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ 
+@@ -1110,6 +1135,11 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+   if (EnableSyntheticCounts && !PGOOpt)
+     MPM.addPass(SyntheticCountsPropagation());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionInline));
++#endif
++
+   if (EnableModuleInliner)
+     MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
+   else
+@@ -1131,6 +1161,12 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+ /// TODO: Should LTO cause any differences to this set of passes?
+ void PassBuilder::addVectorPasses(OptimizationLevel Level,
+                                   FunctionPassManager &FPM, bool IsFullLTO) {
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode && !IsFullLTO)
++    FPM.addPass(
++        AutoTuningCompileFunctionPass(autotuning::CompileOptionVectorize));
++#endif
++
+   FPM.addPass(LoopVectorizePass(
+       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
+ 
+@@ -1444,6 +1480,10 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+     return buildO0DefaultPipeline(Level, LTOPreLink);
+ 
+   ModulePassManager MPM;
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionStart));
++#endif
+ 
+   // Convert @llvm.global.annotations to !annotation metadata.
+   MPM.addPass(Annotation2MetadataPass());
+@@ -1475,6 +1515,12 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+ 
+   if (LTOPreLink)
+     addRequiredLTOPreLinkPasses(MPM);
++
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionEnd));
++#endif
++
+   return MPM;
+ }
+ 
+diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
+index e10dc995c493..45a539f14b93 100644
+--- a/llvm/lib/Passes/PassRegistry.def
++++ b/llvm/lib/Passes/PassRegistry.def
+@@ -29,6 +29,10 @@ MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
+ MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis())
+ MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis())
+ 
++#if defined(ENABLE_AUTOTUNER)
++MODULE_ANALYSIS("autotuning-dump", AutotuningDumpAnalysis())
++#endif
++
+ #ifndef MODULE_ALIAS_ANALYSIS
+ #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+   MODULE_ANALYSIS(NAME, CREATE_PASS)
+@@ -127,6 +131,9 @@ MODULE_PASS("sanmd-module", SanitizerBinaryMetadataPass())
+ MODULE_PASS("memprof-module", ModuleMemProfilerPass())
+ MODULE_PASS("poison-checking", PoisonCheckingPass())
+ MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
++#if defined(ENABLE_AUTOTUNER)
++MODULE_PASS("autotuning-compile-module", AutoTuningCompileModulePass())
++#endif
+ #undef MODULE_PASS
+ 
+ #ifndef MODULE_PASS_WITH_PARAMS
+@@ -430,6 +437,9 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
+ FUNCTION_PASS("tsan", ThreadSanitizerPass())
+ FUNCTION_PASS("memprof", MemProfilerPass())
+ FUNCTION_PASS("declare-to-assign", llvm::AssignmentTrackingPass())
++#if defined(ENABLE_AUTOTUNER)
++FUNCTION_PASS("autotuning-compile-function", AutoTuningCompileFunctionPass())
++#endif
+ #undef FUNCTION_PASS
+ 
+ #ifndef FUNCTION_PASS_WITH_PARAMS
+@@ -614,6 +624,9 @@ LOOP_PASS("guard-widening", GuardWideningPass())
+ LOOP_PASS("loop-bound-split", LoopBoundSplitPass())
+ LOOP_PASS("loop-reroll", LoopRerollPass())
+ LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
++#if defined(ENABLE_AUTOTUNER)
++LOOP_PASS("autotuning-compile-loop", AutoTuningCompileLoopPass())
++#endif
+ #undef LOOP_PASS
+ 
+ #ifndef LOOP_PASS_WITH_PARAMS
+diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
+index 7eef511928ec..8653027ceed2 100644
+--- a/llvm/lib/Passes/StandardInstrumentations.cpp
++++ b/llvm/lib/Passes/StandardInstrumentations.cpp
+@@ -41,6 +41,10 @@
+ #include <unordered_set>
+ #include <utility>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -107,6 +111,10 @@ static cl::opt<bool> PrintOnCrash(
+     cl::desc("Print the last form of the IR before crash (use -print-on-crash-path to dump to a file)"),
+     cl::Hidden);
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++#endif
++
+ static cl::opt<std::string> OptBisectPrintIRPath(
+     "opt-bisect-print-ir-path",
+     cl::desc("Print IR to path when opt-bisect-limit is reached"), cl::Hidden);
+@@ -874,6 +882,21 @@ bool OptPassGateInstrumentation::shouldRun(StringRef PassName, Any IR) {
+ 
+ void OptPassGateInstrumentation::registerCallbacks(
+     PassInstrumentationCallbacks &PIC) {
++#if defined(ENABLE_AUTOTUNER)
++  // Using AutoTuner OptBisect to change the behavior of compilation pipeline.
++  // Flag 'opt-bisect-limit' will be preferred if both 'opt-bisect-limit' and
++  // incremental compilation flags are used.
++  if (autotuning::Engine.isParseInput() && AutoTuningCompileMode) {
++    if (!getAutoTuningOptPassGate().isEnabled())
++      return;
++
++    PIC.registerShouldRunOptionalPassCallback((StringRef PassID, Any IR) {
++      return isIgnored(PassID) ||
++             getAutoTuningOptPassGate().checkPass(PassID, getIRName(IR));
++    });
++    return;
++  }
++#endif
+   OptPassGate &PassGate = Context.getOptPassGate();
+   if (!PassGate.isEnabled())
+     return;
+diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+index b2627196bce6..b1dfa9d0f2cf 100644
+--- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
++++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+@@ -277,6 +277,14 @@ void BitstreamRemarkSerializerHelper::emitRemarkBlock(const Remark &Remark,
+   R.push_back(StrTab.add(Remark.RemarkName).first);
+   R.push_back(StrTab.add(Remark.PassName).first);
+   R.push_back(StrTab.add(Remark.FunctionName).first);
++#if defined(ENABLE_AUTOTUNER)
++  if (Remark.CodeRegionType)
++    R.push_back(StrTab.add(*Remark.CodeRegionType).first);
++  if (std::optional<uint64_t> hash = Remark.CodeRegionHash)
++    R.push_back(*hash);
++  if (std::optional<unsigned int> Invocation = Remark.Invocation)
++    R.push_back(*Invocation);
++#endif
+   Bitstream.EmitRecordWithAbbrev(RecordRemarkHeaderAbbrevID, R);
+ 
+   if (const std::optional<RemarkLocation> &Loc = Remark.Loc) {
+diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp
+index 9f4676ce37ab..d1faf4f1553a 100644
+--- a/llvm/lib/Remarks/RemarkStreamer.cpp
++++ b/llvm/lib/Remarks/RemarkStreamer.cpp
+@@ -14,6 +14,10 @@
+ #include "llvm/Support/CommandLine.h"
+ #include <optional>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/DebugInfoMetadata.h"
++#endif
++
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
+diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
+index f5123b0f64ce..baa393c6a619 100644
+--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
++++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
+@@ -17,10 +17,23 @@
+ #include "llvm/Support/Endian.h"
+ #include "llvm/Support/Path.h"
+ #include <optional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Creating code regions without meta data (e.g. debug Location, Function Name,
++// etc.).
++// This flag is added here instead of 'lib/AutoTuner/AutoTuning.cpp' to avoid
++// making LLVMRemarks dependent on LLVMCore.
++cl::opt<bool> OmitAutotuningMetadata(
++    "auto-tuning-omit-metadata", cl::Hidden, cl::init(false),
++    cl::desc("Include only code region hashes and types in opportunity files"));
++#endif
++
+ char YAMLParseError::ID = 0;
+ 
+ static void handleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
+@@ -235,6 +248,23 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
+         TheRemark.FunctionName = *MaybeStr;
+       else
+         return MaybeStr.takeError();
++#if defined(ENABLE_AUTOTUNER)
++    } else if (KeyName == "CodeRegionType") {
++      if (Expected<StringRef> MaybeStr = parseStr(RemarkField))
++        TheRemark.CodeRegionType = *MaybeStr;
++      else
++        return MaybeStr.takeError();
++    } else if (KeyName == "CodeRegionHash") {
++      if (Expected<uint64_t> MaybeULL = parseUnsignedLL(RemarkField))
++        TheRemark.CodeRegionHash = *MaybeULL;
++      else
++        return MaybeULL.takeError();
++    } else if (KeyName == "Invocation") {
++      if (Expected<unsigned int> MaybeULL = parseUnsignedLL(RemarkField))
++        TheRemark.Invocation = *MaybeULL;
++      else
++        return MaybeULL.takeError();
++#endif
+     } else if (KeyName == "Hotness") {
+       if (Expected<unsigned> MaybeU = parseUnsigned(RemarkField))
+         TheRemark.Hotness = *MaybeU;
+@@ -261,11 +291,35 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Check if any of the mandatory fields are missing.
++  if (TheRemark.RemarkType == Type::AutoTuning) {
++    // We expect type, and pass to be present at least.
++    if (!TheRemark.CodeRegionType || TheRemark.PassName.empty())
++      return error("CodeRegionHash, CodeRegionType, or Pass missing.",
++                   *RemarkEntry.getRoot());
++
++    // Sanity check for the correct command line option.
++    if (!OmitAutotuningMetadata && TheRemark.RemarkName.empty())
++      return error("Remark Name expected; enable -autotuning-omit-metadata.",
++                   *RemarkEntry.getRoot());
++
++    if (!OmitAutotuningMetadata && TheRemark.FunctionName.empty())
++      return error(
++          "Remark Function Name expected; enable -autotuning-omit-metadata.",
++          *RemarkEntry.getRoot());
++  } else if (TheRemark.RemarkType == Type::Unknown ||
++             TheRemark.PassName.empty() || TheRemark.RemarkName.empty() ||
++             TheRemark.FunctionName.empty())
++    return error("Type, Pass, Name or Function missing.",
++                 *RemarkEntry.getRoot());
++#else
+   // Check if any of the mandatory fields are missing.
+   if (TheRemark.RemarkType == Type::Unknown || TheRemark.PassName.empty() ||
+       TheRemark.RemarkName.empty() || TheRemark.FunctionName.empty())
+     return error("Type, Pass, Name or Function missing.",
+                  *RemarkEntry.getRoot());
++#endif
+ 
+   return std::move(Result);
+ }
+@@ -277,6 +331,9 @@ Expected<Type> YAMLRemarkParser::parseType(yaml::MappingNode &Node) {
+                   .Case("!Analysis", remarks::Type::Analysis)
+                   .Case("!AnalysisFPCommute", remarks::Type::AnalysisFPCommute)
+                   .Case("!AnalysisAliasing", remarks::Type::AnalysisAliasing)
++#if defined(ENABLE_AUTOTUNER)
++                  .Case("!AutoTuning", remarks::Type::AutoTuning)
++#endif
+                   .Case("!Failure", remarks::Type::Failure)
+                   .Default(remarks::Type::Unknown);
+   if (Type == remarks::Type::Unknown)
+@@ -313,6 +370,31 @@ Expected<StringRef> YAMLRemarkParser::parseStr(yaml::KeyValueNode &Node) {
+   return Result;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++Expected<std::vector<StringRef>>
++YAMLRemarkParser::parseStrVector(yaml::KeyValueNode &Node) {
++  std::vector<StringRef> Result;
++  auto *SequenceNode = dyn_cast<yaml::SequenceNode>(Node.getValue());
++  if (!SequenceNode)
++    return error("expected a value of sequence type.", Node);
++
++  for (yaml::Node &Element : *SequenceNode) {
++    auto *ScalarNode = dyn_cast<yaml::ScalarNode>(&Element);
++    if (!ScalarNode)
++      return error("expected a value of scalar type.", Element);
++    else {
++      StringRef Str = ScalarNode->getRawValue();
++      if (Str.front() == '\'')
++        Str = Str.drop_front();
++      if (Str.back() == '\'')
++        Str = Str.drop_back();
++      Result.push_back(Str);
++    }
++  }
++  return Result;
++}
++#endif
++
+ Expected<unsigned> YAMLRemarkParser::parseUnsigned(yaml::KeyValueNode &Node) {
+   SmallVector<char, 4> Tmp;
+   auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+@@ -324,6 +406,19 @@ Expected<unsigned> YAMLRemarkParser::parseUnsigned(yaml::KeyValueNode &Node) {
+   return UnsignedValue;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++Expected<uint64_t> YAMLRemarkParser::parseUnsignedLL(yaml::KeyValueNode &Node) {
++  SmallVector<char, 4> Tmp;
++  if (auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue())) {
++    uint64_t UnsignedValue = 0;
++    if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
++      return error("expected a value of integer type.", *Value);
++    return UnsignedValue;
++  }
++  return error("expected a value of scalar type.", Node);
++}
++#endif
++
+ Expected<RemarkLocation>
+ YAMLRemarkParser::parseDebugLoc(yaml::KeyValueNode &Node) {
+   auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
+@@ -374,6 +469,9 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+ 
+   std::optional<StringRef> KeyStr;
+   std::optional<StringRef> ValueStr;
++#if defined(ENABLE_AUTOTUNER)
++  std::optional<std::vector<StringRef>> ValueStrVector;
++#endif
+   std::optional<RemarkLocation> Loc;
+ 
+   for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
+@@ -400,11 +498,27 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+     if (ValueStr)
+       return error("only one string entry is allowed per argument.", ArgEntry);
+ 
++#if defined(ENABLE_AUTOTUNER)
++    // Try to parse the value to a string vector.
++    if (Expected<std::vector<StringRef>> MaybeStrVector =
++            parseStrVector(ArgEntry)) {
++      ValueStrVector = *MaybeStrVector;
++      ValueStr = "";
++    } else {
++      consumeError(MaybeStrVector.takeError());
++      // Try to parse the value.
++      if (Expected<StringRef> MaybeStr = parseStr(ArgEntry))
++        ValueStr = *MaybeStr;
++      else
++        return MaybeStr.takeError();
++    }
++#else
+     // Try to parse the value.
+     if (Expected<StringRef> MaybeStr = parseStr(ArgEntry))
+       ValueStr = *MaybeStr;
+     else
+       return MaybeStr.takeError();
++#endif
+ 
+     // Keep the key from the string.
+     KeyStr = KeyName;
+@@ -412,10 +526,18 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+ 
+   if (!KeyStr)
+     return error("argument key is missing.", *ArgMap);
++#if defined(ENABLE_AUTOTUNER)
++  if (!ValueStr && !ValueStrVector)
++#else
+   if (!ValueStr)
++#endif
+     return error("argument value is missing.", *ArgMap);
+ 
++#if defined(ENABLE_AUTOTUNER)
++  return Argument{*KeyStr, *ValueStr, ValueStrVector, Loc};
++#else
+   return Argument{*KeyStr, *ValueStr, Loc};
++#endif
+ }
+ 
+ Expected<std::unique_ptr<Remark>> YAMLRemarkParser::next() {
+diff --git a/llvm/lib/Remarks/YAMLRemarkParser.h b/llvm/lib/Remarks/YAMLRemarkParser.h
+index 8ef72e16be74..141f10dd3900 100644
+--- a/llvm/lib/Remarks/YAMLRemarkParser.h
++++ b/llvm/lib/Remarks/YAMLRemarkParser.h
+@@ -91,6 +91,12 @@ protected:
+   Expected<RemarkLocation> parseDebugLoc(yaml::KeyValueNode &Node);
+   /// Parse an argument.
+   Expected<Argument> parseArg(yaml::Node &Node);
++#if defined(ENABLE_AUTOTUNER)
++  /// parse a vector of strings.
++  Expected<std::vector<StringRef>> parseStrVector(yaml::KeyValueNode &Node);
++  /// Parse one value to an unsigned long long.
++  Expected<uint64_t> parseUnsignedLL(yaml::KeyValueNode &Node);
++#endif
+ };
+ 
+ /// YAML with a string table to Remark parser.
+diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+index 68285c3dde1b..1bc0f23f9221 100644
+--- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
++++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+@@ -15,10 +15,45 @@
+ #include "llvm/Remarks/Remark.h"
+ #include "llvm/Support/FileSystem.h"
+ #include <optional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<bool> OmitAutotuningMetadata;
++
++// Use the same keys whether we use a string table or not (respectively, T is an
++// unsigned or a StringRef).
++template <typename T>
++static void mapRemarkHeader(
++    yaml::IO &io, T PassName, T RemarkName, std::optional<RemarkLocation> RL,
++    T FunctionName, std::optional<StringRef> CodeRegionType,
++    std::optional<uint64_t> CodeRegionHash,
++    std::optional<unsigned int> Invocation,
++    std::optional<std::map<std::string, std::string>> BaselineConfig,
++    std::optional<std::map<std::string, std::vector<unsigned int>>>
++        AutoTunerOptions,
++    std::optional<uint64_t> Hotness, ArrayRef<Argument> Args) {
++  io.mapRequired("Pass", PassName);
++  if (!OmitAutotuningMetadata) {
++    io.mapRequired("Name", RemarkName);
++    io.mapOptional("DebugLoc", RL);
++    io.mapRequired("Function", FunctionName);
++  }
++  io.mapOptional("CodeRegionType", CodeRegionType);
++  io.mapOptional("CodeRegionHash", CodeRegionHash);
++  io.mapOptional("DynamicConfigs", AutoTunerOptions);
++  io.mapOptional("BaselineConfig", BaselineConfig);
++  io.mapOptional("Invocation", Invocation);
++  if (!OmitAutotuningMetadata) {
++    io.mapOptional("Hotness", Hotness);
++    io.mapOptional("Args", Args);
++  }
++}
++#else
+ // Use the same keys whether we use a string table or not (respectively, T is an
+ // unsigned or a StringRef).
+ template <typename T>
+@@ -33,6 +68,7 @@ static void mapRemarkHeader(yaml::IO &io, T PassName, T RemarkName,
+   io.mapOptional("Hotness", Hotness);
+   io.mapOptional("Args", Args);
+ }
++#endif
+ 
+ namespace llvm {
+ namespace yaml {
+@@ -53,6 +89,10 @@ template <> struct MappingTraits<remarks::Remark *> {
+     else if (io.mapTag("!AnalysisAliasing",
+                        (Remark->RemarkType == Type::AnalysisAliasing)))
+       ;
++#if defined(ENABLE_AUTOTUNER)
++    else if (io.mapTag("!AutoTuning", (Remark->RemarkType == Type::AutoTuning)))
++      ;
++#endif
+     else if (io.mapTag("!Failure", (Remark->RemarkType == Type::Failure)))
+       ;
+     else
+@@ -66,14 +106,58 @@ template <> struct MappingTraits<remarks::Remark *> {
+       unsigned NameID = StrTab.add(Remark->RemarkName).first;
+       unsigned FunctionID = StrTab.add(Remark->FunctionName).first;
+       mapRemarkHeader(io, PassID, NameID, Remark->Loc, FunctionID,
++#if defined(ENABLE_AUTOTUNER)
++                      Remark->CodeRegionType, Remark->CodeRegionHash,
++                      Remark->Invocation, Remark->BaselineConfig,
++                      Remark->AutoTunerOptions, Remark->Hotness, Remark->Args);
++
++#else
+                       Remark->Hotness, Remark->Args);
++#endif
+     } else {
+       mapRemarkHeader(io, Remark->PassName, Remark->RemarkName, Remark->Loc,
++#if defined(ENABLE_AUTOTUNER)
++                      Remark->FunctionName, Remark->CodeRegionType,
++                      Remark->CodeRegionHash, Remark->Invocation,
++                      Remark->BaselineConfig, Remark->AutoTunerOptions,
++                      Remark->Hotness, Remark->Args);
++#else
+                       Remark->FunctionName, Remark->Hotness, Remark->Args);
++#endif
+     }
+   }
+ };
+ 
++#if defined(ENABLE_AUTOTUNER)
++// YAML I/O to support dumping 'Values: { key: ..., ... }' in opportunity
++// files.
++template <>
++struct MappingTraits<std::map<std::string, std::vector<unsigned int>>> {
++  static void mapping(IO &io,
++                      std::map<std::string, std::vector<unsigned int>> &OM) {
++    assert(io.outputting() && "input not yet implemented");
++
++    // Print as an abbreviated dictionary
++    llvm::yaml::StdMapStringCustomMappingTraitsImpl<
++        std::vector<unsigned int>>::output(io, OM);
++  }
++  // This sets the beginFlowMapping and endFlowMapping
++  static const bool flow = true;
++};
++
++template <> struct MappingTraits<std::map<std::string, std::string>> {
++  static void mapping(IO &io, std::map<std::string, std::string> &OM) {
++    assert(io.outputting() && "input not yet implemented");
++
++    // Print as an abbreviated dictionary
++    llvm::yaml::StdMapStringCustomMappingTraitsImpl<std::string>::output(io,
++                                                                         OM);
++  }
++  // This sets the beginFlowMapping and endFlowMapping
++  static const bool flow = true;
++};
++#endif
++
+ template <> struct MappingTraits<RemarkLocation> {
+   static void mapping(IO &io, RemarkLocation &RL) {
+     assert(io.outputting() && "input not yet implemented");
+diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
+index d3efb8b67be5..b66415c0e9a9 100644
+--- a/llvm/lib/Support/CommandLine.cpp
++++ b/llvm/lib/Support/CommandLine.cpp
+@@ -127,6 +127,9 @@ static inline bool isPrefixedOrGrouping(const Option *O) {
+          O->getFormattingFlag() == cl::AlwaysPrefix;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include <map>
++#endif
+ 
+ namespace {
+ 
+@@ -1470,6 +1473,44 @@ bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
+                                                Errs, LongOptionsUseDoubleDash);
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++bool cl::ParseAutoTunerOptions(
++    std::unordered_map<std::string, std::string> LLVMParams,
++    std::unordered_map<std::string, std::string> ProgramParams,
++    StringRef Overview, raw_ostream *Errs, const char *EnvVar,
++    bool LongOptionsUseDoubleDash) {
++  SmallVector<const char *, 20> NewArgv;
++  BumpPtrAllocator A;
++  StringSaver Saver(A);
++  // GlobalParser requires arguments similar to C style command line options
++  // (int argc, char * argv) where argv0 refers to the program name.
++  // We are using a fake program name here which is consistent with LLVM.
++  NewArgv.push_back("AutoTuner (LLVM option parsing)");
++
++  for (const auto &I : LLVMParams) {
++    std::string NewOption = I.first + "=" + I.second;
++    NewArgv.push_back(Saver.save(NewOption).data());
++  }
++
++  for (const auto &I : ProgramParams) {
++    std::string NewOption = I.first + "=" + I.second;
++    NewArgv.push_back(Saver.save(NewOption).data());
++  }
++
++  // Parse options from environment variable.
++  if (EnvVar) {
++    if (std::optional<std::string> EnvValue =
++            sys::Process::GetEnv(StringRef(EnvVar)))
++      TokenizeGNUCommandLine(*EnvValue, Saver, NewArgv);
++  }
++
++  int NewArgc = static_cast<int>(NewArgv.size());
++  // Parse all options.
++  return GlobalParser->ParseCommandLineOptions(NewArgc, &NewArgv0, Overview,
++                                               Errs, LongOptionsUseDoubleDash);
++}
++#endif
++
+ /// Reset all options at least once, so that we can parse different options.
+ void CommandLineParser::ResetAllOptionOccurrences() {
+   // Reset all option values to look like they have never been seen before.
+diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
+index 034f1587ae8d..3507d357a4c6 100644
+--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
++++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
+@@ -57,6 +57,7 @@ add_llvm_component_library(LLVMipo
+   LINK_COMPONENTS
+   AggressiveInstCombine
+   Analysis
++  AutoTuner
+   BitReader
+   BitWriter
+   Core
+diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
+index 3e00aebce372..802667819c44 100644
+--- a/llvm/lib/Transforms/IPO/Inliner.cpp
++++ b/llvm/lib/Transforms/IPO/Inliner.cpp
+@@ -64,6 +64,9 @@
+ #include <functional>
+ #include <utility>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -298,6 +301,27 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+   // be deleted as a batch after inlining.
+   SmallVector<Function *, 4> DeadFunctionsInComdats;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  bool IsAutoTunerEnabled =
++      autotuning::Engine.isEnabled() &&
++      autotuning::Engine.isTuningAllowedForType(autotuning::CallSite);
++  if (IsAutoTunerEnabled) {
++    SmallVector<std::pair<CallBase *, int>, 16> CallsCopy = Calls;
++    for (int I = 0; I < (int)CallsCopy.size(); ++I) {
++      CallBase &CB = *CallsCopyI.first;
++      DebugLoc DLoc = CB.getDebugLoc();
++      if (!CB.getCaller() || !CB.getCalledFunction() || !DLoc)
++        continue;
++      autotuning::CallSiteLocation Loc = autotuning::CallSiteLocation{
++          &CB, CB.getCaller(), CB.getCalledFunction(),
++          autotuning::SourceLocation{DLoc->getFilename().str(), DLoc->getLine(),
++                                     DLoc->getColumn()}};
++      autotuning::Engine.insertCallSiteLoc(Loc);
++    }
++    autotuning::Engine.cleanCallSiteLoc();
++  }
++#endif
++
+   // Loop forward over all of the calls. Note that we cannot cache the size as
+   // inlining can introduce new calls that need to be processed.
+   for (int I = 0; I < (int)Calls.size(); ++I) {
+@@ -412,6 +436,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+           if (NewCallee) {
+             if (!NewCallee->isDeclaration()) {
+               Calls.push_back({ICB, NewHistoryID});
++#if defined(ENABLE_AUTOTUNER)
++              if (IsAutoTunerEnabled)
++                if (ICB->getDebugLoc())
++                  autotuning::Engine.updateCallSiteLocs(
++                      CB, ICB, ICB->getCalledFunction(),
++                      ICB->getDebugLoc()->getLine());
++#endif
+               // Continually inlining through an SCC can result in huge compile
+               // times and bloated code since we arbitrarily stop at some point
+               // when the inliner decides it's not profitable to inline anymore.
+@@ -527,6 +558,11 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+     FAM.invalidate(F, PreservedAnalyses::none());
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (IsAutoTunerEnabled)
++    autotuning::Engine.clearCallSiteLocs();
++#endif
++
+   // We must ensure that we only delete functions with comdats if every function
+   // in the comdat is going to be deleted.
+   if (!DeadFunctionsInComdats.empty()) {
+diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
+index a53baecd4776..9590cf625c64 100644
+--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
++++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
+@@ -1212,6 +1212,20 @@ bool SampleProfileLoader::inlineHotFunctions(
+             }
+           }
+         }
++#if defined(ENABLE_AUTOTUNER)
++        if (autotuning::Engine.isEnabled()) {
++          // If a callsite is hot/cold, mark its corresponding callee as
++          // hot/cold respectively so that auto-tuning engine will be able to
++          // selectively dump code regions as tuning opportunities.
++          if (const CallInst *CI = dyn_cast<CallInst>(&I))
++            if (Function *Callee = CI->getCalledFunction()) {
++              if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
++                Callee->getATEFunction().setHot();
++              else
++                Callee->getATEFunction().setCold();
++            }
++        }
++#endif
+       }
+       if (Hot || ExternalInlineAdvisor) {
+         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+index 424f1d433606..955353944b14 100644
+--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
++++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMInstrumentation
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Demangle
+   MC
+diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+index 3c8f25d73c62..b9459b59e704 100644
+--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
++++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+@@ -2132,6 +2132,10 @@ static bool annotateAllFunctions(
+     F->addFnAttr(Attribute::InlineHint);
+     LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                       << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    if (autotuning::Engine.isEnabled())
++      F->getATEFunction().setHot();
++#endif
+   }
+   for (auto &F : ColdFunctions) {
+     // Only set when there is no Attribute::Hot set by the user. For Hot
+@@ -2148,6 +2152,10 @@ static bool annotateAllFunctions(
+     F->addFnAttr(Attribute::Cold);
+     LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+                       << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    if (autotuning::Engine.isEnabled())
++      F->getATEFunction().setCold();
++#endif
+   }
+   return true;
+ }
+diff --git a/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp b/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+new file mode 100644
+index 000000000000..c33cb7cfc256
+--- /dev/null
++++ b/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+@@ -0,0 +1,334 @@
++#if defined(ENABLE_AUTOTUNER)
++//===--------------- AutoTuningCompile.cpp - Auto-Tuning ------------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++//===----------------------------------------------------------------------===//
++//
++/// \file
++/// This pass implements incremental compilation for AutoTuner to reduce the
++/// compilation time for tuning process.
++/// This pass performs 2 operations.
++/// 1. Writing module level IR files which can be used in subsequent
++///    compilations for AutoTuner flow. So clang frontend don't have to process
++///    the source code from scratch.
++/// 2. Add/Remove attributes for modules and functions to enable/disable
++///    execution of optimization pass(es). It further reduces the compilation
++///    time by skipping optimization pass(es) (If feasible).
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Transforms/Scalar.h"
++#include <string>
++
++// Enable debug messages for AutoTuning Compilation.
++#define DEBUG_TYPE "autotuning-compile"
++
++using namespace llvm;
++
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++
++AutoTuningOptPassGate SkipPasses = AutoTuningOptPassGate(true);
++AutoTuningOptPassGate RunPasses = AutoTuningOptPassGate(false);
++bool AutoTuningCompileModule::SkipCompilation = false;
++
++static void writeFiles(Module &M, std::string Pass) {
++  if (autotuning::Engine.isGenerateOutput()) {
++    switch (AutoTuningCompileMode) {
++    case Basic:
++    case CoarseGrain:
++      if (Pass == autotuning::CompileOptionStart) {
++        LLVM_DEBUG(dbgs() << "AutoTuningCompile: IR files writing before Pass: "
++                          << Pass << ".\n");
++        auto ATD = new AutotuningDumpLegacy(/* Incremental Compilation */ true);
++        ATD->runOnModule(M);
++      }
++      break;
++    case FineGrain:
++      if (autotuning::Engine.hasOpportunities()) {
++        LLVM_DEBUG(dbgs() << "AutoTuningCompile: IR files writing before Pass: "
++                          << Pass << ".\n");
++        auto ATD = new AutotuningDumpLegacy(/* Incremental Compilation */ true);
++        ATD->runOnModule(M);
++      }
++      break;
++    default:
++      llvm_unreachable("AutoTuningCompile: Unknown AutoTuner Incremental "
++                       "Compilation mode.\n");
++    }
++  }
++}
++
++bool AutoTuningOptPassGate::shouldRunPass(const StringRef PassName,
++                                          StringRef IRDescription) {
++  LLVM_DEBUG(dbgs() << "Skip pass '" << PassName
++                    << "': " << (Skip ? "True" : "False") << '\n');
++  return !Skip;
++}
++
++bool AutoTuningOptPassGate::checkPass(const StringRef PassName,
++                                      const StringRef TargetDesc) {
++  if (PassName.startswith("AutoTuningCompile")) {
++    LLVM_DEBUG(dbgs() << "Running '" << PassName << "'pass.\n");
++    return true;
++  }
++
++  LLVM_DEBUG(dbgs() << "Skip pass '" << PassName
++                    << "': " << (Skip ? "True" : "False") << '\n');
++  return !Skip;
++}
++
++AutoTuningCompileModule::AutoTuningCompileModule(std::string Pass) {
++  this->Pass = Pass;
++}
++
++void AutoTuningCompileModule::writeIRFiles(Module &M) const {
++  writeFiles(M, Pass);
++}
++
++bool AutoTuningCompileModule::modifyCompilationPipeline(Module &M) const {
++  bool Changed = false;
++  LLVM_DEBUG(dbgs() << "AutoTuningCompile: Deciding to enable/disable "
++                       "optimization of module/functions. Pass: "
++                    << Pass << '\n');
++
++  StringRef Filename = M.getName();
++  size_t Pos = Filename.rfind(".ll");
++  if (Pos == StringRef::npos) {
++    errs() << "AutoTuningCompile: Source file is not IR (.ll) file. "
++              "Disabling incremental compilation.\n";
++    AutoTuningCompileMode = Inactive;
++    return Changed;
++  }
++  Filename = Filename.substr(0, Pos);
++
++  switch (AutoTuningCompileMode) {
++  case Basic:
++  case CoarseGrain:
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: No change in opt pipeline for "
++                         "Basic/CoarseGrain incremental compilation mode.\n");
++    break;
++  case FineGrain: {
++    if (Pass == autotuning::CompileOptionStart) {
++      M.getContext().setOptPassGate(SkipPasses);
++      getAutoTuningOptPassGate().setSkip(true);
++      setSkipCompilation(true);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses enabled.\n");
++    } else if (getSkipCompilation() &&
++               (autotuning::Engine.shouldRunOptPass(Filename.str(), Pass) ||
++                Pass == "end")) {
++      M.getContext().setOptPassGate(RunPasses);
++      getAutoTuningOptPassGate().setSkip(false);
++      setSkipCompilation(false);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses disabled.\n");
++    } else
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: Old decision (SkipPasses = "
++                        << (getSkipCompilation() ? "True" : "False")
++                        << " ) continued.\n");
++
++    Changed = true;
++    break;
++  }
++  default:
++    llvm_unreachable(
++        "AutoTuningCompile: Unknown AutoTuner Incremental Compilation mode.\n");
++  }
++
++  return Changed;
++}
++
++bool AutoTuningCompileModule::run(Module &M) {
++  bool Changed = false;
++  if (AutoTuningCompileMode == Inactive)
++    return Changed;
++
++  if (!autotuning::Engine.isEnabled()) {
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: AutoTuner is not enabled.\n");
++    return Changed;
++  }
++
++  writeIRFiles(M);
++
++  if (autotuning::Engine.isParseInput())
++    Changed |= modifyCompilationPipeline(M);
++
++  return Changed;
++}
++
++AutoTuningCompileModuleLegacy::AutoTuningCompileModuleLegacy(std::string Pass)
++    : ModulePass(AutoTuningCompileModuleLegacy::ID) {
++  this->Pass = Pass;
++}
++
++bool AutoTuningCompileModuleLegacy::runOnModule(Module &M) {
++  AutoTuningCompileModule Impl(Pass);
++  return Impl.run(M);
++}
++
++char AutoTuningCompileModuleLegacy::ID = 0;
++
++StringRef AutoTuningCompileModuleLegacy::getPassName() const {
++  return "AutoTuner Incremental Compilation";
++}
++
++INITIALIZE_PASS(AutoTuningCompileModuleLegacy, "autotuning-compile-module",
++                "AutoTuner Incremental Compilation", false, false)
++
++// Public interface to the AutoTuningCompile pass
++ModulePass *llvm::createAutoTuningCompileModuleLegacyPass(std::string Pass) {
++  return new AutoTuningCompileModuleLegacy(Pass);
++}
++
++PreservedAnalyses AutoTuningCompileModulePass::run(Module &M,
++                                                   ModuleAnalysisManager &) {
++  AutoTuningCompileModule Impl(Pass);
++  Impl.run(M);
++  return PreservedAnalyses::all();
++}
++
++AutoTuningCompileFunction::AutoTuningCompileFunction(std::string Pass) {
++  this->Pass = Pass;
++}
++
++void AutoTuningCompileFunction::writeIRFiles(Module &M) {
++  if (IsModuleWritten)
++    return;
++  IsModuleWritten = true;
++  writeFiles(M, Pass);
++}
++
++bool AutoTuningCompileFunction::modifyCompilationPipeline(Function &F) {
++  bool Changed = false;
++  LLVM_DEBUG(dbgs() << "AutoTuningCompile: Deciding to enable/disable "
++                       "optimization of module/functions. Pass: "
++                    << Pass << '\n');
++  Module *M = F.getParent();
++  StringRef Filename = M->getName();
++  size_t Pos = Filename.rfind(".ll");
++  if (Pos == StringRef::npos) {
++    errs() << "AutoTuningCompile: Source file is not IR (.ll) file. "
++              "Disabling incremental compilation.\n";
++    AutoTuningCompileMode = Inactive;
++    return Changed;
++  }
++  Filename = Filename.substr(0, Pos);
++
++  switch (AutoTuningCompileMode) {
++  case Basic:
++  case CoarseGrain:
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: No change in opt pipeline for "
++                         "Basic/CoarseGrain incremental compilation mode.\n");
++    break;
++  case FineGrain: {
++    if (!AutoTuningCompileModule::getSkipCompilation() &&
++        Pass == autotuning::CompileOptionStart) {
++      if (!SkipDecision) {
++        M->getContext().setOptPassGate(SkipPasses);
++        getAutoTuningOptPassGate().setSkip(true);
++        SkipDecision = true;
++      }
++      AutoTuningCompileModule::setSkipCompilation(true);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses enabled.\n");
++    } else if (AutoTuningCompileModule::getSkipCompilation() &&
++               Pass != autotuning::CompileOptionStart &&
++               (autotuning::Engine.shouldRunOptPass(Filename.str(), Pass) ||
++                Pass == autotuning::CompileOptionEnd)) {
++      M->getContext().setOptPassGate(RunPasses);
++      getAutoTuningOptPassGate().setSkip(false);
++      SkipDecision = false;
++      AutoTuningCompileModule::setSkipCompilation(false);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses disabled.\n");
++    } else
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: Old decision (SkipPasses = "
++                        << (AutoTuningCompileModule::getSkipCompilation()
++                                ? "True"
++                                : "False")
++                        << " ) continued.\n");
++
++    Changed = true;
++    break;
++  }
++  default:
++    llvm_unreachable(
++        "AutoTuningCompile: Unknown AutoTuner Incremental Compilation mode.\n");
++  }
++
++  return Changed;
++}
++
++bool AutoTuningCompileFunction::run(Function &F) {
++  bool Changed = false;
++  if (AutoTuningCompileMode == Inactive)
++    return Changed;
++
++  if (!autotuning::Engine.isEnabled()) {
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: AutoTuner is not enabled.\n");
++    return Changed;
++  }
++
++  writeIRFiles(*F.getParent());
++
++  if (autotuning::Engine.isParseInput())
++    Changed |= modifyCompilationPipeline(F);
++
++  return Changed;
++}
++
++AutoTuningCompileFunctionLegacy::AutoTuningCompileFunctionLegacy(
++    std::string Pass)
++    : FunctionPass(AutoTuningCompileFunctionLegacy::ID) {
++  this->Pass = Pass;
++}
++
++bool AutoTuningCompileFunctionLegacy::runOnFunction(Function &F) {
++  AutoTuningCompileFunction Impl(Pass);
++  return Impl.run(F);
++}
++
++char AutoTuningCompileFunctionLegacy::ID = 0;
++
++StringRef AutoTuningCompileFunctionLegacy::getPassName() const {
++  return "AutoTuner Incremental Compilation";
++}
++
++INITIALIZE_PASS(AutoTuningCompileFunctionLegacy, "autotuning-compile-function",
++                "AutoTuner Incremental Compilation", false, false)
++
++// Public interface to the AutoTuningCompile pass
++FunctionPass *
++llvm::createAutoTuningCompileFunctionLegacyPass(std::string Pass) {
++  return new AutoTuningCompileFunctionLegacy(Pass);
++}
++
++PreservedAnalyses
++AutoTuningCompileFunctionPass::run(Function &F, FunctionAnalysisManager &AM) {
++  AutoTuningCompileFunction Impl(Pass);
++  Impl.run(F);
++  return PreservedAnalyses::all();
++}
++
++PreservedAnalyses
++AutoTuningCompileLoopPass::run(Loop &L, LoopAnalysisManager &AM,
++                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
++  AutoTuningCompileFunction Impl(Pass);
++  Function *F = L.getHeader()->getParent();
++  Impl.run(*F);
++  return PreservedAnalyses::all();
++}
++
++AutoTuningOptPassGate &llvm::getAutoTuningOptPassGate() {
++  static AutoTuningOptPassGate AutoTuningGate;
++  return AutoTuningGate;
++}
++
++#endif
+diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
+index eb008c15903a..e5a82ea8f923 100644
+--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
++++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
+@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
+   ADCE.cpp
+   AlignmentFromAssumptions.cpp
+   AnnotationRemarks.cpp
++  AutoTuningCompile.cpp
+   BDCE.cpp
+   CallSiteSplitting.cpp
+   ConstantHoisting.cpp
+@@ -92,6 +93,7 @@ add_llvm_component_library(LLVMScalarOpts
+   LINK_COMPONENTS
+   AggressiveInstCombine
+   Analysis
++  AutoTuner
+   Core
+   InstCombine
+   Support
+diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+index 335b489d3cb2..feb8932eaae7 100644
+--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
++++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+@@ -66,6 +66,9 @@
+ #include <string>
+ #include <tuple>
+ #include <utility>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -173,6 +176,10 @@ static cl::opt<unsigned>
+                            cl::desc("Default threshold (max size of unrolled "
+                                     "loop), used in all but O3 optimizations"));
+ 
++#if defined(ENABLE_AUTOTUNER)
++static const std::string UnrollCountParamStr = "UnrollCount";
++#endif
++
+ /// A magic value for use with the Threshold parameter to indicate
+ /// that the loop unroll should be performed regardless of how much
+ /// code expansion would result.
+@@ -893,7 +900,12 @@ bool llvm::computeUnrollCount(
+     OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
+     bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize,
+     TargetTransformInfo::UnrollingPreferences &UP,
++#if defined(ENABLE_AUTOTUNER)
++    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound,
++    unsigned int Invocation) {
++#else
+     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
++#endif
+ 
+   UnrollCostEstimator UCE(*L, LoopSize);
+ 
+@@ -942,6 +954,43 @@ bool llvm::computeUnrollCount(
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Priority 2.5 is using Unroll Count set by AutoTuner (if enabled).
++  if (autotuning::Engine.isEnabled()) {
++    // Create a code region for current loop. This code region will be added to
++    // opportunity list once all the relevant information is gathered.
++    autotuning::Engine.initContainer(L, DEBUG_TYPE,
++                                     L->getHeader()->getParent()->getName(),
++                                     /* addOpportunity */ false, Invocation);
++
++    int NewValue = 0; // the int value is set by lookUpParams()
++    bool UnrollCountChanged = L->lookUpParams<int>("UnrollCount", NewValue);
++
++    if (UnrollCountChanged) {
++      // Setting the UP.Count with the value suggested by AutoTuner.
++      // AutoTuner will use UnrollCount = 0, 1, X, Y, Z in case of dynamic
++      // configuration and UnrollCount = 0, 1, 2, 4, 8 otherwise to find
++      // optimal configuration. Compiler will unroll the loop with suggested
++      // UnrollCount except when UnrollCount = 1 where AutoTuner is suggesting
++      // to try loop peeling.
++      UP.Count = NewValue;
++      UP.AllowExpensiveTripCount = true;
++      UP.Force = true;
++      UP.Runtime = true;
++      if (!UP.AllowRemainder && UP.Count != 1)
++        UP.Count = 0;
++
++      // Check for Loop Peeling
++      if (UP.Count == 1) {
++        computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
++        UP.Runtime = (PP.PeelCount) ? false : UP.Runtime;
++      }
++
++      return true;
++    }
++  }
++#endif
++
+   // 3rd priority is exact full unrolling.  This will eliminate all copies
+   // of some exit test.
+   UP.Count = 0;
+@@ -1119,6 +1168,59 @@ bool llvm::computeUnrollCount(
+   return ExplicitUnroll;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Given UnrollingPreferences count (UPCount) and TripCount for CodeRegion
++// CR, compute the dynamic Unroll values for tuning and add it to CR.
++static void
++computeAutoTunerDynamicUnrollOptions(unsigned UPCount, unsigned TripCount,
++                                     const autotuning::CodeRegion &CR) {
++  std::vector<unsigned int> DynamicTuningOptions;
++  unsigned int PotentialTuningOptions2;
++  unsigned int Idx = 0;
++  int Count = -1;
++  unsigned int CurrentOption = 2;
++  unsigned int MaxTuningCount = 64;
++  DynamicTuningOptions.push_back(0);
++  // Add LoopPeeling as an additional option.
++  DynamicTuningOptions.push_back(1);
++  if (!UPCount) {
++    TripCount = (TripCount > MaxTuningCount) ? MaxTuningCount : TripCount;
++    unsigned int Limit = (TripCount == 0) ? 8 : TripCount;
++    DynamicTuningOptions.push_back(TripCount ? TripCount : 8);
++    while (CurrentOption < Limit) {
++      PotentialTuningOptionsIdx = CurrentOption;
++      CurrentOption *= 2;
++      Idx = (Idx + 1) % 2;
++      ++Count;
++    }
++  } else {
++    while (CurrentOption < UPCount) {
++      PotentialTuningOptionsIdx = CurrentOption;
++      CurrentOption *= 2;
++      Idx = (Idx + 1) % 2;
++      ++Count;
++    }
++    if (TripCount != UPCount) {
++      if (CurrentOption == UPCount) {
++        CurrentOption *= 2;
++      }
++      if (!TripCount || CurrentOption < TripCount) {
++        PotentialTuningOptionsIdx = CurrentOption;
++        ++Count;
++      }
++    }
++    if (UPCount != 1)
++      DynamicTuningOptions.push_back(UPCount);
++  }
++
++  Count = std::min(1, Count);
++  while (Count >= 0)
++    DynamicTuningOptions.push_back(PotentialTuningOptionsCount--);
++
++  CR.addAutoTunerOptions("UnrollCount", DynamicTuningOptions);
++}
++#endif
++
+ static LoopUnrollResult
+ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                 const TargetTransformInfo &TTI, AssumptionCache &AC,
+@@ -1132,7 +1234,12 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                 std::optional<bool> ProvidedUpperBound,
+                 std::optional<bool> ProvidedAllowPeeling,
+                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
++#if defined(ENABLE_AUTOTUNER)
++                std::optional<unsigned> ProvidedFullUnrollMaxCount,
++                unsigned int Invocation = 0) {
++#else
+                 std::optional<unsigned> ProvidedFullUnrollMaxCount) {
++#endif
+ 
+   LLVM_DEBUG(dbgs() << "Loop Unroll: F"
+                     << L->getHeader()->getParent()->getName() << " Loop %"
+@@ -1276,11 +1383,28 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+   // computeUnrollCount() decides whether it is beneficial to use upper bound to
+   // fully unroll the loop.
+   bool UseUpperBound = false;
++
++#if defined(ENABLE_AUTOTUNER)
++  bool IsCountSetExplicitly = computeUnrollCount(
++      L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
++      MaxOrZero, TripMultiple, LoopSize, UP, PP, UseUpperBound, Invocation);
++  const autotuning::CodeRegion CR = L->getCodeRegion();
++  // computeAutoTunerDynamicUnrollOptions() adds the dynamic Unroll values to
++  // the CodeRegion.
++  computeAutoTunerDynamicUnrollOptions(UP.Count, TripCount, CR);
++
++  if (!UP.Count) {
++    autotuning::Engine.addOpportunity(
++        CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++    return LoopUnrollResult::Unmodified;
++  }
++#else
+   bool IsCountSetExplicitly = computeUnrollCount(
+     L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
+       TripMultiple, LoopSize, UP, PP, UseUpperBound);
+   if (!UP.Count)
+     return LoopUnrollResult::Unmodified;
++#endif
+ 
+   if (PP.PeelCount) {
+     assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
+@@ -1300,8 +1424,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+       // we had, so we don't want to unroll or peel again.
+       if (PP.PeelProfiledIterations)
+         L->setLoopAlreadyUnrolled();
++#if defined(ENABLE_AUTOTUNER)
++      autotuning::Engine.addOpportunity(
++          CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++      return LoopUnrollResult::PartiallyUnrolled;
++    }
++    autotuning::Engine.addOpportunity(CR, {{UnrollCountParamStr, "0"}});
++#else
+       return LoopUnrollResult::PartiallyUnrolled;
+     }
++#endif
+     return LoopUnrollResult::Unmodified;
+   }
+ 
+@@ -1329,8 +1461,18 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+       {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+        UP.UnrollRemainder, ForgetAllSCEV},
+       LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
++
++#if defined(ENABLE_AUTOTUNER)
++  if (UnrollResult == LoopUnrollResult::Unmodified) {
++    autotuning::Engine.addOpportunity(CR, {{UnrollCountParamStr, "0"}});
++    return LoopUnrollResult::Unmodified;
++  }
++  autotuning::Engine.addOpportunity(
++      CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++#else
+   if (UnrollResult == LoopUnrollResult::Unmodified)
+     return LoopUnrollResult::Unmodified;
++#endif
+ 
+   if (RemainderLoop) {
+     std::optional<MDNode *> RemainderLoopID =
+@@ -1379,6 +1521,20 @@ public:
+   /// Otherwise, forgetAllLoops and rebuild when needed next.
+   bool ForgetAllSCEV;
+ 
++#if defined(ENABLE_AUTOTUNER)
++private:
++  // 'InvocationCounter' keeps track of Invocation of Loop Unroll Pass and
++  // assign it to 'Invocation'. So each LoopUnroll Object knows when it is
++  // being invoked during optimization pipeline. It is used to identify the
++  // Invocation of a pass if it is invoked multiple times. AutoTuner will use
++  // this information to generate the Code Regions and apply the suggested
++  // configuration during the correct invocation of the Loop Unroll Pass.
++  static unsigned int InvocationCounter;
++  unsigned int Invocation;
++
++public:
++#endif
++
+   std::optional<unsigned> ProvidedCount;
+   std::optional<unsigned> ProvidedThreshold;
+   std::optional<bool> ProvidedAllowPartial;
+@@ -1405,6 +1561,9 @@ public:
+         ProvidedAllowPeeling(AllowPeeling),
+         ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+         ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
++#if defined(ENABLE_AUTOTUNER)
++    Invocation = InvocationCounter++;
++#endif
+     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+   }
+ 
+@@ -1431,7 +1590,12 @@ public:
+         /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
+         ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+         ProvidedUpperBound, ProvidedAllowPeeling,
++#if defined(ENABLE_AUTOTUNER)
++        ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount,
++        Invocation);
++#else
+         ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
++#endif
+ 
+     if (Result == LoopUnrollResult::FullyUnrolled)
+       LPM.markLoopAsDeleted(*L);
+@@ -1449,6 +1613,9 @@ public:
+     getLoopAnalysisUsage(AU);
+   }
+ };
++#if defined(ENABLE_AUTOTUNER)
++unsigned int LoopUnroll::InvocationCounter = 0;
++#endif
+ 
+ } // end anonymous namespace
+ 
+@@ -1496,6 +1663,11 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ 
+   std::string LoopName = std::string(L.getName());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // LoopFullUnrollPass will be invoked first during optimization pipeline.
++  unsigned int Invocation = 0;
++#endif
++
+   bool Changed =
+       tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
+                       /*BFI*/ nullptr, /*PSI*/ nullptr,
+@@ -1505,7 +1677,12 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                       /*Runtime*/ false, /*UpperBound*/ false,
+                       /*AllowPeeling*/ true,
+                       /*AllowProfileBasedPeeling*/ false,
++#if defined(ENABLE_AUTOTUNER)
++                      /*FullUnrollMaxCount*/ std::nullopt,
++                      /*Invocation*/ Invocation) !=
++#else
+                       /*FullUnrollMaxCount*/ std::nullopt) !=
++#endif
+       LoopUnrollResult::Unmodified;
+   if (!Changed)
+     return PreservedAnalyses::all();
+@@ -1588,6 +1765,11 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
+ 
+   bool Changed = false;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // LoopUnrollPass will be invoked second during optimization pipeline.
++  unsigned int Invocation = 1;
++#endif
++
+   // The unroller requires loops to be in simplified form, and also needs LCSSA.
+   // Since simplification may add new inner loops, it has to run before the
+   // legality and profitability checks. This means running the loop unroller
+@@ -1630,7 +1812,12 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
+         /*Count*/ std::nullopt,
+         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
+         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
++#if defined(ENABLE_AUTOTUNER)
++        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
++        Invocation);
++#else
+         UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
++#endif
+     Changed |= Result != LoopUnrollResult::Unmodified;
+ 
+     // The parent must not be damaged by unrolling!
+diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
+index 37b032e4d7c7..4b140e8d600b 100644
+--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
++++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
+@@ -64,4 +64,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
+   initializeStraightLineStrengthReduceLegacyPassPass(Registry);
+   initializePlaceBackedgeSafepointsLegacyPassPass(Registry);
+   initializeLoopSimplifyCFGLegacyPassPass(Registry);
++#if defined(ENABLE_AUTOTUNER)
++  initializeAutoTuningCompileFunctionLegacyPass(Registry);
++  initializeAutoTuningCompileModuleLegacyPass(Registry);
++#endif
+ }
+diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
+index 8b99f73b850b..b3c60686e252 100644
+--- a/llvm/lib/Transforms/Scalar/Sink.cpp
++++ b/llvm/lib/Transforms/Scalar/Sink.cpp
+@@ -248,6 +248,11 @@ namespace {
+     }
+ 
+     bool runOnFunction(Function &F) override {
++#if defined(ENABLE_AUTOTUNER)
++      if (skipFunction(F))
++        return false;
++#endif
++
+       auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+       auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+       auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
+index a870071f3f64..8616e7b923c0 100644
+--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
++++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
+@@ -93,6 +93,7 @@ add_llvm_component_library(LLVMTransformUtils
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Support
+   TargetParser
+diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
+index c36b0533580b..20a4edcb29db 100644
+--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
++++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
+@@ -491,6 +491,11 @@ char &llvm::LCSSAID = LCSSAWrapperPass::ID;
+ 
+ /// Transform \p F into loop-closed SSA form.
+ bool LCSSAWrapperPass::runOnFunction(Function &F) {
++#if defined(ENABLE_AUTOTUNER)
++  if (skipFunction(F))
++    return false;
++#endif
++
+   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+index 3e604fdf2e11..2e42e7f1397f 100644
+--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
++++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+@@ -69,6 +69,9 @@
+ #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+ #include "llvm/Transforms/Utils/Local.h"
+ #include "llvm/Transforms/Utils/LoopUtils.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ using namespace llvm;
+ 
+ #define DEBUG_TYPE "loop-simplify"
+@@ -793,6 +796,11 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+ /// it in any convenient order) inserting preheaders...
+ ///
+ bool LoopSimplify::runOnFunction(Function &F) {
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled() && skipFunction(F))
++    return false;
++#endif
++
+   bool Changed = false;
+   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+index 511dd61308f9..2d2c3e50514b 100644
+--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
++++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+@@ -69,6 +69,9 @@
+ #include <numeric>
+ #include <type_traits>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ class DataLayout;
+diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+index 998dfd956575..f2c5c04abb13 100644
+--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
++++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMVectorize
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Support
+   TransformUtils
+diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+index f923f0be6621..f13ce6853666 100644
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+@@ -113,6 +113,18 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+   // Populate values with existing loop metadata.
+   getHintsFromMetadata();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled()) {
++    int NewValue = 0;
++    bool VectorizationInterleaveChanged =
++        L->lookUpParams<int>("VectorizationInterleave", NewValue);
++
++    if (VectorizationInterleaveChanged) {
++      Interleave.Value = NewValue;
++    }
++  }
++#endif
++
+   // force-vector-interleave overrides DisableInterleaving.
+   if (VectorizerParams::isInterleaveForced())
+     Interleave.Value = VectorizerParams::VectorizationInterleave;
+diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+index b603bbe55dc9..46fab860f5a3 100644
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+@@ -10178,6 +10178,22 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+                               !EnableLoopVectorization) {}
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Given the iterleave count (IC) and CR, compute the dynamic values for
++// interleave count. Then add it to CR.
++static void
++computeAutoTunerDynamicInterleaveOptions(unsigned IC,
++                                         const autotuning::CodeRegion &CR) {
++
++  std::vector<unsigned int> AutoTunerOptions{1, 2, 4};
++  if (std::find(AutoTunerOptions.begin(), AutoTunerOptions.end(), IC) ==
++      AutoTunerOptions.end())
++    AutoTunerOptions2 = IC;
++
++  CR.addAutoTunerOptions("VectorizationInterleave", AutoTunerOptions);
++}
++#endif
++
+ bool LoopVectorizePass::processLoop(Loop *L) {
+   assert((EnableVPlanNativePath || L->isInnermost()) &&
+          "VPlan-native path is not enabled. Only process inner loops.");
+@@ -10190,6 +10206,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
+                     << L->getHeader()->getParent()->getName() << "' from "
+                     << DebugLocStr << "\n");
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Initialize the loop for auto-tuning but do not add it
++  // as an tuning opportunity yet.
++  autotuning::Engine.initContainer(
++      L, LV_NAME, L->getHeader()->getParent()->getName(), false);
++#endif
+   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
+ 
+   LLVM_DEBUG(
+@@ -10422,6 +10444,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
+     InterleaveLoop = false;
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (!VectorizerParams::isInterleaveForced()) {
++    // Compute the dynamic values for VectorizationInterleave and add it to the
++    // CodeRegion.
++    computeAutoTunerDynamicInterleaveOptions(IC, L->getCodeRegion());
++
++    // Add the current loop as a tuning opportunity explicitly.
++    autotuning::Engine.addOpportunity(
++        L->getCodeRegion(), {{"VectorizationInterleave", std::to_string(IC)}});
++  }
++#endif
++
+   // Override IC if user provided an interleave count.
+   IC = UserIC > 0 ? UserIC : IC;
+ 
+diff --git a/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml b/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+new file mode 100644
+index 000000000000..f483a269906a
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            name
++Function:        foo
++CodeRegionType:  loop
++Args:
++  - UnrollCount: number
++...
+diff --git a/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll b/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+new file mode 100644
+index 000000000000..ceb9b4fb2ca6
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+@@ -0,0 +1,65 @@
++; UNSUPPORTED: windows
++; RUN: sed 's#\number\#0#g; s#\name\#for.body#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.DEFAULT.yaml
++; RUN: opt --disable-output %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=1
++; RUN: cat %T/../autotune_datadir/create-data-dir.ll/1.ll | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir/*
++
++; RUN: cp %t.DEFAULT.yaml %T/../autotune_datadir/config.yaml
++; RUN: opt %s -S -passes='require<autotuning-dump>' -auto-tuning-config-id=1
++; RUN: cat %T/../autotune_datadir/create-data-dir.ll/1.ll | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir/*
++
++; RUN: cp %t.DEFAULT.yaml %T/../autotune_datadir/config.yaml
++; RUN: opt %s -S -passes='require<autotuning-dump>' -enable-autotuning-dump
++; RUN: echo -n %T/../autotune_datadir/IR_files/ > %t.filename
++; RUN: echo -n "create-data-dir.ll/" >> %t.filename
++; RUN: echo -n %s | sed 's#/#_#g' >> %t.filename
++; RUN: echo -n ".ll" >> %t.filename
++; RUN: cat %t.filename | xargs cat | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir
++
++; ModuleID = 'search.c'
++source_filename = "search.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: argmemonly nofree norecurse nosync nounwind readonly uwtable
++define dso_local i32 @search(ptr nocapture noundef readonly %Arr, i32 noundef %Value, i32 noundef %Size) {
++entry:
++  %cmp5 = icmp sgt i32 %Size, 0
++  br i1 %cmp5, label %for.body.preheader, label %for.end
++
++for.body.preheader:                               ; preds = %entry
++  %wide.trip.count = zext i32 %Size to i64
++  br label %for.body
++
++for.body:                                         ; preds = %for.body.preheader, %for.inc
++  %indvars.iv = phi i64  0, %for.body.preheader ,  %indvars.iv.next, %for.inc 
++  %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %indvars.iv
++  %0 = load i32, ptr %arrayidx, align 4
++  %cmp1 = icmp eq i32 %0, %Value
++  br i1 %cmp1, label %for.end.loopexit.split.loop.exit, label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
++  br i1 %exitcond.not, label %for.end, label %for.body
++
++for.end.loopexit.split.loop.exit:                 ; preds = %for.body
++  %1 = trunc i64 %indvars.iv to i32
++  br label %for.end
++
++for.end:                                          ; preds = %for.inc, %for.end.loopexit.split.loop.exit, %entry
++  %Idx.0.lcssa = phi i32  0, %entry ,  %1, %for.end.loopexit.split.loop.exit ,  %Size, %for.inc 
++  ret i32 %Idx.0.lcssa
++}
++
++; Check that only loop body is inside the IR File.
++; CHECK-LABEL: for.body:                                         ; preds =
++; CHECK-NEXT: %indvars.iv = phi i64  0, %for.body.preheader ,  %indvars.iv.next, %for.inc 
++; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %indvars.iv
++; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4
++; CHECK-NEXT: %cmp1 = icmp eq i32 %0, %Value
++; CHECK-NEXT: br i1 %cmp1, label %for.end.loopexit.split.loop.exit, label %for.inc
+diff --git a/llvm/test/AutoTuning/AutotuningDump/unroll.ll b/llvm/test/AutoTuning/AutotuningDump/unroll.ll
+new file mode 100644
+index 000000000000..e8243da55fff
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/unroll.ll
+@@ -0,0 +1,35 @@
++; RUN: rm -rf %T.tmp/Output
++; RUN: mkdir -p %T.tmp/Output
++; RUN: rm %t.DEFAULT.yaml -rf
++; RUN: sed 's#\number\#0#g; s#\name\#for.body#g' %S/Inputs/unroll_template.yaml > %t.DEFAULT.yaml
++; RUN: env AUTOTUNE_DATADIR=%T.tmp/Output opt %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=1
++; RUN: env AUTOTUNE_DATADIR=%T.tmp/Output opt %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=2
++; RUN: cat %T.tmp/Output/unroll.ll/1.ll | FileCheck %s -check-prefix=DEFAULT
++; RUN: cat %T.tmp/Output/unroll.ll/2.ll | FileCheck %s -check-prefix=DEFAULT
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++for.end:                                          ; preds = %for.body
++  ret void
++}
++; Check that only loop body is inside the IR File.
++; DEFAULT-LABEL: for.body:                                         ; preds = %for.body, %entry
++; DEFAULT-NEXT: %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++; DEFAULT-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
++; DEFAULT:  %exitcond = icmp eq i64 %indvars.iv.next, 64
++; DEFAULT:  br i1 %exitcond, label %for.end, label %for.body
++
++; RUN: rm -rf %T.tmp/Output
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+new file mode 100644
+index 000000000000..a5e669c17a71
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+@@ -0,0 +1,9 @@
++!AutoTuning {Args: {UnrollCount: 0}, CodeRegionHash: 12835463591102937421,
++  CodeRegionType: loop, Function: test, Invocation: 0, Name: for.body,
++  Pass: loop-unroll}
++--- !AutoTuning {Args: {VectorizationInterleave: 2},
++  CodeRegionHash: 12835463591102937421, CodeRegionType: loop, Function: test,
++  Invocation: 0, Name: for.body, Pass: loop-vectorize}
++--- !AutoTuning {Args: {UnrollCount: 0}, CodeRegionHash: 8430337282115614432,
++  CodeRegionType: loop, Function: test, Invocation: 1, Name: vector.body,
++  Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+new file mode 100644
+index 000000000000..738cf55ffe9a
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+@@ -0,0 +1,9 @@
++!AutoTuning {Args: {UnrollCount: 2}, CodeRegionHash: 12835463591102937421,
++  CodeRegionType: loop, Function: test, Invocation: 0, Name: for.body,
++  Pass: loop-unroll}
++--- !AutoTuning {Args: {VectorizationInterleave: 2},
++  CodeRegionHash: 12835463591102937421, CodeRegionType: loop, Function: test,
++  Invocation: 0, Name: for.body, Pass: loop-vectorize}
++--- !AutoTuning {Args: {UnrollCount: 0}, CodeRegionHash: 8430337282115614432,
++  CodeRegionType: loop, Function: test, Invocation: 1, Name: vector.body,
++  Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll b/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+new file mode 100644
+index 000000000000..667a076b2d23
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+@@ -0,0 +1,117 @@
++; ModuleID = 'test.c'
++source_filename = "test.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++@.str = private unnamed_addr constant 12 x i8 c"tmp <= 10.0\00", align 1
++@.str.1 = private unnamed_addr constant 7 x i8 c"test.c\00", align 1
++@__PRETTY_FUNCTION__.test = private unnamed_addr constant 12 x i8 c"void test()\00", align 1
++
++; Function Attrs: nounwind uwtable
++define dso_local void @test() #0 {
++entry:
++  %cs = alloca i32, align 4
++  %flush = alloca ptr, align 8
++  %i = alloca i32, align 4
++  %tmp = alloca double, align 8
++  call void @llvm.lifetime.start.p0(i64 4, ptr %cs) #5
++  store i32 16431360, ptr %cs, align 4, !tbaa !6
++  call void @llvm.lifetime.start.p0(i64 8, ptr %flush) #5
++  %0 = load i32, ptr %cs, align 4, !tbaa !6
++  %conv = sext i32 %0 to i64
++  %call = call noalias ptr @calloc(i64 noundef %conv, i64 noundef 8) #6
++  store ptr %call, ptr %flush, align 8, !tbaa !10
++  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #5
++  call void @llvm.lifetime.start.p0(i64 8, ptr %tmp) #5
++  store double 0.000000e+00, ptr %tmp, align 8, !tbaa !12
++  store i32 0, ptr %i, align 4, !tbaa !6
++  br label %for.cond
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %1 = load i32, ptr %i, align 4, !tbaa !6
++  %2 = load i32, ptr %cs, align 4, !tbaa !6
++  %cmp = icmp slt i32 %1, %2
++  br i1 %cmp, label %for.body, label %for.end
++
++for.body:                                         ; preds = %for.cond
++  %3 = load ptr, ptr %flush, align 8, !tbaa !10
++  %4 = load i32, ptr %i, align 4, !tbaa !6
++  %idxprom = sext i32 %4 to i64
++  %arrayidx = getelementptr inbounds double, ptr %3, i64 %idxprom
++  %5 = load double, ptr %arrayidx, align 8, !tbaa !12
++  %6 = load double, ptr %tmp, align 8, !tbaa !12
++  %add = fadd double %6, %5
++  store double %add, ptr %tmp, align 8, !tbaa !12
++  br label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %7 = load i32, ptr %i, align 4, !tbaa !6
++  %inc = add nsw i32 %7, 1
++  store i32 %inc, ptr %i, align 4, !tbaa !6
++  br label %for.cond, !llvm.loop !14
++
++for.end:                                          ; preds = %for.cond
++  %8 = load double, ptr %tmp, align 8, !tbaa !12
++  %cmp2 = fcmp ole double %8, 1.000000e+01
++  br i1 %cmp2, label %if.then, label %if.else
++
++if.then:                                          ; preds = %for.end
++  br label %if.end
++
++if.else:                                          ; preds = %for.end
++  call void @__assert_fail(ptr noundef @.str, ptr noundef @.str.1, i32 noundef 11, ptr noundef @__PRETTY_FUNCTION__.test) #7
++  unreachable
++
++if.end:                                           ; preds = %if.then
++  %9 = load ptr, ptr %flush, align 8, !tbaa !10
++  call void @free(ptr noundef %9) #5
++  call void @llvm.lifetime.end.p0(i64 8, ptr %tmp) #5
++  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #5
++  call void @llvm.lifetime.end.p0(i64 8, ptr %flush) #5
++  call void @llvm.lifetime.end.p0(i64 4, ptr %cs) #5
++  ret void
++}
++
++; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
++declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
++
++; Function Attrs: nounwind allocsize(0,1)
++declare noalias ptr @calloc(i64 noundef, i64 noundef) #2
++
++; Function Attrs: noreturn nounwind
++declare void @__assert_fail(ptr noundef, ptr noundef, i32 noundef, ptr noundef) #3
++
++; Function Attrs: nounwind
++declare void @free(ptr noundef) #4
++
++; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
++declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
++
++attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
++attributes #2 = { nounwind allocsize(0,1) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #3 = { noreturn nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #4 = { nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #5 = { nounwind }
++attributes #6 = { nounwind allocsize(0,1) }
++attributes #7 = { noreturn nounwind }
++
++!llvm.module.flags = !{!0, !1, !2, !3, !4}
++!llvm.ident = !{!5}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 8, !"PIC Level", i32 2}
++!2 = !{i32 7, !"PIE Level", i32 2}
++!3 = !{i32 7, !"uwtable", i32 2}
++!4 = !{i32 7, !"frame-pointer", i32 1}
++!5 = !{!"Huawei BiSheng Compiler clang version 18.0.0 (ssh://git@codehub-dg-y.huawei.com:2222/CompilerKernel/BiShengKernel/BiSheng.git 026024071a7fb66b26b65fb81da702cc5f0cf405)"}
++!6 = !{!7, !7, i64 0}
++!7 = !{!"int", !8, i64 0}
++!8 = !{!"omnipotent char", !9, i64 0}
++!9 = !{!"Simple C/C++ TBAA"}
++!10 = !{!11, !11, i64 0}
++!11 = !{!"any pointer", !8, i64 0}
++!12 = !{!13, !13, i64 0}
++!13 = !{!"double", !8, i64 0}
++!14 = distinct !{!14, !15}
++!15 = !{!"llvm.loop.mustprogress"}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll b/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+new file mode 100644
+index 000000000000..f905208a2f3b
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+@@ -0,0 +1,11 @@
++; The purpose is to test the baseline IR is the same as the 1st iteration of
++; autotuning process with --use-baseline-config enabled.
++; RUN: rm %t.baseline %t.firstIt -f
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.baseline
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.firstIt_baseline \
++; RUN:     -auto-tuning-input=%S/Inputs/autotune_datadir/baseline_config.yaml
++; RUN: cmp %t.firstIt_baseline %t.baseline
++
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.firstIt_random \
++; RUN:     -auto-tuning-input=%S/Inputs/autotune_datadir/random_config.yaml
++; RUN: not cmp %t.firstIt_random %t.baseline
+diff --git a/llvm/test/AutoTuning/BaselineConfig/opp.ll b/llvm/test/AutoTuning/BaselineConfig/opp.ll
+new file mode 100644
+index 000000000000..b2897316fc22
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/opp.ll
+@@ -0,0 +1,67 @@
++; REQUIRES: asserts
++; RUN: rm %t.callsite_opp -rf
++; RUN: opt %s -O3 -debug-only=inline -disable-output -S 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=DEFAULT
++; RUN: opt %s -O3 -auto-tuning-opp=%t.callsite_opp -disable-output -S 2>&1
++; RUN: FileCheck %s --input-file %t.callsite_opp/opp.ll.yaml -check-prefix=AUTOTUNE
++
++@a = global i32 4
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #0 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++declare void @extern()
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++
++; NOTE: Need to make sure the function inling have the same behaviour as O3 and
++;       'BaselineConfig'
++; DEFAULT: Inlining calls in: bar
++; DEFAULT: Inlining (cost=115, threshold=375), Call:   %0 = tail call i32 @simpleFunction(i32 6)
++
++; AUTOTUNE:      Pass:            inline
++; AUTOTUNE-NEXT: Name:            simpleFunction
++; AUTOTUNE-NEXT: Function:        bar
++; AUTOTUNE-NEXT: CodeRegionType:  callsite
++; AUTOTUNE-NEXT: CodeRegionHash:  {{0-9+}}
++; AUTOTUNE-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; AUTOTUNE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; AUTOTUNE-NEXT: Invocation:      0
+diff --git a/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll b/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+new file mode 100644
+index 000000000000..13acafae6fc4
+--- /dev/null
++++ b/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+@@ -0,0 +1,62 @@
++; REQUIRES: asserts
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=foo --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=FILTER_FOO
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=bar --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=FILTER_BAR
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=dummy -debug-only=autotuning | \
++; RUN:     FileCheck %s -check-prefix=FILTER_DUMMY
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++define void @bar(i32* nocapture %a) {
++entry:
++  call void @foo(i32* %a)
++  ret void
++}
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT: --- !AutoTuning
++
++; FILTER_FOO: --- !AutoTuning
++; FILTER_FOO: Function:        foo
++; FILTER_FOO-NOT: --- !AutoTuning
++
++; FILTER_BAR: --- !AutoTuning
++; FILTER_BAR: Function:        bar
++; FILTER_BAR-NOT: --- !AutoTuning
++
++; FILTER_DUMMY-NOT: --- !AutoTuning
++; FILTER_DUMMY-NOT: --- !AutoTuning
+diff --git a/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml b/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+new file mode 100644
+index 000000000000..9c203e58f0ab
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+@@ -0,0 +1,3 @@
++<inpus>
++  <input>this is a xml file</input>
++</input>
+diff --git a/llvm/test/AutoTuning/Error/Inputs/template.yaml b/llvm/test/AutoTuning/Error/Inputs/template.yaml
+new file mode 100644
+index 000000000000..1f02b52ffb38
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/Inputs/template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            pass
++Name:            for.body
++Function:        foo
++CodeRegionType:  loop
++CodeRegionHash:  0
++Args:
++  - UnrollCount: 2
++  - PassOrder: test, test2
++...
+diff --git a/llvm/test/AutoTuning/Error/file-not-found-error.ll b/llvm/test/AutoTuning/Error/file-not-found-error.ll
+new file mode 100644
+index 000000000000..6a364239a271
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/file-not-found-error.ll
+@@ -0,0 +1,29 @@
++; RUN: rm %t.non-existing.yaml -rf
++; RUN: not opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.non-existing.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when input yaml is not found
++;
++; ERROR: Error parsing auto-tuning input.
++; ERROR: No such file or directory
+diff --git a/llvm/test/AutoTuning/Error/invalid-yaml-error.ll b/llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+new file mode 100644
+index 000000000000..bfc8784c4ea4
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+@@ -0,0 +1,27 @@
++; RUN: not opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/invalid-format.yaml  2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when input yaml is in invalid format
++;
++; ERROR: error: YAML:1:1: error: document root is not of mapping type.
+diff --git a/llvm/test/AutoTuning/Error/malformed-input-error.ll b/llvm/test/AutoTuning/Error/malformed-input-error.ll
+new file mode 100644
+index 000000000000..0b73c3195503
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/malformed-input-error.ll
+@@ -0,0 +1,136 @@
++; Check if error messages are shown properly for malformed YAML files.
++
++; Missing Pass Field
++; RUN: rm %t.missing-pass.yaml -rf
++; RUN: sed 's#Pass:            pass##g' %S/Inputs/template.yaml > %t.missing-pass.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-pass.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-FIELD
++
++; Missing Pass Value
++; RUN: rm %t.missing-value-pass.yaml -rf
++; RUN: sed 's#pass##g' %S/Inputs/template.yaml > %t.missing-value-pass.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-pass.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PASS-VALUE
++
++; Missing Name Field
++; RUN: rm %t.missing-name.yaml -rf
++; RUN: sed 's#Name:            for.body##g' %S/Inputs/template.yaml > %t.missing-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-NAME-FIELD
++
++; Missing Name Value
++; RUN: rm %t.missing-value-name.yaml -rf
++; RUN: sed 's#for.body##g' %S/Inputs/template.yaml > %t.missing-value-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-NAME-VALUE
++
++; Missing Function Field
++; RUN: rm %t.missing-function.yaml -rf
++; RUN: sed 's#Function:        foo##g' %S/Inputs/template.yaml > %t.missing-function.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' -auto-tuning-input=%t.missing-function.yaml 2>&1 | FileCheck %s -check-prefix=ERROR-FUNCTION-FIELD
++
++; Missing Function Value
++; RUN: rm %t.missing-value-func.yaml -rf
++; RUN: sed 's#foo##g' %S/Inputs/template.yaml > %t.missing-value-func.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-func.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-FUNC-VALUE
++
++; Missing CodeRegionType Field
++; RUN: rm %t.missing-type.yaml -rf
++; RUN: sed 's#CodeRegionType:  loop##g' %S/Inputs/template.yaml > %t.missing-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-FIELD
++
++; Missing CodeRegionType Value
++; RUN: rm %t.missing-value-type.yaml -rf
++; RUN: sed 's#loop##g' %S/Inputs/template.yaml > %t.missing-value-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-VALUE
++
++; Invalid CodeRegionType Value
++; RUN: rm %t.invalid-value-type.yaml -rf
++; RUN: sed 's#loop#error-type#g' %S/Inputs/template.yaml > %t.invalid-value-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.invalid-value-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-INVALID
++
++; Missing Param Name
++; RUN: rm %t.missing-param-name.yaml -rf
++; RUN: sed 's#UnrollCount##g' %S/Inputs/template.yaml > %t.missing-param-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-param-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PARAM-NAME
++
++; Missing Param Value
++; RUN: rm %t.missing-value-param.yaml -rf
++; RUN: sed 's#2##g' %S/Inputs/template.yaml > %t.missing-value-param.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-param.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PARAM-VALUE
++
++; Empty Param List
++; RUN: rm %t.empty-value-param-list.yaml -rf
++; RUN: sed 's#\test, test2\#\\#g' %S/Inputs/template.yaml > %t.empty-value-param-list.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.empty-value-param-list.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=VALID
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly for malformed YAML input files.
++;
++
++; ERROR-FIELD: error: CodeRegionHash, CodeRegionType, or Pass missing.
++
++; ERROR-NAME-FIELD: error: Remark Name expected; enable -autotuning-omit-metadata.
++
++; ERROR-FUNCTION-FIELD: error: Remark Function Name expected; enable -autotuning-omit-metadata.
++
++; ERROR-PASS-VALUE: error: YAML:2:1: error: expected a value of scalar type.
++; ERROR-PASS-VALUE: Pass:
++
++; ERROR-NAME-VALUE: error: YAML:3:1: error: expected a value of scalar type.
++; ERROR-NAME-VALUE: Name:
++
++; ERROR-FUNC-VALUE: error: YAML:4:1: error: expected a value of scalar type.
++; ERROR-FUNC-VALUE: Function:
++
++; ERROR-CODE-REGION-TYPE-FIELD: CodeRegionHash, CodeRegionType, or Pass missing.
++
++; ERROR-CODE-REGION-TYPE-VALUE: error: YAML:5:1: error: expected a value of scalar type.
++; ERROR-CODE-REGION-TYPE-VALUE: CodeRegionType:
++
++; ERROR-CODE-REGION-TYPE-INVALID: Unsupported CodeRegionType:error-type
++
++; ERROR-PARAM-NAME: error: YAML:8:5: error: argument key is missing.
++; ERROR-PARAM-NAME: - : 2
++
++; ERROR-PARAM-VALUE: error: YAML:8:5: error: expected a value of scalar type.
++; ERROR-PARAM-VALUE: - UnrollCount:
++
++; VALID-NOT: -auto-tuning-input=(input file) option failed.
+diff --git a/llvm/test/AutoTuning/Error/output-error.ll b/llvm/test/AutoTuning/Error/output-error.ll
+new file mode 100644
+index 000000000000..61ffba50924b
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/output-error.ll
+@@ -0,0 +1,28 @@
++; RUN: rm %t.opp -rf; touch %t.opp
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.opp  2>&1 | FileCheck %s -check-prefix=ERROR-OPP
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when output files cannot be created
++;
++; ERROR-OPP: Error generating auto-tuning opportunities.
++; ERROR-OPP: error: Not a directory
+diff --git a/llvm/test/AutoTuning/Error/valid-input.ll b/llvm/test/AutoTuning/Error/valid-input.ll
+new file mode 100644
+index 000000000000..dae90cdbe408
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/valid-input.ll
+@@ -0,0 +1,27 @@
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/template.yaml  2>&1 | \
++; RUN:     FileCheck %s -check-prefix=VALID
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when the input is valid
++;
++
++; VALID-NOT: -auto-tuning-input=(input file) option failed.
+diff --git a/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml b/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+new file mode 100644
+index 000000000000..a7d390be63e7
+--- /dev/null
++++ b/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            dummy-pass
++CodeRegionType:  dummy-type
++Name:            foo
++DebugLoc:        { File: dummy-file, Line: 0, Column: 0 }
++Function:        foo
++CodeRegionHash:  0
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll b/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+new file mode 100644
+index 000000000000..b9dc81089d40
+--- /dev/null
++++ b/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+@@ -0,0 +1,103 @@
++; REQUIRES: asserts
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\dummy-pass\#inline#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\dummy-type\#callsite#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\dummy-file\#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=CoarseGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=COARSEGRAIN
++
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\dummy-pass\#inline#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\dummy-type\#callsite#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\dummy-file\#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=FineGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefixes=FINEGRAIN-1,FINEGRAIN-INLINE
++
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\dummy-pass\#loop-unroll#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\dummy-type\#loop#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\dummy-file\#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=FineGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefixes=FINEGRAIN-1,FINEGRAIN-2,FINEGRAIN-UNROLL
++
++; ModuleID = 'test.c'
++source_filename = "test.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable
++define dso_local i32 @test(i32* nocapture noundef %a, i32* nocapture noundef readonly %b, i32 noundef %size) local_unnamed_addr #0 {
++entry:
++  %cmp11 = icmp sgt i32 %size, 0
++  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
++
++for.body.preheader:                               ; preds = %entry
++  %wide.trip.count = zext i32 %size to i64
++  br label %for.body
++
++for.cond.cleanup:                                 ; preds = %for.body, %entry
++  ret i32 undef
++
++for.body:                                         ; preds = %for.body.preheader, %for.body
++  %indvars.iv = phi i64  0, %for.body.preheader ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %1 = load i32, i32* %arrayidx2, align 4
++  %add = add nsw i32 %1, %0
++  store i32 %add, i32* %arrayidx2, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
++  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
++}
++
++attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8a" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (1c7b819ced36)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "test.c", directory: "/home/m00629332/code/autoTuner")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (1c7b819ced36)"}
++!10 = distinct !DISubprogram(name: "dummy", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 2, column: 5, scope: !10)
++
++; COARSEGRAIN: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: start
++; COARSEGRAIN-NEXT: AutoTuningCompile: No change in opt pipeline for Basic/CoarseGrain incremental compilation mode.
++; COARSEGRAIN-NOT: Skip pass {{.*}}: True
++
++; FINEGRAIN-1: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: start
++; FINEGRAIN-1-NEXT: AutoTuningCompile: SkipPasses enabled.
++; FINEGRAIN-1-NOT: Skip pass {{.*}}: False
++; FINEGRAIN-1: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: inline
++; FINEGRAIN-INLINE: AutoTuningCompile: SkipPasses disabled.
++; FINEGRAIN-INLINE: Skip pass 'InlinerPass': False
++; FINEGRAIN-INLINE-NEXT: *** IR Dump After InlinerPass
++; FINEGRAIN-INLINE-NOT: Skip pass {{.*}}: True
++
++; FINEGRAIN-2: AutoTuningCompile: Old decision (SkipPasses = True ) continued.
++; FINEGRAIN-2-NOT: Skip pass {{.*}}: False
++; FINEGRAIN-2: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: loop-unroll
++; FINEGRAIN-UNROLL: AutoTuningCompile: SkipPasses disabled.
++; FINEGRAIN-UNROLL-NOT: Skip pass {{.*}}: True
+diff --git a/llvm/test/AutoTuning/Inline/Inputs/template.yaml b/llvm/test/AutoTuning/Inline/Inputs/template.yaml
+new file mode 100644
+index 000000000000..e04612183d1f
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/Inputs/template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            inline
++Name:            simpleFunction-entry
++Function:        bar
++CodeRegionType:  callsite
++CodeRegionHash:  5550568187071847048
++Args:
++  - ForceInline: force-inline
++...
+diff --git a/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml b/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+new file mode 100644
+index 000000000000..9fc88f56d6bc
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+@@ -0,0 +1,7 @@
++--- !AutoTuning
++Pass:            inline
++CodeRegionType:  callsite
++CodeRegionHash:  5550568187071847048
++Args:
++  - ForceInline: force-inline
++...
+diff --git a/llvm/test/AutoTuning/Inline/duplicate-calls.ll b/llvm/test/AutoTuning/Inline/duplicate-calls.ll
+new file mode 100644
+index 000000000000..ad32262ad044
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/duplicate-calls.ll
+@@ -0,0 +1,96 @@
++; RUN: rm %t.duplicate_calls -rf
++; RUN: opt %s -S -passes='cgscc(inline)' -auto-tuning-opp=%t.duplicate_calls \
++; RUN:     -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.duplicate_calls/duplicate-calls.ll.yaml
++
++; ModuleID = 'duplicate-calls.c'
++source_filename = "duplicate-calls.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nounwind uwtable
++define dso_local void @bar(i32* nocapture %result, i32* %cfb, i32 %bytes) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %call = tail call i32 @test(i32* %cfb, i32 %bytes) #1, !dbg !12
++  store i32 %call, i32* %result, align 4, !dbg !13, !tbaa !14
++  ret void, !dbg !18
++}
++
++declare dso_local i32 @test(i32*, i32) local_unnamed_addr #0
++
++; Function Attrs: nounwind uwtable
++define dso_local void @foo(i32* %cfb, i32* readnone %saved, i32* nocapture %result, i32 %bytes) local_unnamed_addr #0 !dbg !19 {
++entry:
++  %tobool.not = icmp eq i32* %cfb, null, !dbg !20
++  br i1 %tobool.not, label %if.else, label %if.then.split, !dbg !20
++
++if.then.split:                                    ; preds = %entry
++  tail call void @bar(i32* %result, i32* nonnull %cfb, i32 %bytes), !dbg !21
++  br label %return, !dbg !22
++
++if.else:                                          ; preds = %entry
++  %tobool1.not = icmp eq i32* %saved, null, !dbg !23
++  br i1 %tobool1.not, label %if.else.split, label %return, !dbg !23
++
++if.else.split:                                    ; preds = %if.else
++  tail call void @bar(i32* %result, i32* null, i32 %bytes), !dbg !21
++  br label %return, !dbg !23
++
++return:                                           ; preds = %if.then.split, %if.else.split, %if.else
++  ret void, !dbg !24
++}
++
++attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (clang-0d5d71fe6c22 flang-8b17fc131076)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "duplicate-calls.c", directory: "/home/m00629332/benchmarks/cBench/source/security_pgp_d/src")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (clang-0d5d71fe6c22 flang-8b17fc131076)"}
++!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 10, column: 16, scope: !10)
++!13 = !DILocation(line: 10, column: 14, scope: !10)
++!14 = !{!15, !15, i64 0}
++!15 = !{!"int", !16, i64 0}
++!16 = !{!"omnipotent char", !17, i64 0}
++!17 = !{!"Simple C/C++ TBAA"}
++!18 = !DILocation(line: 14, column: 1, scope: !10)
++!19 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 17, type: !11, scopeLine: 18, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!20 = !DILocation(line: 22, column: 6, scope: !19)
++!21 = !DILocation(line: 27, column: 2, scope: !19)
++!22 = !DILocation(line: 23, column: 3, scope: !19)
++!23 = !DILocation(line: 24, column: 11, scope: !19)
++!24 = !DILocation(line: 28, column: 1, scope: !19)
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            inline
++; CHECK-NEXT: Name:            bar-if.then.split
++; CHECK-NEXT: DebugLoc:        { File: duplicate-calls.c, Line: 27, Column: 2 }
++; CHECK-NEXT: Function:        foo
++; CHECK-NEXT: CodeRegionType:  callsite
++; CHECK-NEXT: CodeRegionHash:
++; CHECK-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CHECK-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            inline
++; CHECK-NEXT: Name:            bar-if.else.split
++; CHECK-NEXT: DebugLoc:        { File: duplicate-calls.c, Line: 27, Column: 2 }
++; CHECK-NEXT: Function:        foo
++; CHECK-NEXT: CodeRegionType:  callsite
++; CHECK-NEXT: CodeRegionHash:
++; CHECK-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CHECK-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CHECK-NEXT: Invocation:      0
+diff --git a/llvm/test/AutoTuning/Inline/force-inline.ll b/llvm/test/AutoTuning/Inline/force-inline.ll
+new file mode 100644
+index 000000000000..cedfc8df3483
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/force-inline.ll
+@@ -0,0 +1,84 @@
++; REQUIRES: asserts
++; RUN: opt < %s -passes=inline -debug-only=inline -disable-output -S 2>&1 | FileCheck %s -check-prefix=DEFAULT
++; simpleFunction will be inlined with the default behavior.
++
++; RUN: rm %t.force-inline.yaml -rf
++; RUN: sed 's#\force-inline\#true#g' %S/Inputs/template.yaml > %t.force-inline.yaml
++; RUN: opt %s -passes=inline -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.force-inline.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=FORCE-INLINE
++; Test with ForceInline=true;
++
++; RUN: rm %t.force-inline.yaml -rf
++; RUN: sed 's#\force-inline\#true#g' %S/Inputs/template_no_metadata.yaml > %t.force-inline.yaml
++; RUN: opt %s -passes=inline -S -auto-tuning-input=%t.force-inline.yaml \
++; RUN:     -debug-only=inline -disable-output -auto-tuning-omit-metadata 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=FORCE-INLINE
++; Test with ForceInline=true;
++
++; RUN: rm %t.no-inline.yaml -rf
++; RUN: sed 's#\force-inline\#false#g' %S/Inputs/template.yaml > %t.no-inline.yaml
++; RUN: opt %s -passes=inline -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.no-inline.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NO-INLINE
++; Test with ForceInline=false;
++
++; RUN: rm %t.no-inline.yaml -rf
++; RUN: sed 's#\force-inline\#false#g' %S/Inputs/template_no_metadata.yaml > %t.no-inline.yaml
++; RUN: opt %s -passes='cgscc(inline)' -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.no-inline.yaml -auto-tuning-omit-metadata 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NO-INLINE
++; Test with ForceInline=false;
++
++@a = global i32 4
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #0 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++declare void @extern()
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++; DEFAULT: Inlining (cost=120, threshold=337)
++; DEFAULT-SAME: simpleFunction
++; FORCE-INLINE: Inlining (cost=always): Force inlined by auto-tuning
++; FORCE-INLINE-SAME: simpleFunction
++; NO-INLINE: NOT Inlining (cost=never): Force non-inlined by auto-tuning
++; NO-INLINE-SAME: simpleFunction
+diff --git a/llvm/test/AutoTuning/Inline/inline-attribute.ll b/llvm/test/AutoTuning/Inline/inline-attribute.ll
+new file mode 100644
+index 000000000000..50f583d0a51e
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/inline-attribute.ll
+@@ -0,0 +1,85 @@
++; RUN: rm %t.inline_opp -rf
++; RUN: opt %s -S -passes='cgscc(inline)' -auto-tuning-opp=%t.inline_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.inline_opp/inline-attribute.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.inline_opp/inline-attribute.ll.yaml -check-prefix=TEST-2
++
++; ModuleID = 'inline.c'
++source_filename = "inline.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: noinline norecurse nounwind readnone uwtable willreturn
++define dso_local i32 @mul(i32 %a) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %mul = mul nsw i32 %a, %a, !dbg !12
++  ret i32 %mul, !dbg !13
++}
++
++; Function Attrs: alwaysinline nounwind uwtable
++define dso_local i32 @add(i32 %a) local_unnamed_addr #1 !dbg !14 {
++entry:
++  %add = shl nsw i32 %a, 1, !dbg !15
++  ret i32 %add, !dbg !16
++}
++
++; Function Attrs: nounwind uwtable
++define dso_local i32 @inc(i32 %a) local_unnamed_addr #2 !dbg !17 {
++entry:
++  %inc = add nsw i32 %a, 1, !dbg !18
++  ret i32 %inc, !dbg !19
++}
++
++; Function Attrs: nounwind uwtable
++define dso_local i32 @func(i32 %a) local_unnamed_addr #2 !dbg !20 {
++entry:
++  %call = call i32 @add(i32 %a), !dbg !21
++  %call1 = call i32 @mul(i32 %a), !dbg !22
++  %add = add nsw i32 %call, %call1, !dbg !23
++  %call2 = call i32 @inc(i32 %a), !dbg !24
++  %add3 = add nsw i32 %add, %call2, !dbg !25
++  ret i32 %add3, !dbg !26
++}
++
++attributes #0 = { noinline norecurse nounwind readnone uwtable willreturn "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { alwaysinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei Bisheng Compiler clang version 12.0.0 (729941c4adfa)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "test.c", directory: "/home/m00629332/code/autoTuner/ir-hashing")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (729941c4adfa)"}
++!10 = distinct !DISubprogram(name: "mul", scope: !1, file: !1, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 3, column: 13, scope: !10)
++!13 = !DILocation(line: 3, column: 5, scope: !10)
++!14 = distinct !DISubprogram(name: "add", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!15 = !DILocation(line: 8, column: 13, scope: !14)
++!16 = !DILocation(line: 8, column: 5, scope: !14)
++!17 = distinct !DISubprogram(name: "inc", scope: !1, file: !1, line: 11, type: !11, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!18 = !DILocation(line: 12, column: 12, scope: !17)
++!19 = !DILocation(line: 12, column: 5, scope: !17)
++!20 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 15, type: !11, scopeLine: 15, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!21 = !DILocation(line: 16, column: 12, scope: !20)
++!22 = !DILocation(line: 16, column: 19, scope: !20)
++!23 = !DILocation(line: 16, column: 18, scope: !20)
++!24 = !DILocation(line: 16, column: 26, scope: !20)
++!25 = !DILocation(line: 16, column: 25, scope: !20)
++!26 = !DILocation(line: 16, column: 5, scope: !20)
++
++; TEST-1: Pass:            inline
++; TEST-1-NOT: Pass:            inline
++
++; TEST-2: Name:            inc
++; TEST-2-NEXT: DebugLoc:        { File: test.c, Line: 16, Column: 26 }
++; TEST-2-NEXT: Function:        func
++; TEST-2-NEXT: CodeRegionType:  callsite
+diff --git a/llvm/test/AutoTuning/Inline/opp.ll b/llvm/test/AutoTuning/Inline/opp.ll
+new file mode 100644
+index 000000000000..dfe1dac29476
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/opp.ll
+@@ -0,0 +1,64 @@
++; RUN: rm %t.callsite_opp -rf
++; RUN: sed 's#\number\#25#g; s#\func_name\#ColdFunction#g' %S/Inputs/template.yaml > %t.template25.yaml
++; RUN: opt %s -passes=inline -S -auto-tuning-opp=%t.callsite_opp -auto-tuning-type-filter=CallSite
++
++; RUN: FileCheck  %s  --input-file %t.callsite_opp/opp.ll.yaml -check-prefix=CALLSITE
++
++@a = global i32 4
++
++declare void @extern()
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #1 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++; Check if code regions are properly generated as tuning opportunities.
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            simpleFunction
++; CALLSITE-NEXT: Function:        bar
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{0-9+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; Check if external functions are filtered out.
++; EXTERNAL-NOT: Name:        extern
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+new file mode 100644
+index 000000000000..6dc49a1f7dc2
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            for.cond
++DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++Function:        foo
++CodeRegionType:  loop
++Args:
++  - UnrollCount: number
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+new file mode 100644
+index 000000000000..4920329dbd4b
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+@@ -0,0 +1,10 @@
++# CodeRegionHash is correct for only first code region only. 
++!AutoTuning {Args: {UnrollCount: 2}, CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 8, File: loop-nest.c, Line: 10}, Function: loop_nest, Invocation: 0,
++  Name: for.body6.us, Pass: loop-unroll}
++--- !AutoTuning {Args: {UnrollCount: 4}, CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 5, File: loop-nest.c, Line: 9}, Function: loop_nest, Invocation: 0,
++  Name: for.cond4.preheader.us, Pass: loop-unroll}
++--- !AutoTuning {Args: {UnrollCount: 4}, CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 3, File: loop-nest.c, Line: 8}, Function: loop_nest, Invocation: 0,
++  Name: for.cond1.preheader, Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+new file mode 100644
+index 000000000000..a90cebbce88f
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            loop
++Function:        invariant_backedge_1
++CodeRegionType:  loop
++Args:
++ - UnrollCount: number
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+new file mode 100644
+index 000000000000..18681a0e2efe
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            label %5
++Function:        main
++CodeRegionType:  loop
++CodeRegionHash:  hash
++Args:
++- UnrollCount: number
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+new file mode 100644
+index 000000000000..166f877a232e
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            name
++Function:        foo
++CodeRegionType:  loop
++CodeRegionHash:  hash
++Args:
++  - UnrollCount: number
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+new file mode 100644
+index 000000000000..b626473cf782
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            loop-unroll
++CodeRegionType:  loop
++CodeRegionHash:  hash
++Args:
++  - UnrollCount: number
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll b/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+new file mode 100644
+index 000000000000..85dd690d01c5
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+@@ -0,0 +1,161 @@
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' | \
++; RUN:     FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.unroll_debug_loc0.yaml -rf
++; RUN: sed 's#\number\#0#g' %S/Inputs/debug_loc_template.yaml > %t.unroll_debug_loc0.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.unroll_debug_loc0.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.unroll_debug_loc4.yaml -rf
++; RUN: sed 's#\number\#4#g' %S/Inputs/debug_loc_template.yaml > %t.unroll_debug_loc4.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll_debug_loc4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; RUN: rm %t.unroll4.yaml -rf
++; RUN: sed 's#\number\#4#g; s#\name\#for.cond#g; s#\hash\#11552168367013316892#g;'\
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4-MISMATCH
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @foo(i32* %n) #0 !dbg !6 {
++entry:
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled when
++; the input remark contains DebugLoc info.
++;
++; DISABLE-LABEL: @foo(
++; DISABLE: for.cond
++; DISABLE: for.body
++; DISABLE-NOT: for.body.1
++; DISABLE: for.inc
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when unroll count explicitly set to be 0.
++;
++; UNROLL0-LABEL: @foo(
++; UNROLL0: for.cond
++; UNROLL0: for.body
++; UNROLL0-NOT: for.body.1
++; UNROLL0: for.inc
++; UNROLL0-NOT: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @foo(
++; UNROLL4: for.cond
++; UNROLL4: for.body
++; UNROLL4: for.body.1
++; UNROLL4: for.body.2
++; UNROLL4: for.body.3
++; UNROLL4: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when DebugLoc is missing in the input remark.
++;
++; UNROLL4-MISMATCH-LABEL: @foo(
++; UNROLL4-MISMATCH: for.cond
++; UNROLL4-MISMATCH: for.body
++; UNROLL4-MISMATCH-NOT: for.body.1
++; UNROLL4-MISMATCH: for.inc
++; UNROLL4-MISMATCH-NOT: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll b/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+new file mode 100644
+index 000000000000..414c6ff2d1b0
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+@@ -0,0 +1,56 @@
++; RUN: rm %t.default_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck  %s  --input-file %t.default_opp/dynamic_config.ll.yaml
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @transform(i64* nocapture %W) local_unnamed_addr{
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %entry, %for.body
++  %i.037 = phi i32  16, %entry ,  %inc, %for.body 
++  %sub = add nsw i32 %i.037, -3
++  %idxprom = sext i32 %sub to i64
++  %arrayidx = getelementptr inbounds i64, i64* %W, i64 %idxprom
++  %0 = load i64, i64* %arrayidx, align 8
++  %sub1 = add nsw i32 %i.037, -6
++  %idxprom2 = sext i32 %sub1 to i64
++  %arrayidx3 = getelementptr inbounds i64, i64* %W, i64 %idxprom2
++  %1 = load i64, i64* %arrayidx3, align 8
++  %xor = xor i64 %1, %0
++  %idxprom4 = zext i32 %i.037 to i64
++  %arrayidx5 = getelementptr inbounds i64, i64* %W, i64 %idxprom4
++  store i64 %xor, i64* %arrayidx5, align 8
++  %inc = add nuw nsw i32 %i.037, 1
++  %cmp = icmp ult i32 %i.037, 79
++  br i1 %cmp, label %for.body, label %for.body8.preheader
++
++for.body8.preheader:                              ; preds = %for.body
++  br label %for.body8
++
++for.body8:                                        ; preds = %for.body8.preheader, %for.body8
++  %indvars.iv = phi i64  80, %for.body8.preheader ,  %indvars.iv.next, %for.body8 
++  %2 = add nsw i64 %indvars.iv, -4
++  %arrayidx11 = getelementptr inbounds i64, i64* %W, i64 %2
++  %3 = load i64, i64* %arrayidx11, align 8
++  %4 = add nsw i64 %indvars.iv, -5
++  %arrayidx14 = getelementptr inbounds i64, i64* %W, i64 %4
++  %5 = load i64, i64* %arrayidx14, align 8
++  %xor15 = xor i64 %5, %3
++  %arrayidx17 = getelementptr inbounds i64, i64* %W, i64 %indvars.iv
++  store i64 %xor15, i64* %arrayidx17, align 8
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp ne i64 %indvars.iv.next, 256
++  br i1 %exitcond, label %for.body8, label %for.end20
++
++for.end20:                                        ; preds = %for.body8
++  ret void
++}
++
++; CHECK: --- !AutoTuning
++; CHECK: DynamicConfigs:  { UnrollCount:  0, 1, 64, 16, 32 
++; CHECK: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK: DynamicConfigs:  { UnrollCount:  0, 1, 64, 16, 32 
++; CHECK: ...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll b/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+new file mode 100644
+index 000000000000..7f3e27ca057a
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+@@ -0,0 +1,136 @@
++; REQUIRES: asserts
++; CodeRegionHash matches for the first code region only. AutoTuner will find
++; match for one code region when hash matching is enabled. AutoTuner will find
++; match for all three code regions when hash matching is disabl3ed.
++ 
++; RUN: rm -rf %t.loop_nest.txt
++; RUN: opt %s -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning -auto-tuning-input=%S/Inputs/loop_nest.yaml \
++; RUN:     --disable-output &> %t.loop_nest.txt
++; RUN: grep 'UnrollCount is set' %t.loop_nest.txt | wc -l | \
++; RUN:     FileCheck %s -check-prefix=HASH_MATCHING_ENABLED
++
++; RUN: rm -rf %t.loop_nest.txt
++; RUN: opt %s -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/loop_nest.yaml -debug-only=autotuning \
++; RUN:     -auto-tuning-code-region-matching-hash=false --disable-output &> %t.loop_nest.txt
++; RUN: grep 'UnrollCount is set' %t.loop_nest.txt | wc -l | \
++; RUN:     FileCheck %s -check-prefix=HASH_MATCHING_DISABLED
++
++; ModuleID = 'loop-nest.c'
++source_filename = "loop-nest.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @loop_nest(i32 %ni, i32 %nj, i32 %nk, i32 %alpha, i32 %beta, i32** nocapture readonly %A, i32** nocapture readonly %B, i32** nocapture readonly %C) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %cmp41 = icmp sgt i32 %ni, 0, !dbg !12
++  br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.end23, !dbg !13
++
++for.cond1.preheader.lr.ph:                        ; preds = %entry
++  %cmp238 = icmp slt i32 %nk, 1
++  %cmp536 = icmp slt i32 %nj, 1
++  %wide.trip.count51 = zext i32 %ni to i64, !dbg !12
++  %wide.trip.count47 = zext i32 %nk to i64
++  %wide.trip.count = zext i32 %nj to i64
++  %brmerge = or i1 %cmp238, %cmp536
++  br label %for.cond1.preheader, !dbg !13
++
++for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc21
++  %indvars.iv49 = phi i64  0, %for.cond1.preheader.lr.ph ,  %indvars.iv.next50, %for.inc21 
++  br i1 %brmerge, label %for.inc21, label %for.cond4.preheader.us.preheader, !dbg !14
++
++for.cond4.preheader.us.preheader:                 ; preds = %for.cond1.preheader
++  %arrayidx15 = getelementptr inbounds i32*, i32** %C, i64 %indvars.iv49
++  %arrayidx = getelementptr inbounds i32*, i32** %A, i64 %indvars.iv49
++  %.pre = load i32*, i32** %arrayidx, align 8, !tbaa !15
++  %.pre53 = load i32*, i32** %arrayidx15, align 8, !tbaa !15
++  br label %for.cond4.preheader.us, !dbg !14
++
++for.cond4.preheader.us:                           ; preds = %for.cond4.preheader.us.preheader, %for.cond4.for.inc18_crit_edge.us
++  %indvars.iv45 = phi i64  0, %for.cond4.preheader.us.preheader ,  %indvars.iv.next46, %for.cond4.for.inc18_crit_edge.us 
++  %arrayidx8.us = getelementptr inbounds i32, i32* %.pre, i64 %indvars.iv45
++  %arrayidx10.us = getelementptr inbounds i32*, i32** %B, i64 %indvars.iv45
++  %0 = load i32*, i32** %arrayidx10.us, align 8, !tbaa !15
++  br label %for.body6.us, !dbg !19
++
++for.body6.us:                                     ; preds = %for.cond4.preheader.us, %for.body6.us
++  %indvars.iv = phi i64  0, %for.cond4.preheader.us ,  %indvars.iv.next, %for.body6.us 
++  %1 = load i32, i32* %arrayidx8.us, align 4, !dbg !20, !tbaa !21
++  %mul.us = mul nsw i32 %1, %alpha, !dbg !23
++  %arrayidx12.us = getelementptr inbounds i32, i32* %0, i64 %indvars.iv, !dbg !24
++  %2 = load i32, i32* %arrayidx12.us, align 4, !dbg !24, !tbaa !21
++  %mul13.us = mul nsw i32 %mul.us, %2, !dbg !25
++  %arrayidx17.us = getelementptr inbounds i32, i32* %.pre53, i64 %indvars.iv, !dbg !26
++  %3 = load i32, i32* %arrayidx17.us, align 4, !dbg !27, !tbaa !21
++  %add.us = add nsw i32 %3, %mul13.us, !dbg !27
++  store i32 %add.us, i32* %arrayidx17.us, align 4, !dbg !27, !tbaa !21
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !29
++  br i1 %exitcond.not, label %for.cond4.for.inc18_crit_edge.us, label %for.body6.us, !dbg !19, !llvm.loop !30
++
++for.cond4.for.inc18_crit_edge.us:                 ; preds = %for.body6.us
++  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1, !dbg !33
++  %exitcond48.not = icmp eq i64 %indvars.iv.next46, %wide.trip.count47, !dbg !34
++  br i1 %exitcond48.not, label %for.inc21, label %for.cond4.preheader.us, !dbg !14, !llvm.loop !35
++
++for.inc21:                                        ; preds = %for.cond4.for.inc18_crit_edge.us, %for.cond1.preheader
++  %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1, !dbg !37
++  %exitcond52.not = icmp eq i64 %indvars.iv.next50, %wide.trip.count51, !dbg !12
++  br i1 %exitcond52.not, label %for.end23, label %for.cond1.preheader, !dbg !13, !llvm.loop !38
++
++for.end23:                                        ; preds = %for.inc21, %entry
++  ret void, !dbg !40
++}
++
++attributes #0 = { nofree norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (clang-a279e099a09a flang-9a86b70390a7)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "loop-nest.c", directory: "/home/m00629332/code/autoTuner")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (clang-a279e099a09a flang-9a86b70390a7)"}
++!10 = distinct !DISubprogram(name: "loop_nest", scope: !1, file: !1, line: 1, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 8, column: 17, scope: !10)
++!13 = !DILocation(line: 8, column: 3, scope: !10)
++!14 = !DILocation(line: 9, column: 5, scope: !10)
++!15 = !{!16, !16, i64 0}
++!16 = !{!"any pointer", !17, i64 0}
++!17 = !{!"omnipotent char", !18, i64 0}
++!18 = !{!"Simple C/C++ TBAA"}
++!19 = !DILocation(line: 10, column: 8, scope: !10)
++!20 = !DILocation(line: 11, column: 23, scope: !10)
++!21 = !{!22, !22, i64 0}
++!22 = !{!"int", !17, i64 0}
++!23 = !DILocation(line: 11, column: 21, scope: !10)
++!24 = !DILocation(line: 11, column: 33, scope: !10)
++!25 = !DILocation(line: 11, column: 31, scope: !10)
++!26 = !DILocation(line: 11, column: 4, scope: !10)
++!27 = !DILocation(line: 11, column: 12, scope: !10)
++!28 = !DILocation(line: 10, column: 29, scope: !10)
++!29 = !DILocation(line: 10, column: 22, scope: !10)
++!30 = distinct !{!30, !19, !31, !32}
++!31 = !DILocation(line: 11, column: 39, scope: !10)
++!32 = !{!"llvm.loop.mustprogress"}
++!33 = !DILocation(line: 9, column: 26, scope: !10)
++!34 = !DILocation(line: 9, column: 19, scope: !10)
++!35 = distinct !{!35, !14, !36, !32}
++!36 = !DILocation(line: 12, column: 5, scope: !10)
++!37 = !DILocation(line: 8, column: 24, scope: !10)
++!38 = distinct !{!38, !13, !39, !32}
++!39 = !DILocation(line: 13, column: 3, scope: !10)
++!40 = !DILocation(line: 15, column: 1, scope: !10)
++
++; HASH_MATCHING_ENABLED: 1
++; HASH_MATCHING_DISABLED: 3
+diff --git a/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll b/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+new file mode 100644
+index 000000000000..f3839a49b20e
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+@@ -0,0 +1,53 @@
++; NOTE: This file is used to test when UnrollCount = 1 and when the compiler
++; sees that Loop Peeling is beneficial and possible, then we do Loop Peeling.
++; RUN: rm %t.unroll1.yaml -rf
++; RUN: sed 's#\number\#1#g;' %S/Inputs/loop_peel.yaml > %t.unroll1.yaml
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll1.yaml | FileCheck %s
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\number\#0#g;' %S/Inputs/loop_peel.yaml > %t.unroll0.yaml
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll0.yaml | FileCheck %s --check-prefix=DISABLE
++
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/loop_peel.ll.yaml -check-prefix=TEST-1
++
++define i32 @invariant_backedge_1(i32 %a, i32 %b) {
++; CHECK-LABEL: @invariant_backedge_1
++; CHECK-NOT:     %plus = phi
++; CHECK:       loop.peel:
++; CHECK:       loop:
++; CHECK:         %i = phi
++; CHECK:         %sum = phi
++; DISABLE-LABEL: @invariant_backedge_1
++; DISABLE-NOT: loop.peel:
++entry:
++  br label %loop
++
++loop:
++  %i = phi i32  0, %entry ,  %inc, %loop 
++  %sum = phi i32  0, %entry ,  %incsum, %loop 
++  %plus = phi i32  %a, %entry ,  %b, %loop 
++
++  %incsum = add i32 %sum, %plus
++  %inc = add i32 %i, 1
++  %cmp = icmp slt i32 %i, 1000
++
++  br i1 %cmp, label %loop, label %exit
++
++exit:
++  ret i32 %sum
++}
++
++; Check for dynamic values when UnrollCount is set to 1:
++; TEST-1:      Pass:                loop-unroll
++; TEST-1-NEXT: Name:                loop
++; TEST-1-NEXT: Function:            invariant_backedge_1
++; TEST-1-NEXT: CodeRegionType:      loop
++; TEST-1-NEXT: CodeRegionHash:      {{0-9+}}
++; TEST-1-NEXT: DynamicConfigs:      { UnrollCount:  0, 1, 2  }
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll b/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+new file mode 100644
+index 000000000000..843b8e28f3d8
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+@@ -0,0 +1,129 @@
++; RUN: rm %t.unroll_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-2
++
++; RUN: rm %t.unroll_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,function(loop-unroll)' --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-2
++
++; This function contains two loops. loop for.body is defined with a pragma
++; unroll_count(4) and loop for.body9 is without a pragama. AutoTuner will only
++; consider for.body9 as a tuning opportunity.
++
++; ModuleID = 'loop-unroll.c'
++source_filename = "loop-unroll.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @loop(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32* noalias nocapture %d, i32 %len) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %cmp34 = icmp slt i32 0, %len, !dbg !12
++  br i1 %cmp34, label %for.body.lr.ph, label %for.cond6.preheader, !dbg !13
++
++for.body.lr.ph:                                   ; preds = %entry
++  br label %for.body, !dbg !13
++
++for.cond.for.cond6.preheader_crit_edge:           ; preds = %for.body
++  br label %for.cond6.preheader, !dbg !13
++
++for.cond6.preheader:                              ; preds = %for.cond.for.cond6.preheader_crit_edge, %entry
++  %cmp732 = icmp slt i32 0, %len, !dbg !14
++  br i1 %cmp732, label %for.body9.lr.ph, label %for.cond.cleanup8, !dbg !15
++
++for.body9.lr.ph:                                  ; preds = %for.cond6.preheader
++  br label %for.body9, !dbg !15
++
++for.body:                                         ; preds = %for.body.lr.ph, %for.body
++  %i.035 = phi i32  0, %for.body.lr.ph ,  %inc, %for.body 
++  %idxprom = zext i32 %i.035 to i64, !dbg !16
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom, !dbg !16
++  %0 = load i32, i32* %arrayidx, align 4, !dbg !16, !tbaa !17
++  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %idxprom, !dbg !21
++  %1 = load i32, i32* %arrayidx2, align 4, !dbg !21, !tbaa !17
++  %add = add nsw i32 %1, %0, !dbg !22
++  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %idxprom, !dbg !23
++  store i32 %add, i32* %arrayidx4, align 4, !dbg !24, !tbaa !17
++  %inc = add nuw nsw i32 %i.035, 1, !dbg !25
++  %cmp = icmp slt i32 %inc, %len, !dbg !12
++  br i1 %cmp, label %for.body, label %for.cond.for.cond6.preheader_crit_edge, !dbg !13, !llvm.loop !26
++
++for.cond6.for.cond.cleanup8_crit_edge:            ; preds = %for.body9
++  br label %for.cond.cleanup8, !dbg !15
++
++for.cond.cleanup8:                                ; preds = %for.cond6.for.cond.cleanup8_crit_edge, %for.cond6.preheader
++  ret void, !dbg !30
++
++for.body9:                                        ; preds = %for.body9.lr.ph, %for.body9
++  %i5.033 = phi i32  0, %for.body9.lr.ph ,  %inc17, %for.body9 
++  %idxprom10 = zext i32 %i5.033 to i64, !dbg !31
++  %arrayidx11 = getelementptr inbounds i32, i32* %a, i64 %idxprom10, !dbg !31
++  %2 = load i32, i32* %arrayidx11, align 4, !dbg !31, !tbaa !17
++  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 %idxprom10, !dbg !32
++  %3 = load i32, i32* %arrayidx13, align 4, !dbg !32, !tbaa !17
++  %mul = mul nsw i32 %3, %2, !dbg !33
++  %arrayidx15 = getelementptr inbounds i32, i32* %d, i64 %idxprom10, !dbg !34
++  store i32 %mul, i32* %arrayidx15, align 4, !dbg !35, !tbaa !17
++  %inc17 = add nuw nsw i32 %i5.033, 1, !dbg !36
++  %cmp7 = icmp slt i32 %inc17, %len, !dbg !14
++  br i1 %cmp7, label %for.body9, label %for.cond6.for.cond.cleanup8_crit_edge, !dbg !15, !llvm.loop !37
++}
++
++attributes #0 = { nofree norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei Bisheng Compiler clang version 12.0.0 (0261bbf0b2fd)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "loop-unroll.c", directory: "/home/AutoTuner/")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (0261bbf0b2fd)"}
++!10 = distinct !DISubprogram(name: "a", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 3, column: 20, scope: !10)
++!13 = !DILocation(line: 3, column: 5, scope: !10)
++!14 = !DILocation(line: 7, column: 20, scope: !10)
++!15 = !DILocation(line: 7, column: 5, scope: !10)
++!16 = !DILocation(line: 4, column: 16, scope: !10)
++!17 = !{!18, !18, i64 0}
++!18 = !{!"int", !19, i64 0}
++!19 = !{!"omnipotent char", !20, i64 0}
++!20 = !{!"Simple C/C++ TBAA"}
++!21 = !DILocation(line: 4, column: 23, scope: !10)
++!22 = !DILocation(line: 4, column: 21, scope: !10)
++!23 = !DILocation(line: 4, column: 9, scope: !10)
++!24 = !DILocation(line: 4, column: 14, scope: !10)
++!25 = !DILocation(line: 3, column: 28, scope: !10)
++!26 = distinct !{!26, !13, !27, !28, !29}
++!27 = !DILocation(line: 5, column: 5, scope: !10)
++!28 = !{!"llvm.loop.mustprogress"}
++!29 = !{!"llvm.loop.unroll.count", i32 4}
++!30 = !DILocation(line: 10, column: 1, scope: !10)
++!31 = !DILocation(line: 8, column: 16, scope: !10)
++!32 = !DILocation(line: 8, column: 23, scope: !10)
++!33 = !DILocation(line: 8, column: 21, scope: !10)
++!34 = !DILocation(line: 8, column: 9, scope: !10)
++!35 = !DILocation(line: 8, column: 14, scope: !10)
++!36 = !DILocation(line: 7, column: 28, scope: !10)
++!37 = distinct !{!37, !15, !38, !28}
++!38 = !DILocation(line: 9, column: 5, scope: !10)
++
++
++; TEST-1: Pass:            loop-unroll
++; TEST-1-NOT: Pass:            loop-unroll
++
++; TEST-2: Name:            for.body9
++; TEST-2-NEXT: DebugLoc:        { File: loop-unroll.c, Line: 7, Column: 5 }
++; TEST-2-NEXT: Function:        loop
++; TEST-2-NEXT: CodeRegionType:  loop
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll.ll b/llvm/test/AutoTuning/LoopUnroll/unroll.ll
+new file mode 100644
+index 000000000000..ba5c89fffaff
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll.ll
+@@ -0,0 +1,101 @@
++; RUN: opt %s -S -passes=loop-unroll | FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\number\#0#g; s#\name\#for.body#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll0.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll0.yaml \
++; RUN:     -auto-tuning-code-region-matching-hash=false | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\number\#0#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll0.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll0.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.result1 %t.unroll1.yaml -rf
++; RUN: sed 's#\number\#1#g; s#\name\#for.body#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll1.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll1.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result1 %t.unroll1.yaml -rf
++; RUN: sed 's#\number\#1#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll1.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll1.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result4 %t.unroll4.yaml -rf
++; RUN: sed 's#\number\#4#g; s#\name\#for.body#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; RUN: rm %t.result4 %t.unroll4.yaml -rf
++; RUN: sed 's#\number\#4#g; s#\hash\#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll4.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled
++;
++; DISABLE-LABEL: @foo(
++; DISABLE: store i32
++; DISABLE-NOT: store i32
++; DISABLE: br i1
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when unroll count explicitly set to be 0.
++;
++; UNROLL0-LABEL: @foo(
++; UNROLL0: store i32
++; UNROLL0-NOT: store i32
++; UNROLL0: br i1
++; UNROLL0-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - Requesting UnrollCount = 1 will perform
++; Loop Peeling, and if Loop Peeling isn't possible/beneficial then Unroll Count
++; is unchanged.
++;
++; UNROLL1-LABEL: @foo(
++; UNROLL1: store i32
++; UNROLL1-NOT: store i32
++; UNROLL1: br i1
++; UNROLL1: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @foo(
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: br i1
++; UNROLL4: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll b/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+new file mode 100644
+index 000000000000..480ccad640ae
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+@@ -0,0 +1,113 @@
++; Test loop unrolling using auto-tuning YAML api with IRs generated when ASSERTION=OFF
++; The IRs generated when ASSERTION=OFF usually only use slot numbers as variable names.
++
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' | \
++; RUN:     FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.result1_raw %t.unroll1_raw.yaml -rf
++; RUN: sed 's#\number\#1#g; s#\hash\#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll1_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll1_raw.yaml | FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result2_raw %t.unroll2_raw.yaml -rf
++; RUN: sed 's#\number\#2#g; s#\hash\#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll2_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll2_raw.yaml | FileCheck %s -check-prefix=UNROLL2
++
++; RUN: rm %t.result4_raw %t.unroll4_raw.yaml -rf
++; RUN: sed 's#\number\#4#g; s#\hash\#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll4_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll4_raw.yaml | FileCheck %s -check-prefix=UNROLL4
++
++; UNSUPPORTED: windows
++
++; ModuleID = 't.ll'
++source_filename = "t.ll"
++
++@.str = private unnamed_addr constant 4 x i8 c"%d\0A\00", align 1
++
++define void @test(i32*) {
++  %2 = alloca i32*, align 8
++  store i32* %0, i32** %2, align 8
++  %3 = load i32*, i32** %2, align 8
++  %4 = load i32, i32* %3, align 4
++  %5 = add nsw i32 %4, 2
++  %6 = load i32*, i32** %2, align 8
++  store i32 %5, i32* %6, align 4
++  ret void
++}
++
++define i32 @main() {
++  %1 = alloca i32, align 4
++  %2 = alloca i32, align 4
++  store i32 0, i32* %1, align 4
++  store i32 8, i32* %2, align 4
++  %3 = load i32, i32* %2, align 4
++  %4 = icmp sle i32 %3, 88
++  br i1 %4, label %.lr.ph, label %13
++
++.lr.ph:                                           ; preds = %0
++  br label %5
++
++; <label>:5:                                      ; preds = %.lr.ph, %8
++  call void @test(i32* %2)
++  %6 = load i32, i32* %2, align 4
++  %7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds (4 x i8, 4 x i8* @.str, i32 0, i32 0), i32 %6)
++  br label %8
++
++; <label>:8:                                      ; preds = %5
++  %9 = load i32, i32* %2, align 4
++  %10 = add nsw i32 %9, 8
++  store i32 %10, i32* %2, align 4
++  %11 = load i32, i32* %2, align 4
++  %12 = icmp sle i32 %11, 88
++  br i1 %12, label %5, label %._crit_edge
++
++._crit_edge:                                      ; preds = %8
++  br label %13
++
++; <label>:13:                                     ; preds = %._crit_edge, %0
++  %14 = load i32, i32* %1, align 4
++  ret i32 %14
++}
++
++declare i32 @printf(i8*, ...)
++
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled
++;
++; DISABLE-LABEL: @main(
++; DISABLE: call void @test(ptr %2)
++; DISABLE-NOT: call void @test(ptr %2)
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 1
++; when explicitly requested.
++;
++; UNROLL1-LABEL: @main(
++; UNROLL1: call void @test(ptr %2)
++; UNROLL1-NOT: call void @test(ptr %2)
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 2
++; when explicitly requested.
++;
++; UNROLL2-LABEL: @main(
++; UNROLL2: call void @test(ptr %2)
++; UNROLL2: call void @test(ptr %2)
++; UNROLL2-NOT: call void @test(ptr %2)
++; UNROLL2: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @main(
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+new file mode 100644
+index 000000000000..b65fddf4e23f
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            loop-vectorize
++Name:            bb4
++Function:        TestFoo
++CodeRegionType:  loop
++CodeRegionHash:  14229620333597121971
++Args:
++- VectorizationInterleave: number
++...
+diff --git a/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+new file mode 100644
+index 000000000000..87d2fc2587cb
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+@@ -0,0 +1,7 @@
++--- !AutoTuning
++Pass:            loop-vectorize
++CodeRegionType:  loop
++CodeRegionHash:  14229620333597121971
++Args:
++- VectorizationInterleave: number
++...
+diff --git a/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll b/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+new file mode 100644
+index 000000000000..a1652babd8f4
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+@@ -0,0 +1,88 @@
++; RUN: rm %t.1 %t.2 %t.1.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -S -o %t.1
++; RUN: sed 's#\number\#1#g' %S/Inputs/vectorize_template.yaml > %t.1.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.1.yaml \
++; RUN:     -S -o %t.2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER1
++; RUN: diff %t.1 %t.2
++
++; RUN: rm %t.1 %t.2 %t.1.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -S -o %t.1
++; RUN: sed 's#\number\#1#g' %S/Inputs/vectorize_template_no_metadata.yaml > %t.1.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.1.yaml \
++; RUN:     -auto-tuning-omit-metadata -S -o %t.2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER1
++; RUN: diff %t.1 %t.2
++
++; RUN: rm %t.3 %t.4 %t.2.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=2 -S -o %t.3
++; RUN: sed 's#\number\#2#g' %S/Inputs/vectorize_template.yaml > %t.2.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.2.yaml \
++; RUN:     -S -o %t.4 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER2
++; RUN: diff %t.3 %t.4
++
++; RUN: rm %t.3 %t.4 %t.2.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=2 -S -o %t.3
++; RUN: sed 's#\number\#2#g' %S/Inputs/vectorize_template_no_metadata.yaml > %t.2.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.2.yaml \
++; RUN:     -auto-tuning-omit-metadata -S -o %t.4 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER2
++; RUN: diff %t.3 %t.4
++
++; Compiler should not generate tuning opportunities for AutoTuner if -force-vector-interleave is specified.
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp \
++; RUN:     -force-vector-interleave=2 --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=FORCE-INTERLEAVE
++
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp \
++; RUN:     -force-vector-interleave=0 --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=FORCE-INTERLEAVE
++
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=NO-FORCE-INTERLEAVE
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++target datalayout = "e-m:e-i64:64-n32:64"
++target triple = "powerpc64le-unknown-linux-gnu"
++
++define void @TestFoo(i1 %X, i1 %Y) {
++bb:
++  br label %.loopexit5.outer
++
++.loopexit5.outer:
++  br label %.lr.ph12
++
++.loopexit:
++  br i1 %X, label %.loopexit5.outer, label %.lr.ph12
++
++.lr.ph12:
++  %f.110 = phi i32*  %tmp1, %.loopexit ,  null, %.loopexit5.outer 
++  %tmp1 = getelementptr inbounds i32, i32* %f.110, i64 -2
++  br i1 %Y, label %bb4, label %.loopexit
++
++bb4:
++  %j.27 = phi i32  0, %.lr.ph12 ,  %tmp7, %bb4 
++  %tmp5 = load i32, i32* %f.110, align 4
++  %tmp7 = add nsw i32 %j.27, 1
++  %exitcond = icmp eq i32 %tmp7, 0
++  br i1 %exitcond, label %.loopexit, label %bb4
++}
++
++; NUMBER1: VectorizationInterleave is set for the CodeRegion:
++; NUMBER1:   Name: bb4
++; NUMBER1:   FuncName: TestFoo
++; NUMBER2: VectorizationInterleave is set for the CodeRegion:
++; NUMBER2:   Name: bb4
++; NUMBER2:   FuncName: TestFoo
++
++; FORCE-INTERLEAVE-NOT: Pass:           loop-vectorize
++; NO-FORCE-INTERLEAVE: Pass:            loop-vectorize
++; NO-FORCE-INTERLEAVE: BaselineConfig:  { VectorizationInterleave:
+diff --git a/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml b/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+new file mode 100644
+index 000000000000..34ea66e45a0a
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            machine-scheduler
++Name:            '%bb.1:for.cond.preheader'
++Function:        _preextrapolate_helper
++CodeRegionType:  machine_basic_block
++CodeRegionHash:  17389215691512956355
++Args:
++- ForceBottomUp: bool1
++- ForceTopDown: bool2
++...
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+new file mode 100644
+index 000000000000..aa4781dad204
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+@@ -0,0 +1,73 @@
++; RUN: rm %t.bidirectional_result %t.misched_x86_bidirectional.yaml -rf
++; RUN: sed  ' s#\bool1\#false#g;  s#\bool2\#false#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_bidirectional.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-topdown -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-bottomup -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+new file mode 100644
+index 000000000000..c1d6894c3fe2
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+@@ -0,0 +1,72 @@
++; RUN: rm %t.bottomup_result %t.misched_x86_bottomup.yaml -rf
++; RUN: sed  ' s#\bool1\#true#g;  s#\bool2\#false#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_bottomup.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler  2>&1\
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-topdown -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: RegionPolicy: ShouldTrackPressure=0 OnlyTopDown=0 OnlyBottomUp=1
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++
++; check if the scheduling policies defined with YAML and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: RegionPolicy: ShouldTrackPressure=0 OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=1
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+new file mode 100644
+index 000000000000..53c527e87e41
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+@@ -0,0 +1,72 @@
++; RUN: rm %t.topdown_result %t.misched_x86_topdown.yaml -rf
++; RUN: sed  's#\bool1\#false#g;  s#\bool2\#true#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_topdown.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler  2>&1\
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-topdown -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched  \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=1 OnlyBottomUp=0
+diff --git a/llvm/test/AutoTuning/MetaData/structural_hash.ll b/llvm/test/AutoTuning/MetaData/structural_hash.ll
+new file mode 100644
+index 000000000000..2d8adca910bc
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/structural_hash.ll
+@@ -0,0 +1,234 @@
++; RUN: rm %t.hash_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.hash_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL1
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL2
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL3
++
++; RUN: rm %t.hash_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-type-filter=CallSite -auto-tuning-opp=%t.hash_opp \
++; RUN:     -auto-tuning-omit-metadata --disable-output
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=NO-META-CALL
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop_small.cpp'
++source_filename = "loop_small.cpp"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++@arr = dso_local global 1000000 x i32 zeroinitializer, align 4, !dbg !0
++
++; Function Attrs: nounwind uwtable mustprogress
++define dso_local void @_Z1fv() #0 !dbg !18 {
++entry:
++  %i = alloca i32, align 4
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !21, metadata !DIExpression()), !dbg !23
++  store i32 0, i32* %i, align 4, !dbg !23
++  br label %for.cond, !dbg !24
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !25
++  %cmp = icmp slt i32 %0, 2000, !dbg !27
++  br i1 %cmp, label %for.body, label %for.end, !dbg !28
++
++for.body:                                         ; preds = %for.cond
++  %1 = load i32, i32* %i, align 4, !dbg !29
++  %idxprom = sext i32 %1 to i64, !dbg !31
++  %arrayidx = getelementptr inbounds 1000000 x i32, 1000000 x i32* @arr, i64 0, i64 %idxprom, !dbg !31
++  %2 = load i32, i32* %arrayidx, align 4, !dbg !32
++  %add = add nsw i32 %2, 2, !dbg !32
++  store i32 %add, i32* %arrayidx, align 4, !dbg !32
++  br label %for.inc, !dbg !33
++
++for.inc:                                          ; preds = %for.body
++  %3 = load i32, i32* %i, align 4, !dbg !34
++  %inc = add nsw i32 %3, 1, !dbg !34
++  store i32 %inc, i32* %i, align 4, !dbg !34
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  ret void, !dbg !39
++}
++
++; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++; Function Attrs: nounwind uwtable mustprogress
++define dso_local void @_Z1gv() #0 !dbg !40 {
++entry:
++  %0 = load i32, i32* getelementptr inbounds (1000000 x i32, 1000000 x i32* @arr, i64 0, i64 0), align 4, !dbg !41
++  %inc = add nsw i32 %0, 1, !dbg !41
++  store i32 %inc, i32* getelementptr inbounds (1000000 x i32, 1000000 x i32* @arr, i64 0, i64 0), align 4, !dbg !41
++  ret void, !dbg !42
++}
++
++; Function Attrs: norecurse nounwind uwtable mustprogress
++define dso_local i32 @main() #2 !dbg !43 {
++entry:
++  %retval = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32 0, i32* %retval, align 4
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !46, metadata !DIExpression()), !dbg !48
++  store i32 0, i32* %i, align 4, !dbg !48
++  br label %for.cond, !dbg !49
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !50
++  %cmp = icmp slt i32 %0, 1000000, !dbg !52
++  br i1 %cmp, label %for.body, label %for.end, !dbg !53
++
++for.body:                                         ; preds = %for.cond
++  %1 = load i32, i32* %i, align 4, !dbg !54
++  %idxprom = sext i32 %1 to i64, !dbg !55
++  %arrayidx = getelementptr inbounds 1000000 x i32, 1000000 x i32* @arr, i64 0, i64 %idxprom, !dbg !55
++  store i32 0, i32* %arrayidx, align 4, !dbg !56
++  br label %for.inc, !dbg !55
++
++for.inc:                                          ; preds = %for.body
++  %2 = load i32, i32* %i, align 4, !dbg !57
++  %inc = add nsw i32 %2, 1, !dbg !57
++  store i32 %inc, i32* %i, align 4, !dbg !57
++  br label %for.cond, !dbg !58, !llvm.loop !59
++
++for.end:                                          ; preds = %for.cond
++  call void @_Z1fv(), !dbg !61
++  call void @_Z1gv(), !dbg !62
++  call void @_Z1fv(), !dbg !63
++  %3 = load i32, i32* %retval, align 4, !dbg !64
++  ret i32 %3, !dbg !64
++}
++
++attributes #0 = { nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
++attributes #2 = { norecurse nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!2}
++!llvm.module.flags = !{!10, !11, !12, !13, !14, !15, !16}
++!llvm.ident = !{!17}
++
++!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
++!1 = distinct !DIGlobalVariable(name: "arr", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
++!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "Huawei Bisheng Compiler clang version 12.0.0 (clang-6d7704116510 flang-6d7704116510)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
++!3 = !DIFile(filename: "loop_small.cpp", directory: "/home/g84189222/boole3/llvm-project/tuneTest")
++!4 = !{}
++!5 = !{!0}
++!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 32000000, elements: !8)
++!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!8 = !{!9}
++!9 = !DISubrange(count: 1000000)
++!10 = !{i32 7, !"Dwarf Version", i32 4}
++!11 = !{i32 2, !"Debug Info Version", i32 3}
++!12 = !{i32 1, !"wchar_size", i32 4}
++!13 = !{i32 1, !"branch-target-enforcement", i32 0}
++!14 = !{i32 1, !"sign-return-address", i32 0}
++!15 = !{i32 1, !"sign-return-address-all", i32 0}
++!16 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!17 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (clang-6d7704116510 flang-6d7704116510)"}
++!18 = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!19 = !DISubroutineType(types: !20)
++!20 = !{null}
++!21 = !DILocalVariable(name: "i", scope: !22, file: !3, line: 4, type: !7)
++!22 = distinct !DILexicalBlock(scope: !18, file: !3, line: 4, column: 2)
++!23 = !DILocation(line: 4, column: 10, scope: !22)
++!24 = !DILocation(line: 4, column: 6, scope: !22)
++!25 = !DILocation(line: 4, column: 15, scope: !26)
++!26 = distinct !DILexicalBlock(scope: !22, file: !3, line: 4, column: 2)
++!27 = !DILocation(line: 4, column: 16, scope: !26)
++!28 = !DILocation(line: 4, column: 2, scope: !22)
++!29 = !DILocation(line: 5, column: 7, scope: !30)
++!30 = distinct !DILexicalBlock(scope: !26, file: !3, line: 4, column: 27)
++!31 = !DILocation(line: 5, column: 3, scope: !30)
++!32 = !DILocation(line: 5, column: 10, scope: !30)
++!33 = !DILocation(line: 6, column: 2, scope: !30)
++!34 = !DILocation(line: 4, column: 24, scope: !26)
++!35 = !DILocation(line: 4, column: 2, scope: !26)
++!36 = distinct !{!36, !28, !37, !38}
++!37 = !DILocation(line: 6, column: 2, scope: !22)
++!38 = !{!"llvm.loop.mustprogress"}
++!39 = !DILocation(line: 7, column: 1, scope: !18)
++!40 = distinct !DISubprogram(name: "g", linkageName: "_Z1gv", scope: !3, file: !3, line: 8, type: !19, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!41 = !DILocation(line: 9, column: 8, scope: !40)
++!42 = !DILocation(line: 10, column: 1, scope: !40)
++!43 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 12, type: !44, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!44 = !DISubroutineType(types: !45)
++!45 = !{!7}
++!46 = !DILocalVariable(name: "i", scope: !47, file: !3, line: 13, type: !7)
++!47 = distinct !DILexicalBlock(scope: !43, file: !3, line: 13, column: 2)
++!48 = !DILocation(line: 13, column: 10, scope: !47)
++!49 = !DILocation(line: 13, column: 6, scope: !47)
++!50 = !DILocation(line: 13, column: 15, scope: !51)
++!51 = distinct !DILexicalBlock(scope: !47, file: !3, line: 13, column: 2)
++!52 = !DILocation(line: 13, column: 16, scope: !51)
++!53 = !DILocation(line: 13, column: 2, scope: !47)
++!54 = !DILocation(line: 13, column: 35, scope: !51)
++!55 = !DILocation(line: 13, column: 31, scope: !51)
++!56 = !DILocation(line: 13, column: 38, scope: !51)
++!57 = !DILocation(line: 13, column: 27, scope: !51)
++!58 = !DILocation(line: 13, column: 2, scope: !51)
++!59 = distinct !{!59, !53, !60, !38}
++!60 = !DILocation(line: 13, column: 40, scope: !47)
++!61 = !DILocation(line: 14, column: 2, scope: !43)
++!62 = !DILocation(line: 15, column: 2, scope: !43)
++!63 = !DILocation(line: 16, column: 2, scope: !43)
++!64 = !DILocation(line: 17, column: 1, scope: !43)
++
++; META-CALL1: --- !AutoTuning
++; META-CALL1: Pass:           inline
++; META-CALL1: Name:           _Z1fv
++; META-CALL1: DebugLoc:       { File: loop_small.cpp, Line: 14, Column: 2 }
++; META-CALL1-NEXT: Function:       main
++; META-CALL1-NEXT: CodeRegionType: callsite
++; META-CALL1-NEXT: CodeRegionHash: {{0-9+}}
++; META-CALL1-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; META-CALL1-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL1-NEXT: Invocation:     0
++; META-CALL1-NEXT: ...
++; META-CALL2: --- !AutoTuning
++; META-CALL2: Pass:           inline
++; META-CALL2: Name:           _Z1fv
++; META-CALL2: DebugLoc:       { File: loop_small.cpp, Line: 16, Column: 2 }
++; META-CALL2-NEXT: Function:       main
++; META-CALL2-NEXT: CodeRegionType: callsite
++; META-CALL2-NEXT: CodeRegionHash: {{0-9+}}
++; META-CALL2-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; META-CALL2-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL2-NEXT: Invocation:     0
++; META-CALL2-NEXT: ...
++; META-CALL3: --- !AutoTuning
++; META-CALL3: Pass:           inline
++; META-CALL3: Name:           _Z1gv
++; META-CALL3: DebugLoc:       { File: loop_small.cpp, Line: 15, Column: 2 }
++; META-CALL3-NEXT: Function:       main
++; META-CALL3-NEXT: CodeRegionType: callsite
++; META-CALL3-NEXT: CodeRegionHash: {{0-9+}}
++; META-CALL3-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; META-CALL3-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL3-NEXT: Invocation:     0
++; META-CALL3-NEXT: ...
++
++; NO-META-CALL: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{0-9+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
++; NO-META-CALL-NEXT: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{0-9+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
++; NO-META-CALL-NEXT: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{0-9+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline:  0, 1  }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
+diff --git a/llvm/test/AutoTuning/MetaData/write_no_metadata.ll b/llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+new file mode 100644
+index 000000000000..344a3548a74f
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+@@ -0,0 +1,191 @@
++; REQUIRES: x86-registered-target
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-omit-metadata=1 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/write_no_metadata.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/write_no_metadata.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/write_no_metadata.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.function_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_opp -auto-tuning-type-filter=CallSite \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.function_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.function_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_loop_opp -auto-tuning-omit-metadata=1 \
++; RUN:     -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; COM: Clang generate dynamic values for UnrollCount so we use regex
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{0-9+}}
++; COM: Clang generate dynamic values for UnrollCount so we use regex
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{0-9+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{0-9+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
+diff --git a/llvm/test/AutoTuning/MetaData/write_with_metadata.ll b/llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+new file mode 100644
+index 000000000000..8b7ee9dcce37
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+@@ -0,0 +1,204 @@
++; REQUIRES: x86-registered-target
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-omit-metadata=0 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/write_with_metadata.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/write_with_metadata.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/write_with_metadata.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.function_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_opp -auto-tuning-type-filter=CallSite \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.function_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.function_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_loop_opp -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: Name:            for.cond
++; DEFAULT-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; DEFAULT-NEXT: Function:        test
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: Name:
++; DEFAULT-SAME: write_with_metadata.ll
++; DEFAULT-NEXT: Function:        none
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: Name:            for.cond
++; LOOP-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; LOOP-NEXT: Function:        test
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{0-9+}}
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            callee
++; CALLSITE-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 14 }
++; CALLSITE-NEXT: Function:        test
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{0-9+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: Name:
++; OTHER-SAME: write_with_metadata
++; OTHER-NEXT: Function:        none
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{0-9+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext b/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+new file mode 100644
+index 000000000000..6ed79897d78c
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+@@ -0,0 +1,17 @@
++# IR level Instrumentation Flag
++:ir
++hot
++# Func Hash:
++12884901887
++# Num Counters:
++1
++# Counter Values:
++9000
++
++cold
++# Func Hash:
++12884901887
++# Num Counters:
++1
++# Counter Values:
++10
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+new file mode 100644
+index 000000000000..a1cb2231992e
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+@@ -0,0 +1,7 @@
++main:225715:0
++ 2.1: 5553
++ 3: 5391
++ 3.1: _Z3sumii:0
++  0: 0
++  1: 0
++  2: 0
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+new file mode 100644
+index 000000000000..386cdf8a7b5e
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+@@ -0,0 +1,7 @@
++main:225715:0
++ 2.1: 5553
++ 3: 5391
++ 3.1: _Z3sumii:5860
++  0: 5279
++  1: 5279
++  2: 5279
+diff --git a/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll b/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+new file mode 100644
+index 000000000000..6b279df18343
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+@@ -0,0 +1,61 @@
++; RUN: rm %t.default-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.default-opp -auto-tuning-exclude-cold=false --disable-output
++; RUN: FileCheck  %s  --input-file %t.default-opp/pgo-instr-filters.ll.yaml  -check-prefix=NON-FILTER
++
++; RUN: rm %t.filtered-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -pgo-instr-old-cfg-hashing=true
++; RUN: FileCheck  %s  --input-file %t.filtered-opp/pgo-instr-filters.ll.yaml  -check-prefix=EXCLUDE-COLD
++
++; RUN: rm %t.filtered-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -pgo-instr-old-cfg-hashing=true
++; RUN: FileCheck  %s  --input-file %t.filtered-opp/pgo-instr-filters.ll.yaml  -check-prefix=HOT-ONLY
++
++target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++@s = common dso_local local_unnamed_addr global i32 0, align 4
++
++define void @cold() {
++
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  store i32 1, i32* @s, align 4
++  ret void
++}
++
++define void @hot() {
++entry:
++  %0 = load i32, i32* @s, align 4
++  %1 = tail call i32 @callee(i32 5)
++  %add = add nsw i32 %0, 4
++  store i32 %add, i32* @s, align 4
++  ret void
++}
++
++define void @unknown() {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  store i32 1, i32* @s, align 4
++  ret void
++}
++
++define i32 @callee(i32 %a) {
++entry:
++  %add = add nsw i32 %a, 4
++  ret i32 %add
++}
++
++; NON-FILTER-DAG: Function:        cold
++; NON-FILTER-DAG: Function:        hot
++; NON-FILTER-DAG: Function:        unknown
++
++; EXCLUDE-COLD-NOT: Function:        cold
++; EXCLUDE-COLD-DAG: Function:        hot
++; EXCLUDE-COLD-DAG: Function:        unknown
++
++; HOT-ONLY-NOT: Function:        unknown
++; HOT-ONLY-NOT: Function:        cold
++; HOT-ONLY-DAG: Function:        hot
+diff --git a/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll b/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+new file mode 100644
+index 000000000000..aa93299a7079
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+@@ -0,0 +1,138 @@
++; RUN: rm %t.default-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.default-opp -auto-tuning-exclude-cold=false --disable-output -S
++; RUN: FileCheck %s -check-prefix=NON-FILTER < %t.default-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-exclude-cold with a cold caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -S
++; RUN: FileCheck %s -check-prefix=COLD-PROFILE-EXCLUDE-COLD < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-hot-only with a cold caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -S
++; RUN: FileCheck %s -check-prefix=COLD-PROFILE-HOT-ONLY < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-exclude-cold with a hot caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-hot.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -S
++; RUN: FileCheck %s -check-prefix=HOT-PROFILE-EXCLUDE-COLD < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-hot-only with a hot caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-hot.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -S
++; RUN: FileCheck %s -check-prefix=HOT-PROFILE-HOT-ONLY < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++
++@.str = private unnamed_addr constant 11 x i8 c"sum is %d\0A\00", align 1
++
++define i32 @_Z3sumii(i32 %x, i32 %y) #0 !dbg !6 {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  %x.addr = alloca i32, align 4
++  %y.addr = alloca i32, align 4
++  store i32 %x, i32* %x.addr, align 4
++  store i32 %y, i32* %y.addr, align 4
++  %tmp = load i32, i32* %x.addr, align 4, !dbg !8
++  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !8
++  %add = add nsw i32 %tmp, %tmp1, !dbg !8
++  ret i32 %add, !dbg !8
++}
++
++define i32 @main() #0 !dbg !9 {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  %retval = alloca i32, align 4
++  %s = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32 0, i32* %retval
++  store i32 0, i32* %i, align 4, !dbg !10
++  br label %while.cond, !dbg !11
++
++while.cond:                                       ; preds = %if.end, %entry
++  %tmp = load i32, i32* %i, align 4, !dbg !12
++  %inc = add nsw i32 %tmp, 1, !dbg !12
++  store i32 %inc, i32* %i, align 4, !dbg !12
++  %cmp = icmp slt i32 %tmp, 400000000, !dbg !12
++  br i1 %cmp, label %while.body, label %while.end, !dbg !12
++
++while.body:                                       ; preds = %while.cond
++  %tmp1 = load i32, i32* %i, align 4, !dbg !14
++  %cmp1 = icmp ne i32 %tmp1, 100, !dbg !14
++  br i1 %cmp1, label %if.then, label %if.else, !dbg !14
++
++if.then:                                          ; preds = %while.body
++  %tmp2 = load i32, i32* %i, align 4, !dbg !16
++  %tmp3 = load i32, i32* %s, align 4, !dbg !16
++  %call = call i32 @_Z3sumii(i32 %tmp2, i32 %tmp3), !dbg !16
++; INLINE-NOT: call i32 @_Z3sumii
++; NOTINLINE: call i32 @_Z3sumii
++  store i32 %call, i32* %s, align 4, !dbg !16
++  br label %if.end, !dbg !16
++
++if.else:                                          ; preds = %while.body
++  store i32 30, i32* %s, align 4, !dbg !18
++  br label %if.end
++
++if.end:                                           ; preds = %if.else, %if.then
++  br label %while.cond, !dbg !20
++
++while.end:                                        ; preds = %while.cond
++  %tmp4 = load i32, i32* %s, align 4, !dbg !22
++  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds (11 x i8, 11 x i8* @.str, i32 0, i32 0), i32 %tmp4), !dbg !22
++  ret i32 0, !dbg !23
++}
++
++define i32 @callee(i32 %a) #0 {
++entry:
++  %add = add nsw i32 %a, 4
++  ret i32 %add
++}
++
++declare i32 @printf(i8*, ...)
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++attributes #0 = {"use-sample-profile"}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
++!1 = !DIFile(filename: "calls.cc", directory: ".")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 1, !"Debug Info Version", i32 3}
++!5 = !{!"clang version 3.5 "}
++!6 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
++!7 = !DISubroutineType(types: !2)
++!8 = !DILocation(line: 4, scope: !6)
++!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
++!10 = !DILocation(line: 8, scope: !9)
++!11 = !DILocation(line: 9, scope: !9)
++!12 = !DILocation(line: 9, scope: !13)
++!13 = !DILexicalBlockFile(scope: !9, file: !1, discriminator: 2)
++!14 = !DILocation(line: 10, scope: !15)
++!15 = distinct !DILexicalBlock(scope: !9, file: !1, line: 10)
++!16 = !DILocation(line: 10, scope: !17)
++!17 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
++!18 = !DILocation(line: 10, scope: !19)
++!19 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 4)
++!20 = !DILocation(line: 10, scope: !21)
++!21 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 6)
++!22 = !DILocation(line: 11, scope: !9)
++!23 = !DILocation(line: 12, scope: !9)
++
++; Note that hotness of main is unknown.
++; NON-FILTER-DAG: Function:        _Z3sumii
++; NON-FILTER-DAG: Function:        main
++
++; COLD-PROFILE-EXCLUDE-COLD-NOT: Function:        _Z3sumii
++; COLD-PROFILE-EXCLUDE-COLD-DAG: Function:        main
++
++; COLD-PROFILE-HOT-ONLY-NOT: Function:        _Z3sumii
++; COLD-PROFILE-HOT-ONLY-NOT: Function:        main
++
++; HOT-PROFILE-EXCLUDE-COLD-DAG: Function:        _Z3sumii
++; HOT-PROFILE-EXCLUDE-COLD-DAG: Function:        main
++
++; HOT-PROFILE-HOT-ONLY-NOT: Function:        main
++; HOT-PROFILE-HOT-ONLY-DAG: Function:        _Z3sumii
+diff --git a/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml b/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+new file mode 100644
+index 000000000000..00459fe9e23c
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            for.body
++Function:        find
++CodeRegionType:  loop
++CodeRegionHash:  145363925920731080
++Invocation:      number
++Args:
++ - UnrollCount:  2
++...
+diff --git a/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll b/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+new file mode 100644
+index 000000000000..6e41507af8b8
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+@@ -0,0 +1,64 @@
++; RUN: rm %t.config.yaml -rf
++; RUN: sed 's#\number\#0#g;' %S/Inputs/pass_invocation.yaml > %t.config.yaml
++; RUN: opt %s -S -O3 -print-after=loop-unroll-full -print-after=loop-unroll \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.config.yaml --disable-output 2>&1 | \
++; RUN:     FileCheck %s --check-prefix=INVOCATION-0
++
++; RUN: rm %t.config.yaml -rf
++; RUN: sed 's#\number\#1#g;' %S/Inputs/pass_invocation.yaml > %t.config.yaml
++; RUN: opt %s -S -O3 -print-after=loop-unroll-full -print-after=loop-unroll \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.config.yaml --disable-output 2>&1 | \
++; RUN:     FileCheck %s --check-prefix=INVOCATION-1
++
++; Function Attrs: norecurse nounwind readonly uwtable
++define dso_local i64 @find(i64* nocapture readonly %a, i64 %n, i64 %Value) {
++entry:
++  %cmp6.not = icmp eq i64 %n, 0
++  br i1 %cmp6.not, label %for.end, label %for.body
++
++for.body:                                         ; preds = %entry, %for.inc
++  %i.07 = phi i64  %inc, %for.inc ,  0, %entry 
++  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.07
++  %0 = load i64, i64* %arrayidx, align 8
++  %cmp1 = icmp eq i64 %0, %Value
++  br i1 %cmp1, label %for.end, label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %inc = add nuw i64 %i.07, 1
++  %cmp = icmp ult i64 %inc, %n
++  br i1 %cmp, label %for.body, label %for.end
++
++for.end:                                          ; preds = %for.inc, %for.body, %entry
++  %i.0.lcssa = phi i64  0, %entry ,  %i.07, %for.body ,  %inc, %for.inc 
++  ret i64 %i.0.lcssa
++}
++
++; INVOCATION-0: *** IR Dump After {{.*}}Unroll
++; INVOCATION-0: for.body.preheader:                               ; preds = %entry
++; INVOCATION-0: for.body:                                         ; preds = %for.inc.1, %for.body.preheader
++; INVOCATION-0: for.inc:                                          ; preds = %for.body
++; INVOCATION-0: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-0: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-0: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
++; INVOCATION-0: *** IR Dump After {{.*}}Unroll
++; INVOCATION-0: for.body.preheader:                               ; preds = %entry
++; INVOCATION-0: for.body:                                         ; preds = %for.body.preheader, %for.inc.1
++; INVOCATION-0: for.inc:                                          ; preds = %for.body
++; INVOCATION-0: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-0: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-0: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
++
++; INVOCATION-1: *** IR Dump After {{.*}}Unroll
++; INVOCATION-1: for.body.preheader:                               ; preds = %entry
++; INVOCATION-1: for.body:                                         ; preds = %for.body.preheader, %for.inc
++; INVOCATION-1: for.inc:                                          ; preds = %for.body
++; INVOCATION-1: for.end.loopexit:                                 ; preds = %for.body, %for.inc
++; INVOCATION-1: *** IR Dump After {{.*}}Unroll
++; INVOCATION-1: for.body.preheader:                               ; preds = %entry
++; INVOCATION-1: for.body:                                         ; preds = %for.inc.1, %for.body.preheader
++; INVOCATION-1: for.inc:                                          ; preds = %for.body
++; INVOCATION-1: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-1: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-1: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
+diff --git a/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll b/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+new file mode 100644
+index 000000000000..81097fdd5afa
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+@@ -0,0 +1,67 @@
++; REQUIRES: aarch64-registered-target
++; RUN: rm %t.pass_invocation -rf
++; RUN: opt %s -S -mtriple=aarch64-- -mcpu=tsv110 -auto-tuning-type-filter=Loop \
++; RUN:     -O3 -auto-tuning-opp=%t.pass_invocation --disable-output
++; RUN: FileCheck  %s  --input-file %t.pass_invocation/pass_invocation_write.ll.yaml
++
++; Function Attrs: nounwind uwtable
++define dso_local void @sum(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32 %n) {
++entry:
++  br label %for.cond
++
++for.cond:                                         ; preds = %for.body, %entry
++  %sum.0 = phi float  0.000000e+00, %entry ,  %add, %for.body 
++  %i.0 = phi i32  0, %entry ,  %inc, %for.body 
++  %cmp = icmp slt i32 %i.0, %n
++  br i1 %cmp, label %for.body, label %for.end
++
++for.body:                                         ; preds = %for.cond
++  %idxprom = sext i32 %i.0 to i64
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
++  %0 = load i32, i32* %arrayidx, align 4
++  %idxprom1 = sext i32 %i.0 to i64
++  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %idxprom1
++  %1 = load i32, i32* %arrayidx2, align 4
++  %mul = mul nsw i32 %0, %1
++  %conv = sitofp i32 %mul to float
++  %add = fadd contract float %sum.0, %conv
++  %inc = add nsw i32 %i.0, 1
++  br label %for.cond
++
++for.end:                                          ; preds = %for.cond
++  %conv3 = fptosi float %sum.0 to i32
++  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 0
++  store i32 %conv3, i32* %arrayidx4, align 4
++  ret void
++}
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-unroll
++; CHECK-NEXT: Name:            for.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{0-9+}}
++; CHECK-NEXT: DynamicConfigs:  { UnrollCount:  0, 1, 8, 4, 2  }
++; CHECK-NEXT: BaselineConfig:  { UnrollCount: '0' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-vectorize
++; CHECK-NEXT: Name:            for.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{0-9+}}
++; CHECK-NEXT: DynamicConfigs:  { VectorizationInterleave:  1, 2, 4  }
++; CHECK-NEXT: BaselineConfig:  { VectorizationInterleave: '2' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-unroll
++; CHECK-NEXT: Name:            vector.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{0-9+}}
++; CHECK-NEXT: DynamicConfigs:  { UnrollCount:  0, 1, 8, 4, 2  }
++; CHECK-NEXT: BaselineConfig:  { UnrollCount: '0' }
++; CHECK-NEXT: Invocation:      1
++; CHECK-NEXT: ...
+diff --git a/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml b/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+new file mode 100644
+index 000000000000..065d3cb85b72
+--- /dev/null
++++ b/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            all
++Name:            filename
++Function:        none
++CodeRegionType:  other
++Args:
++  - OptPass: pass
++...
+diff --git a/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll b/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+new file mode 100644
+index 000000000000..9d0210b3fdde
+--- /dev/null
++++ b/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+@@ -0,0 +1,65 @@
++; Run different orders of opt passes and verify that the order is respected
++; -------------------------------------------------------------------------
++; Check to see if the order is correct, trivial case (autotuning disabled)
++; RUN: opt %s -debug-pass-manager -S 2>&1 | FileCheck %s -check-prefix=DISABLE
++
++; One pass:
++; RUN: rm %t.onepass_order.yaml -rf
++; RUN: sed 's#\filename\#%s#g; s#\pass\#\loop-extract\#g' \
++; RUN:    %S/Inputs/template.yaml > %t.onepass_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.onepass_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=ONEPASS
++
++; Two passes (A->B):
++; RUN: rm %t.twopass_order.yaml -rf
++; RUN: sed 's#\filename\#%s#g; s#\pass\#\loop-extract,strip\#g' \
++; RUN:    %S/Inputs/template.yaml > %t.twopass_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.twopass_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=TWOPASS_AB
++
++; Two passes (B->A):
++; RUN: rm %t.twopass_ba_order.yaml -rf
++; RUN: sed 's#\filename\#%s#g; s#\pass\#\strip, loop-extract\#g' \
++; RUN:    %S/Inputs/template.yaml > %t.twopass_ba_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.twopass_ba_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=TWOPASS_BA
++
++; candidate IR that can change based on many optimizations
++; for now just use the IR in the LoopUnroll test file
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64  0, %entry ,  %indvars.iv.next, %for.body 
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; DISABLE-NOT: Running pass: LoopExtractorPass on module
++; DISABLE-NOT: Running pass: StripSymbolsPass on module 
++; DISABLE: Running pass: VerifierPass on module
++; DISABLE: Running pass: PrintModulePass on module
++
++; ONEPASS-NOT: Running pass: StripSymbolsPass on module
++; ONEPASS: Running pass: LoopExtractorPass on module
++; ONEPASS: Running pass: VerifierPass on module
++; ONEPASS: Running pass: PrintModulePass on module
++
++; TWOPASS_AB: Running pass: LoopExtractorPass on module
++; TWOPASS_AB: Running pass: StripSymbolsPass on module
++; TWOPASS_AB: Running pass: VerifierPass on module
++; TWOPASS_AB: Running pass: PrintModulePass on module
++
++; TWOPASS_BA: Running pass: StripSymbolsPass on module
++; TWOPASS_BA: Running pass: LoopExtractorPass on module
++; TWOPASS_BA: Running pass: VerifierPass on module
++; TWOPASS_BA: Running pass: PrintModulePass on module
+diff --git a/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll b/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+new file mode 100644
+index 000000000000..679549180bf4
+--- /dev/null
++++ b/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+@@ -0,0 +1,47 @@
++; RUN: rm %t.switch_opp -rf
++; RUN: llc %s -auto-tuning-opp=%t.switch_opp -auto-tuning-type-filter=Switch -o /dev/null
++; RUN: FileCheck %s --input-file %t.switch_opp/switch-opp.ll.yaml
++
++; UNSUPPORTED: windows
++
++define i32 @test(i32 %arg) #0 {
++entry:
++  switch i32 %arg, label %bb5 
++    i32 1, label %bb1
++    i32 2, label %bb2
++    i32 3, label %bb3
++    i32 4, label %bb4
++  
++
++bb1:			; pred = %entry
++  br label %bb2
++
++bb2:			; pred = %entry, %bb1
++  %res.0 = phi i32  1, %entry ,  2, %bb1 
++  br label %bb3
++
++bb3:			; pred = %entry, %bb2
++  %res.1 = phi i32  0, %entry ,  %res.0, %bb2 
++  %phitmp = add nsw i32 %res.1, 2
++  br label %bb4
++
++bb4:			; pred = %entry, %bb3
++  %res.2 = phi i32  1, %entry ,  %phitmp, %bb3 
++  br label %bb5
++
++bb5:			; pred = %entry, %bb4
++  %res.3 = phi i32  0, %entry ,  %res.2, %bb4 
++  %0 = add nsw i32 %res.3, 1
++  ret i32 %0
++}
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            switch-lowering
++; CHECK-NEXT: Name:            'i32 %arg'
++; CHECK-NEXT: Function:        test
++; CHECK-NEXT: CodeRegionType:  switch
++; CHECK-NEXT: CodeRegionHash:  {{0-9+}}
++; CHECK-NEXT: DynamicConfigs:  { }
++; CHECK-NEXT: BaselineConfig:  { }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
+diff --git a/llvm/test/AutoTuning/lit.local.cfg b/llvm/test/AutoTuning/lit.local.cfg
+new file mode 100644
+index 000000000000..13b4927257ab
+--- /dev/null
++++ b/llvm/test/AutoTuning/lit.local.cfg
+@@ -0,0 +1,2 @@
++if not config.enable_enable_autotuner:
++    config.unsupported = True
+diff --git a/llvm/test/AutoTuning/opt-opp.ll b/llvm/test/AutoTuning/opt-opp.ll
+new file mode 100644
+index 000000000000..97f7b1d121cc
+--- /dev/null
++++ b/llvm/test/AutoTuning/opt-opp.ll
+@@ -0,0 +1,315 @@
++; REQUIRES: asserts
++; REQUIRES: x86-registered-target
++
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.default_opp --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/opt-opp.ll.yaml  -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/opt-opp.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/opt-opp.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.callsite_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.callsite_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.callsite_opp/opt-opp.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.callsite_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.callsite_loop_opp -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.callsite_loop_opp/opt-opp.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.callsite_loop_opp/opt-opp.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; RUN: rm %t.llvm_param_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.llvm_param_opp \
++; RUN:     -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-type-filter=LLVMParam --disable-output
++; RUN: FileCheck %s --input-file %t.llvm_param_opp/opt-opp.ll.yaml -check-prefix=LLVMPARAM
++
++; RUN: rm %t.program_param_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.program_param_opp -auto-tuning-type-filter=ProgramParam --disable-output
++; RUN: FileCheck %s --input-file %t.program_param_opp/opt-opp.ll.yaml -check-prefix=ProgramPARAM
++
++; Test if opp file with the same name exists already
++; RUN: rm %t.default_opp -rf
++; RUN: mkdir %t.default_opp && touch %t.default_opp/opt-opp.ll.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.default_opp --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/opt-opp.ll.yaml.1 -check-prefix=DEFAULT
++
++; Test that the loop code region is included if its size >= the threshold.
++; RUN: rm %t.loop.opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.loop.opp -auto-tuning-size-threshold=13 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning --disable-output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-LOOP
++; RUN: FileCheck %s --input-file %t.loop.opp/opt-opp.ll.yaml -check-prefix=SIZE-LOOP-OPP
++
++; Test that the loop code region is excluded if its size < the threshold.
++; RUN: rm %t.loop.opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.loop.opp -auto-tuning-size-threshold=14 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning --disable-output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-LOOP-FILTERED
++; RUN: FileCheck %s --input-file %t.loop.opp/opt-opp.ll.yaml -check-prefix=SIZE-LOOP-OPP-FILTERED
++
++; Test that the callsite code region is included if its size >= the threshold.
++; RUN: rm %t.callsite.opp -rf
++; RUN: opt %s -S -passes=inline -auto-tuning-opp=%t.callsite.opp --disable-output \
++; RUN:     -auto-tuning-size-threshold=2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-CALLSITE
++; RUN: FileCheck %s --input-file %t.callsite.opp/opt-opp.ll.yaml -check-prefix=SIZE-CALLSITE-OPP
++
++; Test that the callsite code region is excluded if its size < the threshold.
++; RUN: rm %t.callsite.opp -rf
++; RUN: opt %s -S -passes=inline -auto-tuning-opp=%t.callsite.opp \
++; RUN:     -auto-tuning-size-threshold=24 --disable-output -debug-only=autotuning \
++; RUN:     2>&1 | FileCheck %s -check-prefix=SIZE-CALLSITE-FILTERED
++; RUN: FileCheck %s --input-file %t.callsite.opp/opt-opp.ll.yaml -check-prefix=SIZE-CALLSITE-OPP-FILTERED
++
++; RUN: rm -rf %t.other
++; RUN: opt %s -S -O3  -auto-tuning-opp=%t.other -auto-tuning-type-filter=Other
++; RUN: grep "Name: \+'%S/opt-opp.ll'" %t.other/opt-opp.ll.yaml
++; RUN: not grep "Name: \+opt-opp.ll" %t.other/opt-opp.ll.yaml
++
++; RUN: rm -rf %t.other
++; RUN: opt %s -S -O3  -auto-tuning-opp=%t.other -auto-tuning-type-filter=Other \
++; RUN:     -autotuning-project-dir=%S/
++; RUN: not grep "Name: \+'%S/opt-opp.ll'" %t.other/opt-opp.ll.yaml
++; RUN: grep "Name: \+opt-opp.ll" %t.other/opt-opp.ll.yaml
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: Name:            for.cond
++; DEFAULT-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; DEFAULT-NEXT: Function:        test
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: Name:
++; DEFAULT-SAME: opt-opp.ll
++; DEFAULT-NEXT: Function:        none
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{0-9+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: Name:            for.cond
++; LOOP-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; LOOP-NEXT: Function:        test
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{0-9+}}
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount:  {{0-9+(, 0-9+)*}}  }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{0-9+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            callee
++; CALLSITE-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 14 }
++; CALLSITE-NEXT: Function:        test
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{0-9+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline:  0, 1  }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: Name:
++; OTHER-SAME: opt-opp.ll
++; OTHER-NEXT: Function:        none
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{0-9+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
++
++; LLVMPARAM: --- !AutoTuning
++; LLVMPARAM-NEXT: Pass:            none
++; LLVMPARAM-NEXT: Name:
++; LLVMPARAM-SAME: opt-opp.ll
++; LLVMPARAM-NEXT: Function:        none
++; LLVMPARAM-NEXT: CodeRegionType:  llvm-param
++; LLVMPARAM-NEXT: CodeRegionHash:  {{0-9+}}
++; LLVMPARAM-NEXT: DynamicConfigs:  { }
++; LLVMPARAM-NEXT: BaselineConfig:  { }
++; LLVMPARAM-NEXT: Invocation:      0
++; LLVMPARAM-NEXT: ...
++
++; ProgramPARAM: --- !AutoTuning
++; ProgramPARAM-NEXT: Pass:            none
++; ProgramPARAM-NEXT: Name:
++; ProgramPARAM-SAME: opt-opp.ll
++; ProgramPARAM-NEXT: Function:        none
++; ProgramPARAM-NEXT: CodeRegionType:  program-param
++; ProgramPARAM-NEXT: CodeRegionHash:  {{0-9+}}
++; ProgramPARAM-NEXT: DynamicConfigs:  { }
++; ProgramPARAM-NEXT: BaselineConfig:  { }
++; ProgramPARAM-NEXT: Invocation:      0
++; ProgramPARAM-NEXT: ...
++
++; SIZE-LOOP:  PassName: loop-unroll
++; SIZE-LOOP-NEXT:  Type: loop
++; SIZE-LOOP-NEXT:  Size: 13
++; SIZE-LOOP:  Module added as an tuning opportunity
++
++; SIZE-LOOP-OPP-DAG: Pass:            loop-unroll
++; SIZE-LOOP-OPP-DAG: Pass:            all
++
++; SIZE-LOOP-FILTERED-NOT:  PassName: loop-unroll
++; SIZE-LOOP-FILTERED:  Module added as an tuning opportunity
++
++; SIZE-LOOP-OPP-FILTERED-NOT: Pass:            loop-unroll
++; Ths "other" code regions should remain as-is.
++; SIZE-LOOP-OPP-FILTERED: CodeRegionType:  other
++
++; SIZE-CALLSITE:  PassName: inline
++; SIZE-CALLSITE-NEXT:  Type: callsite
++; SIZE-CALLSITE-NEXT:  Size: 4
++; SIZE-CALLSITE:  Module added as an tuning opportunity
++
++; SIZE-CALLSITE-OPP-DAG: Pass:            inline
++; SIZE-CALLSITE-OPP-DAG: Pass:            all
++
++; SIZE-CALLSITE-FILTERED-NOT:  PassName: inline
++; SIZE-CALLSITE-FILTERED:  Module added as an tuning opportunity
++
++; SIZE-CALLSITE-OPP-FILTERED-NOT: Pass:            inline
++; Ths "other" code regions should remain as-is.
++; SIZE-CALLSITE-OPP-FILTERED: CodeRegionType:  other
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index fc7ab6536309..0e9396e3b014 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -62,6 +62,7 @@ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
+ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
++config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
+index 8934130f9913..94b2028b25bc 100644
+--- a/llvm/tools/llc/llc.cpp
++++ b/llvm/tools/llc/llc.cpp
+@@ -645,6 +645,18 @@ static int compileModule(char **argv, LLVMContext &Context) {
+       reportError(EC.message(), SplitDwarfOutputFile);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (llvm::Error E = autotuning::Engine.init(M->getModuleIdentifier())) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++  if (autotuning::Engine.isEnabled() && autotuning::Engine.isParseInput() &&
++      (autotuning::Engine.LLVMParams.size() ||
++       autotuning::Engine.ProgramParams.size()))
++    llvm::cl::ParseAutoTunerOptions(autotuning::Engine.LLVMParams,
++                                    autotuning::Engine.ProgramParams);
++#endif
++
+   // Build up all of the passes that we want to do to the module.
+   legacy::PassManager PM;
+ 
+@@ -776,6 +788,13 @@ static int compileModule(char **argv, LLVMContext &Context) {
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (llvm::Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++#endif
++
+   // Declare success.
+   Out->keep();
+   if (DwoOut)
+diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
+index 6ae3f87099af..5ce9e4fee81f 100644
+--- a/llvm/tools/opt/NewPMDriver.cpp
++++ b/llvm/tools/opt/NewPMDriver.cpp
+@@ -39,6 +39,10 @@
+ #include "llvm/Transforms/Scalar/LoopPassManager.h"
+ #include "llvm/Transforms/Utils/Debugify.h"
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
++
+ using namespace llvm;
+ using namespace opt_tool;
+ 
+@@ -459,6 +463,35 @@ bool llvm::runPassPipeline(
+     MPM.addPass(NewPMDebugifyPass(DebugifyMode::OriginalDebugInfo, "",
+                                   &DebugInfoBeforePass));
+ 
++#if defined(ENABLE_AUTOTUNER)
++  bool Changed = false;
++  // If autotuning is enabled (for applying configuration), use AutoTuner
++  // generated pass ordering instead of using passes specified with -passes=...
++  // with opt tool.
++  if (autotuning::Engine.isEnabled()) {
++    std::vector<std::string> PassesList;
++    Changed = autotuning::Engine.lookUpGlobalParams("OptPass", PassesList);
++    if (Changed && PassesList.size()) {
++      std::string PassPipeline = "";
++      for (auto PassName : PassesList)
++        PassPipeline.append(PassName + ",");
++      PassPipeline.pop_back();
++
++      if (auto Err = PB.parsePassPipeline(MPM, PassPipeline))
++        errs() << "AutoTuner: cannot add pass:" << toString(std::move(Err))
++               << "\n";
++    }
++  }
++  if (!Changed) {
++    // Add passes according to the -passes options.
++    if (!PassPipeline.empty()) {
++      if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
++        errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
++        return false;
++      }
++    }
++  }
++#else
+   // Add passes according to the -passes options.
+   if (!PassPipeline.empty()) {
+     if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
+@@ -466,6 +499,7 @@ bool llvm::runPassPipeline(
+       return false;
+     }
+   }
++#endif
+ 
+   if (VK > VK_NoVerifier)
+     MPM.addPass(VerifierPass());
+@@ -539,6 +573,14 @@ bool llvm::runPassPipeline(
+   if (DebugifyEach && !DebugifyExport.empty())
+     exportDebugifyStats(DebugifyExport, Debugify.getDebugifyStatsMap());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning finalization for this module
++  if (Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return false;
++  }
++#endif
++
+   return true;
+ }
+ 
+diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
+index 9c20e7784223..1401352647cd 100644
+--- a/llvm/tools/opt/opt.cpp
++++ b/llvm/tools/opt/opt.cpp
+@@ -456,6 +456,9 @@ int main(int argc, char **argv) {
+   initializeWriteBitcodePassPass(Registry);
+   initializeReplaceWithVeclibLegacyPass(Registry);
+   initializeJMCInstrumenterPass(Registry);
++#if defined(ENABLE_AUTOTUNER)
++  initializeAutotuningDumpLegacyPass(Registry);
++#endif
+ 
+   SmallVector<PassPlugin, 1> PluginList;
+   PassPlugins.setCallback(&(const std::string &PluginPath) {
+@@ -516,7 +519,11 @@ int main(int argc, char **argv) {
+                                    RemarksFormat, RemarksWithHotness,
+                                    RemarksHotnessThreshold);
+   if (Error E = RemarksFileOrErr.takeError()) {
++#if defined(ENABLE_AUTOTUNER)
++    errs() << "error: " << toString(std::move(E)) << '\n';
++#else
+     errs() << toString(std::move(E)) << '\n';
++#endif
+     return 1;
+   }
+   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
+@@ -641,6 +648,20 @@ int main(int argc, char **argv) {
+       M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning initialization for this module
++  // if the auto-tuning flag is on
++  if (Error E = autotuning::Engine.init(M->getModuleIdentifier())) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++  if (autotuning::Engine.isEnabled() && autotuning::Engine.isParseInput() &&
++      (autotuning::Engine.LLVMParams.size() ||
++       autotuning::Engine.ProgramParams.size()))
++    llvm::cl::ParseAutoTunerOptions(autotuning::Engine.LLVMParams,
++                                    autotuning::Engine.ProgramParams);
++#endif
++
+   // Add an appropriate TargetLibraryInfo pass for the module's triple.
+   TargetLibraryInfoImpl TLII(ModuleTriple);
+ 
+@@ -778,6 +799,30 @@ int main(int argc, char **argv) {
+     Passes.add(TPC);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - If auto-tuning is enabled, try to generate passes
++  // from auto-tuning interface and disable all optimization passes.
++  if (autotuning::Engine.isEnabled()) {
++    std::vector<std::string> PassesList;
++    bool Changed = autotuning::Engine.lookUpGlobalParams("OptPass", PassesList);
++    if (Changed) {
++      // disable all optimization passes of all optimization levels
++      OptLevelO0 = false;
++      OptLevelO1 = false;
++      OptLevelO2 = false;
++      OptLevelOs = false;
++      OptLevelOz = false;
++      OptLevelO3 = false;
++      for (auto const &Value : PassesList) {
++        const PassInfo *PassInf = (Registry.getPassInfo(StringRef(Value)));
++        if (PassInf) {
++          PassList.push_back(PassInf);
++        }
++      }
++    }
++  }
++#endif
++
+   // Create a new optimization pass for each one specified on the command line
+   for (unsigned i = 0; i < PassList.size(); ++i) {
+     const PassInfo *PassInf = PassListi;
+@@ -878,6 +923,14 @@ int main(int argc, char **argv) {
+   if (DebugifyEach && !DebugifyExport.empty())
+     exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning finalization for this module
++  if (Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++#endif
++
+   // Declare success.
+   if (!NoOutput)
+     Out->keep();
+-- 
+2.33.0
+

_service:tar_scm:0021-Backport-GlobalISel-Don-t-expand-stacksave-stackrestore-in-IRTranslator.patch Deleted

@@ -1,315 +0,0 @@
-From 7aeecae6393d5c3333beec64ad343ed1cabe75e4 Mon Sep 17 00:00:00 2001
-From: Matt Arsenault <Matthew.Arsenault@amd.com>
-Date: Sat, 29 Jul 2023 19:12:24 -0400
-Subject: PATCH 1/7 GlobalISel: Don't expand stacksave/stackrestore in
- IRTranslator
-
-In some (likely invalid edge cases anyway), it's not correct to
-directly copy the stack pointer register.
----
- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  2 +
- llvm/include/llvm/Support/TargetOpcodes.def   |  6 +++
- llvm/include/llvm/Target/GenericOpcodes.td    | 12 ++++++
- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 25 ++----------
- .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 26 +++++++++++++
- .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  4 +-
- llvm/lib/Target/X86/X86LegalizerInfo.cpp      |  4 ++
- .../AArch64/GlobalISel/arm64-irtranslator.ll  |  4 +-
- .../GlobalISel/legalizer-info-validation.mir  | 10 ++++-
- .../GlobalISel/stacksave-stackrestore.ll      | 35 +++++++++++++++++
- .../X86/GlobalISel/stacksave-stackrestore.ll  | 39 +++++++++++++++++++
- 11 files changed, 141 insertions(+), 26 deletions(-)
- create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
- create mode 100644 llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-
-diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-index a568edd0e640..9288091874cf 100644
---- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-@@ -401,6 +401,8 @@ public:
-   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
-   LegalizeResult lowerShuffleVector(MachineInstr &MI);
-   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
-+  LegalizeResult lowerStackSave(MachineInstr &MI);
-+  LegalizeResult lowerStackRestore(MachineInstr &MI);
-   LegalizeResult lowerExtract(MachineInstr &MI);
-   LegalizeResult lowerInsert(MachineInstr &MI);
-   LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI);
-diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
-index 186bea75ae96..c92ce6dc701c 100644
---- a/llvm/include/llvm/Support/TargetOpcodes.def
-+++ b/llvm/include/llvm/Support/TargetOpcodes.def
-@@ -763,6 +763,12 @@ HANDLE_TARGET_OPCODE(G_JUMP_TABLE)
- /// Generic dynamic stack allocation.
- HANDLE_TARGET_OPCODE(G_DYN_STACKALLOC)
- 
-+/// Generic stack pointer save.
-+HANDLE_TARGET_OPCODE(G_STACKSAVE)
-+
-+/// Generic stack pointer restore.
-+HANDLE_TARGET_OPCODE(G_STACKRESTORE)
-+
- /// Strict floating point instructions.
- HANDLE_TARGET_OPCODE(G_STRICT_FADD)
- HANDLE_TARGET_OPCODE(G_STRICT_FSUB)
-diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
-index 00d56d1c4bd5..e8cfaeab3cd8 100644
---- a/llvm/include/llvm/Target/GenericOpcodes.td
-+++ b/llvm/include/llvm/Target/GenericOpcodes.td
-@@ -225,6 +225,18 @@ def G_DYN_STACKALLOC : GenericInstruction {
-   let hasSideEffects = true;
- }
- 
-+def G_STACKSAVE : GenericInstruction {
-+  let OutOperandList = (outs ptype0:$dst);
-+  let InOperandList = (ins);
-+  let hasSideEffects = true;
-+}
-+
-+def G_STACKRESTORE : GenericInstruction {
-+  let OutOperandList = (outs);
-+  let InOperandList = (ins ptype0:$src);
-+  let hasSideEffects = true;
-+}
-+
- def G_FREEZE : GenericInstruction {
-   let OutOperandList = (outs type0:$dst);
-   let InOperandList = (ins type0:$src);
-diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
-index 9a67a8d05a4d..e4b837c6b8ce 100644
---- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
-+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
-@@ -2229,31 +2229,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
-     return true;
-   }
-   case Intrinsic::stacksave: {
--    // Save the stack pointer to the location provided by the intrinsic.
--    Register Reg = getOrCreateVReg(CI);
--    Register StackPtr = MF->getSubtarget()
--                            .getTargetLowering()
--                            ->getStackPointerRegisterToSaveRestore();
--
--    // If the target doesn't specify a stack pointer, then fall back.
--    if (!StackPtr)
--      return false;
--
--    MIRBuilder.buildCopy(Reg, StackPtr);
-+    MIRBuilder.buildInstr(TargetOpcode::G_STACKSAVE, {getOrCreateVReg(CI)}, {});
-     return true;
-   }
-   case Intrinsic::stackrestore: {
--    // Restore the stack pointer from the location provided by the intrinsic.
--    Register Reg = getOrCreateVReg(*CI.getArgOperand(0));
--    Register StackPtr = MF->getSubtarget()
--                            .getTargetLowering()
--                            ->getStackPointerRegisterToSaveRestore();
--
--    // If the target doesn't specify a stack pointer, then fall back.
--    if (!StackPtr)
--      return false;
--
--    MIRBuilder.buildCopy(StackPtr, Reg);
-+    MIRBuilder.buildInstr(TargetOpcode::G_STACKRESTORE, {},
-+                          {getOrCreateVReg(*CI.getArgOperand(0))});
-     return true;
-   }
-   case Intrinsic::cttz:
-diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-index f0da0d88140f..75d9789be4d0 100644
---- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-@@ -3503,6 +3503,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
-     return lowerShuffleVector(MI);
-   case G_DYN_STACKALLOC:
-     return lowerDynStackAlloc(MI);
-+  case G_STACKSAVE:
-+    return lowerStackSave(MI);
-+  case G_STACKRESTORE:
-+    return lowerStackRestore(MI);
-   case G_EXTRACT:
-     return lowerExtract(MI);
-   case G_INSERT:
-@@ -6810,6 +6814,28 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
-   return Legalized;
- }
- 
-+LegalizerHelper::LegalizeResult
-+LegalizerHelper::lowerStackSave(MachineInstr &MI) {
-+  Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
-+  if (!StackPtr)
-+    return UnableToLegalize;
-+
-+  MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
-+  MI.eraseFromParent();
-+  return Legalized;
-+}
-+
-+LegalizerHelper::LegalizeResult
-+LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
-+  Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
-+  if (!StackPtr)
-+    return UnableToLegalize;
-+
-+  MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
-+  MI.eraseFromParent();
-+  return Legalized;
-+}
-+
- LegalizerHelper::LegalizeResult
- LegalizerHelper::lowerExtract(MachineInstr &MI) {
-   auto DstReg, DstTy, SrcReg, SrcTy = MI.getFirst2RegLLTs();
-diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-index d905da4eaec3..f0130a0be29d 100644
---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-@@ -797,7 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
-     return Query.Types0 == p0 && Query.Types1 == s64;
-   });
- 
--  getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
-+  getActionDefinitionsBuilder({G_DYN_STACKALLOC,
-+                               G_STACKSAVE,
-+                               G_STACKRESTORE}).lower();
- 
-   if (ST.hasMOPS()) {
-     // G_BZERO is not supported. Currently it is only emitted by
-diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
-index a4a247f85f3d..104461cff0a9 100644
---- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp
-+++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
-@@ -528,6 +528,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
-   // memory intrinsics
-   getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
- 
-+  getActionDefinitionsBuilder({G_DYN_STACKALLOC,
-+                               G_STACKSAVE,
-+                               G_STACKRESTORE}).lower();
-+
-   // fp intrinsics
-   getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
-       .scalarize(0)
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
-index 5f3544add398..575cd6b874e3 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
-@@ -2392,8 +2392,8 @@ declare ptr @llvm.stacksave()
- declare void @llvm.stackrestore(ptr)
- define void @test_stacksaverestore() {
-   ; CHECK-LABEL: name: test_stacksaverestore
--  ; CHECK: SAVE:%0-9+:_(p0) = COPY $sp
--  ; CHECK-NEXT: $sp = COPY SAVE(p0)
-+  ; CHECK: SAVE:%0-9+:_(p0) = G_STACKSAVE
-+  ; CHECK-NEXT: G_STACKRESTORE SAVE
-   ; CHECK-NEXT: RET_ReallyLR
-   %sp = call ptr @llvm.stacksave()
-   call void @llvm.stackrestore(ptr %sp)
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-index b4fe73d29fa6..461161f5b338 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-@@ -641,7 +641,15 @@
- # DEBUG-NEXT: G_JUMP_TABLE (opcode {{0-9+}}): 1 type index, 0 imm indices
- # DEBUG-NEXT: .. the first uncovered type index: 1, OK
- # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
--# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{0-9+}}): 2 type indices, 0 imm indices
-+# DEBUG-NEXT: G_DYN_STACKALLOC (opcode DYN_STACKALLOC:0-9+): 2 type indices, 0 imm indices
-+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: G_STACKSAVE (opcode {{0-9+}}): 1 type index, 0 imm indices
-+# DEBUG-NEXT: .. opcode {{0-9+}} is aliased to DYN_STACKALLOC
-+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: G_STACKRESTORE (opcode {{0-9+}}): 1 type index, 0 imm indices
-+# DEBUG-NEXT: .. opcode {{0-9+}} is aliased to DYN_STACKALLOC
- # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
- # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
- # DEBUG-NEXT: G_STRICT_FADD (opcode {{0-9+}}): 1 type index, 0 imm indices
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
-new file mode 100644
-index 000000000000..16bf85af9c17
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
-@@ -0,0 +1,35 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-+; RUN: llc -global-isel=1 -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
-+
-+declare void @use_addr(ptr)
-+declare ptr @llvm.stacksave.p0()
-+declare void @llvm.stackrestore.p0(ptr)
-+
-+define void @test_scoped_alloca(i64 %n) {
-+; CHECK-LABEL: test_scoped_alloca:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-32! // 16-byte Folded Spill
-+; CHECK-NEXT:    str x19, sp, #16 // 8-byte Folded Spill
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 32
-+; CHECK-NEXT:    .cfi_offset w19, -16
-+; CHECK-NEXT:    .cfi_offset w30, -24
-+; CHECK-NEXT:    .cfi_offset w29, -32
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    mov x19, sp
-+; CHECK-NEXT:    sub x0, x8, x9
-+; CHECK-NEXT:    mov sp, x0
-+; CHECK-NEXT:    bl use_addr
-+; CHECK-NEXT:    mov sp, x19
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    ldr x19, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-+; CHECK-NEXT:    ret
-+  %sp = call ptr @llvm.stacksave.p0()
-+  %addr = alloca i8, i64 %n
-+  call void @use_addr(ptr %addr)
-+  call void @llvm.stackrestore.p0(ptr %sp)
-+  ret void
-+}
-diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-new file mode 100644
-index 000000000000..e86c04ee22db
---- /dev/null
-+++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-@@ -0,0 +1,39 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-+; RUN: llc -global-isel=1 -mtriple=x86_64-linux-gnu -o - %s | FileCheck %s
-+
-+declare void @use_addr(ptr)
-+declare ptr @llvm.stacksave.p0()
-+declare void @llvm.stackrestore.p0(ptr)
-+
-+define void @test_scoped_alloca(i64 %n) {
-+; CHECK-LABEL: test_scoped_alloca:
-+; CHECK:       # %bb.0:
-+; CHECK-NEXT:    pushq %rbp
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset %rbp, -16
-+; CHECK-NEXT:    movq %rsp, %rbp
-+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-+; CHECK-NEXT:    pushq %rbx
-+; CHECK-NEXT:    pushq %rax
-+; CHECK-NEXT:    .cfi_offset %rbx, -24
-+; CHECK-NEXT:    movq %rsp, %rbx
-+; CHECK-NEXT:    movq %rsp, %rax
-+; CHECK-NEXT:    imulq $1, %rdi, %rcx
-+; CHECK-NEXT:    addq $15, %rcx
-+; CHECK-NEXT:    andq $-16, %rcx
-+; CHECK-NEXT:    subq %rcx, %rax
-+; CHECK-NEXT:    movq %rax, %rsp
-+; CHECK-NEXT:    movq %rax, %rdi
-+; CHECK-NEXT:    callq use_addr
-+; CHECK-NEXT:    movq %rbx, %rsp
-+; CHECK-NEXT:    leaq -8(%rbp), %rsp
-+; CHECK-NEXT:    popq %rbx
-+; CHECK-NEXT:    popq %rbp
-+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
-+; CHECK-NEXT:    retq
-+  %sp = call ptr @llvm.stacksave.p0()
-+  %addr = alloca i8, i64 %n
-+  call void @use_addr(ptr %addr)
-+  call void @llvm.stackrestore.p0(ptr %sp)
-+  ret void
-+}
--- 
-2.42.0.windows.2
-

_service:tar_scm:0022-Backport-AArch64-Refactor-allocation-of-locals-and-stack-realignment.patch Deleted

@@ -1,546 +0,0 @@
-From 8db377e2a22d83637171008b6c8723f1869a2926 Mon Sep 17 00:00:00 2001
-From: rickyleung <leung.wing.chung@huawei.com>
-Date: Tue, 7 May 2024 21:24:49 +0800
-Subject: PATCH 3/7 backportAArch64 Refactor allocation of locals and
- stack realignment
-
-Reference: https://github.com/wc00862805aj/llvm-project/commit/dedf2c6bb5193652f6ad7d9ff9e676624c2485b7?
-
-Factor out some stack allocation in a separate function. This patch
-splits out the generic portion of a larger refactoring done as a part of
-stack clash protection support.
-
-The patch is almost, but not quite NFC. The only difference should
-be that where we have adjacent allocation of stack space
-for local SVE objects and non-local SVE objects the order
-of `sub sp, ...` and `addvl sp, ...` instructions is reversed, because now
-it's done with a single call to `emitFrameOffset` and it happens
-add/subtract the fixed part before the scalable part, e.g.
-
-    addvl sp, sp, #-2
-    sub sp, sp, llvm#16, lsl llvm#12
-    sub sp, sp, llvm#16
-
-becomes
-
-    sub sp, sp, llvm#16, lsl llvm#12
-    sub sp, sp, llvm#16
-    addvl sp, sp, #-2
----
- .../Target/AArch64/AArch64FrameLowering.cpp   | 114 +++++++++---------
- .../lib/Target/AArch64/AArch64FrameLowering.h |   5 +
- .../AArch64/framelayout-sve-basepointer.mir   |   4 +-
- .../framelayout-sve-fixed-width-access.mir    |   2 +-
- .../framelayout-sve-scavengingslot.mir        |   4 +-
- llvm/test/CodeGen/AArch64/framelayout-sve.mir |  54 ++++-----
- .../AArch64/spill-stack-realignment.mir       |   2 +-
- llvm/test/CodeGen/AArch64/stack-guard-sve.ll  |   4 +-
- .../AArch64/sve-calling-convention-mixed.ll   |   4 +-
- .../CodeGen/AArch64/sve-fixed-length-fp128.ll |   4 +-
- 10 files changed, 103 insertions(+), 94 deletions(-)
-
-diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-index 4d5676f34101..eeb6185fa36d 100644
---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-@@ -300,6 +300,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
- static bool produceCompactUnwindFrame(MachineFunction &MF);
- static bool needsWinCFI(const MachineFunction &MF);
- static StackOffset getSVEStackSize(const MachineFunction &MF);
-+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
- static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
- 
- /// Returns true if a homogeneous prolog or epilog code can be emitted
-@@ -671,6 +672,44 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
-   emitCalleeSavedRestores(MBB, MBBI, true);
- }
- 
-+void AArch64FrameLowering::allocateStackSpace(
-+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-+    bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI,
-+    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
-+
-+  if (!AllocSize)
-+    return;
-+
-+  DebugLoc DL;
-+  MachineFunction &MF = *MBB.getParent();
-+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-+  AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
-+  const MachineFrameInfo &MFI = MF.getFrameInfo();
-+
-+  Register TargetReg =
-+      NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP;
-+  // SUB Xd/SP, SP, AllocSize
-+  emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
-+                  MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
-+                  EmitCFI, InitialOffset);
-+
-+  if (NeedsRealignment) {
-+    const int64_t MaxAlign = MFI.getMaxAlign().value();
-+    const uint64_t AndMask = ~(MaxAlign - 1);
-+    // AND SP, Xd, 0b11111...0000
-+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
-+        .addReg(TargetReg, RegState::Kill)
-+        .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
-+        .setMIFlags(MachineInstr::FrameSetup);
-+    AFI.setStackRealigned(true);
-+
-+    // No need for SEH instructions here; if we're realigning the stack,
-+    // we've set a frame pointer and already finished the SEH prologue.
-+    assert(!NeedsWinCFI);
-+  }
-+}
-+
- static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
-   switch (Reg.id()) {
-   default:
-@@ -1769,7 +1808,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-     }
-   }
- 
--  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
-+  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
-   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
- 
-   // Process the SVE callee-saves to determine what space needs to be
-@@ -1782,67 +1821,32 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-       ++MBBI;
-     CalleeSavesEnd = MBBI;
- 
--    AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
--    AllocateAfter = SVEStackSize - AllocateBefore;
-+    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
-+    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
-   }
- 
-   // Allocate space for the callee saves (if any).
--  emitFrameOffset(
--      MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
--      MachineInstr::FrameSetup, false, false, nullptr,
--      EmitAsyncCFI && !HasFP && AllocateBefore,
--      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
-+  StackOffset CFAOffset =
-+      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-+  allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false,
-+                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
-+  CFAOffset += SVECalleeSavesSize;
- 
-   if (EmitAsyncCFI)
-     emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
- 
--  // Finally allocate remaining SVE stack space.
--  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
--                  -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
--                  nullptr, EmitAsyncCFI && !HasFP && AllocateAfter,
--                  AllocateBefore + StackOffset::getFixed(
--                                       (int64_t)MFI.getStackSize() - NumBytes));
--
--  // Allocate space for the rest of the frame.
--  if (NumBytes) {
--    unsigned scratchSPReg = AArch64::SP;
--
--    if (NeedsRealignment) {
--      scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
--      assert(scratchSPReg != AArch64::NoRegister);
--    }
--
--    // If we're a leaf function, try using the red zone.
--    if (!canUseRedZone(MF)) {
--      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
--      // the correct value here, as NumBytes also includes padding bytes,
--      // which shouldn't be counted here.
--      emitFrameOffset(
--          MBB, MBBI, DL, scratchSPReg, AArch64::SP,
--          StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
--          false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
--          SVEStackSize +
--              StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
--    }
--    if (NeedsRealignment) {
--      assert(MFI.getMaxAlign() > Align(1));
--      assert(scratchSPReg != AArch64::SP);
--
--      // SUB X9, SP, NumBytes
--      //   -- X9 is temporary register, so shouldn't contain any live data here,
--      //   -- free to use. This is already produced by emitFrameOffset above.
--      // AND SP, X9, 0b11111...0000
--      uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
--
--      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
--          .addReg(scratchSPReg, RegState::Kill)
--          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
--      AFI->setStackRealigned(true);
--
--      // No need for SEH instructions here; if we're realigning the stack,
--      // we've set a frame pointer and already finished the SEH prologue.
--      assert(!NeedsWinCFI);
--    }
-+  // Allocate space for the rest of the frame including SVE locals. Align the
-+  // stack as necessary.
-+  assert(!(canUseRedZone(MF) && NeedsRealignment) &&
-+         "Cannot use redzone with stack realignment");
-+  if (!canUseRedZone(MF)) {
-+    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-+    // the correct value here, as NumBytes also includes padding bytes,
-+    // which shouldn't be counted here.
-+    allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment,
-+                       SVELocalsSize + StackOffset::getFixed(NumBytes),
-+                       NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
-+                       CFAOffset);
-   }
- 
-   // If we need a base pointer, set it up here. It's whatever the value of the
-diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-index 147b5c181be5..f3313f3b53ff 100644
---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-@@ -150,6 +150,11 @@ private:
-                                   MachineBasicBlock::iterator MBBI) const;
-   void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI) const;
-+  void allocateStackSpace(MachineBasicBlock &MBB,
-+                          MachineBasicBlock::iterator MBBI,
-+                          bool NeedsRealignment, StackOffset AllocSize,
-+                          bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
-+                          StackOffset InitialOffset) const;
- 
-   /// Emit target zero call-used regs.
-   void emitZeroCallUsedRegs(BitVector RegsToZero,
-diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
-index 623c0f240be4..265c474fbc5d 100644
---- a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
-+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
-@@ -4,8 +4,8 @@
- name: hasBasepointer
- # CHECK-LABEL: name: hasBasepointer
- # CHECK: bb.0:
--# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
--# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK:      $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0
- # CHECK:      STRXui $x0, $x19, 0
- tracksRegLiveness: true
-diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir
-index e367a380f8ba..35fd7ca77d5c 100644
---- a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir
-+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir
-@@ -7,9 +7,9 @@
-   ; CHECK:       // %bb.0: // %entry
-   ; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-   ; CHECK-NEXT:    mov x29, sp
-+  ; CHECK-NEXT:    sub sp, sp, #2064
-   ; CHECK-NEXT:    addvl sp, sp, #-32
-   ; CHECK-NEXT:    addvl sp, sp, #-28
--  ; CHECK-NEXT:    sub sp, sp, #2064
-   ; CHECK-NEXT:    ldr x8, sp, #2048
-   ; CHECK-NEXT:    addvl sp, sp, #31
-   ; CHECK-NEXT:    addvl sp, sp, #29
-diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir
-index d54f67634d02..680f9c335c25 100644
---- a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir
-+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir
-@@ -4,9 +4,9 @@
- name: LateScavengingSlot
- # CHECK-LABEL: name: LateScavengingSlot
- # CHECK: bb.0:
--# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
--# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 8, 12
-+# CHECK:      $sp = frame-setup SUBXri $sp, 8, 12
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK:      STRXui killed $SCRATCH:x0-9+, $sp, 0
- # CHECK-NEXT: $SCRATCH = ADDVL_XXI $fp, -1
- # CHECK-NEXT: STRXui $x0, killed $SCRATCH, 0
-diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
-index 7c87587c6dc4..8b657c95bfc7 100644
---- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
-+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
-@@ -60,10 +60,10 @@
- # CHECK-NEXT: $sp = frame-setup STRXpre killed $SCRATCH:a-z0-9+, $sp, -16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
--# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
--# CHECK-NEXT: CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22
- 
- # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2
- # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
-@@ -77,7 +77,7 @@
- # ASM-LABEL: test_allocate_sve:
- # ASM:       .cfi_def_cfa_offset 16
- # ASM-NEXT:  .cfi_offset w29, -16
--# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-+# ASM:       .cfi_def_cfa_offset 32
- # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG
- # ASM:       .cfi_def_cfa wsp, 32
- # ASM:       .cfi_def_cfa_offset 16
-@@ -87,7 +87,7 @@
- #
- # UNWINDINFO:       DW_CFA_def_cfa_offset: +16
- # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
--# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-+# UNWINDINFO:       DW_CFA_def_cfa_offset: +32
- # UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- # UNWINDINFO:       DW_CFA_def_cfa: reg31 +32
- # UNWINDINFO:       DW_CFA_def_cfa_offset: +16
-@@ -125,9 +125,9 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w20, -8
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w21, -16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
--# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 48
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22
- #
- # CHECK-NEXT: $x20 = IMPLICIT_DEF
-@@ -149,7 +149,7 @@ body:             |
- # ASM:       .cfi_offset w20, -8
- # ASM-NEXT:  .cfi_offset w21, -16
- # ASM-NEXT:  .cfi_offset w29, -32
--# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG
-+# ASM:       .cfi_def_cfa_offset 48
- # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG
- #
- # ASM:       .cfi_def_cfa wsp, 48
-@@ -164,7 +164,7 @@ body:             |
- # UNWINDINFO:       DW_CFA_offset: reg20 -8
- # UNWINDINFO-NEXT:  DW_CFA_offset: reg21 -16
- # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -32
--# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-+# UNWINDINFO:       DW_CFA_def_cfa_offset: +48
- # UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- #
- # UNWINDINFO:       DW_CFA_def_cfa: reg31 +48
-@@ -205,9 +205,9 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
- # CHECK-NEXT: $TMP:x0-9+ = frame-setup SUBXri $sp, 16, 0
--# CHECK-NEXT: $sp = ANDXri killed $TMP
-+# CHECK-NEXT: $TMP = frame-setup ADDVL_XXI $TMP, -2
-+# CHECK-NEXT: $sp = frame-setup ANDXri killed $TMP
- # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
- # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
- # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
-@@ -267,9 +267,9 @@ body:             |
- # CHECK-NEXT: $sp = frame-setup STRXpre killed $SCRATCH:a-z0-9+, $sp, -16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
--# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
- 
- # CHECK-NEXT: $TMP:x0-9+ = ADDXri $sp, 16
-@@ -292,7 +292,7 @@ body:             |
- # ASM-LABEL:  test_address_sve:
- # ASM:       .cfi_def_cfa_offset 16
- # ASM-NEXT:  .cfi_offset w29, -16
--# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-+# ASM:       .cfi_def_cfa_offset 32
- # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG
- #
- # ASM:       .cfi_def_cfa wsp, 32
-@@ -302,7 +302,7 @@ body:             |
- #
- # UNWINDINFO:       DW_CFA_def_cfa_offset: +16
- # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
--# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-+# UNWINDINFO:       DW_CFA_def_cfa_offset: +32
- # UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- #
- # UNWINDINFO:       DW_CFA_def_cfa: reg31 +32
-@@ -353,8 +353,8 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
- 
- # CHECK-NEXT: STR_ZXI $z0, $fp, -1
- # CHECK-NEXT: STR_ZXI $z1, $fp, -2
-@@ -429,9 +429,9 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
- 
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
--# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK:      $TMP:x0-9+ = ADDVL_XXI $sp, 1
- # CHECK-NEXT: $x0 = LDRXui killed $TMP, 4
-@@ -448,7 +448,7 @@ body:             |
- # ASM-LABEL: test_stack_arg_sve:
- # ASM:       .cfi_def_cfa_offset 16
- # ASM-NEXT:  .cfi_offset w29, -16
--# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-+# ASM:       .cfi_def_cfa_offset 32
- # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
- #
- # ASM:       .cfi_def_cfa wsp, 32
-@@ -458,7 +458,7 @@ body:             |
- 
- # UNWINDINFO:      DW_CFA_def_cfa_offset: +16
- # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
--# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-+# UNWINDINFO:      DW_CFA_def_cfa_offset: +32
- # UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- #
- # UNWINDINFO:      DW_CFA_def_cfa: reg31 +32
-@@ -640,8 +640,8 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w19, -16
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -24
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
-+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0
- # CHECK-NEXT: STRXui $xzr, $x19, 0
- # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
-@@ -863,9 +863,9 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22
--# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
--# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK:      $sp = frame-setup SUBXri $sp, 32, 0
-+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-+# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
- 
- # CHECK:      $sp = frame-destroy ADDXri $sp, 32, 0
-@@ -916,7 +916,7 @@ body:             |
- # ASM-NEXT:  .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG
- # ASM-NEXT:  .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG
- # ASM-NEXT:  .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
--# ASM:       .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
-+# ASM:       .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 144 * VG
- # ASM:       .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG
- #
- # ASM:       .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
-@@ -950,7 +950,7 @@ body:             |
- # UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- # UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- # UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
--# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +144, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- # UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
- #
- # UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-@@ -1031,9 +1031,9 @@ body:             |
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22
- # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22
--# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
- # CHECK-NEXT: $TMP:x0-9+ = frame-setup SUBXri $sp, 16, 0
--# CHECK-NEXT: $sp = ANDXri killed $TMP
-+# CHECK-NEXT: $TMP = frame-setup ADDVL_XXI $TMP, -1
-+# CHECK-NEXT: $sp = frame-setup ANDXri killed $TMP
- 
- # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
- # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
-diff --git a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
-index 1b9411d07f43..f6fc627ac2d3 100644
---- a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
-+++ b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
-@@ -21,7 +21,7 @@ stack:
-   - { id: 1, size: 4, alignment: 4, local-offset: -68 }
- 
- # CHECK: body:
--# CHECK:   $sp = ANDXri killed ${{x0-9+}}, 7865
-+# CHECK:   $sp = frame-setup ANDXri killed ${{x0-9+}}, 7865
- # CHECK:   STRSui $s0, $sp, 0
- # CHECK:   STRSui $s0, $fp, 7
- body:             |
-diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
-index 1672a7eb8739..5acbb22bf1ab 100644
---- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
-+++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll
-@@ -148,9 +148,9 @@ entry:
- 
- ; CHECK-LABEL: local_stack_alloc:
- ; CHECK: mov x29, sp
--; CHECK: addvl sp, sp, #-2
- ; CHECK: sub sp, sp, #16, lsl #12
- ; CHECK: sub sp, sp, #16
-+; CHECK: addvl sp, sp, #-2
- 
- ; Stack guard is placed below the SVE stack area (and above all fixed-width objects)
- ; CHECK-DAG: add STACK_GUARD_SPILL_PART_LOC:x0-9+, sp, #8, lsl #12
-@@ -198,9 +198,9 @@ entry:
- 
- ; CHECK-LABEL: local_stack_alloc_strong:
- ; CHECK: mov x29, sp
--; CHECK: addvl sp, sp, #-3
- ; CHECK: sub sp, sp, #16, lsl #12
- ; CHECK: sub sp, sp, #16
-+; CHECK: addvl sp, sp, #-3
- 
- ; Stack guard is placed at the top of the SVE stack area
- ; CHECK-DAG: ldr STACK_GUARD:x0-9+, {{x0-9+}}, :lo12:__stack_chk_guard
-diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
-index a97649523565..235364ac2321 100644
---- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
-+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
-@@ -56,8 +56,8 @@ define float @foo2(ptr %x0, ptr %x1) nounwind {
- ; CHECK-LABEL: foo2:
- ; CHECK:       // %bb.0: // %entry
- ; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
--; CHECK-NEXT:    addvl sp, sp, #-4
- ; CHECK-NEXT:    sub sp, sp, #16
-+; CHECK-NEXT:    addvl sp, sp, #-4
- ; CHECK-NEXT:    ptrue p0.b
- ; CHECK-NEXT:    add x8, sp, #16
- ; CHECK-NEXT:    ld4d { z1.d - z4.d }, p0/z, x0
-@@ -699,8 +699,8 @@ define void @verify_all_operands_are_initialised() {
- ; CHECK-LABEL: verify_all_operands_are_initialised:
- ; CHECK:       // %bb.0:
- ; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
--; CHECK-NEXT:    addvl sp, sp, #-1
- ; CHECK-NEXT:    sub sp, sp, #16
-+; CHECK-NEXT:    addvl sp, sp, #-1
- ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
- ; CHECK-NEXT:    .cfi_offset w30, -8
- ; CHECK-NEXT:    .cfi_offset w29, -16
-diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
-index 31ff9287046c..b3529549c22b 100644
---- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
-+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
-@@ -9,8 +9,8 @@ define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 {
- ; CHECK:       // %bb.0:
- ; CHECK-NEXT:    str x29, sp, #-32! // 8-byte Folded Spill
- ; CHECK-NEXT:    stp x30, x19, sp, #16 // 16-byte Folded Spill
--; CHECK-NEXT:    addvl sp, sp, #-2
- ; CHECK-NEXT:    sub sp, sp, #48
-+; CHECK-NEXT:    addvl sp, sp, #-2
- ; CHECK-NEXT:    ptrue p0.d, vl4
- ; CHECK-NEXT:    add x8, sp, #48
- ; CHECK-NEXT:    ld1d { z0.d }, p0/z, x0
-@@ -59,8 +59,8 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
- ; CHECK:       // %bb.0:
- ; CHECK-NEXT:    str x29, sp, #-32! // 8-byte Folded Spill
- ; CHECK-NEXT:    stp x30, x19, sp, #16 // 16-byte Folded Spill
--; CHECK-NEXT:    addvl sp, sp, #-2
- ; CHECK-NEXT:    sub sp, sp, #128
-+; CHECK-NEXT:    addvl sp, sp, #-2
- ; CHECK-NEXT:    ldr q1, x0, #64
- ; CHECK-NEXT:    mov x19, x1
- ; CHECK-NEXT:    ldr q0, x0, #80
--- 
-2.42.0.windows.2
-

_service:tar_scm:0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch Added

_service:tar_scm:0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch Added

@@ -0,0 +1,517 @@
+From cac43828d26b178807d194b4bd7c5df69603df29 Mon Sep 17 00:00:00 2001
+From: xiajingze <xiajingze1@huawei.com>
+Date: Wed, 31 Jul 2024 18:37:29 +0800
+Subject: PATCH AArch64 Support HiSilicon's HIP09 Processor
+
+Signed-off-by: xiajingze <xiajingze1@huawei.com>
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |  8 ++
+ .../llvm/TargetParser/AArch64TargetParser.h   |  7 ++
+ llvm/lib/Target/AArch64/AArch64.td            | 36 +++++++
+ .../lib/Target/AArch64/AArch64MacroFusion.cpp | 55 +++++++++++
+ llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  9 ++
+ llvm/lib/Target/AArch64/AArch64Subtarget.h    |  9 +-
+ llvm/lib/Target/CMakeLists.txt                |  4 +
+ llvm/lib/TargetParser/Host.cpp                |  3 +
+ llvm/test/CodeGen/AArch64/cpus-hip09.ll       | 11 +++
+ .../CodeGen/AArch64/macro-fusion-mvnclz.mir   | 20 ++++
+ .../AArch64/misched-fusion-lit-hip09.ll       | 73 ++++++++++++++
+ llvm/test/CodeGen/AArch64/remat-hip09.ll      | 18 ++++
+ llvm/test/lit.site.cfg.py.in                  |  4 +
+ llvm/unittests/TargetParser/Host.cpp          |  5 +
+ .../TargetParser/TargetParserTest.cpp         | 16 +++
+ 15 files changed, 277 insertions(+), 1 deletion(-)
+ create mode 100644 llvm/test/CodeGen/AArch64/cpus-hip09.ll
+ create mode 100644 llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+ create mode 100644 llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+ create mode 100644 llvm/test/CodeGen/AArch64/remat-hip09.ll
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 8be5d4ba5..74e68e25d 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -112,6 +112,14 @@ else()
+   set(LLVM_ENABLE_AUTOTUNER 0)
+ endif()
+ 
++option(LLVM_ENABLE_AARCH64_HIP09 "Enable HIP09 Processor" ON)
++if(LLVM_ENABLE_AARCH64_HIP09)
++  set(LLVM_ENABLE_AARCH64_HIP09 1)
++  add_definitions( -DENABLE_AARCH64_HIP09 )
++else()
++  set(LLVM_ENABLE_AARCH64_HIP09 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+index dc4cdfa8e..07cd2fcbb 100644
+--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
++++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+@@ -542,6 +542,13 @@ inline constexpr CpuInfo CpuInfos = {
+      (AArch64::AEK_FP16 | AArch64::AEK_RAND | AArch64::AEK_SM4 |
+       AArch64::AEK_SHA3 | AArch64::AEK_SHA2 | AArch64::AEK_AES |
+       AArch64::AEK_MTE | AArch64::AEK_SB | AArch64::AEK_SSBS)},
++#if defined(ENABLE_AARCH64_HIP09)
++    {"hip09", ARMV8_5A,
++     (AArch64::AEK_AES | AArch64::AEK_SM4 | AArch64::AEK_SHA2 |
++      AArch64::AEK_SHA3 | AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
++      AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
++      AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16)},
++#endif
+ };
+ 
+ // An alias for a CPU.
+diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
+index 8f50af4b7..c8bfd770f 100644
+--- a/llvm/lib/Target/AArch64/AArch64.td
++++ b/llvm/lib/Target/AArch64/AArch64.td
+@@ -296,6 +296,12 @@ def FeatureFuseAddSub2RegAndConstOne : SubtargetFeature<
+    "fuse-addsub-2reg-const1", "HasFuseAddSub2RegAndConstOne", "true",
+    "CPU fuses (a + b + 1) and (a - b - 1)">;
+ 
++#ifdef ENABLE_AARCH64_HIP09
++def FeatureFuseMvnClz : SubtargetFeature<
++    "fuse-mvn-clz", "HasFuseMvnClz", "true",
++    "CPU fuses mvn+clz operations">;
++#endif
++
+ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+     "Disable latency scheduling heuristic">;
+@@ -1205,6 +1211,21 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
+                                   FeatureFuseAES,
+                                   FeaturePostRAScheduler>;
+ 
++#ifdef ENABLE_AARCH64_HIP09
++def TuneHIP09 : SubtargetFeature<"hip09", "ARMProcFamily", "HIP09",
++                                   "HiSilicon HIP-09 processors", 
++                                   FeatureCustomCheapAsMoveHandling,
++                                   FeatureExperimentalZeroingPseudos,
++                                   FeatureFuseAES,
++                                   FeatureLSLFast,
++                                   FeatureAscendStoreAddress,
++                                   FeatureCmpBccFusion,
++                                   FeatureArithmeticBccFusion,
++                                   FeatureFuseLiterals,
++                                   FeatureFuseMvnClz,
++                                   FeaturePostRAScheduler>;
++#endif
++
+ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                    "Ampere Computing Ampere-1 processors", 
+                                    FeaturePostRAScheduler,
+@@ -1359,6 +1380,14 @@ def ProcessorFeatures {
+   list<SubtargetFeature> TSV110 = HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
+                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd;
++#ifdef ENABLE_AARCH64_HIP09
++  list<SubtargetFeature> HIP09 = HasV8_5aOps, FeatureBF16, FeatureCrypto, FeatureFPARMv8,
++                                  FeatureMatMulInt8, FeatureMatMulFP32, FeatureMatMulFP64,
++                                  FeatureNEON, FeaturePerfMon, FeatureRandGen, FeatureSPE,
++                                  FeatureFullFP16, FeatureFP16FML, FeatureDotProd,
++                                  FeatureJS, FeatureComplxNum, FeatureSHA3, FeatureSM4,
++                                  FeatureSVE;
++#endif
+   list<SubtargetFeature> Ampere1 = HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureSSBS, FeatureRandGen, FeatureSB,
+                                     FeatureSHA2, FeatureSHA3, FeatureAES;
+@@ -1464,8 +1493,15 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model,
+ // Marvell ThunderX3T110 Processors.
+ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
+                      ProcessorFeatures.ThunderX3T110, TuneThunderX3T110>;
++
++// HiSilicon Processors.
+ def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
+                      TuneTSV110>;
++#ifdef ENABLE_AARCH64_HIP09
++// FIXME: HiSilicon HIP09 is currently modeled as a Cortex-A57.
++def : ProcessorModel<"hip09", CortexA57Model, ProcessorFeatures.HIP09,
++                     TuneHIP09>;
++#endif
+ 
+ // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+ def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
+diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+index 05d60872b..4963ec350 100644
+--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
++++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+@@ -51,6 +51,12 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+   case AArch64::SUBSXrr:
+   case AArch64::BICSWrr:
+   case AArch64::BICSXrr:
++#if defined(ENABLE_AARCH64_HIP09)
++  case AArch64::ADCSWr:
++  case AArch64::ADCSXr:
++  case AArch64::SBCSWr:
++  case AArch64::SBCSXr:
++#endif
+     return true;
+   case AArch64::ADDSWrs:
+   case AArch64::ADDSXrs:
+@@ -183,6 +189,20 @@ static bool isLiteralsPair(const MachineInstr *FirstMI,
+       SecondMI.getOperand(3).getImm() == 16))
+     return true;
+ 
++#if defined(ENABLE_AARCH64_HIP09)
++  // 32 bit immediate.
++  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVNWi) &&
++      (SecondMI.getOpcode() == AArch64::MOVKWi &&
++       SecondMI.getOperand(3).getImm() == 16))
++    return true;
++
++  // Lower half of 64 bit immediate.
++  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVNXi) &&
++      (SecondMI.getOpcode() == AArch64::MOVKWi &&
++       SecondMI.getOperand(3).getImm() == 16))
++    return true;
++#endif
++
+   // Upper half of 64 bit immediate.
+   if ((FirstMI == nullptr ||
+        (FirstMI->getOpcode() == AArch64::MOVKXi &&
+@@ -437,6 +457,37 @@ static bool isAddSub2RegAndConstOnePair(const MachineInstr *FirstMI,
+   return false;
+ }
+ 
++#if defined(ENABLE_AARCH64_HIP09)
++static bool isMvnClzPair(const MachineInstr *FirstMI,
++                         const MachineInstr &SecondMI) {
++  // HIP09 supports fusion of MVN + CLZ.
++  // The CLZ can be fused with MVN and make execution faster.
++  // And the fusion is not allowed for shifted forms.
++  //
++  // Instruction alias info:
++  // 1. MVN <Wd>, <Wm>{, <shift> #<amount>} is equivalent to
++  //    ORN <Wd>, WZR, <Wm>{, <shift> #<amount>}
++  // 2. MVN <Xd>, <Xm>{, <shift> #<amount>} is equivalent to
++  //    ORN <Xd>, XZR, <Xm>{, <shift> #<amount>}
++  // Assume the 1st instr to be a wildcard if it is unspecified.
++  if ((FirstMI == nullptr ||
++       ((FirstMI->getOpcode() == AArch64::ORNWrs) &&
++        (FirstMI->getOperand(1).getReg() == AArch64::WZR) &&
++        (!AArch64InstrInfo::hasShiftedReg(*FirstMI)))) &&
++      (SecondMI.getOpcode() == AArch64::CLZWr))
++    return true;
++
++  if ((FirstMI == nullptr ||
++       ((FirstMI->getOpcode() == AArch64::ORNXrs) &&
++        (FirstMI->getOperand(1).getReg() == AArch64::XZR) &&
++        (!AArch64InstrInfo::hasShiftedReg(*FirstMI)))) &&
++      (SecondMI.getOpcode() == AArch64::CLZXr))
++    return true;
++
++  return false;
++}
++#endif
++
+ /// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+ /// together. Given SecondMI, when FirstMI is unspecified, then check if
+ /// SecondMI may be part of a fused pair at all.
+@@ -472,6 +523,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+   if (ST.hasFuseAddSub2RegAndConstOne() &&
+       isAddSub2RegAndConstOnePair(FirstMI, SecondMI))
+     return true;
++#if defined(ENABLE_AARCH64_HIP09)
++  if (ST.hasFuseMvnClz() && isMvnClzPair(FirstMI, SecondMI))
++    return true;
++#endif
+ 
+   return false;
+ }
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+index 450e27b8a..ddf22364c 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+@@ -266,6 +266,15 @@ void AArch64Subtarget::initializeProperties() {
+     PrefFunctionAlignment = Align(16);
+     PrefLoopAlignment = Align(4);
+     break;
++#if defined(ENABLE_AARCH64_HIP09)
++  case HIP09:
++    CacheLineSize = 64;
++    PrefFunctionAlignment = Align(16);
++    PrefLoopAlignment = Align(4);
++    VScaleForTuning = 2;
++    DefaultSVETFOpts = TailFoldingOpts::Simple;
++    break;
++#endif
+   case ThunderX3T110:
+     CacheLineSize = 64;
+     PrefFunctionAlignment = Align(16);
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+index 5e20d1646..5f481f4f9 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+@@ -87,7 +87,10 @@ public:
+     ThunderXT83,
+     ThunderXT88,
+     ThunderX3T110,
+-    TSV110
++    TSV110,
++#if defined(ENABLE_AARCH64_HIP09)
++    HIP09
++#endif
+   };
+ 
+ protected:
+@@ -239,7 +242,11 @@ public:
+   bool hasFusion() const {
+     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
+            hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
++#if defined(ENABLE_AARCH64_HIP09)
++           hasFuseAdrpAdd() || hasFuseLiterals() || hasFuseMvnClz();
++#else
+            hasFuseAdrpAdd() || hasFuseLiterals();
++#endif
+   }
+ 
+   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
+index 2739233f9..501ce1f2f 100644
+--- a/llvm/lib/Target/CMakeLists.txt
++++ b/llvm/lib/Target/CMakeLists.txt
+@@ -2,6 +2,10 @@ list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+ 
+ list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
+ 
++if(LLVM_ENABLE_AARCH64_HIP09)
++  list(APPEND LLVM_TABLEGEN_FLAGS "-DENABLE_AARCH64_HIP09")
++endif()
++
+ add_llvm_component_library(LLVMTarget
+   Target.cpp
+   TargetIntrinsicInfo.cpp
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index d11dc605e..8b23be02e 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -257,6 +257,9 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
+     // contents are specified in the various processor manuals.
+     return StringSwitch<const char *>(Part)
+       .Case("0xd01", "tsv110")
++#if defined(ENABLE_AARCH64_HIP09)
++      .Case("0xd02", "hip09")
++#endif
+       .Default("generic");
+ 
+   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+diff --git a/llvm/test/CodeGen/AArch64/cpus-hip09.ll b/llvm/test/CodeGen/AArch64/cpus-hip09.ll
+new file mode 100644
+index 000000000..dcf32e4dc
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/cpus-hip09.ll
+@@ -0,0 +1,11 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; This tests that llc accepts all valid AArch64 CPUs
++
++; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=hip09 2>&1 | FileCheck %s
++
++; CHECK-NOT: {{.*}}  is not a recognized processor for this target
++; INVALID: {{.*}}  is not a recognized processor for this target
++
++define i32 @f(i64 %z) {
++	ret i32 0
++}
+diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+new file mode 100644
+index 000000000..64bf15937
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+@@ -0,0 +1,20 @@
++# REQUIRES: enable_enable_aarch64_hip09
++# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION
++# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION
++---
++# CHECK-LABEL: name: fuse-mvn-clz
++# CHECK: $w2 = ORNWrs $wzr, $w1, 0
++# FUSION: $w0 = CLZWr killed renamable $w2
++# CHECK: $w3 = ADDWri killed renamable $w1, 1, 0
++# NOFUSION: $w0 = CLZWr killed renamable $w2
++name: fuse-mvn-clz
++tracksRegLiveness: true
++body: |
++  bb.0:
++    liveins: $w0, $w1, $w2, $w3
++
++    $w2 = ORNWrs $wzr, $w1, 0
++    $w3 = ADDWri killed renamable $w1, 1, 0 
++    $w0 = CLZWr killed renamable $w2
++    RET undef $lr, implicit $w0
++...
+diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+new file mode 100644
+index 000000000..d67fa5b43
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+@@ -0,0 +1,73 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=hip09           | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE-HIP09
++
++@g = common local_unnamed_addr global ptr null, align 8
++
++define dso_local ptr @litp(i32 %a, i32 %b) {
++entry:
++  %add = add nsw i32 %b, %a
++  %idx.ext = sext i32 %add to i64
++  %add.ptr = getelementptr i8, ptr @litp, i64 %idx.ext
++  store ptr %add.ptr, ptr @g, align 8
++  ret ptr %add.ptr
++
++; CHECK-LABEL: litp:
++; CHECK: adrp R:x0-9+, litp
++; CHECKFUSE-NEXT: add {{x0-9+}}, R, :lo12:litp
++}
++
++define dso_local ptr @litp_tune_generic(i32 %a, i32 %b) "tune-cpu"="generic" {
++entry:
++  %add = add nsw i32 %b, %a
++  %idx.ext = sext i32 %add to i64
++  %add.ptr = getelementptr i8, ptr @litp_tune_generic, i64 %idx.ext
++  store ptr %add.ptr, ptr @g, align 8
++  ret ptr %add.ptr
++
++; CHECK-LABEL: litp_tune_generic:
++; CHECK:         adrp R:x0-9+, litp_tune_generic
++; CHECK-NEXT:    add {{x0-9+}}, R, :lo12:litp_tune_generic
++}
++
++define dso_local i32 @liti(i32 %a, i32 %b) {
++entry:
++  %add = add i32 %a, -262095121
++  %add1 = add i32 %add, %b
++  ret i32 %add1
++
++; CHECK-LABEL: liti:
++; CHECK: mov R:w0-9+, {{#0-9+}}
++; CHECKDONT-NEXT: add {{w0-9+}}, {{w0-9+}}, {{w0-9+}}
++; CHECKFUSE-NEXT: movk R, {{#0-9+}}, lsl #16
++; CHECKFUSE-HIP09: movk R, {{#0-9+}}, lsl #16
++}
++
++; Function Attrs: norecurse nounwind readnone
++define dso_local i64 @litl(i64 %a, i64 %b) {
++entry:
++  %add = add i64 %a, 2208998440489107183
++  %add1 = add i64 %add, %b
++  ret i64 %add1
++
++; CHECK-LABEL: litl:
++; CHECK: mov R:x0-9+, {{#0-9+}}
++; CHECKDONT-NEXT: add {{x0-9+}}, {{x0-9+}}, {{x0-9+}}
++; CHECK-NEXT: movk R, {{#0-9+}}, lsl #16
++; CHECK: movk R, {{#0-9+}}, lsl #32
++; CHECK-NEXT: movk R, {{#0-9+}}, lsl #48
++}
++
++; Function Attrs: norecurse nounwind readnone
++define dso_local double @litf() {
++entry:
++  ret double 0x400921FB54442D18
++
++; CHECK-LABEL: litf:
++; CHECK-DONT:      adrp ADDR:x0-9+, CSTLABEL:.LCP.*
++; CHECK-DONT-NEXT: ldr  {{d0-9+}}, {{}}ADDR, :lo12:CSTLABEL{{}}
++; CHECKFUSE-HIP09:    mov  R:x0-9+, #11544
++; CHECKFUSE-HIP09:    movk R, #21572, lsl #16
++; CHECKFUSE-HIP09:    movk R, #8699, lsl #32
++; CHECKFUSE-HIP09:    movk R, #16393, lsl #48
++; CHECKFUSE-HIP09:    fmov {{d0-9+}}, R
++}
+diff --git a/llvm/test/CodeGen/AArch64/remat-hip09.ll b/llvm/test/CodeGen/AArch64/remat-hip09.ll
+new file mode 100644
+index 000000000..aec0d18ae
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/remat-hip09.ll
+@@ -0,0 +1,18 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=hip09 -o - %s | FileCheck %s
++
++%X = type { i64, i64, i64 }
++declare void @f(ptr)
++define void @t() {
++entry:
++  %tmp = alloca %X
++  call void @f(ptr %tmp)
++; CHECK: add x0, sp, #8
++; CHECK-NOT: mov
++; CHECK-NEXT: bl f
++  call void @f(ptr %tmp)
++; CHECK: add x0, sp, #8
++; CHECK-NOT: mov
++; CHECK-NEXT: bl f
++  ret void
++}
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index 20c1ecca1..6145a514f 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -64,9 +64,13 @@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
+ config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
++config.enable_enable_aarch64_hip09 = @LLVM_ENABLE_AARCH64_HIP09@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+ 
++if config.enable_enable_aarch64_hip09:
++    config.available_features.add("enable_enable_aarch64_hip09")
++
+ # Let the main config do the real work.
+ lit_config.load_config(
+     config, os.path.join(config.llvm_src_root, "test/lit.cfg.py"))
+diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
+index 452d0326c..4b4c81514 100644
+--- a/llvm/unittests/TargetParser/Host.cpp
++++ b/llvm/unittests/TargetParser/Host.cpp
+@@ -250,6 +250,11 @@ CPU part	: 0x0a1
+   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
+                                               "CPU part        : 0xd01"),
+             "tsv110");
++#if defined(ENABLE_AARCH64_HIP09)
++  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
++                                              "CPU part        : 0xd02"),
++            "hip09");
++#endif
+ 
+   // Verify A64FX.
+   const std::string A64FXProcCpuInfo = R"(
+diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
+index 741d5a2d4..94e0047e5 100644
+--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
++++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
+@@ -1421,6 +1421,18 @@ INSTANTIATE_TEST_SUITE_P(
+                              AArch64::AEK_PROFILE | AArch64::AEK_FP16 |
+                              AArch64::AEK_FP16FML | AArch64::AEK_DOTPROD,
+                          "8.2-A"),
++#if defined(ENABLE_AARCH64_HIP09)
++        ARMCPUTestParams(
++            "hip09", "armv8.5-a", "crypto-neon-fp-armv8",
++            AArch64::AEK_CRC | AArch64::AEK_FP | AArch64::AEK_SIMD |
++                AArch64::AEK_RAS | AArch64::AEK_LSE | AArch64::AEK_RDM |
++                AArch64::AEK_RCPC | AArch64::AEK_DOTPROD | AArch64::AEK_AES |
++                AArch64::AEK_SM4 | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 |
++                AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
++                AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
++                AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16,
++            "8.5-A"),
++#endif
+         ARMCPUTestParams("a64fx", "armv8.2-a", "crypto-neon-fp-armv8",
+                          AArch64::AEK_CRC | AArch64::AEK_AES |
+                              AArch64::AEK_SHA2 | AArch64::AEK_FP |
+@@ -1437,7 +1449,11 @@ INSTANTIATE_TEST_SUITE_P(
+                          "8.2-A")));
+ 
+ // Note: number of CPUs includes aliases.
++#if defined(ENABLE_AARCH64_HIP09)
++static constexpr unsigned NumAArch64CPUArchs = 63;
++#else
+ static constexpr unsigned NumAArch64CPUArchs = 62;
++#endif
+ 
+ TEST(TargetParserTest, testAArch64CPUArchList) {
+   SmallVector<StringRef, NumAArch64CPUArchs> List;
+-- 
+2.19.1
+

_service:tar_scm:0023-Backport-AArch64-Stack-probing-for-function-prologues.patch Deleted

@@ -1,2652 +0,0 @@
-From 3a9ddc2f95926a75a9b436ad4dfd4070f535a113 Mon Sep 17 00:00:00 2001
-From: rickyleung <leung.wing.chung@huawei.com>
-Date: Tue, 7 May 2024 21:25:52 +0800
-Subject: PATCH 4/7 backportAArch64 Stack probing for function prologues
-
-Reference: https://github.com/llvm/llvm-project/commit/cc944f502f1ee20d73ff88c2c86cc909f12caadb
-
-This adds code to AArch64 function prologues to protect against stack
-clash attacks by probing (writing to) the stack at regular enough
-intervals to ensure that the guard page cannot be skipped over.
-
-The patch depends on and maintains the following invariants:
-
-Upon function entry the caller guarantees that it has probed the stack
-(e.g. performed a store) at some address sp, #N, where`0 <= N <=
-1024`. This invariant comes from a requirement for compatibility with
-GCC. Any address range in the allocated stack, no smaller than
-stack-probe-size bytes contains at least one probe At any time the stack
-pointer is above or in the guard page Probes are performed in
-descreasing address order
-The stack-probe-size is a function attribute that can be set by a
-platform to correspond to the guard page size.
-
-By default, the stack probe size is 4KiB, which is a safe default as
-this is the smallest possible page size for AArch64. Linux uses a 64KiB
-guard for AArch64, so this can be overridden by the stack-probe-size
-function attribute.
-
-For small frames without a frame pointer (<= 240 bytes), no probes are
-needed.
-
-For larger frame sizes, LLVM always stores x29 to the stack. This serves
-as an implicit stack probe. Thus, while allocating stack objects the
-compiler assumes that the stack has been probed at sp.
-
-There are multiple probing sequences that can be emitted, depending on
-the size of the stack allocation:
-
-A straight-line sequence of subtracts and stores, used when the
-allocation size is smaller than 5 guard pages. A loop allocating and
-probing one page size per iteration, plus at most a single probe to deal
-with the remainder, used when the allocation size is larger but still
-known at compile time. A loop which moves the SP down to the target
-value held in a register (or a loop, moving a scratch register to the
-target value help in SP), used when the allocation size is not known at
-compile-time, such as when allocating space for SVE values, or when
-over-aligning the stack. This is emitted in AArch64InstrInfo because it
-will also be used for dynamic allocas in a future patch. A single probe
-where the amount of stack adjustment is unknown, but is known to be less
-than or equal to a page size.
-
----------
-
-Co-authored-by: Oliver Stannard <oliver.stannard@linaro.org>
----
- .../Target/AArch64/AArch64FrameLowering.cpp   | 335 +++++++-
- .../lib/Target/AArch64/AArch64FrameLowering.h |  17 +-
- .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +
- llvm/lib/Target/AArch64/AArch64ISelLowering.h |  10 +
- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  90 +++
- llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   6 +
- llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  24 +-
- .../AArch64/AArch64MachineFunctionInfo.cpp    |  43 +-
- .../AArch64/AArch64MachineFunctionInfo.h      |   6 +
- .../test/CodeGen/AArch64/stack-probing-64k.ll | 392 ++++++++++
- .../AArch64/stack-probing-last-in-block.mir   | 146 ++++
- .../test/CodeGen/AArch64/stack-probing-sve.ll | 724 ++++++++++++++++++
- llvm/test/CodeGen/AArch64/stack-probing.ll    | 539 +++++++++++++
- 13 files changed, 2300 insertions(+), 38 deletions(-)
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll
-
-diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-index eeb6185fa36d..af019ab23770 100644
---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-@@ -672,10 +672,18 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
-   emitCalleeSavedRestores(MBB, MBBI, true);
- }
- 
-+// Return the maximum possible number of bytes for `Size` due to the
-+// architectural limit on the size of a SVE register.
-+static int64_t upperBound(StackOffset Size) {
-+  static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
-+  return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
-+}
-+
- void AArch64FrameLowering::allocateStackSpace(
-     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
--    bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI,
--    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
-+    int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
-+    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
-+    bool FollowupAllocs) const {
- 
-   if (!AllocSize)
-     return;
-@@ -687,27 +695,129 @@ void AArch64FrameLowering::allocateStackSpace(
-   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
-   const MachineFrameInfo &MFI = MF.getFrameInfo();
- 
--  Register TargetReg =
--      NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP;
--  // SUB Xd/SP, SP, AllocSize
-+  const int64_t MaxAlign = MFI.getMaxAlign().value();
-+  const uint64_t AndMask = ~(MaxAlign - 1);
-+
-+  if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
-+    Register TargetReg = RealignmentPadding
-+                             ? findScratchNonCalleeSaveRegister(&MBB)
-+                             : AArch64::SP;
-+    // SUB Xd/SP, SP, AllocSize
-+    emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
-+                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
-+                    EmitCFI, InitialOffset);
-+
-+    if (RealignmentPadding) {
-+      // AND SP, X9, 0b11111...0000
-+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
-+          .addReg(TargetReg, RegState::Kill)
-+          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
-+          .setMIFlags(MachineInstr::FrameSetup);
-+      AFI.setStackRealigned(true);
-+
-+      // No need for SEH instructions here; if we're realigning the stack,
-+      // we've set a frame pointer and already finished the SEH prologue.
-+      assert(!NeedsWinCFI);
-+    }
-+    return;
-+  }
-+
-+  //
-+  // Stack probing allocation.
-+  //
-+
-+  // Fixed length allocation. If we don't need to re-align the stack and don't
-+  // have SVE objects, we can use a more efficient sequence for stack probing.
-+  if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
-+    Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
-+    assert(ScratchReg != AArch64::NoRegister);
-+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
-+        .addDef(ScratchReg)
-+        .addImm(AllocSize.getFixed())
-+        .addImm(InitialOffset.getFixed())
-+        .addImm(InitialOffset.getScalable());
-+    // The fixed allocation may leave unprobed bytes at the top of the
-+    // stack. If we have subsequent alocation (e.g. if we have variable-sized
-+    // objects), we need to issue an extra probe, so these allocations start in
-+    // a known state.
-+    if (FollowupAllocs) {
-+      // STR XZR, SP
-+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
-+          .addReg(AArch64::XZR)
-+          .addReg(AArch64::SP)
-+          .addImm(0)
-+          .setMIFlags(MachineInstr::FrameSetup);
-+    }
-+
-+    return;
-+  }
-+
-+  // Variable length allocation.
-+
-+  // If the (unknown) allocation size cannot exceed the probe size, decrement
-+  // the stack pointer right away.
-+  int64_t ProbeSize = AFI.getStackProbeSize();
-+  if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
-+    Register ScratchReg = RealignmentPadding
-+                              ? findScratchNonCalleeSaveRegister(&MBB)
-+                              : AArch64::SP;
-+    assert(ScratchReg != AArch64::NoRegister);
-+    // SUB Xd, SP, AllocSize
-+    emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
-+                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
-+                    EmitCFI, InitialOffset);
-+    if (RealignmentPadding) {
-+      // AND SP, Xn, 0b11111...0000
-+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
-+          .addReg(ScratchReg, RegState::Kill)
-+          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
-+          .setMIFlags(MachineInstr::FrameSetup);
-+      AFI.setStackRealigned(true);
-+    }
-+    if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
-+                              AArch64::StackProbeMaxUnprobedStack) {
-+      // STR XZR, SP
-+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
-+          .addReg(AArch64::XZR)
-+          .addReg(AArch64::SP)
-+          .addImm(0)
-+          .setMIFlags(MachineInstr::FrameSetup);
-+    }
-+    return;
-+  }
-+
-+  // Emit a variable-length allocation probing loop.
-+  // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
-+  // each of them guaranteed to adjust the stack by less than the probe size.
-+  Register TargetReg = findScratchNonCalleeSaveRegister(&MBB);
-+  assert(TargetReg != AArch64::NoRegister);
-+  // SUB Xd, SP, AllocSize
-   emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
-                   MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
-                   EmitCFI, InitialOffset);
- 
--  if (NeedsRealignment) {
--    const int64_t MaxAlign = MFI.getMaxAlign().value();
--    const uint64_t AndMask = ~(MaxAlign - 1);
--    // AND SP, Xd, 0b11111...0000
--    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
-+  if (RealignmentPadding) {
-+    // AND Xn, Xn, 0b11111...0000
-+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
-         .addReg(TargetReg, RegState::Kill)
-         .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
-         .setMIFlags(MachineInstr::FrameSetup);
--    AFI.setStackRealigned(true);
-+  }
- 
--    // No need for SEH instructions here; if we're realigning the stack,
--    // we've set a frame pointer and already finished the SEH prologue.
--    assert(!NeedsWinCFI);
-+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
-+      .addReg(TargetReg);
-+  if (EmitCFI) {
-+    // Set the CFA register back to SP.
-+    unsigned Reg =
-+        Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true);
-+    unsigned CFIIndex =
-+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
-+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-+        .addCFIIndex(CFIIndex)
-+        .setMIFlags(MachineInstr::FrameSetup);
-   }
-+  if (RealignmentPadding)
-+    AFI.setStackRealigned(true);
- }
- 
- static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
-@@ -893,9 +1003,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
-   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
-   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
-   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-+  const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
- 
--  // Don't need a scratch register if we're not going to re-align the stack.
--  if (!RegInfo->hasStackRealignment(*MF))
-+  // Don't need a scratch register if we're not going to re-align the stack or
-+  // emit stack probes.
-+  if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF))
-     return true;
-   // Otherwise, we can use any block as long as it has a scratch register
-   // available.
-@@ -905,15 +1017,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
- static bool windowsRequiresStackProbe(MachineFunction &MF,
-                                       uint64_t StackSizeInBytes) {
-   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
--  if (!Subtarget.isTargetWindows())
--    return false;
--  const Function &F = MF.getFunction();
-+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
-   // TODO: When implementing stack protectors, take that into account
-   // for the probe threshold.
--  unsigned StackProbeSize =
--      F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
--  return (StackSizeInBytes >= StackProbeSize) &&
--         !F.hasFnAttribute("no-stack-arg-probe");
-+  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
-+         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
- }
- 
- static bool needsWinCFI(const MachineFunction &MF) {
-@@ -1678,7 +1786,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-   // Alignment is required for the parent frame, not the funclet
-   const bool NeedsRealignment =
-       NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
--  int64_t RealignmentPadding =
-+  const int64_t RealignmentPadding =
-       (NeedsRealignment && MFI.getMaxAlign() > Align(16))
-           ? MFI.getMaxAlign().value() - 16
-           : 0;
-@@ -1814,6 +1922,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-   // Process the SVE callee-saves to determine what space needs to be
-   // allocated.
-   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-+    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
-+                      << "\n");
-     // Find callee save instructions in frame.
-     CalleeSavesBegin = MBBI;
-     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
-@@ -1828,8 +1938,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-   // Allocate space for the callee saves (if any).
-   StackOffset CFAOffset =
-       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
--  allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false,
--                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
-+  StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
-+  allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
-+                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
-+                     MFI.hasVarSizedObjects() || LocalsSize);
-   CFAOffset += SVECalleeSavesSize;
- 
-   if (EmitAsyncCFI)
-@@ -1843,10 +1955,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
-     // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-     // the correct value here, as NumBytes also includes padding bytes,
-     // which shouldn't be counted here.
--    allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment,
-+    allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
-                        SVELocalsSize + StackOffset::getFixed(NumBytes),
-                        NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
--                       CFAOffset);
-+                       CFAOffset, MFI.hasVarSizedObjects());
-   }
- 
-   // If we need a base pointer, set it up here. It's whatever the value of the
-@@ -4028,3 +4140,170 @@ void AArch64FrameLowering::orderFrameObjects(
-     dbgs() << "\n";
-   });
- }
-+
-+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
-+/// least every ProbeSize bytes. Returns an iterator of the first instruction
-+/// after the loop. The difference between SP and TargetReg must be an exact
-+/// multiple of ProbeSize.
-+MachineBasicBlock::iterator
-+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
-+    MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
-+    Register TargetReg) const {
-+  MachineBasicBlock &MBB = *MBBI->getParent();
-+  MachineFunction &MF = *MBB.getParent();
-+  const AArch64InstrInfo *TII =
-+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-+  DebugLoc DL = MBB.findDebugLoc(MBBI);
-+
-+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
-+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
-+  MF.insert(MBBInsertPoint, LoopMBB);
-+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
-+  MF.insert(MBBInsertPoint, ExitMBB);
-+
-+  // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
-+  // in SUB).
-+  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
-+                  StackOffset::getFixed(-ProbeSize), TII,
-+                  MachineInstr::FrameSetup);
-+  // STR XZR, SP
-+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
-+      .addReg(AArch64::XZR)
-+      .addReg(AArch64::SP)
-+      .addImm(0)
-+      .setMIFlags(MachineInstr::FrameSetup);
-+  // CMP SP, TargetReg
-+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
-+          AArch64::XZR)
-+      .addReg(AArch64::SP)
-+      .addReg(TargetReg)
-+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
-+      .setMIFlags(MachineInstr::FrameSetup);
-+  // B.CC Loop
-+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
-+      .addImm(AArch64CC::NE)
-+      .addMBB(LoopMBB)
-+      .setMIFlags(MachineInstr::FrameSetup);
-+
-+  LoopMBB->addSuccessor(ExitMBB);
-+  LoopMBB->addSuccessor(LoopMBB);
-+  // Synthesize the exit MBB.
-+  ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
-+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
-+  MBB.addSuccessor(LoopMBB);
-+  // Update liveins.
-+  recomputeLiveIns(*LoopMBB);
-+  recomputeLiveIns(*ExitMBB);
-+
-+  return ExitMBB->begin();
-+}
-+
-+void AArch64FrameLowering::inlineStackProbeFixed(
-+    MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
-+    StackOffset CFAOffset) const {
-+  MachineBasicBlock *MBB = MBBI->getParent();
-+  MachineFunction &MF = *MBB->getParent();
-+  const AArch64InstrInfo *TII =
-+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-+  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
-+  bool HasFP = hasFP(MF);
-+
-+  DebugLoc DL;
-+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
-+  int64_t NumBlocks = FrameSize / ProbeSize;
-+  int64_t ResidualSize = FrameSize % ProbeSize;
-+
-+  LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
-+                    << NumBlocks << " blocks of " << ProbeSize
-+                    << " bytes, plus " << ResidualSize << " bytes\n");
-+
-+  // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
-+  // ordinary loop.
-+  if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
-+    for (int i = 0; i < NumBlocks; ++i) {
-+      // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
-+      // encodable in a SUB).
-+      emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-+                      StackOffset::getFixed(-ProbeSize), TII,
-+                      MachineInstr::FrameSetup, false, false, nullptr,
-+                      EmitAsyncCFI && !HasFP, CFAOffset);
-+      CFAOffset += StackOffset::getFixed(ProbeSize);
-+      // STR XZR, SP
-+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
-+          .addReg(AArch64::XZR)
-+          .addReg(AArch64::SP)
-+          .addImm(0)
-+          .setMIFlags(MachineInstr::FrameSetup);
-+    }
-+  } else if (NumBlocks != 0) {
-+    // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
-+    // encodable in ADD). ScrathReg may temporarily become the CFA register.
-+    emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
-+                    StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
-+                    MachineInstr::FrameSetup, false, false, nullptr,
-+                    EmitAsyncCFI && !HasFP, CFAOffset);
-+    CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
-+    MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
-+    MBB = MBBI->getParent();
-+    if (EmitAsyncCFI && !HasFP) {
-+      // Set the CFA register back to SP.
-+      const AArch64RegisterInfo &RegInfo =
-+          *MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
-+      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
-+      unsigned CFIIndex =
-+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
-+      BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-+          .addCFIIndex(CFIIndex)
-+          .setMIFlags(MachineInstr::FrameSetup);
-+    }
-+  }
-+
-+  if (ResidualSize != 0) {
-+    // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
-+    // in SUB).
-+    emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
-+                    StackOffset::getFixed(-ResidualSize), TII,
-+                    MachineInstr::FrameSetup, false, false, nullptr,
-+                    EmitAsyncCFI && !HasFP, CFAOffset);
-+    if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
-+      // STR XZR, SP
-+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
-+          .addReg(AArch64::XZR)
-+          .addReg(AArch64::SP)
-+          .addImm(0)
-+          .setMIFlags(MachineInstr::FrameSetup);
-+    }
-+  }
-+}
-+
-+void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
-+                                            MachineBasicBlock &MBB) const {
-+  // Get the instructions that need to be replaced. We emit at most two of
-+  // these. Remember them in order to avoid complications coming from the need
-+  // to traverse the block while potentially creating more blocks.
-+  SmallVector<MachineInstr *, 4> ToReplace;
-+  for (MachineInstr &MI : MBB)
-+    if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
-+        MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
-+      ToReplace.push_back(&MI);
-+
-+  for (MachineInstr *MI : ToReplace) {
-+    if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
-+      Register ScratchReg = MI->getOperand(0).getReg();
-+      int64_t FrameSize = MI->getOperand(1).getImm();
-+      StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(),
-+                                               MI->getOperand(3).getImm());
-+      inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize,
-+                            CFAOffset);
-+    } else {
-+      assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
-+             "Stack probe pseudo-instruction expected");
-+      const AArch64InstrInfo *TII =
-+          MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
-+      Register TargetReg = MI->getOperand(0).getReg();
-+      (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true);
-+    }
-+    MI->eraseFromParent();
-+  }
-+}
-\ No newline at end of file
-diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-index f3313f3b53ff..941af03a78b7 100644
---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
-@@ -152,13 +152,26 @@ private:
-                                   MachineBasicBlock::iterator MBBI) const;
-   void allocateStackSpace(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI,
--                          bool NeedsRealignment, StackOffset AllocSize,
-+                          int64_t RealignmentPadding, StackOffset AllocSize,
-                           bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
--                          StackOffset InitialOffset) const;
-+                          StackOffset InitialOffset, bool FollowupAllocs) const;
- 
-   /// Emit target zero call-used regs.
-   void emitZeroCallUsedRegs(BitVector RegsToZero,
-                             MachineBasicBlock &MBB) const override;
-+
-+  /// Replace a StackProbe stub (if any) with the actual probe code inline
-+  void inlineStackProbe(MachineFunction &MF,
-+                        MachineBasicBlock &PrologueMBB) const override;
-+
-+  void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI,
-+                             Register ScratchReg, int64_t FrameSize,
-+                             StackOffset CFAOffset) const;
-+
-+  MachineBasicBlock::iterator
-+  inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
-+                                    int64_t NegProbeSize,
-+                                    Register TargetReg) const;
- };
- 
- } // End llvm namespace
-diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-index 6e721b937846..082043420fb9 100644
---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-@@ -26051,3 +26051,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
-   }
-   return true;
- }
-+
-+bool AArch64TargetLowering::hasInlineStackProbe(
-+    const MachineFunction &MF) const {
-+  return !Subtarget->isTargetWindows() &&
-+         MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
-+}
-\ No newline at end of file
-diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-index aca45f113e73..643d363e234a 100644
---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-@@ -508,6 +508,13 @@ const unsigned RoundingBitsPos = 22;
- const ArrayRef<MCPhysReg> getGPRArgRegs();
- const ArrayRef<MCPhysReg> getFPRArgRegs();
- 
-+/// Maximum allowed number of unprobed bytes above SP at an ABI
-+/// boundary.
-+const unsigned StackProbeMaxUnprobedStack = 1024;
-+
-+/// Maximum number of iterations to unroll for a constant size probing loop.
-+const unsigned StackProbeMaxLoopUnroll = 4;
-+
- } // namespace AArch64
- 
- class AArch64Subtarget;
-@@ -942,6 +949,9 @@ public:
-   // used for 64bit and 128bit vectors as well.
-   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
- 
-+  /// True if stack clash protection is enabled for this functions.
-+  bool hasInlineStackProbe(const MachineFunction &MF) const override;
-+
- private:
-   /// Keep a pointer to the AArch64Subtarget around so that we can
-   /// make the right decision when generating code for different targets.
-diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-index 0691e07a639b..b3b42a97e8c9 100644
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-@@ -11,6 +11,7 @@
- //===----------------------------------------------------------------------===//
- 
- #include "AArch64InstrInfo.h"
-+#include "AArch64ExpandImm.h"
- #include "AArch64MachineFunctionInfo.h"
- #include "AArch64Subtarget.h"
- #include "MCTargetDesc/AArch64AddressingModes.h"
-@@ -18,6 +19,7 @@
- #include "llvm/ADT/ArrayRef.h"
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
-+#include "llvm/CodeGen/LivePhysRegs.h"
- #include "llvm/CodeGen/MachineBasicBlock.h"
- #include "llvm/CodeGen/MachineCombinerPattern.h"
- #include "llvm/CodeGen/MachineFrameInfo.h"
-@@ -8428,6 +8430,94 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
-     return AArch64::BLR;
- }
- 
-+MachineBasicBlock::iterator
-+AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
-+                                   Register TargetReg, bool FrameSetup) const {
-+  assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
-+
-+  MachineBasicBlock &MBB = *MBBI->getParent();
-+  MachineFunction &MF = *MBB.getParent();
-+  const AArch64InstrInfo *TII =
-+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
-+  DebugLoc DL = MBB.findDebugLoc(MBBI);
-+
-+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
-+  MachineBasicBlock *LoopTestMBB =
-+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
-+  MF.insert(MBBInsertPoint, LoopTestMBB);
-+  MachineBasicBlock *LoopBodyMBB =
-+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
-+  MF.insert(MBBInsertPoint, LoopBodyMBB);
-+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
-+  MF.insert(MBBInsertPoint, ExitMBB);
-+  MachineInstr::MIFlag Flags =
-+      FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
-+
-+  // LoopTest:
-+  //   SUB SP, SP, #ProbeSize
-+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
-+                  AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
-+
-+  //   CMP SP, TargetReg
-+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
-+          AArch64::XZR)
-+      .addReg(AArch64::SP)
-+      .addReg(TargetReg)
-+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
-+      .setMIFlags(Flags);
-+
-+  //   B.<Cond> LoopExit
-+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
-+      .addImm(AArch64CC::LE)
-+      .addMBB(ExitMBB)
-+      .setMIFlags(Flags);
-+
-+  //   STR XZR, SP
-+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
-+      .addReg(AArch64::XZR)
-+      .addReg(AArch64::SP)
-+      .addImm(0)
-+      .setMIFlags(Flags);
-+
-+  //   B loop
-+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
-+      .addMBB(LoopTestMBB)
-+      .setMIFlags(Flags);
-+
-+  // LoopExit:
-+  //   MOV SP, TargetReg
-+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
-+      .addReg(TargetReg)
-+      .addImm(0)
-+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
-+      .setMIFlags(Flags);
-+
-+  //   STR XZR, SP
-+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui))
-+      .addReg(AArch64::XZR)
-+      .addReg(AArch64::SP)
-+      .addImm(0)
-+      .setMIFlags(Flags);
-+
-+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
-+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
-+
-+  LoopTestMBB->addSuccessor(ExitMBB);
-+  LoopTestMBB->addSuccessor(LoopBodyMBB);
-+  LoopBodyMBB->addSuccessor(LoopTestMBB);
-+  MBB.addSuccessor(LoopTestMBB);
-+
-+  // Update liveins.
-+  if (MF.getRegInfo().reservedRegsFrozen()) {
-+    recomputeLiveIns(*LoopTestMBB);
-+    recomputeLiveIns(*LoopBodyMBB);
-+    recomputeLiveIns(*ExitMBB);
-+  }
-+
-+  return ExitMBB->begin();
-+}
-+
- #define GET_INSTRINFO_HELPERS
- #define GET_INSTRMAP_INFO
- #include "AArch64GenInstrInfo.inc"
-diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-index 20210a96d67a..7e84b86fc52c 100644
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-@@ -340,6 +340,12 @@ public:
-   static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset,
-                                                   int64_t &ByteSized,
-                                                   int64_t &VGSized);
-+  // Decrement the SP, issuing probes along the way. `TargetReg` is the new top
-+  // of the stack. `FrameSetup` is passed as true, if the allocation is a part
-+  // of constructing the activation frame of a function.
-+  MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI,
-+                                               Register TargetReg,
-+                                               bool FrameSetup) const;
- #define GET_INSTRINFO_HELPER_DECLS
- #include "AArch64GenInstrInfo.inc"
- 
-diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-index 9e72d37880c5..09980c2f45e6 100644
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-@@ -880,7 +880,8 @@ include "SMEInstrFormats.td"
- // Miscellaneous instructions.
- //===----------------------------------------------------------------------===//
- 
--let Defs = SP, Uses = SP, hasSideEffects = 1, isCodeGenOnly = 1 in {
-+let hasSideEffects = 1, isCodeGenOnly = 1 in {
-+let Defs = SP, Uses = SP in {
- // We set Sched to empty list because we expect these instructions to simply get
- // removed in most cases.
- def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-@@ -889,7 +890,26 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
- def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                             (AArch64callseq_end timm:$amt1, timm:$amt2)>,
-                             Sched<>;
--} // Defs = SP, Uses = SP, hasSideEffects = 1, isCodeGenOnly = 1
-+}
-+
-+let Defs = SP, NZCV, Uses = SP in {
-+// Probed stack allocation of a constant size, used in function prologues when
-+// stack-clash protection is enabled.
-+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
-+                               (ins i64imm:$stacksize, i64imm:$fixed_offset,
-+                                i64imm:$scalable_offset),
-+                               >,
-+                               Sched<>;
-+
-+// Probed stack allocation of a variable size, used in function prologues when
-+// stack-clash protection is enabled.
-+def PROBED_STACKALLOC_VAR : Pseudo<(outs),
-+                                   (ins GPR64sp:$target),
-+                                   >,
-+                                   Sched<>;
-+
-+} // Defs = SP, NZCV, Uses = SP in 
-+} // hasSideEffects = 1, isCodeGenOnly = 1
- 
- let isReMaterializable = 1, isCodeGenOnly = 1 in {
- // FIXME: The following pseudo instructions are only needed because remat
-diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
-index 961a19317d66..0bef3c2d2483 100644
---- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
-@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
-     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
-             F.getParent()->getModuleFlag("branch-target-enforcement")))
-       BranchTargetEnforcement = BTE->getZExtValue();
--    return;
-+  } else {
-+    const StringRef BTIEnable =
-+        F.getFnAttribute("branch-target-enforcement").getValueAsString();
-+    assert(BTIEnable.equals_insensitive("true") ||
-+           BTIEnable.equals_insensitive("false"));
-+    BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
-   }
- 
--  const StringRef BTIEnable =
--      F.getFnAttribute("branch-target-enforcement").getValueAsString();
--  assert(BTIEnable.equals_insensitive("true") ||
--         BTIEnable.equals_insensitive("false"));
--  BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
-+  // The default stack probe size is 4096 if the function has no
-+  // stack-probe-size attribute. This is a safe default because it is the
-+  // smallest possible guard page size.
-+  uint64_t ProbeSize = 4096;
-+  if (F.hasFnAttribute("stack-probe-size"))
-+    ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size");
-+  else if (const auto *PS = mdconst::extract_or_null<ConstantInt>(
-+               F.getParent()->getModuleFlag("stack-probe-size")))
-+    ProbeSize = PS->getZExtValue();
-+  assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size");
-+
-+  if (STI->isTargetWindows()) {
-+    if (!F.hasFnAttribute("no-stack-arg-probe"))
-+      StackProbeSize = ProbeSize;
-+  } else {
-+    // Round down to the stack alignment.
-+    uint64_t StackAlign =
-+        STI->getFrameLowering()->getTransientStackAlign().value();
-+    ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
-+    StringRef ProbeKind;
-+    if (F.hasFnAttribute("probe-stack"))
-+      ProbeKind = F.getFnAttribute("probe-stack").getValueAsString();
-+    else if (const auto *PS = dyn_cast_or_null<MDString>(
-+                 F.getParent()->getModuleFlag("probe-stack")))
-+      ProbeKind = PS->getString();
-+    if (ProbeKind.size()) {
-+      if (ProbeKind != "inline-asm")
-+        report_fatal_error("Unsupported stack probing method");
-+      StackProbeSize = ProbeSize;
-+    }
-+  }
- }
- 
- MachineFunctionInfo *AArch64FunctionInfo::clone(
-diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
-index d82fb436925e..d50011594eb1 100644
---- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
-+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
-@@ -192,6 +192,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
-   /// True if the function need asynchronous unwind information.
-   mutable std::optional<bool> NeedsAsyncDwarfUnwindInfo;
- 
-+  int64_t StackProbeSize = 0;
-+
- public:
-   AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
- 
-@@ -447,6 +449,10 @@ public:
-   bool needsDwarfUnwindInfo(const MachineFunction &MF) const;
-   bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const;
- 
-+  bool hasStackProbing() const { return StackProbeSize != 0; }
-+
-+  int64_t getStackProbeSize() const { return StackProbeSize; }
-+
- private:
-   // Hold the lists of LOHs.
-   MILOHContainer LOHContainerSet;
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
-new file mode 100644
-index 000000000000..0a3198fc520e
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
-@@ -0,0 +1,392 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
-+
-+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
-+
-+; 64k bytes is the largest frame we can probe in one go.
-+define void @static_65536(ptr %out) #0 {
-+; CHECK-LABEL: static_65536:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 65536, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 64k+16 bytes, still needs just one probe.
-+define void @static_65552(ptr %out) #0 {
-+; CHECK-LABEL: static_65552:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp, #-16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65568
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    add sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 65552, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 64k+1024 bytes, the largest frame which needs just one probe.
-+define void @static_66560(ptr %out) #0 {
-+; CHECK-LABEL: static_66560:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 66576
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 66560, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 64k+1024+16 bytes, the smallest frame which needs two probes.
-+define void @static_66576(ptr %out) #0 {
-+; CHECK-LABEL: static_66576:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 66592
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 66576, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 2*64k+1024, the largest frame needing two probes.
-+define void @static_132096(ptr %out) #0 {
-+; CHECK-LABEL: static_132096:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 132112
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #32, lsl #12 // =131072
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 132096, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*64k-16, the largest frame probed without a loop.
-+define void @static_327664(ptr %out) #0 {
-+; CHECK-LABEL: static_327664:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 196624
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    .cfi_def_cfa_offset 262160
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
-+; CHECK-NEXT:    .cfi_def_cfa_offset 323600
-+; CHECK-NEXT:    sub sp, sp, #4080
-+; CHECK-NEXT:    .cfi_def_cfa_offset 327680
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #79, lsl #12 // =323584
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
-+; CHECK-NEXT:    add sp, sp, #4080
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 327664, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*64k, smallest frame probed with a loop.
-+define void @static_327680(ptr %out) #0 {
-+; CHECK-LABEL: static_327680:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
-+; CHECK-NEXT:  .LBB6_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB6_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 327680, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB
-+; so has a reminder, but no extra probe.
-+define void @static_328704(ptr %out) #0 {
-+; CHECK-LABEL: static_328704:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
-+; CHECK-NEXT:  .LBB7_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB7_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 328720
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 328704, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*64k+1040, large enough to use a loop, has a reminder and
-+; an extra probe.
-+define void @static_328720(ptr %out) #0 {
-+; CHECK-LABEL: static_328720:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
-+; CHECK-NEXT:  .LBB8_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB8_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    sub sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 328736
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 328720, align 1
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A small allocation, but with a very large alignment requirement. We do this
-+; by moving SP far enough that a sufficiently-aligned block will exist
-+; somewhere in the stack frame, so must probe the whole of that larger SP move.
-+define void @static_16_align_131072(ptr %out) #0 {
-+; CHECK-LABEL: static_16_align_131072:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #31, lsl #12 // =126976
-+; CHECK-NEXT:    sub x9, x9, #4080
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffe0000
-+; CHECK-NEXT:  .LBB9_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB9_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB9_1
-+; CHECK-NEXT:  .LBB9_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 16, align 131072
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A small allocation, but with a very large alignment requirement which
-+; is nevertheless small enough as to not need a loop.
-+define void @static_16_align_8192(ptr %out) #0 {
-+; CHECK-LABEL: static_16_align_8192:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    sub x9, x9, #4080
-+; CHECK-NEXT:    and sp, x9, #0xffffffffffffe000
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 16, align 8192
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A large allocation with a very large alignment requirement which
-+; is nevertheless small enough as to not need a loop.
-+define void @static_32752_align_32k(ptr %out) #0 {
-+; CHECK-LABEL: static_32752_align_32k:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #7, lsl #12 // =28672
-+; CHECK-NEXT:    sub x9, x9, #4080
-+; CHECK-NEXT:    and sp, x9, #0xffffffffffff8000
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 32752, align 32768
-+  store i8* %v, ptr %out, align 8
-+  ret void
-+}
-+
-+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" }
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
-new file mode 100644
-index 000000000000..a8a21ab330ba
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
-@@ -0,0 +1,146 @@
-+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-+# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s
-+# Regression test for a crash when the probing instruction
-+# to replace is last in the block.
-+--- |
-+  source_filename = "tt.ll"
-+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-+  target triple = "aarch64-linux"
-+
-+  declare i1 @g(ptr)
-+
-+  define void @f(ptr %out) #0 {
-+  entry:
-+    %p = alloca i32, i32 50000, align 4
-+    br label %loop
-+
-+  loop:                                             ; preds = %loop, %entry
-+    %c = call i1 @g(ptr %p)
-+    br i1 %c, label %loop, label %exit
-+
-+  exit:                                             ; preds = %loop
-+    ret void
-+  }
-+
-+  attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" }
-+
-+...
-+---
-+name:            f
-+alignment:       4
-+exposesReturnsTwice: false
-+legalized:       false
-+regBankSelected: false
-+selected:        false
-+failedISel:      false
-+tracksRegLiveness: true
-+hasWinCFI:       false
-+callsEHReturn:   false
-+callsUnwindInit: false
-+hasEHCatchret:   false
-+hasEHScopes:     false
-+hasEHFunclets:   false
-+isOutlined:      false
-+debugInstrRef:   false
-+failsVerification: false
-+tracksDebugUserValues: true
-+registers:       
-+liveins:         
-+frameInfo:
-+  isFrameAddressTaken: false
-+  isReturnAddressTaken: false
-+  hasStackMap:     false
-+  hasPatchPoint:   false
-+  stackSize:       0
-+  offsetAdjustment: 0
-+  maxAlignment:    4
-+  adjustsStack:    true
-+  hasCalls:        true
-+  stackProtector:  ''
-+  functionContext: ''
-+  maxCallFrameSize: 0
-+  cvBytesOfCalleeSavedRegisters: 0
-+  hasOpaqueSPAdjustment: false
-+  hasVAStart:      false
-+  hasMustTailInVarArgFunc: false
-+  hasTailCall:     false
-+  localFrameSize:  200000
-+  savePoint:       ''
-+  restorePoint:    ''
-+fixedStack:      
-+stack:
-+  - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4,
-+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-+      local-offset: -200000, debug-info-variable: '', debug-info-expression: '',
-+      debug-info-location: '' }
-+entry_values:    
-+callSites:       
-+debugValueSubstitutions: 
-+constants:       
-+machineFunctionInfo: {}
-+body:             |
-+  ; CHECK-LABEL: name: f
-+  ; CHECK: bb.0.entry:
-+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-+  ; CHECK-NEXT:   liveins: $lr, $fp
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT:   early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1)
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w30, -8
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w29, -16
-+  ; CHECK-NEXT:   $x9 = frame-setup SUBXri $sp, 48, 12
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $w9, 196624
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT: bb.3.entry:
-+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
-+  ; CHECK-NEXT:   liveins: $x9
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 1, 12
-+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
-+  ; CHECK-NEXT:   $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv
-+  ; CHECK-NEXT:   frame-setup Bcc 1, %bb.3, implicit $nzcv
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT: bb.4.entry:
-+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_register $wsp
-+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 3392, 0
-+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 200016
-+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT: bb.1.loop:
-+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT:   $x0 = ADDXri $sp, 0, 0
-+  ; CHECK-NEXT:   BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
-+  ; CHECK-NEXT:   TBNZW killed renamable $w0, 0, %bb.1
-+  ; CHECK-NEXT:   B %bb.2
-+  ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT: bb.2.exit:
-+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 48, 12
-+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 3408
-+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 3392, 0
-+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
-+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
-+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
-+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
-+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
-+  ; CHECK-NEXT:   RET_ReallyLR
-+  bb.0.entry:
-+    successors: %bb.1(0x80000000)
-+
-+
-+  bb.1.loop:
-+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-+
-+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
-+    $x0 = ADDXri %stack.0.p, 0, 0
-+    BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
-+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-+    TBNZW killed renamable $w0, 0, %bb.1
-+    B %bb.2
-+
-+  bb.2.exit:
-+    RET_ReallyLR
-+
-+...
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
-new file mode 100644
-index 000000000000..e765d071e722
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
-@@ -0,0 +1,724 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
-+
-+; Test prolog sequences for stack probing when SVE objects are involved.
-+
-+; The space for SVE objects needs probing in the general case, because
-+; the stack adjustment may happen to be too big (i.e. greater than the
-+; probe size) to allocate with a single `addvl`.
-+; When we do know that the stack adjustment cannot exceed the probe size
-+; we can avoid emitting a probe loop and emit a simple `addvl; str`
-+; sequence instead.
-+
-+define void @sve_1_vector(ptr %out) #0 {
-+; CHECK-LABEL: sve_1_vector:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-+; CHECK-NEXT:    addvl sp, sp, #1
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; As above, but with 4 SVE vectors of stack space.
-+define void @sve_4_vector(ptr %out) #0 {
-+; CHECK-LABEL: sve_4_vector:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-4
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-+; CHECK-NEXT:    addvl sp, sp, #4
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec1 = alloca <vscale x 4 x float>, align 16
-+  %vec2 = alloca <vscale x 4 x float>, align 16
-+  %vec3 = alloca <vscale x 4 x float>, align 16
-+  %vec4 = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; As above, but with 16 SVE vectors of stack space.
-+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
-+; we can allocate the locals at once.
-+define void @sve_16_vector(ptr %out) #0 {
-+; CHECK-LABEL: sve_16_vector:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-16
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    addvl sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec1 = alloca <vscale x 4 x float>, align 16
-+  %vec2 = alloca <vscale x 4 x float>, align 16
-+  %vec3 = alloca <vscale x 4 x float>, align 16
-+  %vec4 = alloca <vscale x 4 x float>, align 16
-+  %vec5 = alloca <vscale x 4 x float>, align 16
-+  %vec6 = alloca <vscale x 4 x float>, align 16
-+  %vec7 = alloca <vscale x 4 x float>, align 16
-+  %vec8 = alloca <vscale x 4 x float>, align 16
-+  %vec9 = alloca <vscale x 4 x float>, align 16
-+  %vec10 = alloca <vscale x 4 x float>, align 16
-+  %vec11 = alloca <vscale x 4 x float>, align 16
-+  %vec12 = alloca <vscale x 4 x float>, align 16
-+  %vec13 = alloca <vscale x 4 x float>, align 16
-+  %vec14 = alloca <vscale x 4 x float>, align 16
-+  %vec15 = alloca <vscale x 4 x float>, align 16
-+  %vec16 = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; As above, but with 17 SVE vectors of stack space. Now we need
-+; a probing loops since stack adjustment may be greater than
-+; the probe size (17 x 256 = 4354 bytes)
-+; TODO: Allocating `k*16+r` SVE vectors can be unrolled into
-+; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, sp`
-+define void @sve_17_vector(ptr %out) #0 {
-+; CHECK-LABEL: sve_17_vector:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl x9, sp, #-17
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
-+; CHECK-NEXT:  .LBB3_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB3_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB3_1
-+; CHECK-NEXT:  .LBB3_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    addvl sp, sp, #17
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec1 = alloca <vscale x 4 x float>, align 16
-+  %vec2 = alloca <vscale x 4 x float>, align 16
-+  %vec3 = alloca <vscale x 4 x float>, align 16
-+  %vec4 = alloca <vscale x 4 x float>, align 16
-+  %vec5 = alloca <vscale x 4 x float>, align 16
-+  %vec6 = alloca <vscale x 4 x float>, align 16
-+  %vec7 = alloca <vscale x 4 x float>, align 16
-+  %vec8 = alloca <vscale x 4 x float>, align 16
-+  %vec9 = alloca <vscale x 4 x float>, align 16
-+  %vec10 = alloca <vscale x 4 x float>, align 16
-+  %vec11 = alloca <vscale x 4 x float>, align 16
-+  %vec12 = alloca <vscale x 4 x float>, align 16
-+  %vec13 = alloca <vscale x 4 x float>, align 16
-+  %vec14 = alloca <vscale x 4 x float>, align 16
-+  %vec15 = alloca <vscale x 4 x float>, align 16
-+  %vec16 = alloca <vscale x 4 x float>, align 16
-+  %vec17 = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; Space for callee-saved SVE register is allocated similarly to allocating
-+; space for SVE locals. When we know the stack adjustment cannot exceed the
-+; probe size we can skip the explict probe, since saving SVE registers serves
-+; as an implicit probe.
-+define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_1v_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-+; CHECK-NEXT:    str z8, sp // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr z8, sp // 16-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #1
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    .cfi_restore z8
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{z8}" ()
-+  ret void
-+}
-+
-+define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_4v_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-4
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-+; CHECK-NEXT:    str z11, sp // 16-byte Folded Spill
-+; CHECK-NEXT:    str z10, sp, #1, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z9, sp, #2, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z8, sp, #3, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr z11, sp // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z10, sp, #1, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z9, sp, #2, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z8, sp, #3, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #4
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    .cfi_restore z8
-+; CHECK-NEXT:    .cfi_restore z9
-+; CHECK-NEXT:    .cfi_restore z10
-+; CHECK-NEXT:    .cfi_restore z11
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
-+  ret void
-+}
-+
-+define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_16v_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-16
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str z23, sp // 16-byte Folded Spill
-+; CHECK-NEXT:    str z22, sp, #1, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z21, sp, #2, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z20, sp, #3, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z19, sp, #4, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z18, sp, #5, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z17, sp, #6, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z16, sp, #7, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z15, sp, #8, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z14, sp, #9, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z13, sp, #10, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z12, sp, #11, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z11, sp, #12, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z10, sp, #13, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z9, sp, #14, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z8, sp, #15, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr z23, sp // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z22, sp, #1, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z21, sp, #2, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z20, sp, #3, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z19, sp, #4, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z18, sp, #5, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z17, sp, #6, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z16, sp, #7, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z15, sp, #8, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z14, sp, #9, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z13, sp, #10, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z12, sp, #11, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z11, sp, #12, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z10, sp, #13, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z9, sp, #14, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z8, sp, #15, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    .cfi_restore z8
-+; CHECK-NEXT:    .cfi_restore z9
-+; CHECK-NEXT:    .cfi_restore z10
-+; CHECK-NEXT:    .cfi_restore z11
-+; CHECK-NEXT:    .cfi_restore z12
-+; CHECK-NEXT:    .cfi_restore z13
-+; CHECK-NEXT:    .cfi_restore z14
-+; CHECK-NEXT:    .cfi_restore z15
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
-+  ret void
-+}
-+
-+define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_1p_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-+; CHECK-NEXT:    str p8, sp, #7, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr p8, sp, #7, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #1
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{p8}" ()
-+  ret void
-+}
-+
-+define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_4p_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-+; CHECK-NEXT:    str p11, sp, #4, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    str p10, sp, #5, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    str p9, sp, #6, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    str p8, sp, #7, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr p11, sp, #4, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    ldr p10, sp, #5, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    ldr p9, sp, #6, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    ldr p8, sp, #7, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #1
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
-+  ret void
-+}
-+
-+define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
-+; CHECK-LABEL: sve_16v_1p_csr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl x9, sp, #-17
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
-+; CHECK-NEXT:  .LBB9_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB9_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB9_1
-+; CHECK-NEXT:  .LBB9_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    str p8, sp, #7, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    str z23, sp, #1, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z22, sp, #2, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z21, sp, #3, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z20, sp, #4, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z19, sp, #5, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z18, sp, #6, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z17, sp, #7, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z16, sp, #8, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z15, sp, #9, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z14, sp, #10, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z13, sp, #11, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z12, sp, #12, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z11, sp, #13, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z10, sp, #14, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z9, sp, #15, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z8, sp, #16, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    ldr p8, sp, #7, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    ldr z23, sp, #1, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z22, sp, #2, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z21, sp, #3, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z20, sp, #4, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z19, sp, #5, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z18, sp, #6, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z17, sp, #7, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z16, sp, #8, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z15, sp, #9, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z14, sp, #10, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z13, sp, #11, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z12, sp, #12, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z11, sp, #13, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z10, sp, #14, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z9, sp, #15, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z8, sp, #16, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #17
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    .cfi_restore z8
-+; CHECK-NEXT:    .cfi_restore z9
-+; CHECK-NEXT:    .cfi_restore z10
-+; CHECK-NEXT:    .cfi_restore z11
-+; CHECK-NEXT:    .cfi_restore z12
-+; CHECK-NEXT:    .cfi_restore z13
-+; CHECK-NEXT:    .cfi_restore z14
-+; CHECK-NEXT:    .cfi_restore z15
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
-+  ret void
-+}
-+
-+; A SVE vector and a 16-byte fixed size object.
-+define void @sve_1_vector_16_arr(ptr %out) #0 {
-+; CHECK-LABEL: sve_1_vector_16_arr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    addvl sp, sp, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
-+; CHECK-NEXT:    addvl sp, sp, #1
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
-+; CHECK-NEXT:    add sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 4 x float>, align 16
-+  %arr = alloca i8, i64 16, align 1
-+  ret void
-+}
-+
-+; A large SVE stack object and a large stack slot, both of which need probing.
-+; TODO: This could be optimised by combining the fixed-size offset into the
-+; loop.
-+define void @sve_1_vector_4096_arr(ptr %out) #0 {
-+; CHECK-LABEL: sve_1_vector_4096_arr:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
-+; CHECK-NEXT:    .cfi_def_cfa w9, 12304
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
-+; CHECK-NEXT:  .LBB11_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB11_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB11_1
-+; CHECK-NEXT:  .LBB11_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
-+; CHECK-NEXT:    addvl sp, sp, #2
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 12304
-+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 256 x float>, align 16
-+  %arr = alloca i8, i64 12288, align 1
-+  ret void
-+}
-+
-+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
-+; supported even without stack-probing.
-+
-+; An SVE vector, and a 16-byte fixed size object, which
-+; has a large alignment requirement.
-+define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
-+; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    sub x9, x9, #4080
-+; CHECK-NEXT:    addvl x9, x9, #-1
-+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
-+; CHECK-NEXT:  .LBB12_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB12_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB12_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB12_1
-+; CHECK-NEXT:  .LBB12_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 4 x float>, align 16
-+  %arr = alloca i8, i64 16, align 8192
-+  ret void
-+}
-+
-+; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
-+define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
-+; CHECK-LABEL: sve_1024_64k_guard:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
-+; CHECK-NEXT:    addvl sp, sp, #8
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 1024 x float>, align 16
-+  ret void
-+}
-+
-+define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
-+; CHECK-LABEL: sve_1028_64k_guard:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl x9, sp, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-32
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
-+; CHECK-NEXT:    addvl x9, x9, #-1
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
-+; CHECK-NEXT:  .LBB14_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB14_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB14_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB14_1
-+; CHECK-NEXT:  .LBB14_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
-+; CHECK-NEXT:    addvl sp, sp, #31
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
-+; CHECK-NEXT:    addvl sp, sp, #9
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec = alloca <vscale x 1024 x float>, align 16
-+  %vec1 = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; With 5 SVE vectors of stack space the unprobed area
-+; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
-+; hence we need to issue a probe.
-+define void @sve_5_vector(ptr %out) #0 {
-+; CHECK-LABEL: sve_5_vector:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-5
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    addvl sp, sp, #5
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %vec1 = alloca <vscale x 4 x float>, align 16
-+  %vec2 = alloca <vscale x 4 x float>, align 16
-+  %vec3 = alloca <vscale x 4 x float>, align 16
-+  %vec4 = alloca <vscale x 4 x float>, align 16
-+  %vec5 = alloca <vscale x 4 x float>, align 16
-+  ret void
-+}
-+
-+; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
-+; are bellow the save location of `p9`.
-+define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
-+; CHECK-LABEL: sve_unprobed_area:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    addvl sp, sp, #-4
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str p9, sp, #7, mul vl // 2-byte Folded Spill
-+; CHECK-NEXT:    str z10, sp, #1, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z9, sp, #2, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    str z8, sp, #3, mul vl // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
-+; CHECK-NEXT:    addvl sp, sp, #-4
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
-+; CHECK-NEXT:    //APP
-+; CHECK-NEXT:    //NO_APP
-+; CHECK-NEXT:    addvl sp, sp, #4
-+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-+; CHECK-NEXT:    ldr p9, sp, #7, mul vl // 2-byte Folded Reload
-+; CHECK-NEXT:    ldr z10, sp, #1, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z9, sp, #2, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    ldr z8, sp, #3, mul vl // 16-byte Folded Reload
-+; CHECK-NEXT:    addvl sp, sp, #4
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    .cfi_restore z8
-+; CHECK-NEXT:    .cfi_restore z9
-+; CHECK-NEXT:    .cfi_restore z10
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
-+
-+  %v0 = alloca <vscale x 4 x float>, align 16
-+  %v1 = alloca <vscale x 4 x float>, align 16
-+  %v2 = alloca <vscale x 4 x float>, align 16
-+  %v3 = alloca <vscale x 4 x float>, align 16
-+
-+  ret void
-+}
-+
-+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
-new file mode 100644
-index 000000000000..95001450622f
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
-@@ -0,0 +1,539 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
-+
-+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
-+
-+; The stack probing parameters in function attributes take precedence over
-+; ones in the module flags.
-+
-+; Small stack frame, no probing required.
-+define void @static_64(ptr %out) #0 {
-+; CHECK-LABEL: static_64:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    sub sp, sp, #64
-+; CHECK-NEXT:    .cfi_def_cfa_offset 64
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #64
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 64, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; At 256 bytes we start to always create a frame pointer. No frame smaller then
-+; this needs a probe, so we can use the saving of at least one CSR as a probe
-+; at the top of our frame.
-+define void @static_256(ptr %out) #0 {
-+; CHECK-LABEL: static_256:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    sub sp, sp, #272
-+; CHECK-NEXT:    .cfi_def_cfa_offset 272
-+; CHECK-NEXT:    str x29, sp, #256 // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #272
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 256, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; At 1024 bytes, this is the largest frame which doesn't need probing.
-+define void @static_1024(ptr %out) #0 {
-+; CHECK-LABEL: static_1024:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 1024, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; At 1024+16 bytes, this is the smallest frame which needs probing.
-+define void @static_1040(ptr %out) #0 {
-+; CHECK-LABEL: static_1040:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 1040, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 4k bytes is the largest frame we can probe in one go.
-+define void @static_4096(ptr %out) #0 {
-+; CHECK-LABEL: static_4096:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 4096, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 4k+16 bytes, still needs just one probe.
-+define void @static_4112(ptr %out) #0 {
-+; CHECK-LABEL: static_4112:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp, #-16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4128
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    add sp, sp, #16
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 4112, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 4k+1024 bytes, the largest frame which needs just one probe.
-+define void @static_5120(ptr %out) #0 {
-+; CHECK-LABEL: static_5120:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 5136
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 5120, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 4k+1024+16, the smallest frame which needs two probes.
-+define void @static_5136(ptr %out) #0 {
-+; CHECK-LABEL: static_5136:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 5152
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 5136, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 2*4k+1024, the largest frame needing two probes
-+define void @static_9216(ptr %out) #0 {
-+; CHECK-LABEL: static_9216:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 9232
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 9216, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*4k-16, the largest frame probed without a loop
-+define void @static_20464(ptr %out) #0 {
-+; CHECK-LABEL: static_20464:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 12304
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16400
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #4080
-+; CHECK-NEXT:    .cfi_def_cfa_offset 20480
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #4, lsl #12 // =16384
-+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
-+; CHECK-NEXT:    add sp, sp, #4080
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 20464, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*4k, the smallest frame probed with a loop
-+define void @static_20480(ptr %out) #0 {
-+; CHECK-LABEL: static_20480:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
-+; CHECK-NEXT:  .LBB10_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB10_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 20480, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB
-+; so has a reminder, but no extra probe.
-+define void @static_21504(ptr %out) #0 {
-+; CHECK-LABEL: static_21504:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
-+; CHECK-NEXT:  .LBB11_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB11_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    sub sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 21520
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
-+; CHECK-NEXT:    add sp, sp, #1024
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 21504, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; 5*4k+1040, large enough to use a loop, has a reminder and
-+; an extra probe.
-+define void @static_21520(ptr %out) #0 {
-+; CHECK-LABEL: static_21520:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
-+; CHECK-NEXT:  .LBB12_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.ne .LBB12_1
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    .cfi_def_cfa_register wsp
-+; CHECK-NEXT:    sub sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 21536
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 21520, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A small allocation, but with a very large alignment requirement. We do this
-+; by moving SP far enough that a sufficiently-aligned block will exist
-+; somewhere in the stack frame, so must probe the whole of that larger SP move.
-+define void @static_16_align_8192(ptr %out) #0 {
-+; CHECK-LABEL: static_16_align_8192:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    sub x9, x9, #4080
-+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
-+; CHECK-NEXT:  .LBB13_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB13_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB13_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB13_1
-+; CHECK-NEXT:  .LBB13_3: // %entry
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 16, align 8192
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A small allocation with a very large alignment requirement, but
-+; nevertheless small enough as to not need a loop.
-+define void @static_16_align_2048(ptr %out) #0 {
-+; CHECK-LABEL: static_16_align_2048:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #2032
-+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 16, align 2048
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; A large(-ish) allocation with a very large alignment requirement, but
-+; nevertheless small enough as to not need a loop.
-+define void @static_2032_align_2048(ptr %out) #0 {
-+; CHECK-LABEL: static_2032_align_2048:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #2032
-+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 2032, align 2048
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; Test stack probing is enabled by module flags
-+define void @static_9232(ptr %out) uwtable(async) {
-+; CHECK-LABEL: static_9232:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    str x29, sp, #-16! // 8-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub sp, sp, #2, lsl #12 // =8192
-+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
-+; CHECK-NEXT:    sub sp, sp, #800
-+; CHECK-NEXT:    .cfi_def_cfa_offset 9008
-+; CHECK-NEXT:    str xzr, sp, #-240
-+; CHECK-NEXT:    .cfi_def_cfa_offset 9248
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
-+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
-+; CHECK-NEXT:    add sp, sp, #1040
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    ldr x29, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i64 9232, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; Test for a tight upper bound on the amount of stack adjustment
-+; due to stack realignment. No probes should appear.
-+define void @static_1008(ptr %out) #0 {
-+; CHECK-LABEL: static_1008:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    sub x9, sp, #1008
-+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str x8, x0
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i8, i32 1008, align 32
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" }
-+
-+!llvm.module.flags = !{!0, !1}
-+
-+!0 = !{i32 4, !"probe-stack", !"inline-asm"}
-+!1 = !{i32 8, !"stack-probe-size", i32 9000}
-\ No newline at end of file
--- 
-2.42.0.windows.2
-

_service:tar_scm:0024-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-SelectionDAG.patch Deleted

@@ -1,744 +0,0 @@
-From e433199a7dbe87324a671299f6509f19d295382f Mon Sep 17 00:00:00 2001
-From: rickyleung <leung.wing.chung@huawei.com>
-Date: Fri, 26 Apr 2024 16:59:48 +0800
-Subject: PATCH 5/7 backportAArch64 Stack probing for dynamic allocas in
- SelectionDAG
-
-Reference: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a
-
-Add support for probing for dynamic allocas (variable-size objects and
-outgoing stack arguments).
-
-Co-authored-by: Oliver Stannard <oliver.stannard@linaro.org>
----
- .../Target/AArch64/AArch64FrameLowering.cpp   |  26 ++
- .../Target/AArch64/AArch64ISelLowering.cpp    | 152 +++++---
- llvm/lib/Target/AArch64/AArch64ISelLowering.h |  13 +-
- llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +
- .../stack-probing-dynamic-no-frame-setup.ll   |  14 +
- .../CodeGen/AArch64/stack-probing-dynamic.ll  | 362 ++++++++++++++++++
- 6 files changed, 526 insertions(+), 55 deletions(-)
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
- create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-
-diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-index af019ab23770..fe21173f531f 100644
---- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
-@@ -462,6 +462,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
- /// included as part of the stack frame.
- bool
- AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-+  // The stack probing code for the dynamically allocated outgoing arguments
-+  // area assumes that the stack is probed at the top - either by the prologue
-+  // code, which issues a probe if `hasVarSizedObjects` return true, or by the
-+  // most recent variable-sized object allocation. Changing the condition here
-+  // may need to be followed up by changes to the probe issuing logic.
-   return !MF.getFrameInfo().hasVarSizedObjects();
- }
- 
-@@ -470,6 +475,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
-     MachineBasicBlock::iterator I) const {
-   const AArch64InstrInfo *TII =
-       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-+  const AArch64TargetLowering *TLI =
-+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
-+  MachineFrameInfo &MFI = MF.getFrameInfo();
-   DebugLoc DL = I->getDebugLoc();
-   unsigned Opc = I->getOpcode();
-   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
-@@ -496,6 +504,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
-       // Most call frames will be allocated at the start of a function so
-       // this is OK, but it is a limitation that needs dealing with.
-       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-+
-+      if (TLI->hasInlineStackProbe(MF) &&
-+          -Amount >= AArch64::StackProbeMaxUnprobedStack) {
-+        // When stack probing is enabled, the decrement of SP may need to be
-+        // probed. We only need to do this if the call site needs 1024 bytes of
-+        // space or more, because a region smaller than that is allowed to be
-+        // unprobed at an ABI boundary. We rely on the fact that SP has been
-+        // probed exactly at this point, either by the prologue or most recent
-+        // dynamic allocation.
-+        assert(MFI.hasVarSizedObjects() &&
-+               "non-reserved call frame without var sized objects?");
-+        Register ScratchReg =
-+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-+        inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
-+      } else {
-+        emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-+                        StackOffset::getFixed(Amount), TII);
-+      }
-       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-                       StackOffset::getFixed(Amount), TII);
-     }
-diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-index 082043420fb9..eff0722e1c77 100644
---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
-@@ -556,10 +556,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
-   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- 
--  if (Subtarget->isTargetWindows())
--    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
--  else
--    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
- 
-   // Constant pool entries
-   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-@@ -2288,6 +2285,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-     MAKE_CASE(AArch64ISD::CSINC)
-     MAKE_CASE(AArch64ISD::THREAD_POINTER)
-     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
-+    MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
-     MAKE_CASE(AArch64ISD::ABDS_PRED)
-     MAKE_CASE(AArch64ISD::ABDU_PRED)
-     MAKE_CASE(AArch64ISD::HADDS_PRED)
-@@ -2646,6 +2644,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
-   return BB;
- }
- 
-+MachineBasicBlock *
-+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
-+                                              MachineBasicBlock *MBB) const {
-+  MachineFunction &MF = *MBB->getParent();
-+  MachineBasicBlock::iterator MBBI = MI.getIterator();
-+  DebugLoc DL = MBB->findDebugLoc(MBBI);
-+  const AArch64InstrInfo &TII =
-+      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-+  Register TargetReg = MI.getOperand(0).getReg();
-+  MachineBasicBlock::iterator NextInst =
-+      TII.probedStackAlloc(MBBI, TargetReg, false);
-+
-+  MI.eraseFromParent();
-+  return NextInst->getParent();
-+}
-+
- MachineBasicBlock *
- AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
-                                     MachineInstr &MI,
-@@ -2774,6 +2788,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
- 
-   case AArch64::CATCHRET:
-     return EmitLoweredCatchRet(MI, BB);
-+  case AArch64::PROBED_STACKALLOC_DYN:
-+    return EmitDynamicProbedAlloc(MI, BB);
-   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
-     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
-   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
-@@ -13666,9 +13682,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
-                        AN->getMemOperand());
- }
- 
--SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
--    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
-+SDValue
-+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
-+                                                      SelectionDAG &DAG) const {
-+
-   SDLoc dl(Op);
-+  // Get the inputs.
-+  SDNode *Node = Op.getNode();
-+  SDValue Chain = Op.getOperand(0);
-+  SDValue Size = Op.getOperand(1);
-+  MaybeAlign Align =
-+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
-+  EVT VT = Node->getValueType(0);
-+
-+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
-+          "no-stack-arg-probe")) {
-+    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-+    Chain = SP.getValue(1);
-+    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-+    if (Align)
-+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-+                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-+    SDValue Ops2 = {SP, Chain};
-+    return DAG.getMergeValues(Ops, dl);
-+  }
-+
-+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
-+
-   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
-                                                PtrVT, 0);
-@@ -13692,7 +13733,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
- 
-   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
-                      DAG.getConstant(4, dl, MVT::i64));
--  return Chain;
-+
-+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-+  Chain = SP.getValue(1);
-+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-+  if (Align)
-+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-+
-+  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-+
-+  SDValue Ops2 = {SP, Chain};
-+  return DAG.getMergeValues(Ops, dl);
-+}
-+
-+SDValue
-+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
-+                                                     SelectionDAG &DAG) const {
-+  // Get the inputs.
-+  SDNode *Node = Op.getNode();
-+  SDValue Chain = Op.getOperand(0);
-+  SDValue Size = Op.getOperand(1);
-+
-+  MaybeAlign Align =
-+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
-+  SDLoc dl(Op);
-+  EVT VT = Node->getValueType(0);
-+
-+  // Construct the new SP value in a GPR.
-+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-+  Chain = SP.getValue(1);
-+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-+  if (Align)
-+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-+
-+  // Set the real SP to the new value with a probing loop.
-+  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
-+  SDValue Ops2 = {SP, Chain};
-+  return DAG.getMergeValues(Ops, dl);
-+}
-+
-+SDValue
-+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-+                                               SelectionDAG &DAG) const {
-+  MachineFunction &MF = DAG.getMachineFunction();
-+
-+  if (Subtarget->isTargetWindows())
-+    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
-+  else if (hasInlineStackProbe(MF))
-+    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
-+  else
-+    return SDValue();
- }
- 
- // When x and y are extended, lower:
-@@ -13746,51 +13839,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
-   return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
- }
- 
--SDValue
--AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
--                                               SelectionDAG &DAG) const {
--  assert(Subtarget->isTargetWindows() &&
--         "Only Windows alloca probing supported");
--  SDLoc dl(Op);
--  // Get the inputs.
--  SDNode *Node = Op.getNode();
--  SDValue Chain = Op.getOperand(0);
--  SDValue Size = Op.getOperand(1);
--  MaybeAlign Align =
--      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
--  EVT VT = Node->getValueType(0);
--
--  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
--          "no-stack-arg-probe")) {
--    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
--    Chain = SP.getValue(1);
--    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
--    if (Align)
--      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
--                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
--    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
--    SDValue Ops2 = {SP, Chain};
--    return DAG.getMergeValues(Ops, dl);
--  }
--
--  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
--
--  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
--
--  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
--  Chain = SP.getValue(1);
--  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
--  if (Align)
--    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
--                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
--  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
--
--  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
--
--  SDValue Ops2 = {SP, Chain};
--  return DAG.getMergeValues(Ops, dl);
--}
--
- SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
-                                            SelectionDAG &DAG) const {
-   EVT VT = Op.getValueType();
-diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-index 643d363e234a..9b388c7f8668 100644
---- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
-@@ -90,6 +90,10 @@ enum NodeType : unsigned {
-   ADC,
-   SBC, // adc, sbc instructions
- 
-+  // To avoid stack clash, allocation is performed by block and each block is
-+  // probed.
-+  PROBED_ALLOCA,
-+
-   // Predicated instructions where inactive lanes produce undefined results.
-   ABDS_PRED,
-   ABDU_PRED,
-@@ -610,6 +614,9 @@ public:
-   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
-                                            MachineBasicBlock *BB) const;
- 
-+  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
-+                                            MachineBasicBlock *MBB) const;
-+
-   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
-                                   MachineInstr &MI,
-                                   MachineBasicBlock *BB) const;
-@@ -1113,10 +1120,10 @@ private:
-   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
-+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-+  SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
--  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
--                                         SDValue &Size,
--                                         SelectionDAG &DAG) const;
-+
-   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
- 
-   SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
-diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-index 09980c2f45e6..9b9103e01d67 100644
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
-@@ -818,6 +818,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, SDNPHasChain,
- def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, SDNPHasChain, SDNPMayStore, SDNPMemOperand>;
- 
- def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
-+
-+def AArch64probedalloca
-+    : SDNode<"AArch64ISD::PROBED_ALLOCA",
-+             SDTypeProfile<0, 1, SDTCisPtrTy<0>>,
-+             SDNPHasChain, SDNPMayStore>;
-+
- def AArch64mrs : SDNode<"AArch64ISD::MRS",
-                         SDTypeProfile<1, 1, SDTCisVT<0, i64>, SDTCisVT<1, i32>>,
-                         SDNPHasChain, SDNPOutGlue>;
-@@ -908,6 +914,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
-                                    >,
-                                    Sched<>;
- 
-+// Probed stack allocations of a variable size, used for allocas of unknown size
-+// when stack-clash protection is enabled.
-+let usesCustomInserter = 1 in
-+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
-+                                   (ins GPR64common:$target),
-+                                   (AArch64probedalloca GPR64common:$target)>,
-+                                   Sched<>;
-+
- } // Defs = SP, NZCV, Uses = SP in 
- } // hasSideEffects = 1, isCodeGenOnly = 1
- 
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
-new file mode 100644
-index 000000000000..673f9038a35f
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
-@@ -0,0 +1,14 @@
-+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
-+target triple = "aarch64-linux"
-+
-+; Check dynamic stack allocation and probing instructions do not have
-+; the FrameSetup flag.
-+
-+; CHECK-NOT: frame-setup
-+define void @no_frame_setup(i64 %size, ptr %out) #0 {
-+  %v = alloca i8, i64 %size, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-new file mode 100644
-index 000000000000..4d9ef77f7a0d
---- /dev/null
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-@@ -0,0 +1,362 @@
-+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
-+
-+; Dynamically-sized allocation, needs a loop which can handle any size at
-+; runtime. The final iteration of the loop will temporarily put SP below the
-+; target address, but this doesn't break any of the ABI constraints on the
-+; stack, and also doesn't probe below the target SP value.
-+define void @dynamic(i64 %size, ptr %out) #0 {
-+; CHECK-LABEL: dynamic:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB0_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB0_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB0_1
-+; CHECK-NEXT:  .LBB0_3:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x1
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v = alloca i8, i64 %size, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; This function has a fixed-size stack slot and a dynamic one. The fixed size
-+; slot isn't large enough that we would normally probe it, but we need to do so
-+; here otherwise the gap between the CSR save and the first probe of the
-+; dynamic allocation could be too far apart when the size of the dynamic
-+; allocation is close to the guard size.
-+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
-+; CHECK-LABEL: dynamic_fixed:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    str xzr, sp, #-64!
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    sub x10, x29, #64
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    str x10, x1
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB1_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB1_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB1_1
-+; CHECK-NEXT:  .LBB1_3:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x2
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v1 = alloca i8, i64 64, align 1
-+  store ptr %v1, ptr %out1, align 8
-+  %v2 = alloca i8, i64 %size, align 1
-+  store ptr %v2, ptr %out2, align 8
-+  ret void
-+}
-+
-+; Dynamic allocation, with an alignment requirement greater than the alignment
-+; of SP. Done by ANDing the target SP with a constant to align it down, then
-+; doing the loop as normal. Note that we also re-align the stack in the prolog,
-+; which isn't actually needed because the only aligned allocations are dynamic,
-+; this is done even without stack probing.
-+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
-+; CHECK-LABEL: dynamic_align_64:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-32! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    str x19, sp, #16 // 8-byte Folded Spill
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 32
-+; CHECK-NEXT:    .cfi_offset w19, -16
-+; CHECK-NEXT:    .cfi_offset w30, -24
-+; CHECK-NEXT:    .cfi_offset w29, -32
-+; CHECK-NEXT:    sub x9, sp, #32
-+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    mov x19, sp
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
-+; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB2_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB2_1
-+; CHECK-NEXT:  .LBB2_3:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x1
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
-+; CHECK-NEXT:    ldr x19, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w19
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v = alloca i8, i64 %size, align 64
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; Dynamic allocation, with an alignment greater than the stack guard size. The
-+; only difference to the dynamic allocation is the constant used for aligning
-+; the target SP, the loop will probe the whole allocation without needing to
-+; know about the alignment padding.
-+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
-+; CHECK-LABEL: dynamic_align_8192:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-32! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    str x19, sp, #16 // 8-byte Folded Spill
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 32
-+; CHECK-NEXT:    .cfi_offset w19, -16
-+; CHECK-NEXT:    .cfi_offset w30, -24
-+; CHECK-NEXT:    .cfi_offset w29, -32
-+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    sub x9, x9, #4064
-+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
-+; CHECK-NEXT:  .LBB3_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x9
-+; CHECK-NEXT:    b.le .LBB3_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB3_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB3_1
-+; CHECK-NEXT:  .LBB3_3:
-+; CHECK-NEXT:    mov sp, x9
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    mov x19, sp
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
-+; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB3_6
-+; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB3_4 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB3_4
-+; CHECK-NEXT:  .LBB3_6:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x1
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
-+; CHECK-NEXT:    ldr x19, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w19
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v = alloca i8, i64 %size, align 8192
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; For 64k guard pages, the only difference is the constant subtracted from SP
-+; in the loop.
-+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
-+; CHECK-LABEL: dynamic_64k_guard:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB4_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB4_1
-+; CHECK-NEXT:  .LBB4_3:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x1
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v = alloca i8, i64 %size, align 1
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+; If a function has variable-sized stack objects, then any function calls which
-+; need to pass arguments on the stack must allocate the stack space for them
-+; dynamically, to ensure they are at the bottom of the frame. We need to probe
-+; that space when it is larger than the unprobed space allowed by the ABI (1024
-+; bytes), so this needs a very large number of arguments.
-+define void @no_reserved_call_frame(i64 %n) #0 {
-+; CHECK-LABEL: no_reserved_call_frame:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-16! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 16
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+; CHECK-NEXT:    .cfi_offset w30, -8
-+; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    lsl x9, x0, #2
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    add x9, x9, #15
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    sub x0, x8, x9
-+; CHECK-NEXT:  .LBB5_1: // %entry
-+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x0
-+; CHECK-NEXT:    b.le .LBB5_3
-+; CHECK-NEXT:  // %bb.2: // %entry
-+; CHECK-NEXT:    // in Loop: Header=BB5_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB5_1
-+; CHECK-NEXT:  .LBB5_3: // %entry
-+; CHECK-NEXT:    mov sp, x0
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1104
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    bl callee_stack_args
-+; CHECK-NEXT:    add sp, sp, #1104
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-+; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i32, i64 %n
-+  call void @callee_stack_args(ptr %v, 138 x i64 undef)
-+  ret void
-+}
-+
-+; Same as above but without a variable-sized allocation, so the reserved call
-+; frame can be folded into the fixed-size allocation in the prologue.
-+define void @reserved_call_frame(i64 %n) #0 {
-+; CHECK-LABEL: reserved_call_frame:
-+; CHECK:       // %bb.0: // %entry
-+; CHECK-NEXT:    stp x29, x30, sp, #-32! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    str x28, sp, #16 // 8-byte Folded Spill
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 32
-+; CHECK-NEXT:    .cfi_offset w28, -16
-+; CHECK-NEXT:    .cfi_offset w30, -24
-+; CHECK-NEXT:    .cfi_offset w29, -32
-+; CHECK-NEXT:    sub sp, sp, #1504
-+; CHECK-NEXT:    add x0, sp, #1104
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    bl callee_stack_args
-+; CHECK-NEXT:    add sp, sp, #1504
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
-+; CHECK-NEXT:    ldr x28, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w28
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+entry:
-+  %v = alloca i32, i64 100
-+  call void @callee_stack_args(ptr %v, 138 x i64 undef)
-+  ret void
-+}
-+
-+declare void @callee_stack_args(ptr, 138 x i64)
-+
-+; Dynamic allocation of SVE vectors
-+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
-+; CHECK-LABEL: dynamic_sve:
-+; CHECK:       // %bb.0:
-+; CHECK-NEXT:    stp x29, x30, sp, #-32! // 16-byte Folded Spill
-+; CHECK-NEXT:    .cfi_def_cfa_offset 32
-+; CHECK-NEXT:    str x19, sp, #16 // 8-byte Folded Spill
-+; CHECK-NEXT:    mov x29, sp
-+; CHECK-NEXT:    .cfi_def_cfa w29, 32
-+; CHECK-NEXT:    .cfi_offset w19, -16
-+; CHECK-NEXT:    .cfi_offset w30, -24
-+; CHECK-NEXT:    .cfi_offset w29, -32
-+; CHECK-NEXT:    rdvl x9, #1
-+; CHECK-NEXT:    mov x10, #15 // =0xf
-+; CHECK-NEXT:    mov x8, sp
-+; CHECK-NEXT:    madd x9, x0, x9, x10
-+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
-+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-+; CHECK-NEXT:    cmp sp, x8
-+; CHECK-NEXT:    b.le .LBB7_3
-+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    b .LBB7_1
-+; CHECK-NEXT:  .LBB7_3:
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    str x8, x1
-+; CHECK-NEXT:    mov sp, x29
-+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
-+; CHECK-NEXT:    ldr x19, sp, #16 // 8-byte Folded Reload
-+; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-+; CHECK-NEXT:    .cfi_def_cfa_offset 0
-+; CHECK-NEXT:    .cfi_restore w19
-+; CHECK-NEXT:    .cfi_restore w30
-+; CHECK-NEXT:    .cfi_restore w29
-+; CHECK-NEXT:    ret
-+  %v = alloca <vscale x 4 x float>, i64 %size, align 16
-+  store ptr %v, ptr %out, align 8
-+  ret void
-+}
-+
-+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
-\ No newline at end of file
--- 
-2.42.0.windows.2
-

_service:tar_scm:0024-Backport-LoongArch-fix-and-add-some-new-support.patch Added

@@ -0,0 +1,5463 @@
+From 53a624f1fbb2d1f837070b400812e8bddf66fd3d Mon Sep 17 00:00:00 2001
+From: Lu Weining <luweining@loongson.cn>
+Date: Tue, 5 Dec 2023 09:20:48 +0800
+Subject: PATCH 01/12 BinaryFormatLoongArch Define psABI v2.20 relocs for
+ R_LARCH_CALL36(#73345)
+
+R_LARCH_CALL36 was designed for function call on medium code model where
+the 2 instructions (pcaddu18i + jirl) must be adjacent.
+
+(cherry picked from commit c3a9c905fbc486add75e16218fe58a04b7b6c282)
+---
+ llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def      | 6 ++++++
+ .../tools/llvm-readobj/ELF/reloc-types-loongarch64.test     | 2 ++
+ llvm/unittests/Object/ELFTest.cpp                           | 2 ++
+ 3 files changed, 10 insertions(+)
+
+diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+index 02bce3c71712..c4393432677b 100644
+--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
++++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+@@ -118,3 +118,9 @@ ELF_RELOC(R_LARCH_SUB6,        106)
+ ELF_RELOC(R_LARCH_ADD_ULEB128, 107)
+ ELF_RELOC(R_LARCH_SUB_ULEB128, 108)
+ ELF_RELOC(R_LARCH_64_PCREL,    109)
++
++// Relocs added in ELF for the LoongArch™ Architecture v20231102, part of the
++// v2.20 LoongArch ABI specs.
++//
++// Spec addition: https://github.com/loongson/la-abi-specs/pull/4
++ELF_RELOC(R_LARCH_CALL36, 110)
+diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+index e32dc893fa79..88ff7fa405ed 100644
+--- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
++++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+@@ -102,6 +102,7 @@
+ # CHECK: Type: R_LARCH_ADD_ULEB128 (107)
+ # CHECK: Type: R_LARCH_SUB_ULEB128 (108)
+ # CHECK: Type: R_LARCH_64_PCREL (109)
++# CHECK: Type: R_LARCH_CALL36 (110)
+ 
+ --- !ELF
+ FileHeader:
+@@ -211,3 +212,4 @@ Sections:
+       - Type: R_LARCH_ADD_ULEB128
+       - Type: R_LARCH_SUB_ULEB128
+       - Type: R_LARCH_64_PCREL
++      - Type: R_LARCH_CALL36
+diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
+index 50b1df124a4a..ed851dde4c00 100644
+--- a/llvm/unittests/Object/ELFTest.cpp
++++ b/llvm/unittests/Object/ELFTest.cpp
+@@ -251,6 +251,8 @@ TEST(ELFTest, getELFRelocationTypeNameForLoongArch) {
+             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_SUB_ULEB128));
+   EXPECT_EQ("R_LARCH_64_PCREL",
+             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_64_PCREL));
++  EXPECT_EQ("R_LARCH_CALL36",
++            getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_CALL36));
+ }
+ 
+ TEST(ELFTest, getELFRelativeRelocationType) {
+-- 
+2.20.1
+
+
+From a8ed0f26220bbacb2c485a392f79ac4b271d73af Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:55:02 +0800
+Subject: PATCH 02/12 LoongArch Emit function call code sequence as
+ `PCADDU18I+JIRL` in medium code model
+
+According to the description of the psABI v2.20:
+https://github.com/loongson/la-abi-specs/releases/tag/v2.20, adjustments
+are made to the function call instructions under the medium code model.
+
+At the same time, AsmParser has already supported parsing the call36 and
+tail36 macro instructions.
+
+(cherry picked from commit 2cf420d5b846a4733ef0ef7c8ed0ae0bfd1c6772)
+---
+ .../AsmParser/LoongArchAsmParser.cpp          | 61 +++++++++++++++++++
+ .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 29 ++++-----
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 23 ++++++-
+ .../Target/LoongArch/LoongArchMCInstLower.cpp |  3 +
+ .../LoongArch/LoongArchTargetMachine.cpp      |  4 +-
+ .../MCTargetDesc/LoongArchBaseInfo.h          |  1 +
+ .../MCTargetDesc/LoongArchELFObjectWriter.cpp |  2 +
+ .../MCTargetDesc/LoongArchFixupKinds.h        |  3 +
+ .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   |  3 +
+ .../MCTargetDesc/LoongArchMCExpr.cpp          |  3 +
+ .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  1 +
+ llvm/test/CodeGen/LoongArch/code-models.ll    | 12 ++--
+ .../MC/LoongArch/Basic/Integer/invalid64.s    |  2 +-
+ llvm/test/MC/LoongArch/Macros/macros-call.s   |  9 +++
+ .../MC/LoongArch/Relocations/relocations.s    |  5 ++
+ 15 files changed, 133 insertions(+), 28 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Macros/macros-call.s
+
+diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+index a132e645c864..f908e5bc63d3 100644
+--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
++++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+@@ -122,6 +122,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
+   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
+   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+ 
++  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
++  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
++                      bool IsTailCall);
++
+ public:
+   enum LoongArchMatchResultTy {
+     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
+@@ -401,6 +405,22 @@ public:
+                      IsValidKind;
+   }
+ 
++  bool isSImm20pcaddu18i() const {
++    if (!isImm())
++      return false;
++
++    int64_t Imm;
++    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
++    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
++    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
++                       VK == LoongArchMCExpr::VK_LoongArch_CALL36;
++
++    return IsConstantImm
++               ? isInt<20>(Imm) && IsValidKind
++               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
++                     IsValidKind;
++  }
++
+   bool isSImm21lsl2() const {
+     if (!isImm())
+       return false;
+@@ -1111,6 +1131,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
+   }
+ }
+ 
++void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
++                                        MCStreamer &Out, bool IsTailCall) {
++  // call36 sym
++  // expands to:
++  //   pcaddu18i $ra, %call36(sym)
++  //   jirl      $ra, $ra, 0
++  //
++  // tail36 $rj, sym
++  // expands to:
++  //   pcaddu18i $rj, %call36(sym)
++  //   jirl      $r0, $rj, 0
++  unsigned ScratchReg =
++      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
++  const MCExpr *Sym =
++      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
++  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
++      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
++
++  Out.emitInstruction(
++      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
++      getSTI());
++  Out.emitInstruction(
++      MCInstBuilder(LoongArch::JIRL)
++          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
++          .addReg(ScratchReg)
++          .addImm(0),
++      getSTI());
++}
++
+ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out) {
+@@ -1159,6 +1208,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+   case LoongArch::PseudoLI_D:
+     emitLoadImm(Inst, IDLoc, Out);
+     return false;
++  case LoongArch::PseudoCALL36:
++    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
++    return false;
++  case LoongArch::PseudoTAIL36:
++    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
++    return false;
+   }
+   Out.emitInstruction(Inst, getSTI());
+   return false;
+@@ -1440,6 +1495,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+         /*Upper=*/(1 << 19) - 1,
+         "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
+         "in the range");
++  case Match_InvalidSImm20pcaddu18i:
++    return generateImmOutOfRangeError(
++        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
++        /*Upper=*/(1 << 19) - 1,
++        "operand must be a symbol with modifier (e.g. %call36) or an integer "
++        "in the range");
+   case Match_InvalidSImm21lsl2:
+     return generateImmOutOfRangeError(
+         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index 72c1f1cec198..8eda2dcc1633 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -458,11 +458,11 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+   }
+   case CodeModel::Medium: {
+     // CALL:
+-    // pcalau12i  $ra, %pc_hi20(func)
+-    // jirl       $ra, $ra, %pc_lo12(func)
++    // pcaddu18i $ra, %call36(func)
++    // jirl      $ra, $ra, 0
+     // TAIL:
+-    // pcalau12i  $scratch, %pc_hi20(func)
+-    // jirl       $r0, $scratch, %pc_lo12(func)
++    // pcaddu18i $scratch, %call36(func)
++    // jirl      $r0, $scratch, 0
+     Opcode =
+         IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+     Register ScratchReg =
+@@ -470,18 +470,15 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+             ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+             : LoongArch::R1;
+     MachineInstrBuilder MIB =
+-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
+-    if (Func.isSymbol()) {
+-      const char *FnName = Func.getSymbolName();
+-      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
+-      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
+-      break;
+-    }
+-    assert(Func.isGlobal() && "Expected a GlobalValue at this time");
+-    const GlobalValue *GV = Func.getGlobal();
+-    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
+-    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
++        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
++
++    CALL =
++        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
++
++    if (Func.isSymbol())
++      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
++    else
++      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+     break;
+   }
+   case CodeModel::Large: {
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index ab1890556814..67de5f7afd78 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -351,6 +351,10 @@ def simm20_lu32id : SImm20Operand {
+   let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
+ }
+ 
++def simm20_pcaddu18i : SImm20Operand {
++  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
++}
++
+ def simm21_lsl2 : Operand<OtherVT> {
+   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
+   let EncoderMethod = "getImmOpValueAsr<2>";
+@@ -772,7 +776,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
+                        "$rd, $imm20">;
+ }
+ def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
+-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
++def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
+ def MUL_D     : ALU_3R<0x001d8000>;
+ def MULH_D    : ALU_3R<0x001e0000>;
+ def MULH_DU   : ALU_3R<0x001e8000>;
+@@ -1324,7 +1328,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
+           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+ 
+ let isCall = 1, Defs = R1 in
+-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
++def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
+ 
+ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+ def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+@@ -1344,7 +1348,7 @@ def PseudoRET : Pseudo<(outs), (ins), (loongarch_ret)>,
+                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
+ 
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = R3 in
+-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
++def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
+ 
+ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
+           (PseudoTAIL tglobaladdr:$dst)>;
+@@ -1367,6 +1371,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
+                       PseudoInstExpansion<(JIRL R0, GPR:$rj,
+                                            simm16_lsl2:$imm16)>;
+ 
++/// call36/taill36 macro instructions
++let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
++    Defs = R1, Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
++def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), ,
++                          "call36", "$dst">,
++                   Requires<IsLA64>;
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = R3,
++    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
++    mayStore = 0, mayLoad = 0 in
++def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), ,
++                          "tail36", "$tmp, $dst">,
++                   Requires<IsLA64>;
++
+ /// Load address (la*) macro instructions.
+ 
+ // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+index 5daa9481c907..98ad49f25e3f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
+   case LoongArchII::MO_GD_PC_HI:
+     Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
+     break;
++  case LoongArchII::MO_CALL36:
++    Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
++    break;
+     // TODO: Handle more target-flags.
+   }
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+index d0a4e9375048..0efc5e6ebb99 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
+ 
+   switch (*CM) {
+   case CodeModel::Small:
+-  case CodeModel::Medium:
+     return *CM;
++  case CodeModel::Medium:
+   case CodeModel::Large:
+     if (!TT.isArch64Bit())
+-      report_fatal_error("Large code model requires LA64");
++      report_fatal_error("Medium/Large code model requires LA64");
+     return *CM;
+   default:
+     report_fatal_error(
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+index cee6dad1f095..0692cb92b694 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+@@ -47,6 +47,7 @@ enum {
+   MO_IE_PC64_HI,
+   MO_LD_PC_HI,
+   MO_GD_PC_HI,
++  MO_CALL36
+   // TODO: Add more flags.
+ };
+ } // end namespace LoongArchII
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+index e60b9c2cfd97..0a52380dd2cd 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
+     return ELF::R_LARCH_TLS_LE64_LO20;
+   case LoongArch::fixup_loongarch_tls_le64_hi12:
+     return ELF::R_LARCH_TLS_LE64_HI12;
++  case LoongArch::fixup_loongarch_call36:
++    return ELF::R_LARCH_CALL36;
+     // TODO: Handle more fixup-kinds.
+   }
+ }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+index 78414408f21f..0d19d2b0fb1f 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+@@ -111,6 +111,9 @@ enum Fixups {
+   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
+   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
+   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
++  // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
++  // pcaddu18i+jirl.
++  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
+ };
+ } // end namespace LoongArch
+ } // end namespace llvm
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+index 09d92ac9aa3a..7c4fe9674d4e 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
+     case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
+       FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
+       break;
++    case LoongArchMCExpr::VK_LoongArch_CALL36:
++      FixupKind = LoongArch::fixup_loongarch_call36;
++      break;
+     }
+   } else if (Kind == MCExpr::SymbolRef &&
+              cast<MCSymbolRefExpr>(Expr)->getKind() ==
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+index 82c992b1cc8c..8ca8876a19b9 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
+     return "gd_pc_hi20";
+   case VK_LoongArch_TLS_GD_HI20:
+     return "gd_hi20";
++  case VK_LoongArch_CALL36:
++    return "call36";
+   }
+ }
+ 
+@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
+       .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
+       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
+       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
++      .Case("call36", VK_LoongArch_CALL36)
+       .Default(VK_LoongArch_Invalid);
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+index 93251f824103..bd828116d7fa 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+@@ -61,6 +61,7 @@ public:
+     VK_LoongArch_TLS_LD_HI20,
+     VK_LoongArch_TLS_GD_PC_HI20,
+     VK_LoongArch_TLS_GD_HI20,
++    VK_LoongArch_CALL36,
+     VK_LoongArch_Invalid // Must be the last item.
+   };
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
+index c610f645a06a..7c6f46d5e926 100644
+--- a/llvm/test/CodeGen/LoongArch/code-models.ll
++++ b/llvm/test/CodeGen/LoongArch/code-models.ll
+@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind {
+ ; MEDIUM:       # %bb.0:
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, -16
+ ; MEDIUM-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(callee)
+-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(callee)
++; MEDIUM-NEXT:    pcaddu18i $ra, %call36(callee)
++; MEDIUM-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
+ ; MEDIUM-NEXT:    ret
+@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) {
+ ; MEDIUM-NEXT:    .cfi_offset 1, -8
+ ; MEDIUM-NEXT:    ori $a2, $zero, 1000
+ ; MEDIUM-NEXT:    move $a1, $zero
+-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(memset)
+-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(memset)
++; MEDIUM-NEXT:    pcaddu18i $ra, %call36(memset)
++; MEDIUM-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
+ ; MEDIUM-NEXT:    ret
+@@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind {
+ ;
+ ; MEDIUM-LABEL: caller_tail:
+ ; MEDIUM:       # %bb.0: # %entry
+-; MEDIUM-NEXT:    pcalau12i $a1, %pc_hi20(callee_tail)
+-; MEDIUM-NEXT:    jirl $zero, $a1, %pc_lo12(callee_tail)
++; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
++; MEDIUM-NEXT:    jr $a1
+ ;
+ ; LARGE-LABEL: caller_tail:
+ ; LARGE:       # %bb.0: # %entry
+diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+index acddca9432a6..1c1c658ad440 100644
+--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
++++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768
+ 
+ ## simm20
+ pcaddu18i $a0, 0x80000
+-# CHECK: :#@LINE-1:16: error: immediate must be an integer in the range -524288, 524287
++# CHECK: :#@LINE-1:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range -524288, 524287
+ 
+ ## simm20_lu32id
+ lu32i.d $a0, 0x80000
+diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
+new file mode 100644
+index 000000000000..a648a3978038
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
+@@ -0,0 +1,9 @@
++# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
++
++call36 sym_call
++# CHECK:      pcaddu18i $ra, %call36(sym_call)
++# CHECK-NEXT: jirl $ra, $ra, 0
++
++tail36 $t0, sym_tail
++# CHECK:      pcaddu18i $t0, %call36(sym_tail)
++# CHECK-NEXT: jr $t0
+diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
+index 042cc93470a1..bec71e103893 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
++++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
+@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo)
+ # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0
+ # INSTR: lu12i.w $t1, %gd_hi20(foo)
+ # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE
++
++pcaddu18i $t1, %call36(foo)
++# RELOC: R_LARCH_CALL36 foo 0x0
++# INSTR: pcaddu18i $t1, %call36(foo)
++# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE
+-- 
+2.20.1
+
+
+From d59688f326d8f915ffc5db80b40c9b99d9f95470 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:57:40 +0800
+Subject: PATCH 03/12 LoongArch Pre-commit test for #76555. NFC
+
+(cherry picked from commit 3d6fc35b9071009c5ef37f879a12982c6a54db60)
+---
+ .../LoongArch/psabi-restricted-scheduling.ll  | 172 ++++++++++++++++++
+ 1 file changed, 172 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+new file mode 100644
+index 000000000000..150a935d7bf8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -0,0 +1,172 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \
++; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \
++; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \
++; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
++; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
++
++;; FIXME: According to the description of the psABI v2.30, the code sequences
++;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
++;; be adjacent.
++
++@g = dso_local global i64 zeroinitializer, align 4
++@G = global i64 zeroinitializer, align 4
++@gd = external thread_local global i64
++@ld = external thread_local(localdynamic) global i64
++@ie = external thread_local(initialexec) global i64
++
++declare ptr @bar(i64)
++
++define void @foo() nounwind {
++; MEDIUM_NO_SCH-LABEL: foo:
++; MEDIUM_NO_SCH:       # %bb.0:
++; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, -16
++; MEDIUM_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
++; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
++; MEDIUM_NO_SCH-NEXT:    ret
++;
++; MEDIUM_SCH-LABEL: foo:
++; MEDIUM_SCH:       # %bb.0:
++; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
++; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
++; MEDIUM_SCH-NEXT:    ret
++;
++; LARGE_NO_SCH-LABEL: foo:
++; LARGE_NO_SCH:       # %bb.0:
++; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
++; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
++; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
++; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
++; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
++; LARGE_NO_SCH-NEXT:    ret
++;
++; LARGE_SCH-LABEL: foo:
++; LARGE_SCH:       # %bb.0:
++; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
++; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
++; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
++; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
++; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
++; LARGE_SCH-NEXT:    ret
++  %V = load volatile i64, ptr @G
++  %v = load volatile i64, ptr @g
++  call void @bar(i64 1)
++  %v_gd = load volatile i64, ptr @gd
++  %v_ld = load volatile i64, ptr @ld
++  %v_ie = load volatile i64, ptr @ie
++  ret void
++}
+-- 
+2.20.1
+
+
+From 1248440ab618fcffada7fa29eed71bc04945c3ec Mon Sep 17 00:00:00 2001
+From: Weining Lu <luweining@loongson.cn>
+Date: Tue, 25 Jun 2024 09:52:17 +0800
+Subject: PATCH 04/12 LoongArchtest Remove the FIXME in
+ psabi-restricted-scheduling.ll which has been addressed by #76555
+
+(cherry picked from commit 7ea63b9db4198688873036f3b0b81f9124076f7a)
+---
+ llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+index 150a935d7bf8..a515939b9c2b 100644
+--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -8,10 +8,6 @@
+ ; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
+ ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+ 
+-;; FIXME: According to the description of the psABI v2.30, the code sequences
+-;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
+-;; be adjacent.
+-
+ @g = dso_local global i64 zeroinitializer, align 4
+ @G = global i64 zeroinitializer, align 4
+ @gd = external thread_local global i64
+-- 
+2.20.1
+
+
+From 0e86ae628414dac6d7ef2eaccc8655d790595f9f Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:57:15 +0800
+Subject: PATCH 05/12 LoongArch Reimplement the expansion of
+ PseudoLA*_LARGE instructions (#76555)
+
+According to the description of the psABI v2.30:
+https://github.com/loongson/la-abi-specs/releases/tag/v2.30, moved the
+expansion of relevant pseudo-instructions from
+`LoongArchPreRAExpandPseudo` pass to `LoongArchExpandPseudo` pass, to
+ensure that the code sequences of `PseudoLA*_LARGE` instructions and
+Medium code model's function call are not scheduled.
+
+(cherry picked from commit c56a5e895a96fec4292e9333d998cfa88770432a)
+---
+ .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 519 +++++++++---------
+ .../LoongArch/LoongArchISelLowering.cpp       |  24 +-
+ .../Target/LoongArch/LoongArchISelLowering.h  |   4 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    |  83 ++-
+ llvm/test/CodeGen/LoongArch/code-models.ll    |  36 +-
+ llvm/test/CodeGen/LoongArch/expand-call.ll    |   2 +-
+ llvm/test/CodeGen/LoongArch/global-address.ll |  32 +-
+ .../LoongArch/psabi-restricted-scheduling.ll  | 102 ++--
+ llvm/test/CodeGen/LoongArch/tls-models.ll     |  68 +--
+ 9 files changed, 487 insertions(+), 383 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index 8eda2dcc1633..f977f176066a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -62,43 +62,24 @@ private:
+                                MachineBasicBlock::iterator &NextMBBI,
+                                unsigned FlagsHi, unsigned SecondOpcode,
+                                unsigned FlagsLo);
+-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+-                              MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              unsigned LastOpcode, unsigned IdentifyingMO);
+-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+-                              MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              unsigned LastOpcode, unsigned IdentifyingMO,
+-                              const MachineOperand &Symbol, Register DestReg,
+-                              bool EraseFromParent);
+   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressGot(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+-                            MachineBasicBlock::iterator &NextMBBI,
+-                            bool Large = false);
++                            MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
+-  bool expandFunctionCALL(MachineBasicBlock &MBB,
+-                          MachineBasicBlock::iterator MBBI,
+-                          MachineBasicBlock::iterator &NextMBBI,
+-                          bool IsTailCall);
++                              MachineBasicBlock::iterator &NextMBBI);
+ };
+ 
+ char LoongArchPreRAExpandPseudo::ID = 0;
+@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
+   switch (MBBI->getOpcode()) {
+   case LoongArch::PseudoLA_PCREL:
+     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_PCREL_LARGE:
+-    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_GOT:
+     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_GOT_LARGE:
+-    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_LE:
+     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
+   case LoongArch::PseudoLA_TLS_IE:
+     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_IE_LARGE:
+-    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_LD:
+     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_LD_LARGE:
+-    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_GD:
+     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_GD_LARGE:
+-    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
+-  case LoongArch::PseudoCALL:
+-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+-  case LoongArch::PseudoTAIL:
+-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
+   }
+   return false;
+ }
+@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
+   return true;
+ }
+ 
+-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+-    unsigned IdentifyingMO) {
+-  MachineInstr &MI = *MBBI;
+-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+-                                MI.getOperand(2), MI.getOperand(0).getReg(),
+-                                true);
+-}
+-
+-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+-    bool EraseFromParent) {
+-  // Code Sequence:
+-  //
+-  // Part1: pcalau12i  $scratch, %MO1(sym)
+-  // Part0: addi.d     $dest, $zero, %MO0(sym)
+-  // Part2: lu32i.d    $dest, %MO2(sym)
+-  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
+-  // Fin:   LastOpcode $dest, $dest, $scratch
+-
+-  unsigned MO0, MO1, MO2, MO3;
+-  switch (IdentifyingMO) {
+-  default:
+-    llvm_unreachable("unsupported identifying MO");
+-  case LoongArchII::MO_PCREL_LO:
+-    MO0 = IdentifyingMO;
+-    MO1 = LoongArchII::MO_PCREL_HI;
+-    MO2 = LoongArchII::MO_PCREL64_LO;
+-    MO3 = LoongArchII::MO_PCREL64_HI;
+-    break;
+-  case LoongArchII::MO_GOT_PC_HI:
+-  case LoongArchII::MO_LD_PC_HI:
+-  case LoongArchII::MO_GD_PC_HI:
+-    // These cases relocate just like the GOT case, except for Part1.
+-    MO0 = LoongArchII::MO_GOT_PC_LO;
+-    MO1 = IdentifyingMO;
+-    MO2 = LoongArchII::MO_GOT_PC64_LO;
+-    MO3 = LoongArchII::MO_GOT_PC64_HI;
+-    break;
+-  case LoongArchII::MO_IE_PC_LO:
+-    MO0 = IdentifyingMO;
+-    MO1 = LoongArchII::MO_IE_PC_HI;
+-    MO2 = LoongArchII::MO_IE_PC64_LO;
+-    MO3 = LoongArchII::MO_IE_PC64_HI;
+-    break;
+-  }
+-
+-  MachineFunction *MF = MBB.getParent();
+-  MachineInstr &MI = *MBBI;
+-  DebugLoc DL = MI.getDebugLoc();
+-
+-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+-         "Large code model requires LA64");
+-
+-  Register TmpPart1 =
+-      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+-  Register TmpPart0 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-  Register TmpParts02 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-  Register TmpParts023 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-
+-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
+-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
+-                   .addReg(LoongArch::R0);
+-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
+-                   // "rj" is needed due to InstrInfo pattern requirement.
+-                   .addReg(TmpPart0, RegState::Kill);
+-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
+-                   .addReg(TmpParts02, RegState::Kill);
+-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+-      .addReg(TmpParts023)
+-      .addReg(TmpPart1, RegState::Kill);
+-
+-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+-    const char *SymName = Symbol.getSymbolName();
+-    Part0.addExternalSymbol(SymName, MO0);
+-    Part1.addExternalSymbol(SymName, MO1);
+-    Part2.addExternalSymbol(SymName, MO2);
+-    Part3.addExternalSymbol(SymName, MO3);
+-  } else {
+-    Part0.addDisp(Symbol, 0, MO0);
+-    Part1.addDisp(Symbol, 0, MO1);
+-    Part2.addDisp(Symbol, 0, MO2);
+-    Part3.addDisp(Symbol, 0, MO3);
+-  }
+-
+-  if (EraseFromParent)
+-    MI.eraseFromParent();
+-
+-  return true;
+-}
+-
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%pc` family of
+-    // relocs.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_PCREL_LO);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %pc_hi20(sym)
+   // addi.w/d $rd, $rd, %pc_lo12(sym)
+@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, loading the result from GOT with `ldx.d` in the end.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+-                                  LoongArchII::MO_GOT_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %got_pc_hi20(sym)
+   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%ie_pc` family
+-    // of relocs, loading the result with `ldx.d` in the end.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+-                                  LoongArchII::MO_IE_PC_LO);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %ie_pc_hi20(sym)
+   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
+@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_LD_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %ld_pc_hi20(sym)
+   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_GD_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %gd_pc_hi20(sym)
+   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -433,85 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
+                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
+ }
+ 
+-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+-  MachineFunction *MF = MBB.getParent();
+-  MachineInstr &MI = *MBBI;
+-  DebugLoc DL = MI.getDebugLoc();
+-  const MachineOperand &Func = MI.getOperand(0);
+-  MachineInstrBuilder CALL;
+-  unsigned Opcode;
+-
+-  switch (MF->getTarget().getCodeModel()) {
+-  default:
+-    report_fatal_error("Unsupported code model");
+-    break;
+-  case CodeModel::Small: {
+-    // CALL:
+-    // bl func
+-    // TAIL:
+-    // b func
+-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+-    break;
+-  }
+-  case CodeModel::Medium: {
+-    // CALL:
+-    // pcaddu18i $ra, %call36(func)
+-    // jirl      $ra, $ra, 0
+-    // TAIL:
+-    // pcaddu18i $scratch, %call36(func)
+-    // jirl      $r0, $scratch, 0
+-    Opcode =
+-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+-    Register ScratchReg =
+-        IsTailCall
+-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-            : LoongArch::R1;
+-    MachineInstrBuilder MIB =
+-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+-
+-    CALL =
+-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+-
+-    if (Func.isSymbol())
+-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+-    else
+-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+-    break;
+-  }
+-  case CodeModel::Large: {
+-    // Emit the 5-insn large address load sequence, either directly or
+-    // indirectly in case of going through the GOT, then JIRL_TAIL or
+-    // JIRL_CALL to $addr.
+-    Opcode =
+-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+-    Register AddrReg =
+-        IsTailCall
+-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-            : LoongArch::R1;
+-
+-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+-                           false);
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+-    break;
+-  }
+-  }
+-
+-  // Transfer implicit operands.
+-  CALL.copyImplicitOps(MI);
+-
+-  // Transfer MI flags.
+-  CALL.setMIFlags(MI.getFlags());
+-
+-  MI.eraseFromParent();
+-  return true;
+-}
+-
+ class LoongArchExpandPseudo : public MachineFunctionPass {
+ public:
+   const LoongArchInstrInfo *TII;
+@@ -533,6 +288,35 @@ private:
+                 MachineBasicBlock::iterator &NextMBBI);
+   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      MachineBasicBlock::iterator &NextMBBI);
++  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
++                              MachineBasicBlock::iterator MBBI,
++                              MachineBasicBlock::iterator &NextMBBI,
++                              unsigned LastOpcode, unsigned IdentifyingMO);
++  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
++                              MachineBasicBlock::iterator MBBI,
++                              MachineBasicBlock::iterator &NextMBBI,
++                              unsigned LastOpcode, unsigned IdentifyingMO,
++                              const MachineOperand &Symbol, Register DestReg,
++                              bool EraseFromParent);
++  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
++                                 MachineBasicBlock::iterator MBBI,
++                                 MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandFunctionCALL(MachineBasicBlock &MBB,
++                          MachineBasicBlock::iterator MBBI,
++                          MachineBasicBlock::iterator &NextMBBI,
++                          bool IsTailCall);
+ };
+ 
+ char LoongArchExpandPseudo::ID = 0;
+@@ -567,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
+   switch (MBBI->getOpcode()) {
+   case LoongArch::PseudoCopyCFR:
+     return expandCopyCFR(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_PCREL_LARGE:
++    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_GOT_LARGE:
++    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_IE_LARGE:
++    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_LD_LARGE:
++    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_GD_LARGE:
++    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoCALL:
++  case LoongArch::PseudoCALL_MEDIUM:
++  case LoongArch::PseudoCALL_LARGE:
++    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
++  case LoongArch::PseudoTAIL:
++  case LoongArch::PseudoTAIL_MEDIUM:
++  case LoongArch::PseudoTAIL_LARGE:
++    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
+   }
+ 
+   return false;
+@@ -625,6 +427,213 @@ bool LoongArchExpandPseudo::expandCopyCFR(
+   return true;
+ }
+ 
++bool LoongArchExpandPseudo::expandLargeAddressLoad(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
++    unsigned IdentifyingMO) {
++  MachineInstr &MI = *MBBI;
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
++                                MI.getOperand(2), MI.getOperand(0).getReg(),
++                                true);
++}
++
++bool LoongArchExpandPseudo::expandLargeAddressLoad(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
++    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
++    bool EraseFromParent) {
++  // Code Sequence:
++  //
++  // Part1: pcalau12i  $dst, %MO1(sym)
++  // Part0: addi.d     $t8, $zero, %MO0(sym)
++  // Part2: lu32i.d    $t8, %MO2(sym)
++  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
++  // Fin:   LastOpcode $dst, $t8, $dst
++
++  unsigned MO0, MO1, MO2, MO3;
++  switch (IdentifyingMO) {
++  default:
++    llvm_unreachable("unsupported identifying MO");
++  case LoongArchII::MO_PCREL_LO:
++    MO0 = IdentifyingMO;
++    MO1 = LoongArchII::MO_PCREL_HI;
++    MO2 = LoongArchII::MO_PCREL64_LO;
++    MO3 = LoongArchII::MO_PCREL64_HI;
++    break;
++  case LoongArchII::MO_GOT_PC_HI:
++  case LoongArchII::MO_LD_PC_HI:
++  case LoongArchII::MO_GD_PC_HI:
++    // These cases relocate just like the GOT case, except for Part1.
++    MO0 = LoongArchII::MO_GOT_PC_LO;
++    MO1 = IdentifyingMO;
++    MO2 = LoongArchII::MO_GOT_PC64_LO;
++    MO3 = LoongArchII::MO_GOT_PC64_HI;
++    break;
++  case LoongArchII::MO_IE_PC_LO:
++    MO0 = IdentifyingMO;
++    MO1 = LoongArchII::MO_IE_PC_HI;
++    MO2 = LoongArchII::MO_IE_PC64_LO;
++    MO3 = LoongArchII::MO_IE_PC64_HI;
++    break;
++  }
++
++  MachineFunction *MF = MBB.getParent();
++  MachineInstr &MI = *MBBI;
++  DebugLoc DL = MI.getDebugLoc();
++  Register ScratchReg = LoongArch::R20; // $t8
++
++  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
++         "Large code model requires LA64");
++
++  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
++  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
++                   .addReg(LoongArch::R0);
++  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
++                   // "rj" is needed due to InstrInfo pattern requirement.
++                   .addReg(ScratchReg);
++  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
++                   .addReg(ScratchReg);
++  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
++      .addReg(ScratchReg)
++      .addReg(DestReg);
++
++  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
++    const char *SymName = Symbol.getSymbolName();
++    Part0.addExternalSymbol(SymName, MO0);
++    Part1.addExternalSymbol(SymName, MO1);
++    Part2.addExternalSymbol(SymName, MO2);
++    Part3.addExternalSymbol(SymName, MO3);
++  } else {
++    Part0.addDisp(Symbol, 0, MO0);
++    Part1.addDisp(Symbol, 0, MO1);
++    Part2.addDisp(Symbol, 0, MO2);
++    Part3.addDisp(Symbol, 0, MO3);
++  }
++
++  if (EraseFromParent)
++    MI.eraseFromParent();
++
++  return true;
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%pc` family of
++  // relocs.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_PCREL_LO);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, loading the result from GOT with `ldx.d` in the end.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
++                                LoongArchII::MO_GOT_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%ie_pc` family
++  // of relocs, loading the result with `ldx.d` in the end.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
++                                LoongArchII::MO_IE_PC_LO);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_LD_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_GD_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandFunctionCALL(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
++  MachineFunction *MF = MBB.getParent();
++  MachineInstr &MI = *MBBI;
++  DebugLoc DL = MI.getDebugLoc();
++  const MachineOperand &Func = MI.getOperand(0);
++  MachineInstrBuilder CALL;
++  unsigned Opcode;
++
++  switch (MF->getTarget().getCodeModel()) {
++  default:
++    report_fatal_error("Unsupported code model");
++    break;
++  case CodeModel::Small: {
++    // CALL:
++    // bl func
++    // TAIL:
++    // b func
++    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
++    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
++    break;
++  }
++  case CodeModel::Medium: {
++    // CALL:
++    // pcaddu18i  $ra, %call36(func)
++    // jirl       $ra, $ra, 0
++    // TAIL:
++    // pcaddu18i  $t8, %call36(func)
++    // jr         $t8
++    Opcode =
++        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
++    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
++    MachineInstrBuilder MIB =
++        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
++
++    CALL =
++        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
++
++    if (Func.isSymbol())
++      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
++    else
++      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
++    break;
++  }
++  case CodeModel::Large: {
++    // Emit the 5-insn large address load sequence, either directly or
++    // indirectly in case of going through the GOT, then JIRL_TAIL or
++    // JIRL_CALL to $addr.
++    Opcode =
++        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
++    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
++
++    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
++    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
++    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
++    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
++                           false);
++    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
++    break;
++  }
++  }
++
++  // Transfer implicit operands.
++  CALL.copyImplicitOps(MI);
++
++  // Transfer MI flags.
++  CALL.setMIFlags(MI.getFlags());
++
++  MI.eraseFromParent();
++  return true;
++}
++
+ } // end namespace
+ 
+ INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 4fc2b4709840..df1b17649b7d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -3389,8 +3389,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ 
+     // TODO: Add more target-dependent nodes later.
+     NODE_NAME_CASE(CALL)
++    NODE_NAME_CASE(CALL_MEDIUM)
++    NODE_NAME_CASE(CALL_LARGE)
+     NODE_NAME_CASE(RET)
+     NODE_NAME_CASE(TAIL)
++    NODE_NAME_CASE(TAIL_MEDIUM)
++    NODE_NAME_CASE(TAIL_LARGE)
+     NODE_NAME_CASE(SLL_W)
+     NODE_NAME_CASE(SRA_W)
+     NODE_NAME_CASE(SRL_W)
+@@ -4248,15 +4252,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ 
+   // Emit the call.
+   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
++  unsigned Op;
++  switch (DAG.getTarget().getCodeModel()) {
++  default:
++    report_fatal_error("Unsupported code model");
++  case CodeModel::Small:
++    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
++    break;
++  case CodeModel::Medium:
++    assert(Subtarget.is64Bit() && "Medium code model requires LA64");
++    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
++    break;
++  case CodeModel::Large:
++    assert(Subtarget.is64Bit() && "Large code model requires LA64");
++    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
++    break;
++  }
+ 
+   if (IsTailCall) {
+     MF.getFrameInfo().setHasTailCall();
+-    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
++    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
+     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
+     return Ret;
+   }
+ 
+-  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
++  Chain = DAG.getNode(Op, DL, NodeTys, Ops);
+   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+   Glue = Chain.getValue(1);
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 2c9826a13237..a2ed149f4bb7 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -28,8 +28,12 @@ enum NodeType : unsigned {
+ 
+   // TODO: add more LoongArchISDs
+   CALL,
++  CALL_MEDIUM,
++  CALL_LARGE,
+   RET,
+   TAIL,
++  TAIL_MEDIUM,
++  TAIL_LARGE,
+ 
+   // 32-bit shifts, directly matching the semantics of the named LoongArch
+   // instructions.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index 67de5f7afd78..ecd0c2b71b85 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
+ def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
+                             SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic>;
++def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
++                                   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                    SDNPVariadic>;
++def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
++                                   SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                    SDNPVariadic>;
++def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
++                                  SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                   SDNPVariadic>;
++def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
++                                  SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                   SDNPVariadic>;
+ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
+ def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
+ def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
+@@ -1327,16 +1339,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
+ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
+           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+ 
++// Function call with 'Small' code model.
+ let isCall = 1, Defs = R1 in
+ def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
+ 
+ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+ def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+ 
++// Function call with 'Medium' code model.
++let isCall = 1, Defs = R1, R20, Size = 8 in
++def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
++
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_call_medium tglobaladdr:$func),
++          (PseudoCALL_MEDIUM tglobaladdr:$func)>;
++def : Pat<(loongarch_call_medium texternalsym:$func),
++          (PseudoCALL_MEDIUM texternalsym:$func)>;
++} // Predicates = IsLA64
++
++// Function call with 'Large' code model.
++let isCall = 1, Defs = R1, R20, Size = 24 in
++def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
++
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_call_large tglobaladdr:$func),
++          (PseudoCALL_LARGE tglobaladdr:$func)>;
++def : Pat<(loongarch_call_large texternalsym:$func),
++          (PseudoCALL_LARGE texternalsym:$func)>;
++} // Predicates = IsLA64
++
+ let isCall = 1, Defs = R1 in
+ def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
+                                 (loongarch_call GPR:$rj)>,
+                          PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
++def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
++}
+ 
+ let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = R1 in
+ def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
+@@ -1347,6 +1386,7 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
+ def PseudoRET : Pseudo<(outs), (ins), (loongarch_ret)>,
+                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
+ 
++// Tail call with 'Small' code model.
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = R3 in
+ def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
+ 
+@@ -1355,10 +1395,38 @@ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
+ def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
+           (PseudoTAIL texternalsym:$dst)>;
+ 
++// Tail call with 'Medium' code model.
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
++    Uses = R3, Defs = R20, Size = 8 in
++def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
++
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
++          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
++def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
++          (PseudoTAIL_MEDIUM texternalsym:$dst)>;
++} // Predicates = IsLA64
++
++// Tail call with 'Large' code model.
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
++    Uses = R3, Defs = R19, R20, Size = 24 in
++def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
++
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
++          (PseudoTAIL_LARGE tglobaladdr:$dst)>;
++def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
++          (PseudoTAIL_LARGE texternalsym:$dst)>;
++} // Predicates = IsLA64
++
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = R3 in
+ def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
+                                 (loongarch_tail GPRT:$rj)>,
+                          PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
++let Predicates = IsLA64 in {
++def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
++def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
++}
+ 
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+     hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = R3 in
+@@ -1396,6 +1464,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
+                                 "la.abs", "$dst, $src">;
+ def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
+                             "la.pcrel", "$dst, $src">;
++let Defs = R20, Size = 20 in
+ def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
+                                   (ins GPR:$tmp, bare_symbol:$src), ,
+                                   "la.pcrel", "$dst, $tmp, $src">,
+@@ -1407,28 +1476,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+     isAsmParserOnly = 1 in {
+ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
+                           "la.got", "$dst, $src">;
++def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
++                             "la.tls.ie", "$dst, $src">;
++def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
++                             "la.tls.ld", "$dst, $src">;
++def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
++                             "la.tls.gd", "$dst, $src">;
++let Defs = R20, Size = 20 in {
+ def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
+                                 (ins GPR:$tmp, bare_symbol:$src), ,
+                                 "la.got", "$dst, $tmp, $src">,
+                          Requires<IsLA64>;
+-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
+-                             "la.tls.ie", "$dst, $src">;
+ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), ,
+                                    "la.tls.ie", "$dst, $tmp, $src">,
+                             Requires<IsLA64>;
+-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
+-                             "la.tls.ld", "$dst, $src">;
+ def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), ,
+                                    "la.tls.ld", "$dst, $tmp, $src">,
+                             Requires<IsLA64>;
+-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), ,
+-                             "la.tls.gd", "$dst, $src">;
+ def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), ,
+                                    "la.tls.gd", "$dst, $tmp, $src">,
+                             Requires<IsLA64>;
++} // Defs = R20, Size = 20
+ }
+ 
+ // Load address inst alias: "la", "la.global" and "la.local".
+diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
+index 7c6f46d5e926..f93c31670928 100644
+--- a/llvm/test/CodeGen/LoongArch/code-models.ll
++++ b/llvm/test/CodeGen/LoongArch/code-models.ll
+@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
+ ; LARGE:       # %bb.0:
+ ; LARGE-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
+-; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
+-; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
+-; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
+-; LARGE-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
++; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
++; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
++; LARGE-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE-NEXT:    jirl $ra, $ra, 0
+ ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE-NEXT:    addi.d $sp, $sp, 16
+@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
+ ; LARGE-NEXT:    .cfi_offset 1, -8
+ ; LARGE-NEXT:    ori $a2, $zero, 1000
+ ; LARGE-NEXT:    move $a1, $zero
+-; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
+-; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
+-; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
+-; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
+-; LARGE-NEXT:    add.d $ra, $ra, $a3
++; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
++; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
++; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
++; LARGE-NEXT:    add.d $ra, $t8, $ra
+ ; LARGE-NEXT:    jirl $ra, $ra, 0
+ ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE-NEXT:    addi.d $sp, $sp, 16
+@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
+ ;
+ ; MEDIUM-LABEL: caller_tail:
+ ; MEDIUM:       # %bb.0: # %entry
+-; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+-; MEDIUM-NEXT:    jr $a1
++; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
++; MEDIUM-NEXT:    jr $t8
+ ;
+ ; LARGE-LABEL: caller_tail:
+ ; LARGE:       # %bb.0: # %entry
+-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
+-; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
+-; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
+-; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
+-; LARGE-NEXT:    ldx.d $a1, $a2, $a1
+-; LARGE-NEXT:    jr $a1
++; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
++; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
++; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
++; LARGE-NEXT:    ldx.d $t7, $t8, $t7
++; LARGE-NEXT:    jr $t7
+ entry:
+   %r = tail call i32 @callee_tail(i32 %i)
+   ret i32 %r
+diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
+index 86bf4292665b..e0d179f92de6 100644
+--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
++++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
+@@ -1,6 +1,6 @@
+ ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \
+ ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
+-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
++; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \
+ ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
+ 
+ declare void @callee()
+diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
+index a8f0ef648aa7..d32a17f488b1 100644
+--- a/llvm/test/CodeGen/LoongArch/global-address.ll
++++ b/llvm/test/CodeGen/LoongArch/global-address.ll
+@@ -53,32 +53,32 @@ define void @foo() nounwind {
+ ; LA64LARGENOPIC-LABEL: foo:
+ ; LA64LARGENOPIC:       # %bb.0:
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGENOPIC-NEXT:    ret
+ ;
+ ; LA64LARGEPIC-LABEL: foo:
+ ; LA64LARGEPIC:       # %bb.0:
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGEPIC-NEXT:    ret
+   %V = load volatile i32, ptr @G
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+index a515939b9c2b..474436a0126b 100644
+--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -48,13 +48,13 @@ define void @foo() nounwind {
+ ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+ ; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+ ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+@@ -74,41 +74,41 @@ define void @foo() nounwind {
+ ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
+-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+-; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
++; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+@@ -118,42 +118,42 @@ define void @foo() nounwind {
+ ; LARGE_SCH:       # %bb.0:
+ ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+-; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
+ ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
++; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
+diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
+index a2a3792a6a54..3994df1da716 100644
+--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
++++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
+@@ -45,15 +45,15 @@ define ptr @f1() nounwind {
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
+ ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
++; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+ ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
+ ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
+@@ -76,10 +76,10 @@ define ptr @f1() nounwind {
+ ; LA64LARGENOPIC-LABEL: f1:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+@@ -116,15 +116,15 @@ define ptr @f2() nounwind {
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
+ ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
++; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+ ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
+ ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
+@@ -147,10 +147,10 @@ define ptr @f2() nounwind {
+ ; LA64LARGENOPIC-LABEL: f2:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+@@ -177,10 +177,10 @@ define ptr @f3() nounwind {
+ ; LA64LARGEPIC-LABEL: f3:
+ ; LA64LARGEPIC:       # %bb.0: # %entry
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGEPIC-NEXT:    ret
+ ;
+@@ -201,10 +201,10 @@ define ptr @f3() nounwind {
+ ; LA64LARGENOPIC-LABEL: f3:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+-- 
+2.20.1
+
+
+From 34e8c30579faf4a8ef69fa686bd9b2d9e832d299 Mon Sep 17 00:00:00 2001
+From: Jie Fu <jiefu@tencent.com>
+Date: Fri, 5 Jan 2024 12:05:23 +0800
+Subject: PATCH 06/12 LoongArch Fix -Wunused-variable in
+ LoongArchExpandPseudoInsts.cpp (NFC)
+
+llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp:480:20:
+ error: unused variable 'MF' -Werror,-Wunused-variable
+  MachineFunction *MF = MBB.getParent();
+                   ^
+1 error generated.
+
+(cherry picked from commit 52d1397e38ee88b170585c9c824d08e6975890ca)
+---
+ llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index f977f176066a..ad39658f698e 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -477,12 +477,11 @@ bool LoongArchExpandPseudo::expandLargeAddressLoad(
+     break;
+   }
+ 
+-  MachineFunction *MF = MBB.getParent();
+   MachineInstr &MI = *MBBI;
+   DebugLoc DL = MI.getDebugLoc();
+   Register ScratchReg = LoongArch::R20; // $t8
+ 
+-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
++  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+          "Large code model requires LA64");
+ 
+   auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+-- 
+2.20.1
+
+
+From b5d3aa3ac0dcf98fbb5f8d2d9de295be991c9e8f Mon Sep 17 00:00:00 2001
+From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
+Date: Tue, 23 Jul 2024 12:06:59 +0800
+Subject: PATCH 07/12 LoongArchCodeGen Implement 128-bit and 256-bit
+ vector shuffle. (#100054)
+
+LoongArchCodeGen Implement 128-bit and 256-bit vector shuffle
+operations.
+
+In LoongArch, shuffle operations can be divided into two types:
+- Single-vector shuffle: Shuffle using only one vector, with the other
+vector being `undef` or not selected by mask. This can be expanded to
+instructions such as `vreplvei` and `vshuf4i`.
+- Two-vector shuffle: Shuflle using two vectors. This can be expanded to
+instructions like `vilvl/h`, `vpackev/od`, `vpickev/od` and the
+basic `vshuf`.
+
+In the future, more optimizations may be added, such as handling 1-bit
+vectors and processing single element patterns, etc.
+
+(cherry picked from commit 464ea880cf7710cc8675c83001d7ae020406cf42)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 933 +++++++++++++++++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |  10 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 130 +++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 148 +++
+ .../lasx/ir-instruction/shuffle-as-xvilv.ll   |  74 ++
+ .../lasx/ir-instruction/shuffle-as-xvpack.ll  | 124 +++
+ .../lasx/ir-instruction/shuffle-as-xvpick.ll  |  84 ++
+ .../ir-instruction/shuffle-as-xvrepl128vei.ll |  65 ++
+ .../lasx/ir-instruction/shuffle-as-xvshuf.ll  |  76 ++
+ .../ir-instruction/shuffle-as-xvshuf4i.ll     |  43 +
+ .../lsx/ir-instruction/shuffle-as-vilv.ll     |  82 ++
+ .../lsx/ir-instruction/shuffle-as-vpack.ll    | 122 +++
+ .../lsx/ir-instruction/shuffle-as-vpick.ll    |  82 ++
+ .../lsx/ir-instruction/shuffle-as-vreplvei.ll |  62 ++
+ .../lsx/ir-instruction/shuffle-as-vshuf.ll    |  84 ++
+ .../lsx/ir-instruction/shuffle-as-vshuf4i.ll  |  42 +
+ 16 files changed, 2158 insertions(+), 3 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index df1b17649b7d..618ae7056425 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -247,9 +247,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+       setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -293,9 +293,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+       setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -422,9 +422,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+   return SDValue();
+ }
+ 
++/// Determine whether a range fits a regular pattern of values.
++/// This function accounts for the possibility of jumping over the End iterator.
++template <typename ValType>
++static bool
++fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
++                   unsigned CheckStride,
++                   typename SmallVectorImpl<ValType>::const_iterator End,
++                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
++  auto &I = Begin;
++
++  while (I != End) {
++    if (*I != -1 && *I != ExpectedIndex)
++      return false;
++    ExpectedIndex += ExpectedIndexStride;
++
++    // Incrementing past End is undefined behaviour so we must increment one
++    // step at a time and check for End at each step.
++    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
++      ; // Empty loop body.
++  }
++  return true;
++}
++
++/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
++///
++/// VREPLVEI performs vector broadcast based on an element specified by an
++/// integer immediate, with its mask being similar to:
++///   <x, x, x, ...>
++/// where x is any valid index.
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above form.
++static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  int SplatIndex = -1;
++  for (const auto &M : Mask) {
++    if (M != -1) {
++      SplatIndex = M;
++      break;
++    }
++  }
++
++  if (SplatIndex == -1)
++    return DAG.getUNDEF(VT);
++
++  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
++  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
++    APInt Imm(64, SplatIndex);
++    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
++                       DAG.getConstant(Imm, DL, MVT::i64));
++  }
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
++///
++/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
++/// elements according to a <4 x i2> constant (encoded as an integer immediate).
++///
++/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
++///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
++/// When undef's appear they are treated as if they were whatever value is
++/// necessary in order to fit the above forms.
++///
++/// For example:
++///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
++///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
++///                                 i32 7, i32 6, i32 5, i32 4>
++/// is lowered to:
++///   (VSHUF4I_H $v0, $v1, 27)
++/// where the 27 comes from:
++///   3 + (2 << 2) + (1 << 4) + (0 << 6)
++static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  // When the size is less than 4, lower cost instructions may be used.
++  if (Mask.size() < 4)
++    return SDValue();
++
++  int SubMask4 = {-1, -1, -1, -1};
++  for (unsigned i = 0; i < 4; ++i) {
++    for (unsigned j = i; j < Mask.size(); j += 4) {
++      int Idx = Maskj;
++
++      // Convert from vector index to 4-element subvector index
++      // If an index refers to an element outside of the subvector then give up
++      if (Idx != -1) {
++        Idx -= 4 * (j / 4);
++        if (Idx < 0 || Idx >= 4)
++          return SDValue();
++      }
++
++      // If the mask has an undef, replace it with the current index.
++      // Note that it might still be undef if the current index is also undef
++      if (SubMaski == -1)
++        SubMaski = Idx;
++      // Check that non-undef values are the same as in the mask. If they
++      // aren't then give up
++      else if (Idx != -1 && Idx != SubMaski)
++        return SDValue();
++    }
++  }
++
++  // Calculate the immediate. Replace any remaining undefs with zero
++  APInt Imm(64, 0);
++  for (int i = 3; i >= 0; --i) {
++    int Idx = SubMaski;
++
++    if (Idx == -1)
++      Idx = 0;
++
++    Imm <<= 2;
++    Imm |= Idx & 0x3;
++  }
++
++  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
++                     DAG.getConstant(Imm, DL, MVT::i64));
++}
++
++/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
++///
++/// VPACKEV interleaves the even elements from each vector.
++///
++/// It is possible to lower into VPACKEV when the mask consists of two of the
++/// following forms interleaved:
++///   <0, 2, 4, ...>
++///   <n, n+2, n+4, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 0, 2, 2, 4, 4, ...>
++///   <0, n, 2, n+2, 4, n+4, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
++///
++/// VPACKOD interleaves the odd elements from each vector.
++///
++/// It is possible to lower into VPACKOD when the mask consists of two of the
++/// following forms interleaved:
++///   <1, 3, 5, ...>
++///   <n+1, n+3, n+5, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <1, 1, 3, 3, 5, 5, ...>
++///   <1, n+1, 3, n+3, 5, n+5, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VILVH (if possible).
++///
++/// VILVH interleaves consecutive elements from the left (highest-indexed) half
++/// of each vector.
++///
++/// It is possible to lower into VILVH when the mask consists of two of the
++/// following forms interleaved:
++///   <x, x+1, x+2, ...>
++///   <n+x, n+x+1, n+x+2, ...>
++/// where n is the number of elements in the vector and x is half n.
++/// For example:
++///   <x, x, x+1, x+1, x+2, x+2, ...>
++///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
++                                   1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VILVL (if possible).
++///
++/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
++/// of each vector.
++///
++/// It is possible to lower into VILVL when the mask consists of two of the
++/// following forms interleaved:
++///   <0, 1, 2, ...>
++///   <n, n+1, n+2, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 0, 1, 1, 2, 2, ...>
++///   <0, n, 1, n+1, 2, n+2, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
++///
++/// VPICKEV copies the even elements of each vector into the result vector.
++///
++/// It is possible to lower into VPICKEV when the mask consists of two of the
++/// following forms concatenated:
++///   <0, 2, 4, ...>
++///   <n, n+2, n+4, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 2, 4, ..., 0, 2, 4, ...>
++///   <0, 2, 4, ..., n, n+2, n+4, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
++    V2 = OriV2;
++
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
++///
++/// VPICKOD copies the odd elements of each vector into the result vector.
++///
++/// It is possible to lower into VPICKOD when the mask consists of two of the
++/// following forms concatenated:
++///   <1, 3, 5, ...>
++///   <n+1, n+3, n+5, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <1, 3, 5, ..., 1, 3, 5, ...>
++///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VSHUF.
++///
++/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
++/// adding it as an operand to the resulting VSHUF.
++static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  SmallVector<SDValue, 16> Ops;
++  for (auto M : Mask)
++    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
++
++  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
++  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
++
++  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
++  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
++  // VSHF concatenates the vectors in a bitwise fashion:
++  // <0b00, 0b01> + <0b10, 0b11> ->
++  // 0b0100       + 0b1110       -> 0b01001110
++  //                                <0b10, 0b11, 0b00, 0b01>
++  // We must therefore swap the operands to get the correct result.
++  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
++}
++
++/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
++///
++/// This routine breaks down the specific type of 128-bit shuffle and
++/// dispatches to the lowering routines accordingly.
++static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
++                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
++  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
++          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
++          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
++         "Vector type is unsupported for lsx!");
++  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
++         "Two operands have different types!");
++  assert(VT.getVectorNumElements() == Mask.size() &&
++         "Unexpected mask size for shuffle!");
++  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
++
++  SDValue Result;
++  // TODO: Add more comparison patterns.
++  if (V2.isUndef()) {
++    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
++      return Result;
++    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
++      return Result;
++
++    // TODO: This comment may be enabled in the future to better match the
++    // pattern for instruction selection.
++    /* V2 = V1; */
++  }
++
++  // It is recommended not to change the pattern comparison order for better
++  // performance.
++  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
++///
++/// It is a XVREPLVEI when the mask is:
++///   <x, x, x, ..., x+n, x+n, x+n, ...>
++/// where the number of x is equal to n and n is half the length of vector.
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above form.
++static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
++                                             ArrayRef<int> Mask, MVT VT,
++                                             SDValue V1, SDValue V2,
++                                             SelectionDAG &DAG) {
++  int SplatIndex = -1;
++  for (const auto &M : Mask) {
++    if (M != -1) {
++      SplatIndex = M;
++      break;
++    }
++  }
++
++  if (SplatIndex == -1)
++    return DAG.getUNDEF(VT);
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++
++  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
++  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
++                              0)) {
++    APInt Imm(64, SplatIndex);
++    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
++                       DAG.getConstant(Imm, DL, MVT::i64));
++  }
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  // When the size is less than or equal to 4, lower cost instructions may be
++  // used.
++  if (Mask.size() <= 4)
++    return SDValue();
++  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  unsigned LeftSize = HalfSize / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
++                              1) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
++                                   Mask.size() + HalfSize - LeftSize, 1) &&
++           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
++                                   Mask.size() + HalfSize + LeftSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
++                              1) &&
++      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
++                              1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
++                                   Mask.size() + HalfSize - LeftSize, 1) &&
++           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
++                                   Mask.size() + HalfSize + LeftSize, 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
++           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
++                                   Mask.size() + HalfSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
++      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
++                                   1) &&
++           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
++                                   Mask.size() + HalfSize, 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &RightMid = Mask.end() - Mask.size() / 4;
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
++      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
++           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
++      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
++           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
++    V2 = OriV2;
++
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &RightMid = Mask.end() - Mask.size() / 4;
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
++      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
++           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
++                                   2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
++      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
++           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
++                                   2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  int MaskSize = Mask.size();
++  int HalfSize = Mask.size() / 2;
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + HalfSize;
++  const auto &End = Mask.end();
++
++  // VECTOR_SHUFFLE concatenates the vectors:
++  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
++  //  shuffling ->
++  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
++  //
++  // XVSHUF concatenates the vectors:
++  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
++  //  shuffling ->
++  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
++  SmallVector<SDValue, 8> MaskAlloc;
++  for (auto it = Begin; it < Mid; it++) {
++    if (*it < 0) // UNDEF
++      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
++    else if ((*it >= 0 && *it < HalfSize) ||
++             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
++      int M = *it < HalfSize ? *it : *it - HalfSize;
++      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
++    } else
++      return SDValue();
++  }
++  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
++
++  for (auto it = Mid; it < End; it++) {
++    if (*it < 0) // UNDEF
++      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
++    else if ((*it >= HalfSize && *it < MaskSize) ||
++             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
++      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
++      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
++    } else
++      return SDValue();
++  }
++  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
++
++  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
++  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
++  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
++}
++
++/// Shuffle vectors by lane to generate more optimized instructions.
++/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
++///
++/// Therefore, except for the following four cases, other cases are regarded
++/// as cross-lane shuffles, where optimization is relatively limited.
++///
++/// - Shuffle high, low lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
++/// - Shuffle low, high lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
++/// - Shuffle low, low lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
++/// - Shuffle high, high lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
++///
++/// The first case is the closest to LoongArch instructions and the other
++/// cases need to be converted to it for processing.
++///
++/// This function may modify V1, V2 and Mask
++static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
++                                            MutableArrayRef<int> Mask, MVT VT,
++                                            SDValue &V1, SDValue &V2,
++                                            SelectionDAG &DAG) {
++
++  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
++
++  int MaskSize = Mask.size();
++  int HalfSize = Mask.size() / 2;
++
++  HalfMaskType preMask = None, postMask = None;
++
++  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, &(int M) {
++        return M < 0 || (M >= 0 && M < HalfSize) ||
++               (M >= MaskSize && M < MaskSize + HalfSize);
++      }))
++    preMask = HighLaneTy;
++  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, &(int M) {
++             return M < 0 || (M >= HalfSize && M < MaskSize) ||
++                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
++           }))
++    preMask = LowLaneTy;
++
++  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), &(int M) {
++        return M < 0 || (M >= 0 && M < HalfSize) ||
++               (M >= MaskSize && M < MaskSize + HalfSize);
++      }))
++    postMask = HighLaneTy;
++  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), &(int M) {
++             return M < 0 || (M >= HalfSize && M < MaskSize) ||
++                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
++           }))
++    postMask = LowLaneTy;
++
++  // The pre-half of mask is high lane type, and the post-half of mask
++  // is low lane type, which is closest to the LoongArch instructions.
++  //
++  // Note: In the LoongArch architecture, the high lane of mask corresponds
++  // to the lower 128-bit of vector register, and the low lane of mask
++  // corresponds the higher 128-bit of vector register.
++  if (preMask == HighLaneTy && postMask == LowLaneTy) {
++    return;
++  }
++  if (preMask == LowLaneTy && postMask == HighLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b01001110, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b01001110, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
++      *it = *it < 0 ? *it : *it - HalfSize;
++    }
++    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
++      *it = *it < 0 ? *it : *it + HalfSize;
++    }
++  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b11101110, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b11101110, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
++      *it = *it < 0 ? *it : *it - HalfSize;
++    }
++  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b01000100, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b01000100, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
++      *it = *it < 0 ? *it : *it + HalfSize;
++    }
++  } else { // cross-lane
++    return;
++  }
++}
++
++/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
++///
++/// This routine breaks down the specific type of 256-bit shuffle and
++/// dispatches to the lowering routines accordingly.
++static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
++                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
++  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
++          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
++          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
++         "Vector type is unsupported for lasx!");
++  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
++         "Two operands have different types!");
++  assert(VT.getVectorNumElements() == Mask.size() &&
++         "Unexpected mask size for shuffle!");
++  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
++  assert(Mask.size() >= 4 && "Mask size is less than 4.");
++
++  // canonicalize non cross-lane shuffle vector
++  SmallVector<int> NewMask(Mask);
++  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
++
++  SDValue Result;
++  // TODO: Add more comparison patterns.
++  if (V2.isUndef()) {
++    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
++      return Result;
++    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
++      return Result;
++
++    // TODO: This comment may be enabled in the future to better match the
++    // pattern for instruction selection.
++    /* V2 = V1; */
++  }
++
++  // It is recommended not to change the pattern comparison order for better
++  // performance.
++  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++
++  return SDValue();
++}
++
+ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+-  // TODO: custom shuffle.
++  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
++  ArrayRef<int> OrigMask = SVOp->getMask();
++  SDValue V1 = Op.getOperand(0);
++  SDValue V2 = Op.getOperand(1);
++  MVT VT = Op.getSimpleValueType();
++  int NumElements = VT.getVectorNumElements();
++  SDLoc DL(Op);
++
++  bool V1IsUndef = V1.isUndef();
++  bool V2IsUndef = V2.isUndef();
++  if (V1IsUndef && V2IsUndef)
++    return DAG.getUNDEF(VT);
++
++  // When we create a shuffle node we put the UNDEF node to second operand,
++  // but in some cases the first operand may be transformed to UNDEF.
++  // In this case we should just commute the node.
++  if (V1IsUndef)
++    return DAG.getCommutedVectorShuffle(*SVOp);
++
++  // Check for non-undef masks pointing at an undef vector and make the masks
++  // undef as well. This makes it easier to match the shuffle based solely on
++  // the mask.
++  if (V2IsUndef &&
++      any_of(OrigMask, NumElements(int M) { return M >= NumElements; })) {
++    SmallVector<int, 8> NewMask(OrigMask);
++    for (int &M : NewMask)
++      if (M >= NumElements)
++        M = -1;
++    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
++  }
++
++  // Check for illegal shuffle mask element index values.
++  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
++  (void)MaskUpperLimit;
++  assert(llvm::all_of(OrigMask,
++                      &(int M) { return -1 <= M && M < MaskUpperLimit; }) &&
++         "Out of bounds shuffle index");
++
++  // For each vector width, delegate to a specialized lowering routine.
++  if (VT.is128BitVector())
++    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
++
++  if (VT.is256BitVector())
++    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
++
+   return SDValue();
+ }
+ 
+@@ -3439,6 +4356,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+     NODE_NAME_CASE(MOVFCSR2GR)
+     NODE_NAME_CASE(CACOP_D)
+     NODE_NAME_CASE(CACOP_W)
++    NODE_NAME_CASE(VSHUF)
++    NODE_NAME_CASE(VPICKEV)
++    NODE_NAME_CASE(VPICKOD)
++    NODE_NAME_CASE(VPACKEV)
++    NODE_NAME_CASE(VPACKOD)
++    NODE_NAME_CASE(VILVL)
++    NODE_NAME_CASE(VILVH)
++    NODE_NAME_CASE(VSHUF4I)
++    NODE_NAME_CASE(VREPLVEI)
++    NODE_NAME_CASE(XVPERMI)
+     NODE_NAME_CASE(VPICK_SEXT_ELT)
+     NODE_NAME_CASE(VPICK_ZEXT_ELT)
+     NODE_NAME_CASE(VREPLVE)
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index a2ed149f4bb7..a5ee740c1261 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -117,6 +117,16 @@ enum NodeType : unsigned {
+ 
+   // Vector Shuffle
+   VREPLVE,
++  VSHUF,
++  VPICKEV,
++  VPICKOD,
++  VPACKEV,
++  VPACKOD,
++  VILVL,
++  VILVH,
++  VSHUF4I,
++  VREPLVEI,
++  XVPERMI,
+ 
+   // Extended vector element extraction
+   VPICK_SEXT_ELT,
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 492b62da6ce7..5b6721cdf1b4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -10,6 +10,8 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
++
+ def lasxsplati8
+   : PatFrag<(ops node:$e0),
+             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
+@@ -1571,6 +1573,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
+           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
+ 
++// XVSHUF_{B/H/W/D}
++def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
++          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
++def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
++          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
++          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
++          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
++          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
++          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
++
++// XVPICKEV_{B/H/W/D}
++def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
++          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
++          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
++          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
++          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
++          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
++          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPICKOD_{B/H/W/D}
++def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
++          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
++          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
++          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
++          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
++          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
++          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPACKEV_{B/H/W/D}
++def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
++          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
++          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
++          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
++          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
++          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
++          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPACKOD_{B/H/W/D}
++def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
++          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
++          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
++          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
++          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
++          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
++          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
++
++// XVILVL_{B/H/W/D}
++def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
++          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
++          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
++          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
++          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
++          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
++          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
++
++// XVILVH_{B/H/W/D}
++def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
++          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
++          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
++          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
++          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
++          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
++          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
++
++// XVSHUF4I_{B/H/W}
++def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
++          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
++        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
++        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
++        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
++
++// XVREPL128VEI_{B/H/W/D}
++def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
++          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
++def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
++        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
++def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
++        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
++        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
++def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
++        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
++        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
++
++// XVPERMI_D
++def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
++          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
++def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
++          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
++
+ // XVREPLVE0_{W/D}
+ def : Pat<(lasxsplatf32 FPR32:$fj),
+           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 99ac2f3c162f..3519fa3142c3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, SDTCisInt<0>, SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>, SDTCisInt<2>>;
+ def SDT_LoongArchVecCond : SDTypeProfile<1, 1, SDTCisInt<0>, SDTCisVec<1>>;
+ 
++def SDT_LoongArchVShuf : SDTypeProfile<1, 3, SDTCisVec<0>,
++                                         SDTCisInt<1>, SDTCisVec<1>,
++                                         SDTCisSameAs<0, 2>,
++                                         SDTCisSameAs<2, 3>>;
++def SDT_LoongArchV2R : SDTypeProfile<1, 2, SDTCisVec<0>,
++                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>>;
++def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, SDTCisVec<0>,
++                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>>;
++
+ // Target nodes.
+ def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
+ def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
+@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
+ def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
+                                       SDTypeProfile<1, 3, SDTCisPtrTy<2>>>;
+ 
++def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
++def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
++def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
++def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
++def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
++def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
++def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
++
++def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
++def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
++
++def immZExt1 : ImmLeaf<i64, {return isUInt<1>(Imm);}>;
++def immZExt2 : ImmLeaf<i64, {return isUInt<2>(Imm);}>;
++def immZExt3 : ImmLeaf<i64, {return isUInt<3>(Imm);}>;
++def immZExt4 : ImmLeaf<i64, {return isUInt<4>(Imm);}>;
++def immZExt8 : ImmLeaf<i64, {return isUInt<8>(Imm);}>;
++
+ class VecCond<SDPatternOperator OpNode, ValueType TyNode,
+               RegisterClass RC = LSX128>
+     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
+@@ -1678,6 +1704,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
+           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
+ 
++// VSHUF_{B/H/W/D}
++def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
++          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
++def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
++          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
++          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
++          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
++          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
++          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
++
++// VPICKEV_{B/H/W/D}
++def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
++          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
++          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
++          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
++          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
++          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
++          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
++
++// VPICKOD_{B/H/W/D}
++def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
++          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
++          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
++          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
++          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
++          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
++          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
++
++// VPACKEV_{B/H/W/D}
++def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
++          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
++          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
++          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
++          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
++          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
++          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
++
++// VPACKOD_{B/H/W/D}
++def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
++          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
++          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
++          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
++          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
++          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
++          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
++
++// VILVL_{B/H/W/D}
++def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
++          (VILVL_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
++          (VILVL_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
++          (VILVL_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
++          (VILVL_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
++          (VILVL_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
++          (VILVL_D v2f64:$vj, v2f64:$vk)>;
++
++// VILVH_{B/H/W/D}
++def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
++          (VILVH_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
++          (VILVH_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
++          (VILVH_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
++          (VILVH_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
++          (VILVH_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
++          (VILVH_D v2f64:$vj, v2f64:$vk)>;
++
++// VSHUF4I_{B/H/W}
++def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
++          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
++        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
++        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
++        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
++
++// VREPLVEI_{B/H/W/D}
++def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
++          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
++def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
++        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
++def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
++        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
++        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
++def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
++        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
++        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
++
+ // VREPLVEI_{W/D}
+ def : Pat<(lsxsplatf32 FPR32:$fj),
+           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+new file mode 100644
+index 000000000000..22ab19b9fa44
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvilvl.b
++define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
++                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
++    ret <32 x i8> %c
++}
++
++;; xvilvl.h
++define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
++    ret <16 x i16> %c
++}
++
++;; xvilvl.w
++define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
++    ret <8 x i32> %c
++}
++
++;; xvilvh.b
++define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
++                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvilvh.h
++define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvilvh.w
++define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvilvh.w
++define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+new file mode 100644
+index 000000000000..2ff9af4069b9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+@@ -0,0 +1,124 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvpackev.b
++define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
++                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
++    ret <32 x i8> %c
++}
++
++;; xvpackev.h
++define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
++    ret <16 x i16> %c
++}
++
++;; xvpackev.w
++define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x i32> %c
++}
++
++;; xvpickev.d/xvpackev.d/xvilvl.d
++define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x i64> %c
++}
++
++;; xvpackev.w
++define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x float> %c
++}
++
++;; xvpickev.d/xvpackev.d/xvilvl.d
++define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x double> %c
++}
++
++;; xvpackod.b
++define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
++                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvpackod.h
++define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvpackod.w
++define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvpickod.d/xvpackod.d/xvilvh.d
++define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x i64> %c
++}
++
++;; xvpackod.w
++define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x float> %c
++}
++
++;; xvpickod.d/xvpackod.d/xvilvh.d
++define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+new file mode 100644
+index 000000000000..294d292d1764
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+@@ -0,0 +1,84 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvpickev.b
++define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
++                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
++    ret <32 x i8> %c
++}
++
++;; xvpickev.h
++define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
++    ret <16 x i16> %c
++}
++
++;; xvpickev.w
++define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
++    ret <8 x i32> %c
++}
++
++;; xvpickev.w
++define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
++    ret <8 x float> %c
++}
++
++;; xvpickod.b
++define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
++                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvpickod.h
++define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvpickod.w
++define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvpickod.w
++define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflodector_pick_od_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+new file mode 100644
+index 000000000000..dce1e4b777e2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+@@ -0,0 +1,65 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvrepl128vei.b
++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
++                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
++    ret <32 x i8> %c
++}
++
++;; xvrepl128vei.h
++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
++                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
++    ret <16 x i16> %c
++}
++
++;; xvrepl128vei.w
++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
++    ret <8 x i32> %c
++}
++
++;; xvrepl128vei.d
++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
++    ret <4 x i64> %c
++}
++
++;; xvrepl128vei.w
++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
++    ret <8 x float> %c
++}
++
++;; xvrepl128vei.d
++define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflevector_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
++    ret <4 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+new file mode 100644
+index 000000000000..fce32647da3d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+@@ -0,0 +1,76 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvshuf.b
++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
++; CHECK-NEXT:    xvld $xr2, $a0, 0
++; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
++                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
++    ret <32 x i8> %c
++}
++
++;; xvshuf.h
++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
++                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
++    ret <16 x i16> %c
++}
++
++;; xvshuf.w
++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 68
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
++    ret <8 x i32> %c
++}
++
++;; xvshuf.d
++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 238
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
++    ret <4 x i64> %c
++}
++
++;; xvshuf.w
++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    xvld $xr2, $a0, 0
++; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
++; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+new file mode 100644
+index 000000000000..dc4532a7292a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+@@ -0,0 +1,43 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xxvshuf4i.b
++define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
++                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
++    ret <32 x i8> %c
++}
++
++;; xvshuf4i.h
++define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
++    ret <16 x i16> %c
++}
++
++;; xvshuf4i.w
++define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x i32> %c
++}
++
++;; xvshuf4i.w
++define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+new file mode 100644
+index 000000000000..31398c6081c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+@@ -0,0 +1,82 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vilvl.b
++define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
++    ret <16 x i8> %c
++}
++
++;; vilvl.h
++define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
++    ret <8 x i16> %c
++}
++
++;; vilvl.w
++define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++    ret <4 x i32> %c
++}
++
++;; vilvl.w
++define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++    ret <4 x float> %c
++}
++
++;; vilvh.b
++define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vilvh.h
++define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vilvh.w
++define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vilvh.w
++define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++    ret <4 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+new file mode 100644
+index 000000000000..171e68306cd1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vpackev.b
++define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
++    ret <16 x i8> %c
++}
++
++;; vpackev.h
++define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x i16> %c
++}
++
++;; vpackev.w
++define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x i32> %c
++}
++
++;; vpickev.d/vpackev.d/vilvl.d
++define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
++    ret <2 x i64> %c
++}
++
++;; vpackev.w
++define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x float> %c
++}
++
++;; vpickev.d/vpackev.d/vilvl.d
++define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
++    ret <2 x double> %c
++}
++
++;; vpackod.b
++define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vpackod.h
++define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vpackod.w
++define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vpickod.d/vpackod.d/vilvh.d
++define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
++    ret <2 x i64> %c
++}
++
++;; vpackod.w
++define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x float> %c
++}
++
++;; vpickod.d/vpackod.d/vilvh.d
++define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+new file mode 100644
+index 000000000000..ca636d942b58
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+@@ -0,0 +1,82 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vpickev.b
++define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
++    ret <16 x i8> %c
++}
++
++;; vpickev.h
++define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
++    ret <8 x i16> %c
++}
++
++;; vpickev.w
++define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
++    ret <4 x i32> %c
++}
++
++;; vpickev.w
++define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
++    ret <4 x float> %c
++}
++
++;; vpickod.b
++define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vpickod.h
++define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vpickod.w
++define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vpickod.w
++define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflodector_pick_od_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++    ret <4 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+new file mode 100644
+index 000000000000..10510786f321
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vreplvei.b
++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++    ret <16 x i8> %c
++}
++
++;; vreplvei.h
++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
++    ret <8 x i16> %c
++}
++
++;; vreplvei.w
++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
++    ret <4 x i32> %c
++}
++
++;; vreplvei.d
++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
++    ret <2 x i64> %c
++}
++
++;; vreplvei.w
++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
++    ret <4 x float> %c
++}
++
++;; vreplvei.d
++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+new file mode 100644
+index 000000000000..55800b31446b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+@@ -0,0 +1,84 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vshuf.h
++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vshuf.w
++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vshuf.d
++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
++    ret <2 x i64> %c
++}
++
++;; vshuf.w
++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
++    ret <4 x float> %c
++}
++
++;; vshuf.d
++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+new file mode 100644
+index 000000000000..660b9581c3d1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+@@ -0,0 +1,42 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vilvh.b
++define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
++    ret <16 x i8> %c
++}
++
++;; vilvh.h
++define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x i16> %c
++}
++
++;; vilvh.w
++define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
++    ret <4 x i32> %c
++}
++
++;; vilvh.w
++define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
++    ret <4 x float> %c
++}
+-- 
+2.20.1
+
+
+From cac0cc4649362e0b80f61e45aec54341f40f7f77 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Wed, 17 Jan 2024 11:15:05 +0800
+Subject: PATCH 08/12 LoongArch Add LoongArch V1.1 instructions definitions
+ and MC tests (#78238)
+
+LoongArch V1.1 instrucions include floating-point approximate reciprocal
+instructions and atomic instrucions. And add testcases for these
+instrucions meanwhile.
+
+(cherry picked from commit 84bdee2875da364be7eb2144b1ae530f6a05f0e2)
+---
+ .../LoongArch/LoongArchFloat32InstrInfo.td    |  2 +
+ .../LoongArch/LoongArchFloat64InstrInfo.td    |  2 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 34 ++++++-
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  4 +
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  4 +
+ llvm/test/MC/LoongArch/Basic/Float/d-arith.s  |  8 ++
+ llvm/test/MC/LoongArch/Basic/Float/f-arith.s  |  8 ++
+ llvm/test/MC/LoongArch/Basic/Integer/atomic.s | 92 +++++++++++++++++++
+ llvm/test/MC/LoongArch/lasx/frecip.s          |  8 ++
+ llvm/test/MC/LoongArch/lasx/frsqrt.s          |  8 ++
+ llvm/test/MC/LoongArch/lsx/frecip.s           |  8 ++
+ llvm/test/MC/LoongArch/lsx/frsqrt.s           |  8 ++
+ 12 files changed, 184 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+index 65120c083f49..f30837912e75 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+@@ -50,6 +50,8 @@ def FNEG_S   : FP_ALU_2R<0x01141400>;
+ def FSQRT_S  : FP_ALU_2R<0x01144400>;
+ def FRECIP_S : FP_ALU_2R<0x01145400>;
+ def FRSQRT_S : FP_ALU_2R<0x01146400>;
++def FRECIPE_S : FP_ALU_2R<0x01147400>;
++def FRSQRTE_S : FP_ALU_2R<0x01148400>;
+ def FSCALEB_S : FP_ALU_3R<0x01108000>;
+ def FLOGB_S   : FP_ALU_2R<0x01142400>;
+ def FCOPYSIGN_S : FP_ALU_3R<0x01128000>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+index 437c1e4d7be2..0ea4c564b045 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+@@ -34,6 +34,8 @@ def FNEG_D   : FP_ALU_2R<0x01141800, FPR64>;
+ def FSQRT_D  : FP_ALU_2R<0x01144800, FPR64>;
+ def FRECIP_D : FP_ALU_2R<0x01145800, FPR64>;
+ def FRSQRT_D : FP_ALU_2R<0x01146800, FPR64>;
++def FRECIPE_D : FP_ALU_2R<0x01147800, FPR64>;
++def FRSQRTE_D : FP_ALU_2R<0x01148800, FPR64>;
+ def FSCALEB_D : FP_ALU_3R<0x01110000, FPR64>;
+ def FLOGB_D   : FP_ALU_2R<0x01142800, FPR64>;
+ def FCOPYSIGN_D : FP_ALU_3R<0x01130000, FPR64>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index ecd0c2b71b85..756c460f916b 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -634,15 +634,24 @@ class AM_3R<bits<32> op>
+     : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPRMemAtomic:$rj),
+             "$rd, $rk, $rj">;
+ 
+-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
++let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ class LLBase<bits<32> op>
+     : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14),
+                "$rd, $rj, $imm14">;
++class LLBase_ACQ<bits<32> op>
++    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), "$rd, $rj">;
++}
+ 
+-let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in
++let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in {
+ class SCBase<bits<32> op>
+     : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
+                "$rd, $rj, $imm14">;
++class SCBase_128<bits<32> op>
++    : Fmt3R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rk, GPR:$rj),
++               "$rd, $rk, $rj">;
++class SCBase_REL<bits<32> op>
++    : Fmt2R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj), "$rd, $rj">;
++}
+ 
+ let hasSideEffects = 1 in
+ class IOCSRRD<bits<32> op>
+@@ -754,6 +763,8 @@ def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12),
+ // Atomic Memory Access Instructions
+ def LL_W : LLBase<0x20000000>;
+ def SC_W : SCBase<0x21000000>;
++def LLACQ_W : LLBase_ACQ<0x38578000>;
++def SCREL_W : SCBase_REL<0x38578400>;
+ 
+ // Barrier Instructions
+ def DBAR : MISC_I15<0x38720000>;
+@@ -875,8 +886,12 @@ def STLE_W : STORE_3R<0x387f0000>;
+ def STLE_D : STORE_3R<0x387f8000>;
+ 
+ // Atomic Memory Access Instructions for 64-bits
++def AMSWAP_B     : AM_3R<0x385c0000>;
++def AMSWAP_H     : AM_3R<0x385c8000>;
+ def AMSWAP_W     : AM_3R<0x38600000>;
+ def AMSWAP_D     : AM_3R<0x38608000>;
++def AMADD_B      : AM_3R<0x385d0000>;
++def AMADD_H      : AM_3R<0x385d8000>;
+ def AMADD_W      : AM_3R<0x38610000>;
+ def AMADD_D      : AM_3R<0x38618000>;
+ def AMAND_W      : AM_3R<0x38620000>;
+@@ -893,8 +908,12 @@ def AMMAX_WU     : AM_3R<0x38670000>;
+ def AMMAX_DU     : AM_3R<0x38678000>;
+ def AMMIN_WU     : AM_3R<0x38680000>;
+ def AMMIN_DU     : AM_3R<0x38688000>;
++def AMSWAP__DB_B : AM_3R<0x385e0000>;
++def AMSWAP__DB_H : AM_3R<0x385e8000>;
+ def AMSWAP__DB_W : AM_3R<0x38690000>;
+ def AMSWAP__DB_D : AM_3R<0x38698000>;
++def AMADD__DB_B  : AM_3R<0x385f0000>;
++def AMADD__DB_H  : AM_3R<0x385f8000>;
+ def AMADD__DB_W  : AM_3R<0x386a0000>;
+ def AMADD__DB_D  : AM_3R<0x386a8000>;
+ def AMAND__DB_W  : AM_3R<0x386b0000>;
+@@ -911,8 +930,19 @@ def AMMAX__DB_WU : AM_3R<0x38700000>;
+ def AMMAX__DB_DU : AM_3R<0x38708000>;
+ def AMMIN__DB_WU : AM_3R<0x38710000>;
+ def AMMIN__DB_DU : AM_3R<0x38718000>;
++def AMCAS_B     : AM_3R<0x38580000>;
++def AMCAS_H     : AM_3R<0x38588000>;
++def AMCAS_W     : AM_3R<0x38590000>;
++def AMCAS_D     : AM_3R<0x38598000>;
++def AMCAS__DB_B     : AM_3R<0x385a0000>;
++def AMCAS__DB_H     : AM_3R<0x385a8000>;
++def AMCAS__DB_W     : AM_3R<0x385b0000>;
++def AMCAS__DB_D     : AM_3R<0x385b8000>;
+ def LL_D : LLBase<0x22000000>;
+ def SC_D : SCBase<0x23000000>;
++def SC_Q : SCBase_128<0x38570000>;
++def LLACQ_D : LLBase_ACQ<0x38578800>;
++def SCREL_D : SCBase_REL<0x38578C00>;
+ 
+ // CRC Check Instructions
+ def CRC_W_B_W  : ALU_3R<0x00240000>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 5b6721cdf1b4..454915ac8c0a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -773,6 +773,10 @@ def XVFRECIP_S : LASX2R_XX<0x769cf400>;
+ def XVFRECIP_D : LASX2R_XX<0x769cf800>;
+ def XVFRSQRT_S : LASX2R_XX<0x769d0400>;
+ def XVFRSQRT_D : LASX2R_XX<0x769d0800>;
++def XVFRECIPE_S : LASX2R_XX<0x769d1400>;
++def XVFRECIPE_D : LASX2R_XX<0x769d1800>;
++def XVFRSQRTE_S : LASX2R_XX<0x769d2400>;
++def XVFRSQRTE_D : LASX2R_XX<0x769d2800>;
+ 
+ def XVFCVTL_S_H : LASX2R_XX<0x769de800>;
+ def XVFCVTH_S_H : LASX2R_XX<0x769dec00>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 3519fa3142c3..6d60d7074ec3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -918,6 +918,10 @@ def VFRECIP_S : LSX2R_VV<0x729cf400>;
+ def VFRECIP_D : LSX2R_VV<0x729cf800>;
+ def VFRSQRT_S : LSX2R_VV<0x729d0400>;
+ def VFRSQRT_D : LSX2R_VV<0x729d0800>;
++def VFRECIPE_S : LSX2R_VV<0x729d1400>;
++def VFRECIPE_D : LSX2R_VV<0x729d1800>;
++def VFRSQRTE_S : LSX2R_VV<0x729d2400>;
++def VFRSQRTE_D : LSX2R_VV<0x729d2800>;
+ 
+ def VFCVTL_S_H : LSX2R_VV<0x729de800>;
+ def VFCVTH_S_H : LSX2R_VV<0x729dec00>;
+diff --git a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+index 6b2c67e9a2cc..8e19d2e34f3c 100644
+--- a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
++++ b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+@@ -78,10 +78,18 @@ fsqrt.d $fa2, $ft3
+ # ASM: encoding: 0x7b,0x5b,0x14,0x01
+ frecip.d $fs3, $fs3
+ 
++# ASM-AND-OBJ: frecipe.d $fa0, $fa0
++# ASM: encoding: 0x00,0x78,0x14,0x01
++frecipe.d $fa0, $fa0
++
+ # ASM-AND-OBJ: frsqrt.d $ft14, $fa3
+ # ASM: encoding: 0x76,0x68,0x14,0x01
+ frsqrt.d $ft14, $fa3
+ 
++# ASM-AND-OBJ: frsqrte.d $fa1, $fa1
++# ASM: encoding: 0x21,0x88,0x14,0x01
++frsqrte.d $fa1, $fa1
++
+ # ASM-AND-OBJ: fscaleb.d $ft4, $ft6, $fs2
+ # ASM: encoding: 0xcc,0x69,0x11,0x01
+ fscaleb.d $ft4, $ft6, $fs2
+diff --git a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+index 155e783cf435..c32151adbf3b 100644
+--- a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
++++ b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+@@ -73,10 +73,18 @@ fsqrt.s $fs3, $ft10
+ # ASM: encoding: 0x71,0x57,0x14,0x01
+ frecip.s $ft9, $fs3
+ 
++# ASM-AND-OBJ: frecipe.s $fa0, $fa0
++# ASM: encoding: 0x00,0x74,0x14,0x01
++frecipe.s $fa0, $fa0
++
+ # ASM-AND-OBJ: frsqrt.s $fs1, $ft4
+ # ASM: encoding: 0x99,0x65,0x14,0x01
+ frsqrt.s $fs1, $ft4
+ 
++# ASM-AND-OBJ: frsqrte.s $fa1, $fa1
++# ASM: encoding: 0x21,0x84,0x14,0x01
++frsqrte.s $fa1, $fa1
++
+ # ASM-AND-OBJ: fscaleb.s $ft13, $ft15, $fa6
+ # ASM: encoding: 0xf5,0x9a,0x10,0x01
+ fscaleb.s $ft13, $ft15, $fa6
+diff --git a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+index a35211db8851..69acdeef935c 100644
+--- a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
++++ b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+@@ -21,6 +21,14 @@ ll.w $tp, $s4, 220
+ # CHECK-ASM: encoding: 0xd3,0x39,0x00,0x21
+ sc.w $t7, $t2, 56
+ 
++# CHECK-ASM-AND-OBJ: llacq.w $t1, $t2
++# CHECK-ASM: encoding: 0xcd,0x81,0x57,0x38
++llacq.w $t1, $t2
++
++# CHECK-ASM-AND-OBJ: screl.w $t1, $t2
++# CHECK-ASM: encoding: 0xcd,0x85,0x57,0x38
++screl.w $t1, $t2
++
+ 
+ 
+ #############################################################
+@@ -29,6 +37,14 @@ sc.w $t7, $t2, 56
+ 
+ .ifdef LA64
+ 
++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: 0x06,0x33,0x5c,0x38
++amswap.b $a2, $t0, $s1, 0
++
++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
++# CHECK64-ASM: encoding: 0x06,0xb3,0x5c,0x38
++amswap.h $a2, $t0, $s1, 0
++
+ # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: 0x06,0x33,0x60,0x38
+ amswap.w $a2, $t0, $s1, 0
+@@ -41,6 +57,14 @@ amswap.w $zero, $t0, $zero
+ # CHECK64-ASM: encoding: 0xa0,0x00,0x6a,0x38
+ amadd_db.w $zero, $zero, $a1
+ 
++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: 0x06,0x33,0x5c,0x38
++amswap.b $a2, $t0, $s1
++
++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
++# CHECK64-ASM: encoding: 0x06,0xb3,0x5c,0x38
++amswap.h $a2, $t0, $s1
++
+ # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: 0x06,0x33,0x60,0x38
+ amswap.w $a2, $t0, $s1
+@@ -49,6 +73,14 @@ amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: 0xc2,0xba,0x60,0x38
+ amswap.d $tp, $t2, $fp
+ 
++# CHECK64-ASM-AND-OBJ: amadd.b $a4, $t0, $r21
++# CHECK64-ASM: encoding: 0xa8,0x32,0x5d,0x38
++amadd.b $a4, $t0, $r21
++
++# CHECK64-ASM-AND-OBJ: amadd.h $a1, $t5, $s6
++# CHECK64-ASM: encoding: 0xa5,0xc7,0x5d,0x38
++amadd.h $a1, $t5, $s6
++
+ # CHECK64-ASM-AND-OBJ: amadd.w $a4, $t0, $r21
+ # CHECK64-ASM: encoding: 0xa8,0x32,0x61,0x38
+ amadd.w $a4, $t0, $r21
+@@ -113,6 +145,14 @@ ammin.wu $a4, $t6, $s7
+ # CHECK64-ASM: encoding: 0x27,0xc3,0x68,0x38
+ ammin.du $a3, $t4, $s2
+ 
++# CHECK64-ASM-AND-OBJ: amswap_db.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: 0x06,0x33,0x5e,0x38
++amswap_db.b $a2, $t0, $s1
++
++# CHECK64-ASM-AND-OBJ: amswap_db.h $tp, $t2, $fp
++# CHECK64-ASM: encoding: 0xc2,0xba,0x5e,0x38
++amswap_db.h $tp, $t2, $fp
++
+ # CHECK64-ASM-AND-OBJ: amswap_db.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: 0x06,0x33,0x69,0x38
+ amswap_db.w $a2, $t0, $s1
+@@ -121,6 +161,14 @@ amswap_db.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: 0xc2,0xba,0x69,0x38
+ amswap_db.d $tp, $t2, $fp
+ 
++# CHECK64-ASM-AND-OBJ: amadd_db.b $zero, $zero, $a1
++# CHECK64-ASM: encoding: 0xa0,0x00,0x5f,0x38
++amadd_db.b $zero, $zero, $a1
++
++# CHECK64-ASM-AND-OBJ: amadd_db.h $a4, $t0, $r21
++# CHECK64-ASM: encoding: 0xa8,0xb2,0x5f,0x38
++amadd_db.h $a4, $t0, $r21
++
+ # CHECK64-ASM-AND-OBJ: amadd_db.w $a4, $t0, $r21
+ # CHECK64-ASM: encoding: 0xa8,0x32,0x6a,0x38
+ amadd_db.w $a4, $t0, $r21
+@@ -185,6 +233,38 @@ ammin_db.wu $a4, $t6, $s7
+ # CHECK64-ASM: encoding: 0x27,0xc3,0x71,0x38
+ ammin_db.du $a3, $t4, $s2
+ 
++# CHECK64-ASM-AND-OBJ: amcas.b $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0x39,0x58,0x38
++amcas.b $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.h $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0xb9,0x58,0x38
++amcas.h $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.w $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0x39,0x59,0x38
++amcas.w $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.d $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0xb9,0x59,0x38
++amcas.d $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.b $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0x39,0x5a,0x38
++amcas_db.b $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.h $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0xb9,0x5a,0x38
++amcas_db.h $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.w $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0x39,0x5b,0x38
++amcas_db.w $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.d $t1, $t2, $t3
++# CHECK64-ASM: encoding: 0xed,0xb9,0x5b,0x38
++amcas_db.d $t1, $t2, $t3
++
+ # CHECK64-ASM-AND-OBJ: ll.d $s2, $s4, 16
+ # CHECK64-ASM: encoding: 0x79,0x13,0x00,0x22
+ ll.d $s2, $s4, 16
+@@ -193,5 +273,17 @@ ll.d $s2, $s4, 16
+ # CHECK64-ASM: encoding: 0x31,0xf6,0x00,0x23
+ sc.d $t5, $t5, 244
+ 
++# CHECK64-ASM-AND-OBJ: sc.q $t7, $t2, $t5
++# CHECK64-ASM: encoding: 0x33,0x3a,0x57,0x38
++sc.q $t7, $t2, $t5
++
++# CHECK64-ASM-AND-OBJ: llacq.d $t1, $t2
++# CHECK64-ASM: encoding: 0xcd,0x89,0x57,0x38
++llacq.d $t1, $t2
++
++# CHECK64-ASM-AND-OBJ: screl.d $t1, $t2
++# CHECK64-ASM: encoding: 0xcd,0x8d,0x57,0x38
++screl.d $t1, $t2
++
+ .endif
+ 
+diff --git a/llvm/test/MC/LoongArch/lasx/frecip.s b/llvm/test/MC/LoongArch/lasx/frecip.s
+index 1bb3ce02fb9c..e95b03a96eba 100644
+--- a/llvm/test/MC/LoongArch/lasx/frecip.s
++++ b/llvm/test/MC/LoongArch/lasx/frecip.s
+@@ -10,3 +10,11 @@ xvfrecip.s $xr3, $xr16
+ xvfrecip.d $xr17, $xr24
+ # CHECK-INST: xvfrecip.d $xr17, $xr24
+ # CHECK-ENCODING: encoding: 0x11,0xfb,0x9c,0x76
++
++xvfrecipe.s $xr3, $xr16
++# CHECK-INST: xvfrecipe.s $xr3, $xr16
++# CHECK-ENCODING: encoding: 0x03,0x16,0x9d,0x76
++
++xvfrecipe.d $xr17, $xr24
++# CHECK-INST: xvfrecipe.d $xr17, $xr24
++# CHECK-ENCODING: encoding: 0x11,0x1b,0x9d,0x76
+diff --git a/llvm/test/MC/LoongArch/lasx/frsqrt.s b/llvm/test/MC/LoongArch/lasx/frsqrt.s
+index af96e10832df..d1048f9ff8f0 100644
+--- a/llvm/test/MC/LoongArch/lasx/frsqrt.s
++++ b/llvm/test/MC/LoongArch/lasx/frsqrt.s
+@@ -10,3 +10,11 @@ xvfrsqrt.s $xr31, $xr25
+ xvfrsqrt.d $xr14, $xr22
+ # CHECK-INST: xvfrsqrt.d $xr14, $xr22
+ # CHECK-ENCODING: encoding: 0xce,0x0a,0x9d,0x76
++
++xvfrsqrte.s $xr31, $xr25
++# CHECK-INST: xvfrsqrte.s $xr31, $xr25
++# CHECK-ENCODING: encoding: 0x3f,0x27,0x9d,0x76
++
++xvfrsqrte.d $xr14, $xr22
++# CHECK-INST: xvfrsqrte.d $xr14, $xr22
++# CHECK-ENCODING: encoding: 0xce,0x2a,0x9d,0x76
+diff --git a/llvm/test/MC/LoongArch/lsx/frecip.s b/llvm/test/MC/LoongArch/lsx/frecip.s
+index d8c8278d1667..cd6d925e1470 100644
+--- a/llvm/test/MC/LoongArch/lsx/frecip.s
++++ b/llvm/test/MC/LoongArch/lsx/frecip.s
+@@ -10,3 +10,11 @@ vfrecip.s $vr29, $vr14
+ vfrecip.d $vr24, $vr9
+ # CHECK-INST: vfrecip.d $vr24, $vr9
+ # CHECK-ENCODING: encoding: 0x38,0xf9,0x9c,0x72
++
++vfrecipe.s $vr29, $vr14
++# CHECK-INST: vfrecipe.s $vr29, $vr14
++# CHECK-ENCODING: encoding: 0xdd,0x15,0x9d,0x72
++
++vfrecipe.d $vr24, $vr9
++# CHECK-INST: vfrecipe.d $vr24, $vr9
++# CHECK-ENCODING: encoding: 0x38,0x19,0x9d,0x72
+diff --git a/llvm/test/MC/LoongArch/lsx/frsqrt.s b/llvm/test/MC/LoongArch/lsx/frsqrt.s
+index 68b0cc091b8a..d8b9fc3d0684 100644
+--- a/llvm/test/MC/LoongArch/lsx/frsqrt.s
++++ b/llvm/test/MC/LoongArch/lsx/frsqrt.s
+@@ -10,3 +10,11 @@ vfrsqrt.s $vr19, $vr30
+ vfrsqrt.d $vr1, $vr0
+ # CHECK-INST: vfrsqrt.d $vr1, $vr0
+ # CHECK-ENCODING: encoding: 0x01,0x08,0x9d,0x72
++
++vfrsqrte.s $vr19, $vr30
++# CHECK-INST: vfrsqrte.s $vr19, $vr30
++# CHECK-ENCODING: encoding: 0xd3,0x27,0x9d,0x72
++
++vfrsqrte.d $vr1, $vr0
++# CHECK-INST: vfrsqrte.d $vr1, $vr0
++# CHECK-ENCODING: encoding: 0x01,0x28,0x9d,0x72
+-- 
+2.20.1
+
+
+From 57eaecf7bdb7a7502580076b365b4f70dde1185d Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jan 2024 14:24:58 +0800
+Subject: PATCH 09/12 LoongArch Add definitions and feature 'frecipe' for
+ FP approximation intrinsics/builtins (#78962)
+
+This PR adds definitions and 'frecipe' feature for FP approximation
+intrinsics/builtins. In additions, this adds and complements relative
+testcases.
+
+(cherry picked from commit fcb8342a219ada8ec641790a4c8a9f969d7d64ee)
+---
+ llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 13 ++++++++++
+ llvm/lib/Target/LoongArch/LoongArch.td        |  7 +++++
+ .../LoongArch/LoongArchFloat32InstrInfo.td    |  6 +++++
+ .../LoongArch/LoongArchFloat64InstrInfo.td    |  6 +++++
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 10 +++++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 10 +++++++
+ .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 ++
+ .../LoongArch/intrinsic-frecipe-dbl.ll        | 26 +++++++++++++++++++
+ .../LoongArch/intrinsic-frecipe-flt.ll        | 26 +++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-frecipe.ll       | 26 +++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-frsqrte.ll       | 26 +++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-frecipe.ll        | 26 +++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-frsqrte.ll        | 26 +++++++++++++++++++
+ 13 files changed, 210 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+
+diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+index 685deaec7709..9002076e7aec 100644
+--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<llvm_i64_ty, llvm_i64_ty, llvm_i64_ty,
+                                     ImmArg<ArgIndex<1>>>;
+ def int_loongarch_ldpte_d : BaseInt<, llvm_i64_ty, llvm_i64_ty,
+                                     ImmArg<ArgIndex<1>>>;
++
++def int_loongarch_frecipe_s : BaseInt<llvm_float_ty, llvm_float_ty,
++                                      IntrNoMem>;
++def int_loongarch_frecipe_d : BaseInt<llvm_double_ty, llvm_double_ty,
++                                      IntrNoMem>;
++def int_loongarch_frsqrte_s : BaseInt<llvm_float_ty, llvm_float_ty,
++                                      IntrNoMem>;
++def int_loongarch_frsqrte_d : BaseInt<llvm_double_ty, llvm_double_ty,
++                                      IntrNoMem>;
+ } // TargetPrefix = "loongarch"
+ 
+ /// Vector intrinsic
+@@ -527,10 +536,12 @@ foreach inst = "vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d" in
+              IntrNoMem>;
+ 
+ foreach inst = "vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
++                "vfrecipe_s", "vfrsqrte_s",
+                 "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s" in
+   def int_loongarch_lsx_#inst : VecInt<llvm_v4f32_ty, llvm_v4f32_ty,
+                                        IntrNoMem>;
+ foreach inst = "vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
++                "vfrecipe_d", "vfrsqrte_d",
+                 "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d" in
+   def int_loongarch_lsx_#inst : VecInt<llvm_v2f64_ty, llvm_v2f64_ty,
+                                        IntrNoMem>;
+@@ -1044,10 +1055,12 @@ foreach inst = "xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d" in
+              IntrNoMem>;
+ 
+ foreach inst = "xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
++                "xvfrecipe_s", "xvfrsqrte_s",
+                 "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s" in
+   def int_loongarch_lasx_#inst : VecInt<llvm_v8f32_ty, llvm_v8f32_ty,
+                                         IntrNoMem>;
+ foreach inst = "xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
++                "xvfrecipe_d", "xvfrsqrte_d",
+                 "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d" in
+   def int_loongarch_lasx_#inst : VecInt<llvm_v4f64_ty, llvm_v4f64_ty,
+                                         IntrNoMem>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 2a4c991a43b0..5573e5415d26 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -110,6 +110,13 @@ def FeatureAutoVec
+     : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+                        "Experimental auto vectorization">;
+ 
++// Floating point approximation operation
++def FeatureFrecipe
++    : SubtargetFeature<"frecipe", "HasFrecipe", "true",
++                       "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
++def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
++
++
+ //===----------------------------------------------------------------------===//
+ // Registers, instruction descriptions ...
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+index f30837912e75..e27896768818 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+@@ -281,6 +281,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
+ // FP reciprocal operation
+ def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
+ 
++let Predicates = HasFrecipe in {
++// FP approximate reciprocal operation
++def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
++def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
++}
++
+ // fmadd.s: fj * fk + fa
+ def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>;
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+index 0ea4c564b045..26bed67ac222 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+@@ -242,6 +242,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
+ // FP reciprocal operation
+ def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
+ 
++let Predicates = HasFrecipe in {
++// FP approximate reciprocal operation
++def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
++def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
++}
++
+ // fmadd.d: fj * fk + fa
+ def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>;
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 454915ac8c0a..6f1969bf8cae 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -2080,6 +2080,16 @@ foreach Inst = "XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ 
++// 256-Bit vector FP approximate reciprocal operation
++let Predicates = HasFrecipe in {
++foreach Inst = "XVFRECIPE_S", "XVFRSQRTE_S" in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = "XVFRECIPE_D", "XVFRSQRTE_D" in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++}
++
+ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
+           (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 6d60d7074ec3..0580683c3ce3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -2195,6 +2195,16 @@ foreach Inst = "VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ 
++// 128-Bit vector FP approximate reciprocal operation
++let Predicates = HasFrecipe in {
++foreach Inst = "VFRECIPE_S", "VFRSQRTE_S" in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = "VFRECIPE_D", "VFRSQRTE_D" in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++}
++
+ // load
+ def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
+           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+index 174e4cba8326..11c0b39e176e 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
++++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
+   bool HasUAL = false;
+   bool HasLinkerRelax = false;
+   bool HasExpAutoVec = false;
++  bool HasFrecipe = false;
+   unsigned GRLen = 32;
+   MVT GRLenVT = MVT::i32;
+   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
+@@ -104,6 +105,7 @@ public:
+   bool hasUAL() const { return HasUAL; }
+   bool hasLinkerRelax() const { return HasLinkerRelax; }
+   bool hasExpAutoVec() const { return HasExpAutoVec; }
++  bool hasFrecipe() const { return HasFrecipe; }
+   MVT getGRLenVT() const { return GRLenVT; }
+   unsigned getGRLen() const { return GRLen; }
+   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
+diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+new file mode 100644
+index 000000000000..9f572500caa0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+@@ -0,0 +1,26 @@
++; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s
++; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s
++
++declare double @llvm.loongarch.frecipe.d(double)
++
++define double @frecipe_d(double %a) {
++; CHECK-LABEL: frecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frecipe.d $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call double @llvm.loongarch.frecipe.d(double %a)
++  ret double %res
++}
++
++declare double @llvm.loongarch.frsqrte.d(double)
++
++define double @frsqrte_d(double %a) {
++; CHECK-LABEL: frsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frsqrte.d $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call double @llvm.loongarch.frsqrte.d(double %a)
++  ret double %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+new file mode 100644
+index 000000000000..0b2029f2e44a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+@@ -0,0 +1,26 @@
++; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s
++; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s
++
++declare float @llvm.loongarch.frecipe.s(float)
++
++define float @frecipe_s(float %a) {
++; CHECK-LABEL: frecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frecipe.s $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call float @llvm.loongarch.frecipe.s(float %a)
++  ret float %res
++}
++
++declare float @llvm.loongarch.frsqrte.s(float)
++
++define float @frsqrte_s(float %a) {
++; CHECK-LABEL: frsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frsqrte.s $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call float @llvm.loongarch.frsqrte.s(float %a)
++  ret float %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+new file mode 100644
+index 000000000000..215436823af8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecipe.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecipe.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+new file mode 100644
+index 000000000000..ad36c3aa5c29
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrte.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrte.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+new file mode 100644
+index 000000000000..1b7a97d9f972
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>)
++
++define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecipe.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>)
++
++define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecipe.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+new file mode 100644
+index 000000000000..3cd6c78e87d7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>)
++
++define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrte.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>)
++
++define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrte.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+-- 
+2.20.1
+
+
+From 6f9531b069971dc0f5c6b28bd6a6754c1b5fde72 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jul 2024 14:03:28 +0800
+Subject: PATCH 10/12 LoongArch Support -march=la64v1.0 and -march=la64v1.1
+ (#100057)
+
+The newly added strings `la64v1.0` and `la64v1.1` in `-march` are as
+described in LoongArch toolchains conventions (see 1).
+
+The target-cpu/feature attributes are forwarded to compiler when
+specifying particular `-march` parameter. The default cpu `loongarch64`
+is returned when archname is `la64v1.0` or `la64v1.1`.
+
+In addition, this commit adds `la64v1.0`/`la64v1.1` to
+"__loongarch_arch" and adds definition for macro "__loongarch_frecipe".
+
+1: https://github.com/loongson/la-toolchain-conventions
+
+(cherry picked from commit 5a1b9896ad5a7dcd25a1cc7a4d3fd44155e4b22d)
+---
+ llvm/lib/TargetParser/LoongArchTargetParser.cpp | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+index 772d24c5ce3d..8e86d18de2ad 100644
+--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
++++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch,
+       return true;
+     }
+   }
++
++  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
++    Features.push_back("+64bit");
++    Features.push_back("+d");
++    Features.push_back("+lsx");
++    Features.push_back("+ual");
++    if (Arch == "la64v1.1")
++      Features.push_back("+frecipe");
++    return true;
++  }
++
+   return false;
+ }
+ 
+-- 
+2.20.1
+
+
+From 6094875aa6aab1e28a096294783cada0243e95d5 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jul 2024 15:14:20 +0800
+Subject: PATCH 11/12 LoongArch Support la664 (#100068)
+
+A new ProcessorModel called `la664` is defined in LoongArch.td to
+support `-march/-mtune=la664`.
+
+(cherry picked from commit fcec298087dba0c83f6d0bbafd6cd934c42cbf82)
+---
+ llvm/include/llvm/TargetParser/LoongArchTargetParser.def | 2 ++
+ llvm/include/llvm/TargetParser/LoongArchTargetParser.h   | 3 +++
+ llvm/lib/Target/LoongArch/LoongArch.td                   | 7 +++++++
+ llvm/lib/TargetParser/Host.cpp                           | 2 ++
+ llvm/test/CodeGen/LoongArch/cpus.ll                      | 5 +++++
+ 5 files changed, 19 insertions(+)
+
+diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+index b20d124953f8..101a48cbd539 100644
+--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
+ LOONGARCH_FEATURE("+lbt", FK_LBT)
+ LOONGARCH_FEATURE("+lvz", FK_LVZ)
+ LOONGARCH_FEATURE("+ual", FK_UAL)
++LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
+ 
+ #undef LOONGARCH_FEATURE
+ 
+@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
+ 
+ LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
+ LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
++LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
+ 
+ #undef LOONGARCH_ARCH
+diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+index 028844187584..c0bb15a5163b 100644
+--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {
+ 
+   // Allow memory accesses to be unaligned.
+   FK_UAL = 1 << 8,
++
++  // Floating-point approximate reciprocal instructions are available.
++  FK_FRECIPE = 1 << 9,
+ };
+ 
+ struct FeatureInfo {
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 5573e5415d26..b5cd5bb0f8a4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -147,6 +147,13 @@ def : ProcessorModel<"la464", NoSchedModel, Feature64Bit,
+                                              FeatureExtLVZ,
+                                              FeatureExtLBT>;
+ 
++def : ProcessorModel<"la664", NoSchedModel, Feature64Bit,
++                                             FeatureUAL,
++                                             FeatureExtLASX,
++                                             FeatureExtLVZ,
++                                             FeatureExtLBT,
++                                             FeatureFrecipe>;
++
+ //===----------------------------------------------------------------------===//
+ // Define the LoongArch target.
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index 8b23be02edc0..87e3e0b434d5 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -1469,6 +1469,8 @@ StringRef sys::getHostCPUName() {
+   switch (processor_id & 0xf000) {
+   case 0xc000: // Loongson 64bit, 4-issue
+     return "la464";
++  case 0xd000: // Loongson 64bit, 6-issue
++    return "la664";
+   // TODO: Others.
+   default:
+     break;
+diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
+index 35945ae4de71..087cf887b813 100644
+--- a/llvm/test/CodeGen/LoongArch/cpus.ll
++++ b/llvm/test/CodeGen/LoongArch/cpus.ll
+@@ -3,6 +3,7 @@
+ 
+ ; RUN: llc < %s --mtriple=loongarch64 --mcpu=loongarch64 2>&1 | FileCheck %s
+ ; RUN: llc < %s --mtriple=loongarch64 --mcpu=la464 2>&1 | FileCheck %s
++; RUN: llc < %s --mtriple=loongarch64 --mcpu=la664 2>&1 | FileCheck %s
+ ; RUN: llc < %s --mtriple=loongarch64 2>&1 | FileCheck %s
+ 
+ ; CHECK-NOT: {{.*}} is not a recognized processor for this target
+@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
+ define void @tune_cpu_la464() "tune-cpu"="la464" {
+   ret void
+ }
++
++define void @tune_cpu_la664() "tune-cpu"="la664" {
++  ret void
++}
+-- 
+2.20.1
+
+
+From f06fec7597485a8d90aa81e3c65abea1bdeeb90b Mon Sep 17 00:00:00 2001
+From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
+Date: Tue, 23 Jul 2024 15:19:00 +0800
+Subject: PATCH 12/12 LoongArch Remove experimental `auto-vec` feature.
+ (#100070)
+
+Currently, automatic vectorization will be enabled with `-mlsx/-mlasx`
+enabled.
+
+(cherry picked from commit 89d1eb67342d75d1de8d210157fdeaeb6a4724b6)
+---
+ llvm/lib/Target/LoongArch/LoongArch.td                     | 4 ----
+ llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp | 2 --
+ llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll   | 2 +-
+ 3 files changed, 1 insertion(+), 7 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index b5cd5bb0f8a4..5f85cace71af 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -105,10 +105,6 @@ def FeatureUAL
+ def FeatureRelax
+     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
+                        "Enable Linker relaxation">;
+-// Experimental auto vectorization
+-def FeatureAutoVec
+-    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+-                       "Experimental auto vectorization">;
+ 
+ // Floating point approximation operation
+ def FeatureFrecipe
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+index d47dded9ea6e..7961bb141e64 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
+   case TargetTransformInfo::RGK_Scalar:
+     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+   case TargetTransformInfo::RGK_FixedWidthVector:
+-    if (!ST->hasExpAutoVec())
+-      return DefSize;
+     if (ST->hasExtLASX())
+       return TypeSize::getFixed(256);
+     if (ST->hasExtLSX())
+diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+index a8ac2411dd82..6ab300859f9d 100644
+--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
++++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+@@ -1,5 +1,5 @@
+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
++; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
+ 
+ ;; This is a collection of tests whose only purpose is to show changes in the
+ ;; default configuration.  Please keep these tests minimal - if you're testing
+-- 
+2.20.1
+

_service:tar_scm:0025-Backport-AArch64-Stack-probing-for-dynamic-allocas-in-GlobalISel.patch Deleted

@@ -1,496 +0,0 @@
-From dbca022577e0da1f411ee84143d59c6c9d941969 Mon Sep 17 00:00:00 2001
-From: rickyleung <leung.wing.chung@huawei.com>
-Date: Fri, 26 Apr 2024 17:29:18 +0800
-Subject: PATCH 6/7 backportAArch64 Stack probing for dynamic allocas in
- GlobalISel
-
-Reference: https://github.com/llvm/llvm-project/commit/c1140d49ec3363bf903e4c1dbf7a3f5e8c1b6523
-
-Co-authored-by: Oliver Stannard <oliver.stannard@linaro.org>
----
- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   2 +
- .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  37 ++-
- .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  47 +++-
- .../AArch64/GISel/AArch64LegalizerInfo.h      |   1 +
- .../GlobalISel/legalize-dyn-alloca.mir        | 255 ++++++++++++++----
- .../GlobalISel/legalizer-info-validation.mir  |   7 +
- .../CodeGen/AArch64/stack-probing-dynamic.ll  |   3 +-
- 7 files changed, 284 insertions(+), 68 deletions(-)
-
-diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-index 9288091874cf..7abbd1f03f16 100644
---- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
-@@ -400,6 +400,8 @@ public:
-   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
-   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
-   LegalizeResult lowerShuffleVector(MachineInstr &MI);
-+  Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
-+                                     Align Alignment, LLT PtrTy);
-   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
-   LegalizeResult lowerStackSave(MachineInstr &MI);
-   LegalizeResult lowerStackRestore(MachineInstr &MI);
-diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-index 75d9789be4d0..5557456e706d 100644
---- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
-@@ -6777,21 +6777,12 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
-   return Legalized;
- }
- 
--LegalizerHelper::LegalizeResult
--LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
--  const auto &MF = *MI.getMF();
--  const auto &TFI = *MF.getSubtarget().getFrameLowering();
--  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
--    return UnableToLegalize;
--
--  Register Dst = MI.getOperand(0).getReg();
--  Register AllocSize = MI.getOperand(1).getReg();
--  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
--
--  LLT PtrTy = MRI.getType(Dst);
-+Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
-+                                                    Register AllocSize,
-+                                                    Align Alignment,
-+                                                    LLT PtrTy) {
-   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
- 
--  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
-   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
-   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
- 
-@@ -6806,7 +6797,25 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
-     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
-   }
- 
--  SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
-+  return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
-+}
-+
-+LegalizerHelper::LegalizeResult
-+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
-+  const auto &MF = *MI.getMF();
-+  const auto &TFI = *MF.getSubtarget().getFrameLowering();
-+  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
-+    return UnableToLegalize;
-+
-+  Register Dst = MI.getOperand(0).getReg();
-+  Register AllocSize = MI.getOperand(1).getReg();
-+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
-+
-+  LLT PtrTy = MRI.getType(Dst);
-+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
-+  Register SPTmp =
-+      getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
-+
-   MIRBuilder.buildCopy(SPReg, SPTmp);
-   MIRBuilder.buildCopy(Dst, SPTmp);
- 
-diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-index f0130a0be29d..0dd2b4d48dd6 100644
---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
-@@ -797,9 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
-     return Query.Types0 == p0 && Query.Types1 == s64;
-   });
- 
--  getActionDefinitionsBuilder({G_DYN_STACKALLOC,
--                               G_STACKSAVE,
--                               G_STACKRESTORE}).lower();
-+  getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
-+
-+  getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
- 
-   if (ST.hasMOPS()) {
-     // G_BZERO is not supported. Currently it is only emitted by
-@@ -993,6 +993,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
-     return legalizeMemOps(MI, Helper);
-   case TargetOpcode::G_FCOPYSIGN:
-     return legalizeFCopySign(MI, Helper);
-+  case TargetOpcode::G_DYN_STACKALLOC:
-+    return legalizeDynStackAlloc(MI, Helper);
-   }
- 
-   llvm_unreachable("expected switch to return");
-@@ -1689,3 +1691,42 @@ bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI,
-   MI.eraseFromParent();
-   return true;
- }
-+
-+bool AArch64LegalizerInfo::legalizeDynStackAlloc(
-+    MachineInstr &MI, LegalizerHelper &Helper) const {
-+  MachineFunction &MF = *MI.getParent()->getParent();
-+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
-+
-+  // If stack probing is not enabled for this function, use the default
-+  // lowering.
-+  if (!MF.getFunction().hasFnAttribute("probe-stack") ||
-+      MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
-+          "inline-asm") {
-+    Helper.lowerDynStackAlloc(MI);
-+    return true;
-+  }
-+
-+  Register Dst = MI.getOperand(0).getReg();
-+  Register AllocSize = MI.getOperand(1).getReg();
-+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
-+
-+  assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
-+         "Unexpected type for dynamic alloca");
-+  assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
-+         "Unexpected type for dynamic alloca");
-+
-+  LLT PtrTy = MRI.getType(Dst);
-+  Register SPReg =
-+      Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
-+  Register SPTmp =
-+      Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
-+  auto NewMI =
-+      MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
-+  MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
-+  MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
-+  MIRBuilder.buildCopy(Dst, SPTmp);
-+
-+  MI.eraseFromParent();
-+  return true;
-+}
-\ No newline at end of file
-diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
-index c10f6e071ed4..94484ea59d15 100644
---- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
-+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
-@@ -58,6 +58,7 @@ private:
-   bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const;
-   bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const;
-   bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const;
-+  bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
-   const AArch64Subtarget *ST;
- };
- } // End llvm namespace.
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-index e9188fb89f69..882c7468e70f 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-@@ -19,6 +19,21 @@
-     ret i128* %addr
-   }
- 
-+  define i8* @test_simple_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
-+    %addr = alloca i8, i32 %numelts
-+    ret i8* %addr
-+  }
-+
-+  define i8* @test_aligned_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
-+    %addr = alloca i8, i32 %numelts, align 32
-+    ret i8* %addr
-+  }
-+
-+  define i128* @test_natural_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
-+    %addr = alloca i128, i32 %numelts
-+    ret i128* %addr
-+  }
-+
- ...
- ---
- name:            test_simple_alloca
-@@ -37,22 +52,23 @@ body:             |
- 
-     ; CHECK-LABEL: name: test_simple_alloca
-     ; CHECK: liveins: $w0
--    ; CHECK: COPY:%0-9+:_(s32) = COPY $w0
--    ; CHECK: C:%0-9+:_(s64) = G_CONSTANT i64 1
--    ; CHECK: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
--    ; CHECK: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
--    ; CHECK: C1:%0-9+:_(s64) = G_CONSTANT i64 15
--    ; CHECK: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
--    ; CHECK: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
--    ; CHECK: AND:%0-9+:_(s64) = G_AND ADD, C2
--    ; CHECK: COPY1:%0-9+:_(p0) = COPY $sp
--    ; CHECK: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
--    ; CHECK: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
--    ; CHECK: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR SUB(s64)
--    ; CHECK: $sp = COPY INTTOPTR(p0)
--    ; CHECK: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
--    ; CHECK: $x0 = COPY COPY2(p0)
--    ; CHECK: RET_ReallyLR implicit $x0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 1
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR SUB(s64)
-+    ; CHECK-NEXT: $sp = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-     %0:_(s32) = COPY $w0
-     %3:_(s64) = G_CONSTANT i64 1
-     %1:_(s64) = G_ZEXT %0(s32)
-@@ -83,24 +99,25 @@ body:             |
- 
-     ; CHECK-LABEL: name: test_aligned_alloca
-     ; CHECK: liveins: $w0
--    ; CHECK: COPY:%0-9+:_(s32) = COPY $w0
--    ; CHECK: C:%0-9+:_(s64) = G_CONSTANT i64 1
--    ; CHECK: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
--    ; CHECK: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
--    ; CHECK: C1:%0-9+:_(s64) = G_CONSTANT i64 15
--    ; CHECK: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
--    ; CHECK: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
--    ; CHECK: AND:%0-9+:_(s64) = G_AND ADD, C2
--    ; CHECK: COPY1:%0-9+:_(p0) = COPY $sp
--    ; CHECK: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
--    ; CHECK: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
--    ; CHECK: C3:%0-9+:_(s64) = G_CONSTANT i64 -32
--    ; CHECK: AND1:%0-9+:_(s64) = G_AND SUB, C3
--    ; CHECK: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR AND1(s64)
--    ; CHECK: $sp = COPY INTTOPTR(p0)
--    ; CHECK: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
--    ; CHECK: $x0 = COPY COPY2(p0)
--    ; CHECK: RET_ReallyLR implicit $x0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 1
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: C3:%0-9+:_(s64) = G_CONSTANT i64 -32
-+    ; CHECK-NEXT: AND1:%0-9+:_(s64) = G_AND SUB, C3
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR AND1(s64)
-+    ; CHECK-NEXT: $sp = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-     %0:_(s32) = COPY $w0
-     %3:_(s64) = G_CONSTANT i64 1
-     %1:_(s64) = G_ZEXT %0(s32)
-@@ -131,22 +148,23 @@ body:             |
- 
-     ; CHECK-LABEL: name: test_natural_alloca
-     ; CHECK: liveins: $w0
--    ; CHECK: COPY:%0-9+:_(s32) = COPY $w0
--    ; CHECK: C:%0-9+:_(s64) = G_CONSTANT i64 16
--    ; CHECK: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
--    ; CHECK: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
--    ; CHECK: C1:%0-9+:_(s64) = G_CONSTANT i64 15
--    ; CHECK: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
--    ; CHECK: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
--    ; CHECK: AND:%0-9+:_(s64) = G_AND ADD, C2
--    ; CHECK: COPY1:%0-9+:_(p0) = COPY $sp
--    ; CHECK: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
--    ; CHECK: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
--    ; CHECK: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR SUB(s64)
--    ; CHECK: $sp = COPY INTTOPTR(p0)
--    ; CHECK: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
--    ; CHECK: $x0 = COPY COPY2(p0)
--    ; CHECK: RET_ReallyLR implicit $x0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 16
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: MUL:%0-9+:_(s64) = G_MUL ZEXT, C
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD MUL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:_(p0) = G_INTTOPTR SUB(s64)
-+    ; CHECK-NEXT: $sp = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-     %0:_(s32) = COPY $w0
-     %3:_(s64) = G_CONSTANT i64 16
-     %1:_(s64) = G_ZEXT %0(s32)
-@@ -160,3 +178,140 @@ body:             |
-     RET_ReallyLR implicit $x0
- 
- ...
-+---
-+name:            test_simple_alloca_stack_probing
-+alignment:       4
-+tracksRegLiveness: true
-+liveins:
-+  - { reg: '$w0' }
-+frameInfo:
-+  maxAlignment:    1
-+stack:
-+  - { id: 0, name: addr, type: variable-sized, alignment: 1 }
-+machineFunctionInfo: {}
-+body:             |
-+  bb.1 (%ir-block.0):
-+    liveins: $w0
-+    ; CHECK-LABEL: name: test_simple_alloca_stack_probing
-+    ; CHECK: liveins: $w0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 0
-+    ; CHECK-NEXT: SHL:%0-9+:_(s64) = G_SHL ZEXT, C(s64)
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD SHL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:gpr64common(p0) = G_INTTOPTR SUB(s64)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN INTTOPTR(p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-+    %0:_(s32) = COPY $w0
-+    %1:_(s64) = G_ZEXT %0(s32)
-+    %9:_(s64) = G_CONSTANT i64 0
-+    %2:_(s64) = G_SHL %1, %9(s64)
-+    %4:_(s64) = G_CONSTANT i64 15
-+    %5:_(s64) = nuw G_ADD %2, %4
-+    %6:_(s64) = G_CONSTANT i64 -16
-+    %7:_(s64) = G_AND %5, %6
-+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1
-+    $x0 = COPY %8(p0)
-+    RET_ReallyLR implicit $x0
-+...
-+---
-+name:            test_aligned_alloca_stack_probing
-+alignment:       4
-+tracksRegLiveness: true
-+liveins:
-+  - { reg: '$w0' }
-+frameInfo:
-+  maxAlignment:    32
-+stack:
-+  - { id: 0, name: addr, type: variable-sized, alignment: 32 }
-+machineFunctionInfo: {}
-+body:             |
-+  bb.1 (%ir-block.0):
-+    liveins: $w0
-+    ; CHECK-LABEL: name: test_aligned_alloca_stack_probing
-+    ; CHECK: liveins: $w0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 0
-+    ; CHECK-NEXT: SHL:%0-9+:_(s64) = G_SHL ZEXT, C(s64)
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD SHL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: C3:%0-9+:_(s64) = G_CONSTANT i64 -32
-+    ; CHECK-NEXT: AND1:%0-9+:_(s64) = G_AND SUB, C3
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:gpr64common(p0) = G_INTTOPTR AND1(s64)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN INTTOPTR(p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-+    %0:_(s32) = COPY $w0
-+    %1:_(s64) = G_ZEXT %0(s32)
-+    %9:_(s64) = G_CONSTANT i64 0
-+    %2:_(s64) = G_SHL %1, %9(s64)
-+    %4:_(s64) = G_CONSTANT i64 15
-+    %5:_(s64) = nuw G_ADD %2, %4
-+    %6:_(s64) = G_CONSTANT i64 -16
-+    %7:_(s64) = G_AND %5, %6
-+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 32
-+    $x0 = COPY %8(p0)
-+    RET_ReallyLR implicit $x0
-+...
-+---
-+name:            test_natural_alloca_stack_probing
-+alignment:       4
-+tracksRegLiveness: true
-+liveins:
-+  - { reg: '$w0' }
-+frameInfo:
-+  maxAlignment:    1
-+stack:
-+  - { id: 0, name: addr, type: variable-sized, alignment: 1 }
-+machineFunctionInfo: {}
-+body:             |
-+  bb.1 (%ir-block.0):
-+    liveins: $w0
-+    ; CHECK-LABEL: name: test_natural_alloca_stack_probing
-+    ; CHECK: liveins: $w0
-+    ; CHECK-NEXT: {{  $}}
-+    ; CHECK-NEXT: COPY:%0-9+:_(s32) = COPY $w0
-+    ; CHECK-NEXT: ZEXT:%0-9+:_(s64) = G_ZEXT COPY(s32)
-+    ; CHECK-NEXT: C:%0-9+:_(s64) = G_CONSTANT i64 4
-+    ; CHECK-NEXT: SHL:%0-9+:_(s64) = G_SHL ZEXT, C(s64)
-+    ; CHECK-NEXT: C1:%0-9+:_(s64) = G_CONSTANT i64 15
-+    ; CHECK-NEXT: ADD:%0-9+:_(s64) = nuw G_ADD SHL, C1
-+    ; CHECK-NEXT: C2:%0-9+:_(s64) = G_CONSTANT i64 -16
-+    ; CHECK-NEXT: AND:%0-9+:_(s64) = G_AND ADD, C2
-+    ; CHECK-NEXT: COPY1:%0-9+:_(p0) = COPY $sp
-+    ; CHECK-NEXT: PTRTOINT:%0-9+:_(s64) = G_PTRTOINT COPY1(p0)
-+    ; CHECK-NEXT: SUB:%0-9+:_(s64) = G_SUB PTRTOINT, AND
-+    ; CHECK-NEXT: INTTOPTR:%0-9+:gpr64common(p0) = G_INTTOPTR SUB(s64)
-+    ; CHECK-NEXT: COPY2:%0-9+:_(p0) = COPY INTTOPTR(p0)
-+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN INTTOPTR(p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
-+    ; CHECK-NEXT: $x0 = COPY COPY2(p0)
-+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
-+    %0:_(s32) = COPY $w0
-+    %1:_(s64) = G_ZEXT %0(s32)
-+    %9:_(s64) = G_CONSTANT i64 4
-+    %2:_(s64) = G_SHL %1, %9(s64)
-+    %4:_(s64) = G_CONSTANT i64 15
-+    %5:_(s64) = nuw G_ADD %2, %4
-+    %6:_(s64) = G_CONSTANT i64 -16
-+    %7:_(s64) = G_AND %5, %6
-+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1
-+    $x0 = COPY %8(p0)
-+    RET_ReallyLR implicit $x0
-+...
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-index 461161f5b338..efae9b66b53d 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
-@@ -652,6 +652,13 @@
- # DEBUG-NEXT: .. opcode {{0-9+}} is aliased to DYN_STACKALLOC
- # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
- # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: G_STACKSAVE (opcode STACKSAVE:0-9+): 1 type index, 0 imm indices
-+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: G_STACKRESTORE (opcode {{0-9+}}): 1 type index, 0 imm indices
-+# DEBUG-NEXT: .. opcode {{0-9+}} is aliased to STACKSAVE
-+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
- # DEBUG-NEXT: G_STRICT_FADD (opcode {{0-9+}}): 1 type index, 0 imm indices
- # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
- # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-index 4d9ef77f7a0d..ad9cdbe92b23 100644
---- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-@@ -1,5 +1,6 @@
- ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
--; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs                                   | FileCheck %s
-+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
- 
- ; Dynamically-sized allocation, needs a loop which can handle any size at
- ; runtime. The final iteration of the loop will temporarily put SP below the
--- 
-2.42.0.windows.2
-

_service:tar_scm:0026-Update-testcase-for-stack-clash-protection-backport.patch Deleted

@@ -1,177 +0,0 @@
-From 9425ee5f8608ff8611628d83386f61950d7fff85 Mon Sep 17 00:00:00 2001
-From: rickyleung <leung.wing.chung@huawei.com>
-Date: Tue, 7 May 2024 21:37:03 +0800
-Subject: PATCH 7/7 Update testcase for stack clash protection backport
-
----
- .../GlobalISel/legalize-dyn-alloca.mir        |  3 +-
- .../GlobalISel/stacksave-stackrestore.ll      | 14 ++++++----
- .../CodeGen/AArch64/stack-probing-dynamic.ll  | 16 ++++++-----
- .../AArch64/stack-probing-last-in-block.mir   |  4 +--
- .../X86/GlobalISel/stacksave-stackrestore.ll  | 28 +++++++++++--------
- 5 files changed, 36 insertions(+), 29 deletions(-)
-
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-index 882c7468e70f..82781cebc55a 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
-@@ -313,5 +313,4 @@ body:             |
-     %7:_(s64) = G_AND %5, %6
-     %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1
-     $x0 = COPY %8(p0)
--    RET_ReallyLR implicit $x0
--...
-\ No newline at end of file
-+    RET_ReallyLR implicit $x0
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
-index 16bf85af9c17..97ecca0bd77b 100644
---- a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
-+++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
-@@ -15,14 +15,18 @@ define void @test_scoped_alloca(i64 %n) {
- ; CHECK-NEXT:    .cfi_offset w19, -16
- ; CHECK-NEXT:    .cfi_offset w30, -24
- ; CHECK-NEXT:    .cfi_offset w29, -32
--; CHECK-NEXT:    add x9, x0, #15
-+; CHECK-NEXT:    mov x19, x0
-+; CHECK-NEXT:    bl llvm.stacksave.p0
-+; CHECK-NEXT:    add x9, x19, #15
- ; CHECK-NEXT:    mov x8, sp
- ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
--; CHECK-NEXT:    mov x19, sp
--; CHECK-NEXT:    sub x0, x8, x9
--; CHECK-NEXT:    mov sp, x0
-+; CHECK-NEXT:    mov x19, x0
-+; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    mov sp, x8
-+; CHECK-NEXT:    mov x0, x8
- ; CHECK-NEXT:    bl use_addr
--; CHECK-NEXT:    mov sp, x19
-+; CHECK-NEXT:    mov x0, x19
-+; CHECK-NEXT:    bl llvm.stackrestore.p0
- ; CHECK-NEXT:    mov sp, x29
- ; CHECK-NEXT:    ldr x19, sp, #16 // 8-byte Folded Reload
- ; CHECK-NEXT:    ldp x29, x30, sp, #32 // 16-byte Folded Reload
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-index ad9cdbe92b23..3cbcf7749b2a 100644
---- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
-@@ -59,10 +59,10 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
- ; CHECK-NEXT:    str xzr, sp, #-64!
- ; CHECK-NEXT:    add x9, x0, #15
- ; CHECK-NEXT:    mov x8, sp
--; CHECK-NEXT:    sub x10, x29, #64
- ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
--; CHECK-NEXT:    str x10, x1
-+; CHECK-NEXT:    sub x10, x29, #64
- ; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    str x10, x1
- ; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
- ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
- ; CHECK-NEXT:    cmp sp, x8
-@@ -108,10 +108,10 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 {
- ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
- ; CHECK-NEXT:    add x9, x0, #15
- ; CHECK-NEXT:    mov x8, sp
--; CHECK-NEXT:    str xzr, sp
- ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
--; CHECK-NEXT:    mov x19, sp
-+; CHECK-NEXT:    str xzr, sp
- ; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    mov x19, sp
- ; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
- ; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
- ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-@@ -167,10 +167,10 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
- ; CHECK-NEXT:    mov sp, x9
- ; CHECK-NEXT:    add x9, x0, #15
- ; CHECK-NEXT:    mov x8, sp
--; CHECK-NEXT:    str xzr, sp
- ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
--; CHECK-NEXT:    mov x19, sp
-+; CHECK-NEXT:    str xzr, sp
- ; CHECK-NEXT:    sub x8, x8, x9
-+; CHECK-NEXT:    mov x19, sp
- ; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
- ; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
- ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
-@@ -268,8 +268,10 @@ define void @no_reserved_call_frame(i64 %n) #0 {
- ; CHECK-NEXT:    str xzr, sp
- ; CHECK-NEXT:    sub sp, sp, #1104
- ; CHECK-NEXT:    str xzr, sp
-+; CHECK-NEXT:    sub sp, sp, #1104
- ; CHECK-NEXT:    bl callee_stack_args
- ; CHECK-NEXT:    add sp, sp, #1104
-+; CHECK-NEXT:    add sp, sp, #1104
- ; CHECK-NEXT:    mov sp, x29
- ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
- ; CHECK-NEXT:    ldp x29, x30, sp, #16 // 16-byte Folded Reload
-@@ -331,8 +333,8 @@ define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
- ; CHECK-NEXT:    .cfi_offset w29, -32
- ; CHECK-NEXT:    rdvl x9, #1
- ; CHECK-NEXT:    mov x10, #15 // =0xf
--; CHECK-NEXT:    mov x8, sp
- ; CHECK-NEXT:    madd x9, x0, x9, x10
-+; CHECK-NEXT:    mov x8, sp
- ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
- ; CHECK-NEXT:    sub x8, x8, x9
- ; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
-diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
-index a8a21ab330ba..9a173be5857e 100644
---- a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
-+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
-@@ -141,6 +141,4 @@ body:             |
-     B %bb.2
- 
-   bb.2.exit:
--    RET_ReallyLR
--
--...
-\ No newline at end of file
-+    RET_ReallyLR
-\ No newline at end of file
-diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-index e86c04ee22db..8f665924577f 100644
---- a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-+++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll
-@@ -13,21 +13,25 @@ define void @test_scoped_alloca(i64 %n) {
- ; CHECK-NEXT:    .cfi_offset %rbp, -16
- ; CHECK-NEXT:    movq %rsp, %rbp
- ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
-+; CHECK-NEXT:    pushq %r14
- ; CHECK-NEXT:    pushq %rbx
--; CHECK-NEXT:    pushq %rax
--; CHECK-NEXT:    .cfi_offset %rbx, -24
--; CHECK-NEXT:    movq %rsp, %rbx
--; CHECK-NEXT:    movq %rsp, %rax
--; CHECK-NEXT:    imulq $1, %rdi, %rcx
--; CHECK-NEXT:    addq $15, %rcx
--; CHECK-NEXT:    andq $-16, %rcx
--; CHECK-NEXT:    subq %rcx, %rax
--; CHECK-NEXT:    movq %rax, %rsp
--; CHECK-NEXT:    movq %rax, %rdi
-+; CHECK-NEXT:    .cfi_offset %rbx, -32
-+; CHECK-NEXT:    .cfi_offset %r14, -24
-+; CHECK-NEXT:    movq %rdi, %rbx
-+; CHECK-NEXT:    callq llvm.stacksave.p0
-+; CHECK-NEXT:    movq %rax, %r14
-+; CHECK-NEXT:    movq %rsp, %rdi
-+; CHECK-NEXT:    imulq $1, %rbx, %rax
-+; CHECK-NEXT:    addq $15, %rax
-+; CHECK-NEXT:    andq $-16, %rax
-+; CHECK-NEXT:    subq %rax, %rdi
-+; CHECK-NEXT:    movq %rdi, %rsp
- ; CHECK-NEXT:    callq use_addr
--; CHECK-NEXT:    movq %rbx, %rsp
--; CHECK-NEXT:    leaq -8(%rbp), %rsp
-+; CHECK-NEXT:    movq %r14, %rdi
-+; CHECK-NEXT:    callq llvm.stackrestore.p0
-+; CHECK-NEXT:    leaq -16(%rbp), %rsp
- ; CHECK-NEXT:    popq %rbx
-+; CHECK-NEXT:    popq %r14
- ; CHECK-NEXT:    popq %rbp
- ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
- ; CHECK-NEXT:    retq
--- 
-2.42.0.windows.2
-

_service Changed