aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/CMake.rst5
-rw-r--r--docs/CoverageMappingFormat.rst40
-rw-r--r--docs/GettingStarted.rst2
-rw-r--r--docs/MCJITDesignAndImplementation.rst360
-rw-r--r--include/llvm/Analysis/MemoryBuiltins.h5
-rw-r--r--include/llvm/CodeGen/MachineInstr.h27
-rw-r--r--include/llvm/CodeGen/MachineInstrBuilder.h5
-rw-r--r--include/llvm/CodeGen/MachineInstrBundle.h2
-rw-r--r--include/llvm/CodeGen/WinEHFuncInfo.h4
-rw-r--r--include/llvm/IR/CallSite.h5
-rw-r--r--include/llvm/IR/IRBuilder.h60
-rw-r--r--include/llvm/IR/Instructions.h21
-rw-r--r--include/llvm/IR/IntrinsicsX86.td152
-rw-r--r--include/llvm/IR/Metadata.h17
-rw-r--r--include/llvm/IR/Statepoint.h56
-rw-r--r--include/llvm/MC/SubtargetFeature.h22
-rw-r--r--include/llvm/ProfileData/InstrProf.h48
-rw-r--r--include/llvm/ProfileData/InstrProfData.inc21
-rw-r--r--include/llvm/Support/ARMTargetParser.def1
-rw-r--r--include/llvm/Support/Program.h2
-rw-r--r--include/llvm/Support/YAMLParser.h23
-rw-r--r--include/llvm/TableGen/Record.h35
-rw-r--r--include/llvm/Target/Target.td4
-rw-r--r--include/llvm/Target/TargetLowering.h6
-rw-r--r--include/llvm/Transforms/Utils/BypassSlowDivision.h10
-rw-r--r--include/llvm/Transforms/Utils/LoopUtils.h3
-rw-r--r--lib/Analysis/BasicAliasAnalysis.cpp25
-rw-r--r--lib/Analysis/GlobalsModRef.cpp17
-rw-r--r--lib/Analysis/MemoryBuiltins.cpp7
-rw-r--r--lib/Analysis/MemoryDependenceAnalysis.cpp22
-rw-r--r--lib/Analysis/TargetLibraryInfo.cpp13
-rw-r--r--lib/Analysis/ValueTracking.cpp25
-rw-r--r--lib/Bitcode/Reader/BitcodeReader.cpp10
-rw-r--r--lib/CodeGen/AsmPrinter/WinException.cpp43
-rw-r--r--lib/CodeGen/CodeGenPrepare.cpp55
-rw-r--r--lib/CodeGen/MachineCSE.cpp38
-rw-r--r--lib/CodeGen/MachineInstr.cpp42
-rw-r--r--lib/CodeGen/MachineInstrBundle.cpp2
-rw-r--r--lib/CodeGen/RegisterPressure.cpp5
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp6
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAG.cpp94
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp57
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h2
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp3
-rw-r--r--lib/CodeGen/SelectionDAG/StatepointLowering.cpp41
-rw-r--r--lib/CodeGen/TargetSchedule.cpp2
-rw-r--r--lib/CodeGen/WinEHPrepare.cpp301
-rw-r--r--lib/Fuzzer/FuzzerDriver.cpp1
-rw-r--r--lib/Fuzzer/FuzzerFlags.def2
-rw-r--r--lib/Fuzzer/FuzzerInternal.h2
-rw-r--r--lib/Fuzzer/FuzzerLoop.cpp18
-rw-r--r--lib/Fuzzer/FuzzerMutate.cpp17
-rw-r--r--lib/Fuzzer/FuzzerTraceState.cpp21
-rw-r--r--lib/Fuzzer/test/CMakeLists.txt1
-rw-r--r--lib/Fuzzer/test/ThreadedTest.cpp23
-rw-r--r--lib/Fuzzer/test/fuzzer-threaded.test7
-rw-r--r--lib/Fuzzer/test/fuzzer.test6
-rw-r--r--lib/IR/AsmWriter.cpp15
-rw-r--r--lib/IR/Attributes.cpp17
-rw-r--r--lib/IR/Instruction.cpp13
-rw-r--r--lib/IR/Instructions.cpp25
-rw-r--r--lib/IR/Metadata.cpp6
-rw-r--r--lib/IR/Statepoint.cpp15
-rw-r--r--lib/IR/Verifier.cpp28
-rw-r--r--lib/Linker/IRMover.cpp23
-rw-r--r--lib/MC/MCDwarf.cpp10
-rw-r--r--lib/MC/MCObjectFileInfo.cpp2
-rw-r--r--lib/MC/MCSubtargetInfo.cpp17
-rw-r--r--lib/MC/SubtargetFeature.cpp30
-rw-r--r--lib/ProfileData/CoverageMappingReader.cpp15
-rw-r--r--lib/ProfileData/InstrProf.cpp99
-rw-r--r--lib/Support/Unix/Program.inc4
-rw-r--r--lib/Support/Windows/Program.inc9
-rw-r--r--lib/Support/Windows/WindowsSupport.h18
-rw-r--r--lib/Support/raw_ostream.cpp19
-rw-r--r--lib/TableGen/Record.cpp8
-rw-r--r--lib/TableGen/TGParser.cpp27
-rw-r--r--lib/TableGen/TGParser.h9
-rw-r--r--lib/Target/AArch64/AArch64.td10
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp8
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp104
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp26
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h10
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp4
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h12
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td5
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp13
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td35
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h5
-rw-r--r--lib/Target/AMDGPU/CIInstructions.td76
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp86
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td28
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td4
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp11
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp43
-rw-r--r--lib/Target/AMDGPU/VIInstructions.td9
-rw-r--r--lib/Target/ARM/ARM.td8
-rw-r--r--lib/Target/ARM/ARMConstantIslandPass.cpp12
-rw-r--r--lib/Target/ARM/ARMLoadStoreOptimizer.cpp21
-rw-r--r--lib/Target/ARM/ARMSubtarget.h2
-rw-r--r--lib/Target/Hexagon/Hexagon.td5
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.td2
-rw-r--r--lib/Target/Hexagon/HexagonSystemInst.td113
-rw-r--r--lib/Target/WebAssembly/known_gcc_test_failures.txt17
-rw-r--r--lib/Target/X86/CMakeLists.txt1
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp1
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp165
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h16
-rw-r--r--lib/Target/X86/X86FastISel.cpp6
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp34
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp10
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp186
-rw-r--r--lib/Target/X86/X86ISelLowering.h4
-rw-r--r--lib/Target/X86/X86InstrAVX512.td1
-rw-r--r--lib/Target/X86/X86InstrCompiler.td26
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp60
-rw-r--r--lib/Target/X86/X86InstrInfo.h4
-rw-r--r--lib/Target/X86/X86InstrInfo.td32
-rw-r--r--lib/Target/X86/X86InstrMMX.td2
-rw-r--r--lib/Target/X86/X86InstrMPX.td6
-rw-r--r--lib/Target/X86/X86InstrSSE.td54
-rw-r--r--lib/Target/X86/X86InstrSystem.td15
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h43
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp5
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.cpp190
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.h45
-rw-r--r--lib/Transforms/IPO/InferFunctionAttrs.cpp53
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp3
-rw-r--r--lib/Transforms/InstCombine/InstCombineCasts.cpp16
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--lib/Transforms/InstCombine/InstCombineVectorOps.cpp29
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp5
-rw-r--r--lib/Transforms/Instrumentation/InstrProfiling.cpp10
-rw-r--r--lib/Transforms/Scalar/LICM.cpp115
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp182
-rw-r--r--lib/Transforms/Scalar/MemCpyOptimizer.cpp98
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp34
-rw-r--r--lib/Transforms/Scalar/RewriteStatepointsForGC.cpp52
-rw-r--r--lib/Transforms/Utils/BypassSlowDivision.cpp100
-rw-r--r--lib/Transforms/Utils/Local.cpp47
-rw-r--r--lib/Transforms/Utils/SimplifyCFG.cpp66
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp78
-rw-r--r--lib/Transforms/Utils/ValueMapper.cpp10
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp12
-rw-r--r--test/Analysis/BasicAA/memset_pattern.ll2
-rw-r--r--test/Analysis/GlobalsModRef/argmemonly-escape.ll47
-rw-r--r--test/Analysis/GlobalsModRef/inaccessiblememonly.ll21
-rw-r--r--test/Analysis/GlobalsModRef/modreftest.ll20
-rw-r--r--test/Analysis/ValueTracking/known-power-of-two.ll20
-rw-r--r--test/Bitcode/compatibility.ll12
-rw-r--r--test/CodeGen/AArch64/arm64-vector-ext.ll54
-rw-r--r--test/CodeGen/AArch64/cpus.ll1
-rw-r--r--test/CodeGen/AArch64/remat.ll1
-rw-r--r--test/CodeGen/AArch64/tbz-tbnz.ll103
-rw-r--r--test/CodeGen/AMDGPU/flat-scratch-reg.ll17
-rw-r--r--test/CodeGen/AMDGPU/large-alloca-compute.ll4
-rw-r--r--test/CodeGen/AMDGPU/large-alloca-graphics.ll2
-rw-r--r--test/CodeGen/AMDGPU/load.ll140
-rw-r--r--test/CodeGen/AMDGPU/salu-to-valu.ll269
-rw-r--r--test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll24
-rw-r--r--test/CodeGen/ARM/build-attributes.ll36
-rw-r--r--test/CodeGen/ARM/debugtrap.ll34
-rw-r--r--test/CodeGen/WebAssembly/offset.ll198
-rw-r--r--test/CodeGen/WinEH/wineh-cloning.ll45
-rw-r--r--test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll8
-rw-r--r--test/CodeGen/X86/2011-11-30-or.ll14
-rw-r--r--test/CodeGen/X86/avx-cast.ll84
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll123
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll60
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll123
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics.ll36
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll586
-rw-r--r--test/CodeGen/X86/cmpxchg-clobber-flags.ll23
-rw-r--r--test/CodeGen/X86/copy-eflags.ll54
-rw-r--r--test/CodeGen/X86/divrem8_ext.ll19
-rw-r--r--test/CodeGen/X86/fold-load-unops.ll94
-rw-r--r--test/CodeGen/X86/fpcmp-soft-fp.ll254
-rw-r--r--test/CodeGen/X86/inline-sse.ll4
-rw-r--r--test/CodeGen/X86/insertelement-zero.ll539
-rw-r--r--test/CodeGen/X86/insertps-combine.ll111
-rw-r--r--test/CodeGen/X86/materialize-one.ll100
-rw-r--r--test/CodeGen/X86/materialize.ll184
-rw-r--r--test/CodeGen/X86/peephole-na-phys-copy-folding.ll7
-rw-r--r--test/CodeGen/X86/pku.ll25
-rw-r--r--test/CodeGen/X86/powi.ll4
-rw-r--r--test/CodeGen/X86/pr11415.ll8
-rw-r--r--test/CodeGen/X86/pr21792.ll82
-rw-r--r--test/CodeGen/X86/pr24139.ll296
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub.ll59
-rw-r--r--test/CodeGen/X86/statepoint-far-call.ll44
-rw-r--r--test/CodeGen/X86/system-intrinsics-64-xsave.ll82
-rw-r--r--test/CodeGen/X86/system-intrinsics-64-xsavec.ll42
-rw-r--r--test/CodeGen/X86/system-intrinsics-64-xsaveopt.ll42
-rw-r--r--test/CodeGen/X86/system-intrinsics-64-xsaves.ll82
-rw-r--r--test/CodeGen/X86/system-intrinsics-xsave.ll46
-rw-r--r--test/CodeGen/X86/system-intrinsics-xsavec.ll24
-rw-r--r--test/CodeGen/X86/system-intrinsics-xsaveopt.ll24
-rw-r--r--test/CodeGen/X86/system-intrinsics-xsaves.ll46
-rw-r--r--test/CodeGen/X86/vec_insert-7.ll34
-rw-r--r--test/CodeGen/X86/vec_partial.ll64
-rw-r--r--test/CodeGen/X86/vec_reassociate.ll238
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll139
-rw-r--r--test/CodeGen/X86/win64_frame.ll5
-rw-r--r--test/CodeGen/X86/wineh-coreclr.ll625
-rw-r--r--test/CodeGen/X86/x86-32-intrcc.ll158
-rw-r--r--test/CodeGen/X86/x86-64-flags-intrinsics.ll37
-rw-r--r--test/CodeGen/X86/x86-64-intrcc.ll170
-rw-r--r--test/CodeGen/X86/x86-flags-intrinsics.ll31
-rw-r--r--test/CodeGen/X86/x86-win64-shrink-wrapping.ll4
-rw-r--r--test/DebugInfo/COFF/asm.ll17
-rw-r--r--test/DebugInfo/debugmacinfo.test54
-rw-r--r--test/JitListener/multiple.ll74
-rw-r--r--test/JitListener/simple.ll10
-rw-r--r--test/MC/ARM/gas-compl-copr-reg.s14
-rw-r--r--test/Transforms/EarlyCSE/AArch64/ldstN.ll36
-rw-r--r--test/Transforms/InferFunctionAttrs/annotate.ll11
-rw-r--r--test/Transforms/InstCombine/double-float-shrink-1.ll253
-rw-r--r--test/Transforms/InstCombine/fast-math.ll33
-rw-r--r--test/Transforms/InstCombine/insert-extract-shuffle.ll53
-rw-r--r--test/Transforms/InstCombine/token.ll17
-rw-r--r--test/Transforms/InstSimplify/call.ll2
-rw-r--r--test/Transforms/LICM/funclet.ll107
-rw-r--r--test/Transforms/LICM/sinking.ll4
-rw-r--r--test/Transforms/MemCpyOpt/fca2memcpy.ll72
-rw-r--r--test/Transforms/PlaceSafepoints/leaf-function.ll35
-rw-r--r--test/Transforms/PlaceSafepoints/statepoint-coreclr.ll62
-rw-r--r--test/Transforms/Reassociate/factorize-again.ll34
-rw-r--r--test/Transforms/Reassociate/secondary.ll2
-rw-r--r--test/Transforms/SimplifyCFG/empty-catchpad.ll115
-rw-r--r--test/Transforms/SimplifyCFG/wineh-unreachable.ll84
-rw-r--r--test/Verifier/invalid-eh.ll60
-rw-r--r--test/tools/llvm-pdbdump/class-layout.test114
-rw-r--r--test/tools/llvm-pdbdump/enum-layout.test40
-rw-r--r--test/tools/llvm-pdbdump/load-address.test20
-rw-r--r--test/tools/llvm-symbolizer/pdb/lit.local.cfg2
-rw-r--r--unittests/IR/IRBuilderTest.cpp10
-rw-r--r--unittests/IR/MetadataTest.cpp14
-rw-r--r--unittests/IR/TypesTest.cpp14
-rw-r--r--unittests/ProfileData/InstrProfTest.cpp74
-rw-r--r--unittests/Support/YAMLParserTest.cpp72
-rw-r--r--utils/TableGen/AsmMatcherEmitter.cpp307
-rw-r--r--utils/TableGen/SubtargetEmitter.cpp28
-rw-r--r--utils/TableGen/TableGen.cpp3
244 files changed, 8773 insertions, 3699 deletions
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 38199e5cc587..9ec6b0a2416e 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -26,7 +26,10 @@ Quick start
We use here the command-line, non-interactive CMake interface.
#. `Download <http://www.cmake.org/cmake/resources/software.html>`_ and install
- CMake. Version 2.8.8 is the minimum required.
+ CMake. Version 2.8.8 is the minimum required, but if you're using the Ninja
+ backend, CMake v3.2 or newer is required to `get interactive output
+ <http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20141117/244797.html>`_
+ when running :doc:`Lit <CommandGuide/lit>`.
#. Open a shell. Your development tools must be reachable from this shell
through the PATH environment variable.
diff --git a/docs/CoverageMappingFormat.rst b/docs/CoverageMappingFormat.rst
index 9ac476c88b34..84cddff5ed9e 100644
--- a/docs/CoverageMappingFormat.rst
+++ b/docs/CoverageMappingFormat.rst
@@ -241,15 +241,25 @@ For example, let’s consider a C file and how it gets compiled to LLVM:
return 13;
}
-The coverage mapping variable generated by Clang is:
+The coverage mapping variable generated by Clang has 3 fields:
+
+* Coverage mapping header.
+
+* An array of function records.
+
+* Coverage mapping data which is an array of bytes. Zero paddings are added at the end to force 8 byte alignment.
.. code-block:: llvm
- @__llvm_coverage_mapping = internal constant { i32, i32, i32, i32, [2 x { i8*, i32, i32 }], [40 x i8] }
- { i32 2, ; The number of function records
- i32 20, ; The length of the string that contains the encoded translation unit filenames
- i32 20, ; The length of the string that contains the encoded coverage mapping data
- i32 0, ; Coverage mapping format version
+ @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x { i8*, i32, i32 }], [40 x i8] }
+ {
+ { i32, i32, i32, i32 } ; Coverage map header
+ {
+ i32 2, ; The number of function records
+ i32 20, ; The length of the string that contains the encoded translation unit filenames
+ i32 20, ; The length of the string that contains the encoded coverage mapping data
+ i32 0, ; Coverage mapping format version
+ },
[2 x { i8*, i32, i32 }] [ ; Function records
{ i8*, i32, i32 } { i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), ; Function's name
i32 3, ; Function's name length
@@ -262,12 +272,18 @@ The coverage mapping variable generated by Clang is:
[40 x i8] c"..." ; Encoded data (dissected later)
}, section "__llvm_covmap", align 8
-Version:
---------
+Coverage Mapping Header:
+------------------------
+
+The coverage mapping header has the following fields:
+
+* The number of function records.
+
+* The length of the string in the third field of *__llvm_coverage_mapping* that contains the encoded translation unit filenames.
-The coverage mapping version number can have the following values:
+* The length of the string in the third field of *__llvm_coverage_mapping* that contains the encoded coverage mapping data.
-* 0 — The first (current) version of the coverage mapping format.
+* The format version. 0 is the first (current) version of the coverage mapping format.
.. _function records:
@@ -331,7 +347,7 @@ IR for the `coverage mapping sample`_ that was shown earlier:
* The length of the substring that contains the encoded coverage mapping data
for the first function is the value of the third field in the first
structure in an array of `function records`_ stored in the
- fifth field of the *__llvm_coverage_mapping* structure, which is the 9.
+ third field of the *__llvm_coverage_mapping* structure, which is the 9.
Therefore, the coverage mapping for the first function record is encoded
in this string:
@@ -351,7 +367,7 @@ IR for the `coverage mapping sample`_ that was shown earlier:
| ``0x01`` | The number of mapping regions that are stored in an array for the function's file id #0. |
+----------+-------------------------------------------------------------------------------------------------------------------------+
| ``0x01`` | The coverage mapping counter for the first region in this function. The value of 1 tells us that it's a coverage |
- | | mapping counter that is a reference ot the profile instrumentation counter with an index of 0. |
+ | | mapping counter that is a reference to the profile instrumentation counter with an index of 0. |
+----------+-------------------------------------------------------------------------------------------------------------------------+
| ``0x01`` | The starting line of the first mapping region in this function. |
+----------+-------------------------------------------------------------------------------------------------------------------------+
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 2585ce135ba6..6aba50036793 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -78,6 +78,8 @@ Here's the short story for getting up and running quickly with LLVM:
The usual build uses `CMake <CMake.html>`_. If you would rather use
autotools, see `Building LLVM with autotools <BuildingLLVMWithAutotools.html>`_.
+ Although the build is known to work with CMake >= 2.8.8, we recommend CMake
+ >= v3.2, especially if you're generating Ninja build files.
* ``cd where you want to build llvm``
* ``mkdir build``
diff --git a/docs/MCJITDesignAndImplementation.rst b/docs/MCJITDesignAndImplementation.rst
index 237a5be52fb8..63a9e40ec180 100644
--- a/docs/MCJITDesignAndImplementation.rst
+++ b/docs/MCJITDesignAndImplementation.rst
@@ -1,180 +1,180 @@
-===============================
-MCJIT Design and Implementation
-===============================
-
-Introduction
-============
-
-This document describes the internal workings of the MCJIT execution
-engine and the RuntimeDyld component. It is intended as a high level
-overview of the implementation, showing the flow and interactions of
-objects throughout the code generation and dynamic loading process.
-
-Engine Creation
-===============
-
-In most cases, an EngineBuilder object is used to create an instance of
-the MCJIT execution engine. The EngineBuilder takes an llvm::Module
-object as an argument to its constructor. The client may then set various
-options that we control the later be passed along to the MCJIT engine,
-including the selection of MCJIT as the engine type to be created.
-Of particular interest is the EngineBuilder::setMCJITMemoryManager
-function. If the client does not explicitly create a memory manager at
-this time, a default memory manager (specifically SectionMemoryManager)
-will be created when the MCJIT engine is instantiated.
-
-Once the options have been set, a client calls EngineBuilder::create to
-create an instance of the MCJIT engine. If the client does not use the
-form of this function that takes a TargetMachine as a parameter, a new
-TargetMachine will be created based on the target triple associated with
-the Module that was used to create the EngineBuilder.
-
-.. image:: MCJIT-engine-builder.png
-
-EngineBuilder::create will call the static MCJIT::createJIT function,
-passing in its pointers to the module, memory manager and target machine
-objects, all of which will subsequently be owned by the MCJIT object.
-
-The MCJIT class has a member variable, Dyld, which contains an instance of
-the RuntimeDyld wrapper class. This member will be used for
-communications between MCJIT and the actual RuntimeDyldImpl object that
-gets created when an object is loaded.
-
-.. image:: MCJIT-creation.png
-
-Upon creation, MCJIT holds a pointer to the Module object that it received
-from EngineBuilder but it does not immediately generate code for this
-module. Code generation is deferred until either the
-MCJIT::finalizeObject method is called explicitly or a function such as
-MCJIT::getPointerToFunction is called which requires the code to have been
-generated.
-
-Code Generation
-===============
-
-When code generation is triggered, as described above, MCJIT will first
-attempt to retrieve an object image from its ObjectCache member, if one
-has been set. If a cached object image cannot be retrieved, MCJIT will
-call its emitObject method. MCJIT::emitObject uses a local PassManager
-instance and creates a new ObjectBufferStream instance, both of which it
-passes to TargetMachine::addPassesToEmitMC before calling PassManager::run
-on the Module with which it was created.
-
-.. image:: MCJIT-load.png
-
-The PassManager::run call causes the MC code generation mechanisms to emit
-a complete relocatable binary object image (either in either ELF or MachO
-format, depending on the target) into the ObjectBufferStream object, which
-is flushed to complete the process. If an ObjectCache is being used, the
-image will be passed to the ObjectCache here.
-
-At this point, the ObjectBufferStream contains the raw object image.
-Before the code can be executed, the code and data sections from this
-image must be loaded into suitable memory, relocations must be applied and
-memory permission and code cache invalidation (if required) must be completed.
-
-Object Loading
-==============
-
-Once an object image has been obtained, either through code generation or
-having been retrieved from an ObjectCache, it is passed to RuntimeDyld to
-be loaded. The RuntimeDyld wrapper class examines the object to determine
-its file format and creates an instance of either RuntimeDyldELF or
-RuntimeDyldMachO (both of which derive from the RuntimeDyldImpl base
-class) and calls the RuntimeDyldImpl::loadObject method to perform that
-actual loading.
-
-.. image:: MCJIT-dyld-load.png
-
-RuntimeDyldImpl::loadObject begins by creating an ObjectImage instance
-from the ObjectBuffer it received. ObjectImage, which wraps the
-ObjectFile class, is a helper class which parses the binary object image
-and provides access to the information contained in the format-specific
-headers, including section, symbol and relocation information.
-
-RuntimeDyldImpl::loadObject then iterates through the symbols in the
-image. Information about common symbols is collected for later use. For
-each function or data symbol, the associated section is loaded into memory
-and the symbol is stored in a symbol table map data structure. When the
-iteration is complete, a section is emitted for the common symbols.
-
-Next, RuntimeDyldImpl::loadObject iterates through the sections in the
-object image and for each section iterates through the relocations for
-that sections. For each relocation, it calls the format-specific
-processRelocationRef method, which will examine the relocation and store
-it in one of two data structures, a section-based relocation list map and
-an external symbol relocation map.
-
-.. image:: MCJIT-load-object.png
-
-When RuntimeDyldImpl::loadObject returns, all of the code and data
-sections for the object will have been loaded into memory allocated by the
-memory manager and relocation information will have been prepared, but the
-relocations have not yet been applied and the generated code is still not
-ready to be executed.
-
-[Currently (as of August 2013) the MCJIT engine will immediately apply
-relocations when loadObject completes. However, this shouldn't be
-happening. Because the code may have been generated for a remote target,
-the client should be given a chance to re-map the section addresses before
-relocations are applied. It is possible to apply relocations multiple
-times, but in the case where addresses are to be re-mapped, this first
-application is wasted effort.]
-
-Address Remapping
-=================
-
-At any time after initial code has been generated and before
-finalizeObject is called, the client can remap the address of sections in
-the object. Typically this is done because the code was generated for an
-external process and is being mapped into that process' address space.
-The client remaps the section address by calling MCJIT::mapSectionAddress.
-This should happen before the section memory is copied to its new
-location.
-
-When MCJIT::mapSectionAddress is called, MCJIT passes the call on to
-RuntimeDyldImpl (via its Dyld member). RuntimeDyldImpl stores the new
-address in an internal data structure but does not update the code at this
-time, since other sections are likely to change.
-
-When the client is finished remapping section addresses, it will call
-MCJIT::finalizeObject to complete the remapping process.
-
-Final Preparations
-==================
-
-When MCJIT::finalizeObject is called, MCJIT calls
-RuntimeDyld::resolveRelocations. This function will attempt to locate any
-external symbols and then apply all relocations for the object.
-
-External symbols are resolved by calling the memory manager's
-getPointerToNamedFunction method. The memory manager will return the
-address of the requested symbol in the target address space. (Note, this
-may not be a valid pointer in the host process.) RuntimeDyld will then
-iterate through the list of relocations it has stored which are associated
-with this symbol and invoke the resolveRelocation method which, through an
-format-specific implementation, will apply the relocation to the loaded
-section memory.
-
-Next, RuntimeDyld::resolveRelocations iterates through the list of
-sections and for each section iterates through a list of relocations that
-have been saved which reference that symbol and call resolveRelocation for
-each entry in this list. The relocation list here is a list of
-relocations for which the symbol associated with the relocation is located
-in the section associated with the list. Each of these locations will
-have a target location at which the relocation will be applied that is
-likely located in a different section.
-
-.. image:: MCJIT-resolve-relocations.png
-
-Once relocations have been applied as described above, MCJIT calls
-RuntimeDyld::getEHFrameSection, and if a non-zero result is returned
-passes the section data to the memory manager's registerEHFrames method.
-This allows the memory manager to call any desired target-specific
-functions, such as registering the EH frame information with a debugger.
-
-Finally, MCJIT calls the memory manager's finalizeMemory method. In this
-method, the memory manager will invalidate the target code cache, if
-necessary, and apply final permissions to the memory pages it has
-allocated for code and data memory.
-
+===============================
+MCJIT Design and Implementation
+===============================
+
+Introduction
+============
+
+This document describes the internal workings of the MCJIT execution
+engine and the RuntimeDyld component. It is intended as a high level
+overview of the implementation, showing the flow and interactions of
+objects throughout the code generation and dynamic loading process.
+
+Engine Creation
+===============
+
+In most cases, an EngineBuilder object is used to create an instance of
+the MCJIT execution engine. The EngineBuilder takes an llvm::Module
+object as an argument to its constructor. The client may then set various
+options that we control the later be passed along to the MCJIT engine,
+including the selection of MCJIT as the engine type to be created.
+Of particular interest is the EngineBuilder::setMCJITMemoryManager
+function. If the client does not explicitly create a memory manager at
+this time, a default memory manager (specifically SectionMemoryManager)
+will be created when the MCJIT engine is instantiated.
+
+Once the options have been set, a client calls EngineBuilder::create to
+create an instance of the MCJIT engine. If the client does not use the
+form of this function that takes a TargetMachine as a parameter, a new
+TargetMachine will be created based on the target triple associated with
+the Module that was used to create the EngineBuilder.
+
+.. image:: MCJIT-engine-builder.png
+
+EngineBuilder::create will call the static MCJIT::createJIT function,
+passing in its pointers to the module, memory manager and target machine
+objects, all of which will subsequently be owned by the MCJIT object.
+
+The MCJIT class has a member variable, Dyld, which contains an instance of
+the RuntimeDyld wrapper class. This member will be used for
+communications between MCJIT and the actual RuntimeDyldImpl object that
+gets created when an object is loaded.
+
+.. image:: MCJIT-creation.png
+
+Upon creation, MCJIT holds a pointer to the Module object that it received
+from EngineBuilder but it does not immediately generate code for this
+module. Code generation is deferred until either the
+MCJIT::finalizeObject method is called explicitly or a function such as
+MCJIT::getPointerToFunction is called which requires the code to have been
+generated.
+
+Code Generation
+===============
+
+When code generation is triggered, as described above, MCJIT will first
+attempt to retrieve an object image from its ObjectCache member, if one
+has been set. If a cached object image cannot be retrieved, MCJIT will
+call its emitObject method. MCJIT::emitObject uses a local PassManager
+instance and creates a new ObjectBufferStream instance, both of which it
+passes to TargetMachine::addPassesToEmitMC before calling PassManager::run
+on the Module with which it was created.
+
+.. image:: MCJIT-load.png
+
+The PassManager::run call causes the MC code generation mechanisms to emit
+a complete relocatable binary object image (either in either ELF or MachO
+format, depending on the target) into the ObjectBufferStream object, which
+is flushed to complete the process. If an ObjectCache is being used, the
+image will be passed to the ObjectCache here.
+
+At this point, the ObjectBufferStream contains the raw object image.
+Before the code can be executed, the code and data sections from this
+image must be loaded into suitable memory, relocations must be applied and
+memory permission and code cache invalidation (if required) must be completed.
+
+Object Loading
+==============
+
+Once an object image has been obtained, either through code generation or
+having been retrieved from an ObjectCache, it is passed to RuntimeDyld to
+be loaded. The RuntimeDyld wrapper class examines the object to determine
+its file format and creates an instance of either RuntimeDyldELF or
+RuntimeDyldMachO (both of which derive from the RuntimeDyldImpl base
+class) and calls the RuntimeDyldImpl::loadObject method to perform that
+actual loading.
+
+.. image:: MCJIT-dyld-load.png
+
+RuntimeDyldImpl::loadObject begins by creating an ObjectImage instance
+from the ObjectBuffer it received. ObjectImage, which wraps the
+ObjectFile class, is a helper class which parses the binary object image
+and provides access to the information contained in the format-specific
+headers, including section, symbol and relocation information.
+
+RuntimeDyldImpl::loadObject then iterates through the symbols in the
+image. Information about common symbols is collected for later use. For
+each function or data symbol, the associated section is loaded into memory
+and the symbol is stored in a symbol table map data structure. When the
+iteration is complete, a section is emitted for the common symbols.
+
+Next, RuntimeDyldImpl::loadObject iterates through the sections in the
+object image and for each section iterates through the relocations for
+that sections. For each relocation, it calls the format-specific
+processRelocationRef method, which will examine the relocation and store
+it in one of two data structures, a section-based relocation list map and
+an external symbol relocation map.
+
+.. image:: MCJIT-load-object.png
+
+When RuntimeDyldImpl::loadObject returns, all of the code and data
+sections for the object will have been loaded into memory allocated by the
+memory manager and relocation information will have been prepared, but the
+relocations have not yet been applied and the generated code is still not
+ready to be executed.
+
+[Currently (as of August 2013) the MCJIT engine will immediately apply
+relocations when loadObject completes. However, this shouldn't be
+happening. Because the code may have been generated for a remote target,
+the client should be given a chance to re-map the section addresses before
+relocations are applied. It is possible to apply relocations multiple
+times, but in the case where addresses are to be re-mapped, this first
+application is wasted effort.]
+
+Address Remapping
+=================
+
+At any time after initial code has been generated and before
+finalizeObject is called, the client can remap the address of sections in
+the object. Typically this is done because the code was generated for an
+external process and is being mapped into that process' address space.
+The client remaps the section address by calling MCJIT::mapSectionAddress.
+This should happen before the section memory is copied to its new
+location.
+
+When MCJIT::mapSectionAddress is called, MCJIT passes the call on to
+RuntimeDyldImpl (via its Dyld member). RuntimeDyldImpl stores the new
+address in an internal data structure but does not update the code at this
+time, since other sections are likely to change.
+
+When the client is finished remapping section addresses, it will call
+MCJIT::finalizeObject to complete the remapping process.
+
+Final Preparations
+==================
+
+When MCJIT::finalizeObject is called, MCJIT calls
+RuntimeDyld::resolveRelocations. This function will attempt to locate any
+external symbols and then apply all relocations for the object.
+
+External symbols are resolved by calling the memory manager's
+getPointerToNamedFunction method. The memory manager will return the
+address of the requested symbol in the target address space. (Note, this
+may not be a valid pointer in the host process.) RuntimeDyld will then
+iterate through the list of relocations it has stored which are associated
+with this symbol and invoke the resolveRelocation method which, through an
+format-specific implementation, will apply the relocation to the loaded
+section memory.
+
+Next, RuntimeDyld::resolveRelocations iterates through the list of
+sections and for each section iterates through a list of relocations that
+have been saved which reference that symbol and call resolveRelocation for
+each entry in this list. The relocation list here is a list of
+relocations for which the symbol associated with the relocation is located
+in the section associated with the list. Each of these locations will
+have a target location at which the relocation will be applied that is
+likely located in a different section.
+
+.. image:: MCJIT-resolve-relocations.png
+
+Once relocations have been applied as described above, MCJIT calls
+RuntimeDyld::getEHFrameSection, and if a non-zero result is returned
+passes the section data to the memory manager's registerEHFrames method.
+This allows the memory manager to call any desired target-specific
+functions, such as registering the EH frame information with a debugger.
+
+Finally, MCJIT calls the memory manager's finalizeMemory method. In this
+method, the memory manager will invalidate the target code cache, if
+necessary, and apply final permissions to the memory pages it has
+allocated for code and data memory.
+
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index 87fb3efaf50e..493a99a4b11e 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -59,11 +59,6 @@ bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
bool LookThroughBitCast = false);
-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory and never returns null (such as operator new).
-bool isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
- bool LookThroughBitCast = false);
-
//===----------------------------------------------------------------------===//
// malloc Call Utility Functions.
//
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 978864e96ca5..05c9a9e0b079 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -97,7 +97,7 @@ private:
// of memory operands required to be precise exceeds the maximum value of
// NumMemRefs - currently 256 - we remove the operands entirely. Note also
// that this is a non-owning reference to a shared copy on write buffer owned
- // by the MachineFunction and created via MF.allocateMemRefsArray.
+ // by the MachineFunction and created via MF.allocateMemRefsArray.
mmo_iterator MemRefs;
DebugLoc debugLoc; // Source line information.
@@ -354,7 +354,7 @@ public:
mmo_iterator memoperands_end() const { return MemRefs + NumMemRefs; }
/// Return true if we don't have any memory operands which described the the
/// memory access done by this instruction. If this is true, calling code
- /// must be conservative.
+ /// must be conservative.
bool memoperands_empty() const { return NumMemRefs == 0; }
iterator_range<mmo_iterator> memoperands() {
@@ -774,7 +774,7 @@ public:
bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }
- bool isMSInlineAsm() const {
+ bool isMSInlineAsm() const {
return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect();
}
bool isStackAligningInlineAsm() const;
@@ -1180,11 +1180,26 @@ public:
/// Assign this MachineInstr's memory reference descriptor list.
/// This does not transfer ownership.
void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd) {
- MemRefs = NewMemRefs;
- NumMemRefs = uint8_t(NewMemRefsEnd - NewMemRefs);
- assert(NumMemRefs == NewMemRefsEnd - NewMemRefs && "Too many memrefs");
+ setMemRefs(std::make_pair(NewMemRefs, NewMemRefsEnd-NewMemRefs));
}
+ /// Assign this MachineInstr's memory reference descriptor list. First
+ /// element in the pair is the begin iterator/pointer to the array; the
+ /// second is the number of MemoryOperands. This does not transfer ownership
+ /// of the underlying memory.
+ void setMemRefs(std::pair<mmo_iterator, unsigned> NewMemRefs) {
+ MemRefs = NewMemRefs.first;
+ NumMemRefs = uint8_t(NewMemRefs.second);
+ assert(NumMemRefs == NewMemRefs.second &&
+ "Too many memrefs - must drop memory operands");
+ }
+
+ /// Return a set of memrefs (begin iterator, size) which conservatively
+ /// describe the memory behavior of both MachineInstrs. This is appropriate
+ /// for use when merging two MachineInstrs into one. This routine does not
+ /// modify the memrefs of the this MachineInstr.
+ std::pair<mmo_iterator, unsigned> mergeMemRefsWith(const MachineInstr& Other);
+
/// Clear this MachineInstr's memory reference descriptor list. This resets
/// the memrefs to their most conservative state. This should be used only
/// as a last resort since it greatly pessimizes our knowledge of the memory
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index aa5f4b24df61..8fe9b280d5d2 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -162,6 +162,11 @@ public:
return *this;
}
+ const MachineInstrBuilder &setMemRefs(std::pair<MachineInstr::mmo_iterator,
+ unsigned> MemOperandsRef) const {
+ MI->setMemRefs(MemOperandsRef);
+ return *this;
+ }
const MachineInstrBuilder &addOperand(const MachineOperand &MO) const {
MI->addOperand(*MF, MO);
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 4fbe206fceb9..4e88606c05a7 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -178,7 +178,7 @@ public:
/// register.
bool FullyDefined;
- /// Reg or ont of its aliases is read. The register may only be read
+ /// Reg or one of its aliases is read. The register may only be read
/// partially.
bool Read;
/// Reg or a super-register is read. The full register is read.
diff --git a/include/llvm/CodeGen/WinEHFuncInfo.h b/include/llvm/CodeGen/WinEHFuncInfo.h
index 70d558f5cfbd..f6ad7a8572ab 100644
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@@ -83,7 +83,9 @@ enum class ClrHandlerType { Catch, Finally, Fault, Filter };
struct ClrEHUnwindMapEntry {
MBBOrBasicBlock Handler;
uint32_t TypeToken;
- int Parent;
+ int HandlerParentState; ///< Outer handler enclosing this entry's handler
+ int TryParentState; ///< Outer try region enclosing this entry's try region,
+ ///< treating later catches on same try as "outer"
ClrHandlerType HandlerType;
};
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index f4b8a8a5a1c9..f7bfb47a5b44 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -310,6 +310,11 @@ public:
CALLSITE_DELEGATE_GETTER(hasFnAttr(A));
}
+ /// \brief Return true if this function has the given attribute.
+ bool hasFnAttr(StringRef A) const {
+ CALLSITE_DELEGATE_GETTER(hasFnAttr(A));
+ }
+
/// \brief Return true if the call or the callee has the given attribute.
bool paramHasAttr(unsigned i, Attribute::AttrKind A) const {
CALLSITE_DELEGATE_GETTER(paramHasAttr(i, A));
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 7fe04f2a091a..a30505471aac 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -61,9 +61,13 @@ protected:
MDNode *DefaultFPMathTag;
FastMathFlags FMF;
+ ArrayRef<OperandBundleDef> DefaultOperandBundles;
+
public:
- IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr)
- : Context(context), DefaultFPMathTag(FPMathTag), FMF() {
+ IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : Context(context), DefaultFPMathTag(FPMathTag), FMF(),
+ DefaultOperandBundles(OpBundles) {
ClearInsertionPoint();
}
@@ -538,37 +542,44 @@ class IRBuilder : public IRBuilderBase, public Inserter {
public:
IRBuilder(LLVMContext &C, const T &F, Inserter I = Inserter(),
- MDNode *FPMathTag = nullptr)
- : IRBuilderBase(C, FPMathTag), Inserter(std::move(I)), Folder(F) {}
-
- explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr)
- : IRBuilderBase(C, FPMathTag), Folder() {
- }
-
- explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr)
- : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
+ MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(C, FPMathTag, OpBundles), Inserter(std::move(I)),
+ Folder(F) {}
+
+ explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(C, FPMathTag, OpBundles), Folder() {}
+
+ explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder(F) {
SetInsertPoint(TheBB);
}
- explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr)
- : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
+ explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
SetInsertPoint(TheBB);
}
- explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr)
- : IRBuilderBase(IP->getContext(), FPMathTag), Folder() {
+ explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(IP->getContext(), FPMathTag, OpBundles), Folder() {
SetInsertPoint(IP);
}
- IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F,
- MDNode *FPMathTag = nullptr)
- : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
+ IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T &F,
+ MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder(F) {
SetInsertPoint(TheBB, IP);
}
IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP,
- MDNode *FPMathTag = nullptr)
- : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
+ MDNode *FPMathTag = nullptr,
+ ArrayRef<OperandBundleDef> OpBundles = None)
+ : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
SetInsertPoint(TheBB, IP);
}
@@ -1529,8 +1540,11 @@ public:
CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args = None,
ArrayRef<OperandBundleDef> OpBundles = None,
- const Twine &Name = "") {
- return Insert(CallInst::Create(Callee, Args, OpBundles), Name);
+ const Twine &Name = "", MDNode *FPMathTag = nullptr) {
+ CallInst *CI = CallInst::Create(Callee, Args, OpBundles);
+ if (isa<FPMathOperator>(CI))
+ CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
+ return Insert(CI, Name);
}
CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args,
@@ -1543,7 +1557,7 @@ public:
CallInst *CreateCall(llvm::FunctionType *FTy, Value *Callee,
ArrayRef<Value *> Args, const Twine &Name = "",
MDNode *FPMathTag = nullptr) {
- CallInst *CI = CallInst::Create(FTy, Callee, Args);
+ CallInst *CI = CallInst::Create(FTy, Callee, Args, DefaultOperandBundles);
if (isa<FPMathOperator>(CI))
CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
return Insert(CI, Name);
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index d781c7af36d7..aba48ca6fa9e 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -3550,6 +3550,11 @@ public:
return hasFnAttrImpl(A);
}
+ /// \brief Determine whether this call has the given attribute.
+ bool hasFnAttr(StringRef A) const {
+ return hasFnAttrImpl(A);
+ }
+
/// \brief Determine whether the call or the callee has the given attributes.
bool paramHasAttr(unsigned i, Attribute::AttrKind A) const;
@@ -3734,7 +3739,19 @@ private:
unsigned getNumSuccessorsV() const override;
void setSuccessorV(unsigned idx, BasicBlock *B) override;
- bool hasFnAttrImpl(Attribute::AttrKind A) const;
+ template <typename AttrKind> bool hasFnAttrImpl(AttrKind A) const {
+ if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
+ return true;
+
+ // Operand bundles override attributes on the called function, but don't
+ // override attributes directly present on the invoke instruction.
+ if (isFnAttrDisallowedByOpBundle(A))
+ return false;
+
+ if (const Function *F = getCalledFunction())
+ return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
+ return false;
+ }
// Shadow Instruction::setInstructionSubclassData with a private forwarding
// method so that subclasses cannot accidentally use it.
@@ -3966,6 +3983,8 @@ public:
/// point to the added handler.
void addHandler(BasicBlock *Dest);
+ void removeHandler(handler_iterator HI);
+
unsigned getNumSuccessors() const { return getNumOperands() - 1; }
BasicBlock *getSuccessor(unsigned Idx) const {
assert(Idx < getNumSuccessors() &&
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 18390f853510..54bcbd8da509 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -33,6 +33,19 @@ let TargetPrefix = "x86" in {
}
//===----------------------------------------------------------------------===//
+// FLAGS.
+let TargetPrefix = "x86" in {
+ def int_x86_flags_read_u32 : GCCBuiltin<"__builtin_ia32_readeflags_u32">,
+ Intrinsic<[llvm_i32_ty], [], []>;
+ def int_x86_flags_read_u64 : GCCBuiltin<"__builtin_ia32_readeflags_u64">,
+ Intrinsic<[llvm_i64_ty], [], []>;
+ def int_x86_flags_write_u32 : GCCBuiltin<"__builtin_ia32_writeeflags_u32">,
+ Intrinsic<[], [llvm_i32_ty], []>;
+ def int_x86_flags_write_u64 : GCCBuiltin<"__builtin_ia32_writeeflags_u64">,
+ Intrinsic<[], [llvm_i64_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
// Read Time Stamp Counter.
let TargetPrefix = "x86" in {
def int_x86_rdtsc : GCCBuiltin<"__builtin_ia32_rdtsc">,
@@ -2211,6 +2224,25 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
llvm_i8_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_w_128 : GCCBuiltin<"__builtin_ia32_psraw128_mask">,
+ Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+ llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_w_256 : GCCBuiltin<"__builtin_ia32_psraw256_mask">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+ llvm_v8i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_w_512 : GCCBuiltin<"__builtin_ia32_psraw512_mask">,
+ Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+ llvm_v8i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_wi_128 : GCCBuiltin<"__builtin_ia32_psrawi128_mask">,
+ Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+ llvm_i8_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_wi_256 : GCCBuiltin<"__builtin_ia32_psrawi256_mask">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+ llvm_i8_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_wi_512 : GCCBuiltin<"__builtin_ia32_psrawi512_mask">,
+ Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+ llvm_i8_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+
def int_x86_avx512_mask_psll_d : GCCBuiltin<"__builtin_ia32_pslld512_mask">,
Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
@@ -2229,6 +2261,69 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_mask_psra_q : GCCBuiltin<"__builtin_ia32_psraq512_mask">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_psra_d_128 : GCCBuiltin<"__builtin_ia32_psrad128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_d_256 : GCCBuiltin<"__builtin_ia32_psrad256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_di_128 : GCCBuiltin<"__builtin_ia32_psradi128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_di_256 : GCCBuiltin<"__builtin_ia32_psradi256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_di_512 : GCCBuiltin<"__builtin_ia32_psradi512_mask">,
+ Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+ llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_q_128 : GCCBuiltin<"__builtin_ia32_psraq128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_q_256 : GCCBuiltin<"__builtin_ia32_psraq256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_qi_128 : GCCBuiltin<"__builtin_ia32_psraqi128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_i8_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_qi_256 : GCCBuiltin<"__builtin_ia32_psraqi256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_i8_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psra_qi_512 : GCCBuiltin<"__builtin_ia32_psraqi512_mask">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+ llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_psrl_d_128: GCCBuiltin<"__builtin_ia32_psrld128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [ llvm_v4i32_ty,
+ llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_d_256: GCCBuiltin<"__builtin_ia32_psrld256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [ llvm_v8i32_ty,
+ llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_di_128: GCCBuiltin<"__builtin_ia32_psrldi128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [ llvm_v4i32_ty,
+ llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_di_256: GCCBuiltin<"__builtin_ia32_psrldi256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [ llvm_v8i32_ty,
+ llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_di_512: GCCBuiltin<"__builtin_ia32_psrldi512_mask">,
+ Intrinsic<[llvm_v16i32_ty], [ llvm_v16i32_ty,
+ llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty ], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_psrl_q_128: GCCBuiltin<"__builtin_ia32_psrlq128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_q_256: GCCBuiltin<"__builtin_ia32_psrlq256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_qi_128: GCCBuiltin<"__builtin_ia32_psrlqi128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_i8_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_qi_256: GCCBuiltin<"__builtin_ia32_psrlqi256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_i8_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrl_qi_512: GCCBuiltin<"__builtin_ia32_psrlqi512_mask">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+ llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
}
// Pack ops.
@@ -2696,6 +2791,59 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_psrl_dq_512 : GCCBuiltin<"__builtin_ia32_psrldq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty],
[IntrNoMem]>;
+
+ def int_x86_avx512_mask_psll_d_128 : GCCBuiltin<"__builtin_ia32_pslld128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_d_256 : GCCBuiltin<"__builtin_ia32_pslld256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_di_128 : GCCBuiltin<"__builtin_ia32_pslldi128_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_di_256 : GCCBuiltin<"__builtin_ia32_pslldi256_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_di_512 : GCCBuiltin<"__builtin_ia32_pslldi512_mask">,
+ Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+ llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_q_128 : GCCBuiltin<"__builtin_ia32_psllq128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_q_256 : GCCBuiltin<"__builtin_ia32_psllq256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_qi_128 : GCCBuiltin<"__builtin_ia32_psllqi128_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_i8_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_qi_256 : GCCBuiltin<"__builtin_ia32_psllqi256_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_i8_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psll_qi_512 : GCCBuiltin<"__builtin_ia32_psllqi512_mask">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+ llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+ def int_x86_avx512_mask_psrlv16_hi : GCCBuiltin<"__builtin_ia32_psrlv16hi_mask">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+ llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv2_di : GCCBuiltin<"__builtin_ia32_psrlv2di_mask">,
+ Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+ llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv32hi : GCCBuiltin<"__builtin_ia32_psrlv32hi_mask">,
+ Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+ llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv4_di : GCCBuiltin<"__builtin_ia32_psrlv4di_mask">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+ llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv4_si : GCCBuiltin<"__builtin_ia32_psrlv4si_mask">,
+ Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+ llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv8_hi : GCCBuiltin<"__builtin_ia32_psrlv8hi_mask">,
+ Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+ llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+ def int_x86_avx512_mask_psrlv8_si : GCCBuiltin<"__builtin_ia32_psrlv8si_mask">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
}
// Gather ops
@@ -3919,9 +4067,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Support protection key
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_rdpkru : GCCBuiltin <"__builtin_ia32_rdpkru">,
- Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+ Intrinsic<[llvm_i32_ty], [], []>;
def int_x86_wrpkru : GCCBuiltin<"__builtin_ia32_wrpkru">,
- Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+ Intrinsic<[], [llvm_i32_ty], []>;
}
//===----------------------------------------------------------------------===//
// Half float conversion
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 2ea591383f82..4a8557d074f0 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -283,14 +283,20 @@ private:
LLVMContext &Context;
uint64_t NextIndex;
SmallDenseMap<void *, std::pair<OwnerTy, uint64_t>, 4> UseMap;
+ /// Flag that can be set to false if this metadata should not be
+ /// RAUW'ed, e.g. if it is used as the key of a map.
+ bool CanReplace;
public:
ReplaceableMetadataImpl(LLVMContext &Context)
- : Context(Context), NextIndex(0) {}
+ : Context(Context), NextIndex(0), CanReplace(true) {}
~ReplaceableMetadataImpl() {
assert(UseMap.empty() && "Cannot destroy in-use replaceable metadata");
}
+ /// Set the CanReplace flag to the given value.
+ void setCanReplace(bool Replaceable) { CanReplace = Replaceable; }
+
LLVMContext &getContext() const { return Context; }
/// \brief Replace all uses of this with MD.
@@ -901,14 +907,19 @@ public:
Context.getReplaceableUses()->replaceAllUsesWith(MD);
}
+ /// Set the CanReplace flag to the given value.
+ void setCanReplace(bool Replaceable) {
+ Context.getReplaceableUses()->setCanReplace(Replaceable);
+ }
+
/// \brief Resolve cycles.
///
/// Once all forward declarations have been resolved, force cycles to be
- /// resolved. If \p MDMaterialized is true, then any temporary metadata
+ /// resolved. If \p AllowTemps is true, then any temporary metadata
/// is ignored, otherwise it asserts when encountering temporary metadata.
///
/// \pre No operands (or operands' operands, etc.) have \a isTemporary().
- void resolveCycles(bool MDMaterialized = true);
+ void resolveCycles(bool AllowTemps = false);
/// \brief Replace a temporary node with a permanent one.
///
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 7310c5697a7e..51a0951a9798 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -22,6 +22,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Compiler.h"
@@ -36,14 +37,13 @@ enum class StatepointFlags {
MaskAll = GCTransition ///< A bitmask that includes all valid flags.
};
-class GCRelocateOperands;
+class GCRelocateInst;
class ImmutableStatepoint;
bool isStatepoint(const ImmutableCallSite &CS);
bool isStatepoint(const Value *V);
bool isStatepoint(const Value &V);
-bool isGCRelocate(const Value *V);
bool isGCRelocate(const ImmutableCallSite &CS);
bool isGCResult(const Value *V);
@@ -247,7 +247,7 @@ public:
/// May contain several relocations for the same base/derived pair.
/// For example this could happen due to relocations on unwinding
/// path of invoke.
- std::vector<GCRelocateOperands> getRelocates() const;
+ std::vector<const GCRelocateInst *> getRelocates() const;
/// Get the experimental_gc_result call tied to this statepoint. Can be
/// nullptr if there isn't a gc_result tied to this statepoint. Guaranteed to
@@ -305,33 +305,27 @@ public:
explicit Statepoint(CallSite CS) : Base(CS) {}
};
-/// Wraps a call to a gc.relocate and provides access to it's operands.
-/// TODO: This should likely be refactored to resememble the wrappers in
-/// InstrinsicInst.h.
-class GCRelocateOperands {
- ImmutableCallSite RelocateCS;
-
+/// This represents the gc.relocate intrinsic.
+class GCRelocateInst : public IntrinsicInst {
public:
- GCRelocateOperands(const User *U) : RelocateCS(U) { assert(isGCRelocate(U)); }
- GCRelocateOperands(const Instruction *inst) : RelocateCS(inst) {
- assert(isGCRelocate(inst));
+ static inline bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
+ }
+ static inline bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
- GCRelocateOperands(CallSite CS) : RelocateCS(CS) { assert(isGCRelocate(CS)); }
/// Return true if this relocate is tied to the invoke statepoint.
/// This includes relocates which are on the unwinding path.
bool isTiedToInvoke() const {
- const Value *Token = RelocateCS.getArgument(0);
+ const Value *Token = getArgOperand(0);
return isa<LandingPadInst>(Token) || isa<InvokeInst>(Token);
}
- /// Get enclosed relocate intrinsic
- ImmutableCallSite getUnderlyingCallSite() { return RelocateCS; }
-
/// The statepoint with which this gc.relocate is associated.
- const Instruction *getStatepoint() {
- const Value *Token = RelocateCS.getArgument(0);
+ const Instruction *getStatepoint() const {
+ const Value *Token = getArgOperand(0);
// This takes care both of relocates for call statepoints and relocates
// on normal path of invoke statepoint.
@@ -354,22 +348,22 @@ public:
/// The index into the associate statepoint's argument list
/// which contains the base pointer of the pointer whose
/// relocation this gc.relocate describes.
- unsigned getBasePtrIndex() {
- return cast<ConstantInt>(RelocateCS.getArgument(1))->getZExtValue();
+ unsigned getBasePtrIndex() const {
+ return cast<ConstantInt>(getArgOperand(1))->getZExtValue();
}
/// The index into the associate statepoint's argument list which
/// contains the pointer whose relocation this gc.relocate describes.
- unsigned getDerivedPtrIndex() {
- return cast<ConstantInt>(RelocateCS.getArgument(2))->getZExtValue();
+ unsigned getDerivedPtrIndex() const {
+ return cast<ConstantInt>(getArgOperand(2))->getZExtValue();
}
- Value *getBasePtr() {
+ Value *getBasePtr() const {
ImmutableCallSite CS(getStatepoint());
return *(CS.arg_begin() + getBasePtrIndex());
}
- Value *getDerivedPtr() {
+ Value *getDerivedPtr() const {
ImmutableCallSite CS(getStatepoint());
return *(CS.arg_begin() + getDerivedPtrIndex());
}
@@ -377,11 +371,11 @@ public:
template <typename FunTy, typename InstructionTy, typename ValueTy,
typename CallSiteTy>
-std::vector<GCRelocateOperands>
+std::vector<const GCRelocateInst *>
StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()
const {
- std::vector<GCRelocateOperands> Result;
+ std::vector<const GCRelocateInst *> Result;
CallSiteTy StatepointCS = getCallSite();
@@ -389,8 +383,8 @@ StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()
// gc_relocates ensures that we only get pairs which are actually relocated
// and used after the statepoint.
for (const User *U : getInstruction()->users())
- if (isGCRelocate(U))
- Result.push_back(GCRelocateOperands(U));
+ if (auto *Relocate = dyn_cast<GCRelocateInst>(U))
+ Result.push_back(Relocate);
if (!StatepointCS.isInvoke())
return Result;
@@ -401,8 +395,8 @@ StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()
// Search for gc relocates that are attached to this landingpad.
for (const User *LandingPadUser : LandingPad->users()) {
- if (isGCRelocate(LandingPadUser))
- Result.push_back(GCRelocateOperands(LandingPadUser));
+ if (auto *Relocate = dyn_cast<GCRelocateInst>(LandingPadUser))
+ Result.push_back(Relocate);
}
return Result;
}
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index 0d97b226d728..75d1e7997119 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -39,8 +39,8 @@ public:
FeatureBitset(const bitset<MAX_SUBTARGET_FEATURES>& B) : bitset(B) {}
FeatureBitset(std::initializer_list<unsigned> Init) : bitset() {
- for (auto I = Init.begin() , E = Init.end(); I != E; ++I)
- set(*I);
+ for (auto I : Init)
+ set(I);
}
};
@@ -59,6 +59,11 @@ struct SubtargetFeatureKV {
bool operator<(StringRef S) const {
return StringRef(Key) < S;
}
+
+ // Compare routine for std::is_sorted.
+ bool operator<(const SubtargetFeatureKV &Other) const {
+ return StringRef(Key) < StringRef(Other.Key);
+ }
};
//===----------------------------------------------------------------------===//
@@ -98,14 +103,13 @@ public:
/// Adding Features.
void AddFeature(StringRef String, bool Enable = true);
- /// ToggleFeature - Toggle a feature and returns the newly updated feature
- /// bits.
- FeatureBitset ToggleFeature(FeatureBitset Bits, StringRef String,
- ArrayRef<SubtargetFeatureKV> FeatureTable);
+ /// ToggleFeature - Toggle a feature and update the feature bits.
+ static void ToggleFeature(FeatureBitset &Bits, StringRef String,
+ ArrayRef<SubtargetFeatureKV> FeatureTable);
- /// Apply the feature flag and return the newly updated feature bits.
- FeatureBitset ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
- ArrayRef<SubtargetFeatureKV> FeatureTable);
+ /// Apply the feature flag and update the feature bits.
+ static void ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
+ ArrayRef<SubtargetFeatureKV> FeatureTable);
/// Get feature bits of a CPU.
FeatureBitset getFeatureBits(StringRef CPU,
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 4688759a3bd1..49569d89507b 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -155,11 +155,36 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName);
GlobalVariable *createPGOFuncNameVar(Module &M,
GlobalValue::LinkageTypes Linkage,
StringRef FuncName);
+/// Return the initializer in string of the PGO name var \c NameVar.
+StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar);
/// Given a PGO function name, remove the filename prefix and return
/// the original (static) function name.
StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName);
+/// Given a vector of strings (function PGO names) \c NameStrs, the
+/// method generates a combined string \c Result thatis ready to be
+/// serialized. The \c Result string is comprised of three fields:
+/// The first field is the legnth of the uncompressed strings, and the
+/// the second field is the length of the zlib-compressed string.
+/// Both fields are encoded in ULEB128. If \c doCompress is false, the
+/// third field is the uncompressed strings; otherwise it is the
+/// compressed string. When the string compression is off, the
+/// second field will have value zero.
+int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+ bool doCompression, std::string &Result);
+/// Produce \c Result string with the same format described above. The input
+/// is vector of PGO function name variables that are referenced.
+int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+ std::string &Result);
+class InstrProfSymtab;
+/// \c NameStrings is a string composed of one of more sub-strings encoded in
+/// the
+/// format described above. The substrings are seperated by 0 or more zero
+/// bytes.
+/// This method decodes the string and populates the \c Symtab.
+int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab);
+
const std::error_category &instrprof_category();
enum class instrprof_error {
@@ -235,6 +260,11 @@ public:
/// This interface is used by reader of CoverageMapping test
/// format.
inline std::error_code create(StringRef D, uint64_t BaseAddr);
+ /// \c NameStrings is a string composed of one of more sub-strings
+ /// encoded in the format described above. The substrings are
+ /// seperated by 0 or more zero bytes. This method decodes the
+ /// string and populates the \c Symtab.
+ inline std::error_code create(StringRef NameStrings);
/// Create InstrProfSymtab from a set of names iteratable from
/// \p IterRange. This interface is used by IndexedProfReader.
template <typename NameIterRange> void create(const NameIterRange &IterRange);
@@ -255,8 +285,8 @@ public:
AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val));
}
AddrHashMap &getAddrHashMap() { return AddrToMD5Map; }
- /// Return function's PGO name from the function name's symabol
- /// address in the object file. If an error occurs, Return
+ /// Return function's PGO name from the function name's symbol
+ /// address in the object file. If an error occurs, return
/// an empty string.
StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize);
/// Return function's PGO name from the name's md5 hash value.
@@ -270,6 +300,12 @@ std::error_code InstrProfSymtab::create(StringRef D, uint64_t BaseAddr) {
return std::error_code();
}
+std::error_code InstrProfSymtab::create(StringRef NameStrings) {
+ if (readPGOFuncNameStrings(NameStrings, *this))
+ return make_error_code(instrprof_error::malformed);
+ return std::error_code();
+}
+
template <typename NameIterRange>
void InstrProfSymtab::create(const NameIterRange &IterRange) {
for (auto Name : IterRange)
@@ -576,8 +612,14 @@ template <class IntPtrT> struct CovMapFunctionRecord {
#define COVMAP_FUNC_RECORD(Type, LLVMType, Name, Init) Type Name;
#include "llvm/ProfileData/InstrProfData.inc"
};
-LLVM_PACKED_END
+// Per module coverage mapping data header, i.e. CoverageMapFileHeader
+// documented above.
+struct CovMapHeader {
+#define COVMAP_HEADER(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+LLVM_PACKED_END
}
} // end namespace llvm
diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index 48dae506cabb..3a7c0c5f2773 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -1,4 +1,4 @@
-/*===-- InstrProfData.inc - instr profiling runtime structures -----------=== *\
+/*===-- InstrProfData.inc - instr profiling runtime structures -*- C++ -*-=== *\
|*
|* The LLVM Compiler Infrastructure
|*
@@ -167,6 +167,25 @@ COVMAP_FUNC_RECORD(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
#undef COVMAP_FUNC_RECORD
/* COVMAP_FUNC_RECORD end. */
+/* COVMAP_HEADER start */
+/* Definition of member fields of coverage map header.
+ */
+#ifndef COVMAP_HEADER
+#define COVMAP_HEADER(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+COVMAP_HEADER(uint32_t, Int32Ty, NRecords, \
+ llvm::ConstantInt::get(Int32Ty, FunctionRecords.size()))
+COVMAP_HEADER(uint32_t, Int32Ty, FilenamesSize, \
+ llvm::ConstantInt::get(Int32Ty, FilenamesSize))
+COVMAP_HEADER(uint32_t, Int32Ty, CoverageSize, \
+ llvm::ConstantInt::get(Int32Ty, CoverageMappingSize))
+COVMAP_HEADER(uint32_t, Int32Ty, Version, \
+ llvm::ConstantInt::get(Int32Ty, CoverageMappingVersion1))
+#undef COVMAP_HEADER
+/* COVMAP_HEADER end. */
+
#ifdef INSTR_PROF_VALUE_PROF_DATA
#define INSTR_PROF_DATA_DEFINED
diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index 2f99b0717adf..c895b095bc5b 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -213,6 +213,7 @@ ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, AEK_CRC)
ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
ARM_CPU_NAME("cyclone", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
+ARM_CPU_NAME("exynos-m1", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
// Non-standard Arch names.
ARM_CPU_NAME("iwmmxt", AK_IWMMXT, FK_NONE, true, AEK_NONE)
ARM_CPU_NAME("xscale", AK_XSCALE, FK_NONE, true, AEK_NONE)
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 43302101e3e0..727864df2721 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -130,7 +130,7 @@ struct ProcessInfo {
/// Return true if the given arguments fit within system-specific
/// argument length limits.
- bool argumentsFitWithinSystemLimits(ArrayRef<const char*> Args);
+ bool commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args);
/// File encoding options when writing contents that a non-UTF8 tool will
/// read (on Windows systems). For UNIX, we always use UTF-8.
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index b056ab6c1ce2..a5addfa3c7ae 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -305,7 +305,7 @@ private:
/// increment() which must set CurrentEntry to 0 to create an end iterator.
template <class BaseT, class ValueT>
class basic_collection_iterator
- : public std::iterator<std::forward_iterator_tag, ValueT> {
+ : public std::iterator<std::input_iterator_tag, ValueT> {
public:
basic_collection_iterator() : Base(nullptr) {}
basic_collection_iterator(BaseT *B) : Base(B) {}
@@ -326,11 +326,24 @@ public:
return Base->CurrentEntry;
}
+ /// Note on EqualityComparable:
+ ///
+ /// The iterator is not re-entrant,
+ /// it is meant to be used for parsing YAML on-demand
+ /// Once iteration started - it can point only to one entry at a time
+ /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
+ /// iff Base and Other.Base are equal.
+ bool operator==(const basic_collection_iterator &Other) const {
+ if (Base && (Base == Other.Base)) {
+ assert((Base->CurrentEntry == Other.Base->CurrentEntry)
+ && "Equal Bases expected to point to equal Entries");
+ }
+
+ return Base == Other.Base;
+ }
+
bool operator!=(const basic_collection_iterator &Other) const {
- if (Base != Other.Base)
- return true;
- return (Base && Other.Base) &&
- Base->CurrentEntry != Other.Base->CurrentEntry;
+ return !(Base == Other.Base);
}
basic_collection_iterator &operator++() {
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index eb1c5c78b9c0..4c1ef4013dda 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -232,7 +232,7 @@ protected:
/// We could pack these a bit tighter by not having the IK_FirstXXXInit
/// and IK_LastXXXInit be their own values, but that would degrade
/// readability for really no benefit.
- enum InitKind {
+ enum InitKind : uint8_t {
IK_BitInit,
IK_FirstTypedInit,
IK_BitsInit,
@@ -256,6 +256,9 @@ protected:
private:
const InitKind Kind;
+protected:
+ uint8_t Opc; // Used by UnOpInit, BinOpInit, and TernOpInit
+private:
Init(const Init &) = delete;
Init &operator=(const Init &) = delete;
virtual void anchor();
@@ -264,7 +267,7 @@ public:
InitKind getKind() const { return Kind; }
protected:
- explicit Init(InitKind K) : Kind(K) {}
+ explicit Init(InitKind K, uint8_t Opc = 0) : Kind(K), Opc(Opc) {}
public:
virtual ~Init() {}
@@ -365,7 +368,8 @@ class TypedInit : public Init {
TypedInit &operator=(const TypedInit &Other) = delete;
protected:
- explicit TypedInit(InitKind K, RecTy *T) : Init(K), Ty(T) {}
+ explicit TypedInit(InitKind K, RecTy *T, uint8_t Opc = 0)
+ : Init(K, Opc), Ty(T) {}
~TypedInit() override {
// If this is a DefInit we need to delete the RecordRecTy.
if (getKind() == IK_DefInit)
@@ -650,7 +654,8 @@ class OpInit : public TypedInit {
OpInit &operator=(OpInit &Other) = delete;
protected:
- explicit OpInit(InitKind K, RecTy *Type) : TypedInit(K, Type) {}
+ explicit OpInit(InitKind K, RecTy *Type, uint8_t Opc)
+ : TypedInit(K, Type, Opc) {}
public:
static bool classof(const Init *I) {
@@ -677,14 +682,13 @@ public:
///
class UnOpInit : public OpInit {
public:
- enum UnaryOp { CAST, HEAD, TAIL, EMPTY };
+ enum UnaryOp : uint8_t { CAST, HEAD, TAIL, EMPTY };
private:
- UnaryOp Opc;
Init *LHS;
UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
- : OpInit(IK_UnOpInit, Type), Opc(opc), LHS(lhs) {}
+ : OpInit(IK_UnOpInit, Type, opc), LHS(lhs) {}
UnOpInit(const UnOpInit &Other) = delete;
UnOpInit &operator=(const UnOpInit &Other) = delete;
@@ -708,7 +712,7 @@ public:
return getOperand();
}
- UnaryOp getOpcode() const { return Opc; }
+ UnaryOp getOpcode() const { return (UnaryOp)Opc; }
Init *getOperand() const { return LHS; }
// Fold - If possible, fold this to a simpler init. Return this if not
@@ -724,14 +728,14 @@ public:
///
class BinOpInit : public OpInit {
public:
- enum BinaryOp { ADD, AND, SHL, SRA, SRL, LISTCONCAT, STRCONCAT, CONCAT, EQ };
+ enum BinaryOp : uint8_t { ADD, AND, SHL, SRA, SRL, LISTCONCAT,
+ STRCONCAT, CONCAT, EQ };
private:
- BinaryOp Opc;
Init *LHS, *RHS;
BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
- OpInit(IK_BinOpInit, Type), Opc(opc), LHS(lhs), RHS(rhs) {}
+ OpInit(IK_BinOpInit, Type, opc), LHS(lhs), RHS(rhs) {}
BinOpInit(const BinOpInit &Other) = delete;
BinOpInit &operator=(const BinOpInit &Other) = delete;
@@ -759,7 +763,7 @@ public:
}
}
- BinaryOp getOpcode() const { return Opc; }
+ BinaryOp getOpcode() const { return (BinaryOp)Opc; }
Init *getLHS() const { return LHS; }
Init *getRHS() const { return RHS; }
@@ -776,15 +780,14 @@ public:
///
class TernOpInit : public OpInit {
public:
- enum TernaryOp { SUBST, FOREACH, IF };
+ enum TernaryOp : uint8_t { SUBST, FOREACH, IF };
private:
- TernaryOp Opc;
Init *LHS, *MHS, *RHS;
TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
RecTy *Type) :
- OpInit(IK_TernOpInit, Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
+ OpInit(IK_TernOpInit, Type, opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
TernOpInit(const TernOpInit &Other) = delete;
TernOpInit &operator=(const TernOpInit &Other) = delete;
@@ -815,7 +818,7 @@ public:
}
}
- TernaryOp getOpcode() const { return Opc; }
+ TernaryOp getOpcode() const { return (TernaryOp)Opc; }
Init *getLHS() const { return LHS; }
Init *getMHS() const { return MHS; }
Init *getRHS() const { return RHS; }
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 79046b2b7352..c869341c384f 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -936,6 +936,10 @@ class AsmParser {
// ShouldEmitMatchRegisterName - Set to false if the target needs a hand
// written register name matcher
bit ShouldEmitMatchRegisterName = 1;
+
+ // HasMnemonicFirst - Set to false if target instructions don't always
+ // start with a mnemonic as the first token.
+ bit HasMnemonicFirst = 1;
}
def DefaultAsmParser : AsmParser;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 140c36591acc..863b7cd044fb 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -2269,6 +2269,12 @@ public:
return false;
}
+ /// Return true if the MachineFunction contains a COPY which would imply
+ /// HasOpaqueSPAdjustment.
+ virtual bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const {
+ return false;
+ }
+
/// Perform necessary initialization to handle a subset of CSRs explicitly
/// via copies. This function is called at the beginning of instruction
/// selection.
diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
index 0d081c0194bf..af0d60b2625f 100644
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -23,11 +23,13 @@
namespace llvm {
-/// This optimization identifies DIV instructions that can be
+/// This optimization identifies DIV instructions in a BB that can be
/// profitably bypassed and carried out with a shorter, faster divide.
-bool bypassSlowDivision(Function &F,
- Function::iterator &I,
- const DenseMap<unsigned int, unsigned int> &BypassWidth);
+///
+/// This optimization may add basic blocks immediately after BB; for obvious
+/// reasons, you shouldn't pass those blocks to bypassSlowDivision.
+bool bypassSlowDivision(
+ BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);
} // End llvm namespace
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 17aaee03e4a8..2cfacb650ff5 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -16,6 +16,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,8 @@ struct LICMSafetyInfo {
bool MayThrow; // The current loop contains an instruction which
// may throw.
bool HeaderMayThrow; // Same as previous, but specific to loop header
+ // Used to update funclet bundle operands.
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
LICMSafetyInfo() : MayThrow(false), HeaderMayThrow(false)
{}
};
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 00f346ea115d..85404d87a611 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -543,7 +543,6 @@ static bool isMemsetPattern16(const Function *MS,
isa<IntegerType>(MemsetType->getParamType(2)))
return true;
}
-
return false;
}
@@ -583,9 +582,6 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
if (F->onlyAccessesArgMemory())
Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
- if (isMemsetPattern16(F, TLI))
- Min = FMRB_OnlyAccessesArgumentPointees;
-
// Otherwise be conservative.
return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min);
}
@@ -599,22 +595,21 @@ ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
case Intrinsic::memset:
case Intrinsic::memcpy:
case Intrinsic::memmove:
- assert((ArgIdx == 0 || ArgIdx == 1) &&
- "Invalid argument index for memory intrinsic");
- return ArgIdx ? MRI_Ref : MRI_Mod;
+ // We don't currently have a writeonly attribute. All other properties
+ // of these intrinsics are nicely described via attributes in
+ // Intrinsics.td and handled generically below.
+ if (ArgIdx == 0)
+ return MRI_Mod;
}
// We can bound the aliasing properties of memset_pattern16 just as we can
// for memcpy/memset. This is particularly important because the
// LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
- // whenever possible.
- if (CS.getCalledFunction() &&
- isMemsetPattern16(CS.getCalledFunction(), TLI)) {
- assert((ArgIdx == 0 || ArgIdx == 1) &&
- "Invalid argument index for memset_pattern16");
- return ArgIdx ? MRI_Ref : MRI_Mod;
- }
- // FIXME: Handle memset_pattern4 and memset_pattern8 also.
+ // whenever possible. Note that all but the missing writeonly attribute are
+ // handled via InferFunctionAttr.
+ if (CS.getCalledFunction() && isMemsetPattern16(CS.getCalledFunction(), TLI))
+ if (ArgIdx == 0)
+ return MRI_Mod;
if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly))
return MRI_Ref;
diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp
index ab2263ae374e..249f3954d554 100644
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@@ -376,15 +376,6 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
} else {
return true; // Argument of an unknown call.
}
- // If the Callee is not ReadNone, it may read the global,
- // and if it is not ReadOnly, it may also write to it.
- Function *CalleeF = CS.getCalledFunction();
- if (!CalleeF->doesNotAccessMemory()) {
- if (Readers)
- Readers->insert(CalleeF);
- if (Writers && !CalleeF->onlyReadsMemory())
- Writers->insert(CalleeF);
- }
}
} else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
@@ -516,7 +507,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
if (F->isDeclaration()) {
// Try to get mod/ref behaviour from function attributes.
- if (F->doesNotAccessMemory() || F->onlyAccessesInaccessibleMemory()) {
+ if (F->doesNotAccessMemory()) {
// Can't do better than that!
} else if (F->onlyReadsMemory()) {
FI.addModRefInfo(MRI_Ref);
@@ -524,12 +515,6 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
// This function might call back into the module and read a global -
// consider every global as possibly being read by this function.
FI.setMayReadAnyGlobal();
- } else if (F->onlyAccessesArgMemory() ||
- F->onlyAccessesInaccessibleMemOrArgMem()) {
- // This function may only access (read/write) memory pointed to by its
- // arguments. If this pointer is to a global, this escaping use of the
- // pointer is captured in AnalyzeUsesOfPointer().
- FI.addModRefInfo(MRI_ModRef);
} else {
FI.addModRefInfo(MRI_ModRef);
// Can't say anything useful unless it's an intrinsic - they don't
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b19ecadd3161..9e896aed0dce 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -187,13 +187,6 @@ bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
}
-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory and never returns null (such as operator new).
-bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
- bool LookThroughBitCast) {
- return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
-}
-
/// extractMallocCall - Returns the corresponding CallInst if the instruction
/// is a malloc call. Since CallInst::CreateMalloc() only creates calls, we
/// ignore InvokeInst here.
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 3e80bfe1fdfb..6918360536a3 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -477,7 +477,7 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
// being 42. A key property of this program however is that if either
// 1 or 4 were missing, there would be a race between the store of 42
// either the store of 0 or the load (making the whole progam racy).
- // The paper mentionned above shows that the same property is respected
+ // The paper mentioned above shows that the same property is respected
// by every program that can detect any optimisation of that kind: either
// it is racy (undefined) or there is a release followed by an acquire
// between the pair of accesses under consideration.
@@ -685,13 +685,13 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
return MemDepResult::getDef(Inst);
if (isInvariantLoad)
continue;
- // Be conservative if the accessed pointer may alias the allocation.
- if (AA->alias(Inst, AccessPtr) != NoAlias)
- return MemDepResult::getClobber(Inst);
- // If the allocation is not aliased and does not read memory (like
- // strdup), it is safe to ignore.
- if (isa<AllocaInst>(Inst) ||
- isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+ // Be conservative if the accessed pointer may alias the allocation -
+ // fallback to the generic handling below.
+ if ((AA->alias(Inst, AccessPtr) == NoAlias) &&
+ // If the allocation is not aliased and does not read memory (like
+ // strdup), it is safe to ignore.
+ (isa<AllocaInst>(Inst) || isMallocLikeFn(Inst, TLI) ||
+ isCallocLikeFn(Inst, TLI)))
continue;
}
@@ -792,10 +792,8 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
int Count = -1) {
if (Count == -1) Count = Cache.size();
- if (Count == 0) return;
-
- for (unsigned i = 1; i != unsigned(Count); ++i)
- assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!");
+ assert(std::is_sorted(Cache.begin(), Cache.begin() + Count) &&
+ "Cache isn't sorted!");
}
#endif
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index e00f4aed07fc..ce3881925627 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -52,14 +52,13 @@ static bool hasSinCosPiStret(const Triple &T) {
/// specified target triple. This should be carefully written so that a missing
/// target triple gets a sane set of defaults.
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
- const char *const *StandardNames) {
-#ifndef NDEBUG
+ ArrayRef<const char *> StandardNames) {
// Verify that the StandardNames array is in alphabetical order.
- for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
- if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
- llvm_unreachable("TargetLibraryInfoImpl function names must be sorted");
- }
-#endif // !NDEBUG
+ assert(std::is_sorted(StandardNames.begin(), StandardNames.end(),
+ [](const char *LHS, const char *RHS) {
+ return strcmp(LHS, RHS) < 0;
+ }) &&
+ "TargetLibraryInfoImpl function names must be sorted");
if (T.getArch() == Triple::r600 ||
T.getArch() == Triple::amdgcn) {
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 314ec9c1886e..abc57ed8bca0 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -1743,9 +1743,10 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
return false;
Value *X = nullptr, *Y = nullptr;
- // A shift of a power of two is a power of two or zero.
+ // A shift left or a logical shift right of a power of two is a power of two
+ // or zero.
if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
- match(V, m_Shr(m_Value(X), m_Value()))))
+ match(V, m_LShr(m_Value(X), m_Value()))))
return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL);
if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
@@ -2829,7 +2830,12 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
const DataLayout &DL) {
unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
APInt ByteOffset(BitWidth, 0);
- while (1) {
+
+ // We walk up the defs but use a visited set to handle unreachable code. In
+ // that case, we stop after accumulating the cycle once (not that it
+ // matters).
+ SmallPtrSet<Value *, 16> Visited;
+ while (Visited.insert(Ptr).second) {
if (Ptr->getType()->isVectorTy())
break;
@@ -3268,12 +3274,9 @@ static bool isDereferenceableAndAlignedPointer(
}
// For gc.relocate, look through relocations
- if (const IntrinsicInst *I = dyn_cast<IntrinsicInst>(V))
- if (I->getIntrinsicID() == Intrinsic::experimental_gc_relocate) {
- GCRelocateOperands RelocateInst(I);
- return isDereferenceableAndAlignedPointer(
- RelocateInst.getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);
- }
+ if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
+ return isDereferenceableAndAlignedPointer(
+ RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL,
@@ -3474,10 +3477,6 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
if (CS.isReturnNonNull())
return true;
- // operator new never returns null.
- if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
- return true;
-
return false;
}
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 2e670d584ecc..c7606fd488a0 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3071,7 +3071,12 @@ void BitcodeReader::saveMetadataList(
for (unsigned ID = 0; ID < MetadataList.size(); ++ID) {
Metadata *MD = MetadataList[ID];
auto *N = dyn_cast_or_null<MDNode>(MD);
+ assert((!N || (N->isResolved() || N->isTemporary())) &&
+ "Found non-resolved non-temp MDNode while saving metadata");
// Save all values if !OnlyTempMD, otherwise just the temporary metadata.
+ // Note that in the !OnlyTempMD case we need to save all Metadata, not
+ // just MDNode, as we may have references to other types of module-level
+ // metadata (e.g. ValueAsMetadata) from instructions.
if (!OnlyTempMD || (N && N->isTemporary())) {
// Will call this after materializing each function, in order to
// handle remapping of the function's instructions/metadata.
@@ -3080,6 +3085,11 @@ void BitcodeReader::saveMetadataList(
assert(MetadataToIDs[MD] == ID && "Inconsistent metadata value id");
continue;
}
+ if (N && N->isTemporary())
+ // Ensure that we assert if someone tries to RAUW this temporary
+ // metadata while it is the key of a map. The flag will be set back
+ // to true when the saved metadata list is destroyed.
+ N->setCanReplace(false);
MetadataToIDs[MD] = ID;
}
}
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 48b7104f24c3..4da5b580fcda 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -976,32 +976,32 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
}
}
-static int getRank(const WinEHFuncInfo &FuncInfo, int State) {
+static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) {
int Rank = 0;
while (State != -1) {
++Rank;
- State = FuncInfo.ClrEHUnwindMap[State].Parent;
+ State = FuncInfo.ClrEHUnwindMap[State].TryParentState;
}
return Rank;
}
-static int getAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
- int LeftRank = getRank(FuncInfo, Left);
- int RightRank = getRank(FuncInfo, Right);
+static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
+ int LeftRank = getTryRank(FuncInfo, Left);
+ int RightRank = getTryRank(FuncInfo, Right);
while (LeftRank < RightRank) {
- Right = FuncInfo.ClrEHUnwindMap[Right].Parent;
+ Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
--RightRank;
}
while (RightRank < LeftRank) {
- Left = FuncInfo.ClrEHUnwindMap[Left].Parent;
+ Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
--LeftRank;
}
while (Left != Right) {
- Left = FuncInfo.ClrEHUnwindMap[Left].Parent;
- Right = FuncInfo.ClrEHUnwindMap[Right].Parent;
+ Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+ Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
}
return Left;
@@ -1035,9 +1035,9 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>();
HandlerStates[HandlerBlock] = State;
// Use this loop through all handlers to verify our assumption (used in
- // the MinEnclosingState computation) that ancestors have lower state
- // numbers than their descendants.
- assert(FuncInfo.ClrEHUnwindMap[State].Parent < State &&
+ // the MinEnclosingState computation) that enclosing funclets have lower
+ // state numbers than their enclosed funclets.
+ assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State &&
"ill-formed state numbering");
}
// Map the main function to the NullState.
@@ -1070,7 +1070,6 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates);
// Visit the root function and each funclet.
-
for (MachineFunction::const_iterator FuncletStart = MF->begin(),
FuncletEnd = MF->begin(),
End = MF->end();
@@ -1100,17 +1099,18 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
for (const auto &StateChange :
InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) {
// Close any try regions we're not still under
- int AncestorState =
- getAncestor(FuncInfo, CurrentState, StateChange.NewState);
- while (CurrentState != AncestorState) {
- assert(CurrentState != NullState && "Failed to find ancestor!");
+ int StillPendingState =
+ getTryAncestor(FuncInfo, CurrentState, StateChange.NewState);
+ while (CurrentState != StillPendingState) {
+ assert(CurrentState != NullState &&
+ "Failed to find still-pending state!");
// Close the pending clause
Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel,
CurrentState, FuncletState});
- // Now the parent handler is current
- CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].Parent;
+ // Now the next-outer try region is current
+ CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState;
// Pop the new start label from the handler stack if we've exited all
- // descendants of the corresponding handler.
+ // inner try regions of the corresponding try region.
if (HandlerStack.back().second == CurrentState)
CurrentStartLabel = HandlerStack.pop_back_val().first;
}
@@ -1121,7 +1121,8 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
// it.
for (int EnteredState = StateChange.NewState;
EnteredState != CurrentState;
- EnteredState = FuncInfo.ClrEHUnwindMap[EnteredState].Parent) {
+ EnteredState =
+ FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) {
int &MinEnclosingState = MinClauseMap[EnteredState];
if (FuncletState < MinEnclosingState)
MinEnclosingState = FuncletState;
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 5844124d8565..6fbdea84c10f 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -225,8 +225,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
const DenseMap<unsigned int, unsigned int> &BypassWidths =
TLI->getBypassSlowDivWidths();
- for (Function::iterator I = F.begin(); I != F.end(); I++)
- EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
+ BasicBlock* BB = &*F.begin();
+ while (BB != nullptr) {
+ // bypassSlowDivision may create new BBs, but we don't want to reapply the
+ // optimization to those blocks.
+ BasicBlock* Next = BB->getNextNode();
+ EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+ BB = Next;
+ }
}
// Eliminate blocks that contain only PHI nodes and an
@@ -526,19 +532,17 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
// Computes a map of base pointer relocation instructions to corresponding
// derived pointer relocation instructions given a vector of all relocate calls
static void computeBaseDerivedRelocateMap(
- const SmallVectorImpl<User *> &AllRelocateCalls,
- DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
- RelocateInstMap) {
+ const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
+ DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
+ &RelocateInstMap) {
// Collect information in two maps: one primarily for locating the base object
// while filling the second map; the second map is the final structure holding
// a mapping between Base and corresponding Derived relocate calls
- DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
- for (auto &U : AllRelocateCalls) {
- GCRelocateOperands ThisRelocate(U);
- IntrinsicInst *I = cast<IntrinsicInst>(U);
- auto K = std::make_pair(ThisRelocate.getBasePtrIndex(),
- ThisRelocate.getDerivedPtrIndex());
- RelocateIdxMap.insert(std::make_pair(K, I));
+ DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
+ for (auto *ThisRelocate : AllRelocateCalls) {
+ auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
+ ThisRelocate->getDerivedPtrIndex());
+ RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
}
for (auto &Item : RelocateIdxMap) {
std::pair<unsigned, unsigned> Key = Item.first;
@@ -546,7 +550,7 @@ static void computeBaseDerivedRelocateMap(
// Base relocation: nothing to insert
continue;
- IntrinsicInst *I = Item.second;
+ GCRelocateInst *I = Item.second;
auto BaseKey = std::make_pair(Key.first, Key.first);
// We're iterating over RelocateIdxMap so we cannot modify it.
@@ -579,16 +583,13 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
// replace, computes a replacement, and affects it.
static bool
-simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
- const SmallVectorImpl<IntrinsicInst *> &Targets) {
+simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
+ const SmallVectorImpl<GCRelocateInst *> &Targets) {
bool MadeChange = false;
- for (auto &ToReplace : Targets) {
- GCRelocateOperands MasterRelocate(RelocatedBase);
- GCRelocateOperands ThisRelocate(ToReplace);
-
- assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() &&
+ for (GCRelocateInst *ToReplace : Targets) {
+ assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
"Not relocating a derived object of the original base object");
- if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) {
+ if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
// A duplicate relocate call. TODO: coalesce duplicates.
continue;
}
@@ -601,8 +602,8 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
continue;
}
- Value *Base = ThisRelocate.getBasePtr();
- auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr());
+ Value *Base = ToReplace->getBasePtr();
+ auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
if (!Derived || Derived->getPointerOperand() != Base)
continue;
@@ -680,12 +681,12 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
// %val = load %ptr'
bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
bool MadeChange = false;
- SmallVector<User *, 2> AllRelocateCalls;
+ SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
for (auto *U : I.users())
- if (isGCRelocate(dyn_cast<Instruction>(U)))
+ if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
// Collect all the relocate calls associated with a statepoint
- AllRelocateCalls.push_back(U);
+ AllRelocateCalls.push_back(Relocate);
// We need atleast one base pointer relocation + one derived pointer
// relocation to mangle
@@ -694,7 +695,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
// RelocateInstMap is a mapping from the base relocate instruction to the
// corresponding derived relocate instructions
- DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+ DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
if (RelocateInstMap.empty())
return false;
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 021707b7c3c7..aad376c4702b 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -122,8 +122,7 @@ INITIALIZE_PASS_END(MachineCSE, "machine-cse",
bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
MachineBasicBlock *MBB) {
bool Changed = false;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MI->getOperand(i);
+ for (MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.isUse())
continue;
unsigned Reg = MO.getReg();
@@ -186,8 +185,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
return true;
bool SeenDef = false;
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = I->getOperand(i);
+ for (const MachineOperand &MO : I->operands()) {
if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
SeenDef = true;
if (!MO.isReg() || !MO.getReg())
@@ -220,8 +218,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
SmallVectorImpl<unsigned> &PhysDefs,
bool &PhysUseDef) const{
// First, add all uses to PhysRefs.
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
+ for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || MO.isDef())
continue;
unsigned Reg = MO.getReg();
@@ -239,8 +236,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
// (which currently contains only uses), set the PhysUseDef flag.
PhysUseDef = false;
MachineBasicBlock::const_iterator I = MI; I = std::next(I);
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
+ for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
unsigned Reg = MO.getReg();
@@ -311,8 +307,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
if (I == E)
return true;
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = I->getOperand(i);
+ for (const MachineOperand &MO : I->operands()) {
// RegMasks go on instructions like calls that clobber lots of physregs.
// Don't attempt to CSE across such an instruction.
if (MO.isRegMask())
@@ -398,8 +393,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
// Heuristics #2: If the expression doesn't not use a vr and the only use
// of the redundant computation are copies, do not cse.
bool HasVRegUse = false;
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
- const MachineOperand &MO = MI->getOperand(i);
+ for (const MachineOperand &MO : MI->operands()) {
if (MO.isReg() && MO.isUse() &&
TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
HasVRegUse = true;
@@ -580,9 +574,9 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
// Actually perform the elimination.
if (DoCSE) {
- for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
- unsigned OldReg = CSEPairs[i].first;
- unsigned NewReg = CSEPairs[i].second;
+ for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+ unsigned OldReg = CSEPair.first;
+ unsigned NewReg = CSEPair.second;
// OldReg may have been unused but is used now, clear the Dead flag
MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
assert(Def != nullptr && "CSEd register has no unique definition?");
@@ -594,8 +588,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
// Go through implicit defs of CSMI and MI, if a def is not dead at MI,
// we should make sure it is not dead at CSMI.
- for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
- CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+ for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
+ CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
// Go through implicit defs of CSMI and MI, and clear the kill flags on
// their uses in all the instructions between CSMI and MI.
@@ -685,18 +679,14 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
Node = WorkList.pop_back_val();
Scopes.push_back(Node);
const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
- unsigned NumChildren = Children.size();
- OpenChildren[Node] = NumChildren;
- for (unsigned i = 0; i != NumChildren; ++i) {
- MachineDomTreeNode *Child = Children[i];
+ OpenChildren[Node] = Children.size();
+ for (MachineDomTreeNode *Child : Children)
WorkList.push_back(Child);
- }
} while (!WorkList.empty());
// Now perform CSE.
bool Changed = false;
- for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
- MachineDomTreeNode *Node = Scopes[i];
+ for (MachineDomTreeNode *Node : Scopes) {
MachineBasicBlock *MBB = Node->getBlock();
EnterScope(MBB);
Changed |= ProcessBlock(MBB);
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 1eb2edcd7cec..6b8eeccd173d 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -866,6 +866,27 @@ void MachineInstr::addMemOperand(MachineFunction &MF,
setMemRefs(NewMemRefs, NewMemRefs + NewNum);
}
+std::pair<MachineInstr::mmo_iterator, unsigned>
+MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
+ // TODO: If we end up with too many memory operands, return the empty
+ // conservative set rather than failing asserts.
+ // TODO: consider uniquing elements within the operand lists to reduce
+ // space usage and fall back to conservative information less often.
+ size_t CombinedNumMemRefs = (memoperands_end() - memoperands_begin())
+ + (Other.memoperands_end() - Other.memoperands_begin());
+
+ MachineFunction *MF = getParent()->getParent();
+ mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs);
+ mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(),
+ MemBegin);
+ MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(),
+ MemEnd);
+ assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
+ "missing memrefs");
+
+ return std::make_pair(MemBegin, CombinedNumMemRefs);
+}
+
bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
assert(!isBundledWithPred() && "Must be called on bundle header");
for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
@@ -1738,7 +1759,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
bool HaveSemi = false;
const unsigned PrintableFlags = FrameSetup | FrameDestroy;
if (Flags & PrintableFlags) {
- if (!HaveSemi) OS << ";"; HaveSemi = true;
+ if (!HaveSemi) {
+ OS << ";";
+ HaveSemi = true;
+ }
OS << " flags: ";
if (Flags & FrameSetup)
@@ -1749,7 +1773,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
}
if (!memoperands_empty()) {
- if (!HaveSemi) OS << ";"; HaveSemi = true;
+ if (!HaveSemi) {
+ OS << ";";
+ HaveSemi = true;
+ }
OS << " mem:";
for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
@@ -1762,7 +1789,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
// Print the regclass of any virtual registers encountered.
if (MRI && !VirtRegs.empty()) {
- if (!HaveSemi) OS << ";"; HaveSemi = true;
+ if (!HaveSemi) {
+ OS << ";";
+ HaveSemi = true;
+ }
for (unsigned i = 0; i != VirtRegs.size(); ++i) {
const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
OS << " " << TRI->getRegClassName(RC)
@@ -1781,7 +1811,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
// Print debug location information.
if (isDebugValue() && getOperand(e - 2).isMetadata()) {
- if (!HaveSemi) OS << ";";
+ if (!HaveSemi)
+ OS << ";";
auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
OS << " line no:" << DV->getLine();
if (auto *InlinedAt = debugLoc->getInlinedAt()) {
@@ -1795,7 +1826,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
if (isIndirectDebugValue())
OS << " indirect";
} else if (debugLoc && MF) {
- if (!HaveSemi) OS << ";";
+ if (!HaveSemi)
+ OS << ";";
OS << " dbg:";
debugLoc.print(OS);
}
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 3eaf4c5dea0f..4619daf30141 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -315,7 +315,7 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
if (!TRI->regsOverlap(MOReg, Reg))
continue;
- bool Covered = TRI->isSuperRegisterEq(MOReg, Reg);
+ bool Covered = TRI->isSuperRegisterEq(Reg, MOReg);
if (MO.readsReg()) {
PRI.Read = true;
if (Covered) {
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 8382b0912bde..3749b1dd217a 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -97,9 +97,8 @@ void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
unsigned Weight = PSetI.getWeight();
for (; PSetI.isValid(); ++PSetI) {
CurrSetPressure[*PSetI] += Weight;
- if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
- P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
- }
+ P.MaxSetPressure[*PSetI] =
+ std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
}
}
}
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0872d7a9a228..bc2405b952a6 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6843,9 +6843,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
uint64_t PtrOff = ShAmt / 8;
unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
SDLoc DL(LN0);
+ // The original load itself didn't wrap, so an offset within it doesn't.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
PtrType, LN0->getBasePtr(),
- DAG.getConstant(PtrOff, DL, PtrType));
+ DAG.getConstant(PtrOff, DL, PtrType),
+ &Flags);
AddToWorklist(NewPtr.getNode());
SDValue Load;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index abbc48e10e46..96bf914701c6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2843,6 +2843,43 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
return (AZero | BZero).isAllOnesValue();
}
+static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops,
+ llvm::SelectionDAG &DAG) {
+ if (Ops.size() == 1)
+ return Ops[0];
+
+ // Concat of UNDEFs is UNDEF.
+ if (std::all_of(Ops.begin(), Ops.end(),
+ [](SDValue Op) { return Op.isUndef(); }))
+ return DAG.getUNDEF(VT);
+
+ // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified
+ // to one big BUILD_VECTOR.
+ // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well.
+ if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) {
+ return Op.getOpcode() == ISD::BUILD_VECTOR;
+ }))
+ return SDValue();
+
+ EVT SVT = VT.getScalarType();
+ SmallVector<SDValue, 16> Elts;
+ for (SDValue Op : Ops)
+ Elts.append(Op->op_begin(), Op->op_end());
+
+ // BUILD_VECTOR requires all inputs to be of the same type, find the
+ // maximum type and extend them all.
+ for (SDValue Op : Elts)
+ SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+
+ if (SVT.bitsGT(VT.getScalarType()))
+ for (SDValue &Op : Elts)
+ Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+ ? DAG.getZExtOrTrunc(Op, DL, SVT)
+ : DAG.getSExtOrTrunc(Op, DL, SVT);
+
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
+}
+
/// getNode - Gets or creates the specified node.
///
SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
@@ -3426,34 +3463,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
if (N2.getOpcode() == ISD::EntryToken) return N1;
if (N1 == N2) return N1;
break;
- case ISD::CONCAT_VECTORS:
- // Concat of UNDEFs is UNDEF.
- if (N1.getOpcode() == ISD::UNDEF &&
- N2.getOpcode() == ISD::UNDEF)
- return getUNDEF(VT);
-
- // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
- // one big BUILD_VECTOR.
- if (N1.getOpcode() == ISD::BUILD_VECTOR &&
- N2.getOpcode() == ISD::BUILD_VECTOR) {
- SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
- N1.getNode()->op_end());
- Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-
- // BUILD_VECTOR requires all inputs to be of the same type, find the
- // maximum type and extend them all.
- EVT SVT = VT.getScalarType();
- for (SDValue Op : Elts)
- SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
- if (SVT.bitsGT(VT.getScalarType()))
- for (SDValue &Op : Elts)
- Op = TLI->isZExtFree(Op.getValueType(), SVT)
- ? getZExtOrTrunc(Op, DL, SVT)
- : getSExtOrTrunc(Op, DL, SVT);
-
- return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
- }
+ case ISD::CONCAT_VECTORS: {
+ // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+ SDValue Ops[] = {N1, N2};
+ if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+ return V;
break;
+ }
case ISD::AND:
assert(VT.isInteger() && "This operator does not apply to FP types!");
assert(N1.getValueType() == N2.getValueType() &&
@@ -3911,19 +3927,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
}
break;
}
- case ISD::CONCAT_VECTORS:
- // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
- // one big BUILD_VECTOR.
- if (N1.getOpcode() == ISD::BUILD_VECTOR &&
- N2.getOpcode() == ISD::BUILD_VECTOR &&
- N3.getOpcode() == ISD::BUILD_VECTOR) {
- SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
- N1.getNode()->op_end());
- Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
- Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
- return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
- }
+ case ISD::CONCAT_VECTORS: {
+ // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+ SDValue Ops[] = {N1, N2, N3};
+ if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+ return V;
break;
+ }
case ISD::SETCC: {
// Use FoldSetCC to simplify SETCC's.
if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
@@ -5462,6 +5472,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
switch (Opcode) {
default: break;
+ case ISD::CONCAT_VECTORS: {
+ // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+ if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+ return V;
+ break;
+ }
case ISD::SELECT_CC: {
assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
assert(Ops[0].getValueType() == Ops[1].getValueType() &&
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d2ea85ab4d22..e446a934554e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1329,12 +1329,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
+ // An aggregate return value cannot wrap around the address space, so
+ // offsets to its parts don't wrap either.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+
SmallVector<SDValue, 4> Chains(NumValues);
for (unsigned i = 0; i != NumValues; ++i) {
SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
RetPtr.getValueType(), RetPtr,
DAG.getIntPtrConstant(Offsets[i],
- getCurSDLoc()));
+ getCurSDLoc()),
+ &Flags);
Chains[i] =
DAG.getStore(Chain, getCurSDLoc(),
SDValue(RetOp.getNode(), RetOp.getResNo() + i),
@@ -2994,8 +3000,15 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
if (Field) {
// N = N + Offset
uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+
+ // In an inbouds GEP with an offset that is nonnegative even when
+ // interpreted as signed, assume there is no unsigned overflow.
+ SDNodeFlags Flags;
+ if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
+ Flags.setNoUnsignedWrap(true);
+
N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
- DAG.getConstant(Offset, dl, N.getValueType()));
+ DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
}
Ty = StTy->getElementType(Field);
@@ -3020,7 +3033,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
SDValue OffsVal = VectorWidth ?
DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) :
DAG.getConstant(Offs, dl, PtrTy);
- N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal);
+
+ // In an inbouds GEP with an offset that is nonnegative even when
+ // interpreted as signed, assume there is no unsigned overflow.
+ SDNodeFlags Flags;
+ if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
+ Flags.setNoUnsignedWrap(true);
+
+ N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags);
continue;
}
@@ -3092,10 +3112,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
Align = 0;
// Round the size of the allocation up to the stack alignment size
- // by add SA-1 to the size.
+ // by add SA-1 to the size. This doesn't overflow because we're computing
+ // an address inside an alloca.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
AllocSize = DAG.getNode(ISD::ADD, dl,
AllocSize.getValueType(), AllocSize,
- DAG.getIntPtrConstant(StackAlign - 1, dl));
+ DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags);
// Mask out the low bits for alignment purposes.
AllocSize = DAG.getNode(ISD::AND, dl,
@@ -3168,6 +3191,11 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (isVolatile)
Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
+ // An aggregate load cannot wrap around the address space, so offsets to its
+ // parts don't wrap either.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+
SmallVector<SDValue, 4> Values(NumValues);
SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
EVT PtrVT = Ptr.getValueType();
@@ -3188,7 +3216,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
}
SDValue A = DAG.getNode(ISD::ADD, dl,
PtrVT, Ptr,
- DAG.getConstant(Offsets[i], dl, PtrVT));
+ DAG.getConstant(Offsets[i], dl, PtrVT),
+ &Flags);
SDValue L = DAG.getLoad(ValueVTs[i], dl, Root,
A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
isNonTemporal, isInvariant, Alignment, AAInfo,
@@ -3243,6 +3272,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
AAMDNodes AAInfo;
I.getAAMetadata(AAInfo);
+ // An aggregate load cannot wrap around the address space, so offsets to its
+ // parts don't wrap either.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
// See visitLoad comments.
@@ -3253,7 +3287,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
ChainI = 0;
}
SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
- DAG.getConstant(Offsets[i], dl, PtrVT));
+ DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
SDValue St = DAG.getStore(Root, dl,
SDValue(Src.getNode(), Src.getResNo() + i),
Add, MachinePointerInfo(PtrV, Offsets[i]),
@@ -5189,7 +5223,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
return nullptr;
}
case Intrinsic::experimental_gc_relocate: {
- visitGCRelocate(I);
+ visitGCRelocate(cast<GCRelocateInst>(I));
return nullptr;
}
case Intrinsic::instrprof_increment:
@@ -7202,10 +7236,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
ReturnValues.resize(NumValues);
SmallVector<SDValue, 4> Chains(NumValues);
+ // An aggregate return value cannot wrap around the address space, so
+ // offsets to its parts don't wrap either.
+ SDNodeFlags Flags;
+ Flags.setNoUnsignedWrap(true);
+
for (unsigned i = 0; i < NumValues; ++i) {
SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
CLI.DAG.getConstant(Offsets[i], CLI.DL,
- PtrVT));
+ PtrVT), &Flags);
SDValue L = CLI.DAG.getLoad(
RetTys[i], CLI.DL, CLI.Chain, Add,
MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 49a3872d20c8..8fb85ff6ecc7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -855,7 +855,7 @@ private:
// These three are implemented in StatepointLowering.cpp
void visitStatepoint(const CallInst &I);
- void visitGCRelocate(const CallInst &I);
+ void visitGCRelocate(const GCRelocateInst &I);
void visitGCResult(const CallInst &I);
void visitUserOp1(const Instruction &I) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 853a21a15eb9..9f8759df0bab 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -633,6 +633,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
MRI.replaceRegWith(From, To);
}
+ if (TLI->hasCopyImplyingStackAdjustment(MF))
+ MFI->setHasOpaqueSPAdjustment(true);
+
// Freeze the set of reserved registers now that MachineFrameInfo has been
// set up. All the information required by getReservedRegs() should be
// available now.
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 050ec2116c5d..6547a62d0778 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -128,13 +128,11 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
return Optional<int>();
// Spill location is known for gc relocates
- if (isGCRelocate(Val)) {
- GCRelocateOperands RelocOps(cast<Instruction>(Val));
-
+ if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
- Builder.FuncInfo.StatepointRelocatedValues[RelocOps.getStatepoint()];
+ Builder.FuncInfo.StatepointRelocatedValues[Relocate->getStatepoint()];
- auto It = SpillMap.find(RelocOps.getDerivedPtr());
+ auto It = SpillMap.find(Relocate->getDerivedPtr());
if (It == SpillMap.end())
return Optional<int>();
@@ -401,10 +399,10 @@ static void getIncomingStatepointGCValues(
SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Ptrs,
SmallVectorImpl<const Value *> &Relocs, ImmutableStatepoint StatepointSite,
SelectionDAGBuilder &Builder) {
- for (GCRelocateOperands relocateOpers : StatepointSite.getRelocates()) {
- Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction());
- Bases.push_back(relocateOpers.getBasePtr());
- Ptrs.push_back(relocateOpers.getDerivedPtr());
+ for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+ Relocs.push_back(Relocate);
+ Bases.push_back(Relocate->getBasePtr());
+ Ptrs.push_back(Relocate->getDerivedPtr());
}
// Remove any redundant llvm::Values which map to the same SDValue as another
@@ -602,8 +600,8 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
Builder.FuncInfo.StatepointRelocatedValues[StatepointInstr];
- for (GCRelocateOperands RelocateOpers : StatepointSite.getRelocates()) {
- const Value *V = RelocateOpers.getDerivedPtr();
+ for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+ const Value *V = Relocate->getDerivedPtr();
SDValue SDV = Builder.getValue(V);
SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
@@ -624,8 +622,7 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
// uses of the corresponding values so that it would automatically
// export them. Relocates of the spilled values does not use original
// value.
- if (RelocateOpers.getUnderlyingCallSite().getParent() !=
- StatepointInstr->getParent())
+ if (Relocate->getParent() != StatepointInstr->getParent())
Builder.ExportFromCurrentBlock(V);
}
}
@@ -656,7 +653,7 @@ void SelectionDAGBuilder::LowerStatepoint(
// statepoint.
for (const User *U : CS->users()) {
const CallInst *Call = cast<CallInst>(U);
- if (isGCRelocate(Call) && Call->getParent() == CS.getParent())
+ if (isa<GCRelocateInst>(Call) && Call->getParent() == CS.getParent())
StatepointLowering.scheduleRelocCall(*Call);
}
#endif
@@ -859,24 +856,22 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
}
}
-void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
- GCRelocateOperands RelocateOpers(&CI);
-
+void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
#ifndef NDEBUG
// Consistency check
// We skip this check for relocates not in the same basic block as thier
// statepoint. It would be too expensive to preserve validation info through
// different basic blocks.
- if (RelocateOpers.getStatepoint()->getParent() == CI.getParent()) {
- StatepointLowering.relocCallVisited(CI);
+ if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) {
+ StatepointLowering.relocCallVisited(Relocate);
}
#endif
- const Value *DerivedPtr = RelocateOpers.getDerivedPtr();
+ const Value *DerivedPtr = Relocate.getDerivedPtr();
SDValue SD = getValue(DerivedPtr);
FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
- FuncInfo.StatepointRelocatedValues[RelocateOpers.getStatepoint()];
+ FuncInfo.StatepointRelocatedValues[Relocate.getStatepoint()];
// We should have recorded location for this pointer
assert(SpillMap.count(DerivedPtr) && "Relocating not lowered gc value");
@@ -885,7 +880,7 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
// We didn't need to spill these special cases (constants and allocas).
// See the handling in spillIncomingValueForStatepoint for detail.
if (!DerivedPtrLocation) {
- setValue(&CI, SD);
+ setValue(&Relocate, SD);
return;
}
@@ -907,5 +902,5 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
DAG.setRoot(SpillLoad.getValue(1));
assert(SpillLoad.getNode());
- setValue(&CI, SpillLoad);
+ setValue(&Relocate, SpillLoad);
}
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index fc656396ade8..1c4558cea5f5 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -212,7 +212,7 @@ unsigned TargetSchedModel::computeOperandLatency(
&& !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
&& SchedModel.isComplete()) {
errs() << "DefIdx " << DefIdx << " exceeds machine model writes for "
- << *DefMI;
+ << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)";
llvm_unreachable("incomplete machine model");
}
#endif
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index 52fb922c935a..2426c27d43dc 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -17,11 +17,14 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Verifier.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@@ -435,11 +438,12 @@ void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
calculateStateNumbersForInvokes(Fn, FuncInfo);
}
-static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int ParentState,
- ClrHandlerType HandlerType, uint32_t TypeToken,
- const BasicBlock *Handler) {
+static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState,
+ int TryParentState, ClrHandlerType HandlerType,
+ uint32_t TypeToken, const BasicBlock *Handler) {
ClrEHUnwindMapEntry Entry;
- Entry.Parent = ParentState;
+ Entry.HandlerParentState = HandlerParentState;
+ Entry.TryParentState = TryParentState;
Entry.Handler = Handler;
Entry.HandlerType = HandlerType;
Entry.TypeToken = TypeToken;
@@ -453,82 +457,199 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
if (!FuncInfo.EHPadStateMap.empty())
return;
+ // This numbering assigns one state number to each catchpad and cleanuppad.
+ // It also computes two tree-like relations over states:
+ // 1) Each state has a "HandlerParentState", which is the state of the next
+ // outer handler enclosing this state's handler (same as nearest ancestor
+ // per the ParentPad linkage on EH pads, but skipping over catchswitches).
+ // 2) Each state has a "TryParentState", which:
+ // a) for a catchpad that's not the last handler on its catchswitch, is
+ // the state of the next catchpad on that catchswitch
+ // b) for all other pads, is the state of the pad whose try region is the
+ // next outer try region enclosing this state's try region. The "try
+ // regions are not present as such in the IR, but will be inferred
+ // based on the placement of invokes and pads which reach each other
+ // by exceptional exits
+ // Catchswitches do not get their own states, but each gets mapped to the
+ // state of its first catchpad.
+
+ // Step one: walk down from outermost to innermost funclets, assigning each
+ // catchpad and cleanuppad a state number. Add an entry to the
+ // ClrEHUnwindMap for each state, recording its HandlerParentState and
+ // handler attributes. Record the TryParentState as well for each catchpad
+ // that's not the last on its catchswitch, but initialize all other entries'
+ // TryParentStates to a sentinel -1 value that the next pass will update.
+
+ // Seed a worklist with pads that have no parent.
SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
-
- // Each pad needs to be able to refer to its parent, so scan the function
- // looking for top-level handlers and seed the worklist with them.
for (const BasicBlock &BB : *Fn) {
- if (!BB.isEHPad())
- continue;
- if (BB.isLandingPad())
- report_fatal_error("CoreCLR EH cannot use landingpads");
const Instruction *FirstNonPHI = BB.getFirstNonPHI();
- if (!isTopLevelPadForMSVC(FirstNonPHI))
+ const Value *ParentPad;
+ if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
+ ParentPad = CPI->getParentPad();
+ else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI))
+ ParentPad = CSI->getParentPad();
+ else
continue;
- // queue this with sentinel parent state -1 to mean unwind to caller.
- Worklist.emplace_back(FirstNonPHI, -1);
+ if (isa<ConstantTokenNone>(ParentPad))
+ Worklist.emplace_back(FirstNonPHI, -1);
}
+ // Use the worklist to visit all pads, from outer to inner. Record
+ // HandlerParentState for all pads. Record TryParentState only for catchpads
+ // that aren't the last on their catchswitch (setting all other entries'
+ // TryParentStates to an initial value of -1). This loop is also responsible
+ // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and
+ // catchswitches.
while (!Worklist.empty()) {
const Instruction *Pad;
- int ParentState;
- std::tie(Pad, ParentState) = Worklist.pop_back_val();
-
- Value *ParentPad;
- int PredState;
- if (const CleanupPadInst *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
- // A cleanup can have multiple exits; don't re-process after the first.
- if (FuncInfo.EHPadStateMap.count(Cleanup))
- continue;
- // CoreCLR personality uses arity to distinguish faults from finallies.
- const BasicBlock *PadBlock = Cleanup->getParent();
+ int HandlerParentState;
+ std::tie(Pad, HandlerParentState) = Worklist.pop_back_val();
+
+ if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
+ // Create the entry for this cleanup with the appropriate handler
+ // properties. Finaly and fault handlers are distinguished by arity.
ClrHandlerType HandlerType =
- (Cleanup->getNumOperands() ? ClrHandlerType::Fault
- : ClrHandlerType::Finally);
- int NewState =
- addClrEHHandler(FuncInfo, ParentState, HandlerType, 0, PadBlock);
- FuncInfo.EHPadStateMap[Cleanup] = NewState;
- // Propagate the new state to all preds of the cleanup
- ParentPad = Cleanup->getParentPad();
- PredState = NewState;
- } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
- SmallVector<const CatchPadInst *, 1> Handlers;
- for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
- const auto *Catch = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
- Handlers.push_back(Catch);
- }
- FuncInfo.EHPadStateMap[CatchSwitch] = ParentState;
- int NewState = ParentState;
- for (auto HandlerI = Handlers.rbegin(), HandlerE = Handlers.rend();
- HandlerI != HandlerE; ++HandlerI) {
- const CatchPadInst *Catch = *HandlerI;
- const BasicBlock *PadBlock = Catch->getParent();
+ (Cleanup->getNumArgOperands() ? ClrHandlerType::Fault
+ : ClrHandlerType::Finally);
+ int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1,
+ HandlerType, 0, Pad->getParent());
+ // Queue any child EH pads on the worklist.
+ for (const User *U : Cleanup->users())
+ if (const auto *I = dyn_cast<Instruction>(U))
+ if (I->isEHPad())
+ Worklist.emplace_back(I, CleanupState);
+ // Remember this pad's state.
+ FuncInfo.EHPadStateMap[Cleanup] = CleanupState;
+ } else {
+ // Walk the handlers of this catchswitch in reverse order since all but
+ // the last need to set the following one as its TryParentState.
+ const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
+ int CatchState = -1, FollowerState = -1;
+ SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
+ for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend();
+ CBI != CBE; ++CBI, FollowerState = CatchState) {
+ const BasicBlock *CatchBlock = *CBI;
+ // Create the entry for this catch with the appropriate handler
+ // properties.
+ const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
uint32_t TypeToken = static_cast<uint32_t>(
cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
- NewState = addClrEHHandler(FuncInfo, NewState, ClrHandlerType::Catch,
- TypeToken, PadBlock);
- FuncInfo.EHPadStateMap[Catch] = NewState;
+ CatchState =
+ addClrEHHandler(FuncInfo, HandlerParentState, FollowerState,
+ ClrHandlerType::Catch, TypeToken, CatchBlock);
+ // Queue any child EH pads on the worklist.
+ for (const User *U : Catch->users())
+ if (const auto *I = dyn_cast<Instruction>(U))
+ if (I->isEHPad())
+ Worklist.emplace_back(I, CatchState);
+ // Remember this catch's state.
+ FuncInfo.EHPadStateMap[Catch] = CatchState;
}
- for (const auto *CatchPad : Handlers) {
- for (const User *U : CatchPad->users()) {
- const auto *UserI = cast<Instruction>(U);
- if (UserI->isEHPad())
- Worklist.emplace_back(UserI, ParentState);
+ // Associate the catchswitch with the state of its first catch.
+ assert(CatchSwitch->getNumHandlers());
+ FuncInfo.EHPadStateMap[CatchSwitch] = CatchState;
+ }
+ }
+
+ // Step two: record the TryParentState of each state. For cleanuppads that
+ // don't have cleanuprets, we may need to infer this from their child pads,
+ // so visit pads in descendant-most to ancestor-most order.
+ for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(),
+ End = FuncInfo.ClrEHUnwindMap.rend();
+ Entry != End; ++Entry) {
+ const Instruction *Pad =
+ Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI();
+ // For most pads, the TryParentState is the state associated with the
+ // unwind dest of exceptional exits from it.
+ const BasicBlock *UnwindDest;
+ if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) {
+ // If a catch is not the last in its catchswitch, its TryParentState is
+ // the state associated with the next catch in the switch, even though
+ // that's not the unwind dest of exceptions escaping the catch. Those
+ // cases were already assigned a TryParentState in the first pass, so
+ // skip them.
+ if (Entry->TryParentState != -1)
+ continue;
+ // Otherwise, get the unwind dest from the catchswitch.
+ UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
+ } else {
+ const auto *Cleanup = cast<CleanupPadInst>(Pad);
+ UnwindDest = nullptr;
+ for (const User *U : Cleanup->users()) {
+ if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+ // Common and unambiguous case -- cleanupret indicates cleanup's
+ // unwind dest.
+ UnwindDest = CleanupRet->getUnwindDest();
+ break;
+ }
+
+ // Get an unwind dest for the user
+ const BasicBlock *UserUnwindDest = nullptr;
+ if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+ UserUnwindDest = Invoke->getUnwindDest();
+ } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) {
+ UserUnwindDest = CatchSwitch->getUnwindDest();
+ } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) {
+ int UserState = FuncInfo.EHPadStateMap[ChildCleanup];
+ int UserUnwindState =
+ FuncInfo.ClrEHUnwindMap[UserState].TryParentState;
+ if (UserUnwindState != -1)
+ UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState]
+ .Handler.get<const BasicBlock *>();
}
+
+ // Not having an unwind dest for this user might indicate that it
+ // doesn't unwind, so can't be taken as proof that the cleanup itself
+ // may unwind to caller (see e.g. SimplifyUnreachable and
+ // RemoveUnwindEdge).
+ if (!UserUnwindDest)
+ continue;
+
+ // Now we have an unwind dest for the user, but we need to see if it
+ // unwinds all the way out of the cleanup or if it stays within it.
+ const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+ const Value *UserUnwindParent;
+ if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
+ UserUnwindParent = CSI->getParentPad();
+ else
+ UserUnwindParent =
+ cast<CleanupPadInst>(UserUnwindPad)->getParentPad();
+
+ // The unwind stays within the cleanup iff it targets a child of the
+ // cleanup.
+ if (UserUnwindParent == Cleanup)
+ continue;
+
+ // This unwind exits the cleanup, so its dest is the cleanup's dest.
+ UnwindDest = UserUnwindDest;
+ break;
}
- PredState = NewState;
- ParentPad = CatchSwitch->getParentPad();
- } else {
- llvm_unreachable("Unexpected EH pad");
}
- // Queue all predecessors with the given state
- for (const BasicBlock *Pred : predecessors(Pad->getParent())) {
- if ((Pred = getEHPadFromPredecessor(Pred, ParentPad)))
- Worklist.emplace_back(Pred->getFirstNonPHI(), PredState);
+ // Record the state of the unwind dest as the TryParentState.
+ int UnwindDestState;
+
+ // If UnwindDest is null at this point, either the pad in question can
+ // be exited by unwind to caller, or it cannot be exited by unwind. In
+ // either case, reporting such cases as unwinding to caller is correct.
+ // This can lead to EH tables that "look strange" -- if this pad's is in
+ // a parent funclet which has other children that do unwind to an enclosing
+ // pad, the try region for this pad will be missing the "duplicate" EH
+ // clause entries that you'd expect to see covering the whole parent. That
+ // should be benign, since the unwind never actually happens. If it were
+ // an issue, we could add a subsequent pass that pushes unwind dests down
+ // from parents that have them to children that appear to unwind to caller.
+ if (!UnwindDest) {
+ UnwindDestState = -1;
+ } else {
+ UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
}
+
+ Entry->TryParentState = UnwindDestState;
}
+ // Step three: transfer information from pads to invokes.
calculateStateNumbersForInvokes(Fn, FuncInfo);
}
@@ -597,6 +718,11 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
for (auto &Funclets : FuncletBlocks) {
BasicBlock *FuncletPadBB = Funclets.first;
std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second;
+ Value *FuncletToken;
+ if (FuncletPadBB == &F.getEntryBlock())
+ FuncletToken = ConstantTokenNone::get(F.getContext());
+ else
+ FuncletToken = FuncletPadBB->getFirstNonPHI();
std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
ValueToValueMapTy VMap;
@@ -668,15 +794,44 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
RemapInstruction(&I, VMap,
RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);
+ // Catchrets targeting cloned blocks need to be updated separately from
+ // the loop above because they are not in the current funclet.
+ SmallVector<CatchReturnInst *, 2> FixupCatchrets;
+ for (auto &BBMapping : Orig2Clone) {
+ BasicBlock *OldBlock = BBMapping.first;
+ BasicBlock *NewBlock = BBMapping.second;
+
+ FixupCatchrets.clear();
+ for (BasicBlock *Pred : predecessors(OldBlock))
+ if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
+ if (CatchRet->getParentPad() == FuncletToken)
+ FixupCatchrets.push_back(CatchRet);
+
+ for (CatchReturnInst *CatchRet : FixupCatchrets)
+ CatchRet->setSuccessor(NewBlock);
+ }
+
auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) {
unsigned NumPreds = PN->getNumIncomingValues();
for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd;
++PredIdx) {
BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx);
- ColorVector &IncomingColors = BlockColors[IncomingBlock];
- bool BlockInFunclet = IncomingColors.size() == 1 &&
- IncomingColors.front() == FuncletPadBB;
- if (IsForOldBlock != BlockInFunclet)
+ bool EdgeTargetsFunclet;
+ if (auto *CRI =
+ dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+ EdgeTargetsFunclet = (CRI->getParentPad() == FuncletToken);
+ } else {
+ ColorVector &IncomingColors = BlockColors[IncomingBlock];
+ assert(!IncomingColors.empty() && "Block not colored!");
+ assert((IncomingColors.size() == 1 ||
+ llvm::all_of(IncomingColors,
+ [&](BasicBlock *Color) {
+ return Color != FuncletPadBB;
+ })) &&
+ "Cloning should leave this funclet's blocks monochromatic");
+ EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB);
+ }
+ if (IsForOldBlock != EdgeTargetsFunclet)
continue;
PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false);
// Revisit the next entry.
@@ -864,7 +1019,6 @@ void WinEHPrepare::cleanupPreparedFunclets(Function &F) {
}
void WinEHPrepare::verifyPreparedFunclets(Function &F) {
- // Recolor the CFG to verify that all is well.
for (BasicBlock &BB : F) {
size_t NumColors = BlockColors[&BB].size();
assert(NumColors == 1 && "Expected monochromatic BB!");
@@ -872,12 +1026,8 @@ void WinEHPrepare::verifyPreparedFunclets(Function &F) {
report_fatal_error("Uncolored BB!");
if (NumColors > 1)
report_fatal_error("Multicolor BB!");
- if (!DisableDemotion) {
- bool EHPadHasPHI = BB.isEHPad() && isa<PHINode>(BB.begin());
- assert(!EHPadHasPHI && "EH Pad still has a PHI!");
- if (EHPadHasPHI)
- report_fatal_error("EH Pad still has a PHI!");
- }
+ assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) &&
+ "EH Pad still has a PHI!");
}
}
@@ -896,12 +1046,17 @@ bool WinEHPrepare::prepareExplicitEH(Function &F) {
demotePHIsOnFunclets(F);
if (!DisableCleanups) {
+ DEBUG(verifyFunction(F));
removeImplausibleInstructions(F);
+ DEBUG(verifyFunction(F));
cleanupPreparedFunclets(F);
}
- verifyPreparedFunclets(F);
+ DEBUG(verifyPreparedFunclets(F));
+ // Recolor the CFG to verify that all is well.
+ DEBUG(colorFunclets(F));
+ DEBUG(verifyPreparedFunclets(F));
BlockColors.clear();
FuncletBlocks.clear();
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index dc5f8babbfe6..e8c117ef6087 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -281,6 +281,7 @@ int FuzzerDriver(const std::vector<std::string> &Args,
if (Flags.verbosity > 0 && !Dictionary.empty())
Printf("Dictionary: %zd entries\n", Dictionary.size());
Options.SaveArtifacts = !Flags.test_single_input;
+ Options.PrintNewCovPcs = Flags.print_new_cov_pcs;
Fuzzer F(USF, Options);
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index c2b506c3c8aa..6d98f66ef9c1 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -72,3 +72,5 @@ FUZZER_FLAG_STRING(exact_artifact_path,
FUZZER_FLAG_INT(drill, 0, "Experimental: fuzz using a single unit as the seed "
"corpus, then merge with the initial corpus")
FUZZER_FLAG_INT(output_csv, 0, "Enable pulse output in CSV format.")
+FUZZER_FLAG_INT(print_new_cov_pcs, 0, "If 1, print out new covered pcs.")
+
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index e96a4bc35fe2..17a2cae94a58 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -97,6 +97,7 @@ class Fuzzer {
bool SaveArtifacts = true;
bool PrintNEW = true; // Print a status line when new units are found;
bool OutputCSV = false;
+ bool PrintNewCovPcs = false;
};
Fuzzer(UserSuppliedFuzzer &USF, FuzzingOptions Options);
void AddToCorpus(const Unit &U) { Corpus.push_back(U); }
@@ -188,6 +189,7 @@ class Fuzzer {
long EpochOfLastReadOfOutputCorpus = 0;
size_t LastRecordedBlockCoverage = 0;
size_t LastRecordedCallerCalleeCoverage = 0;
+ size_t LastCoveragePcBufferLen = 0;
};
class SimpleUserSuppliedFuzzer: public UserSuppliedFuzzer {
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 7ea82f4f15dd..0b1d9d9686a2 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -31,6 +31,8 @@ void __sanitizer_set_death_callback(void (*callback)(void));
__attribute__((weak)) size_t __sanitizer_get_number_of_counters();
__attribute__((weak))
uintptr_t __sanitizer_update_counter_bitset_and_clear_counters(uint8_t *bitset);
+__attribute__((weak)) uintptr_t
+__sanitizer_get_coverage_pc_buffer(uintptr_t **data);
}
namespace fuzzer {
@@ -249,7 +251,21 @@ void Fuzzer::ExecuteCallback(const Unit &U) {
size_t Fuzzer::RecordBlockCoverage() {
CHECK_WEAK_API_FUNCTION(__sanitizer_get_total_unique_coverage);
- return LastRecordedBlockCoverage = __sanitizer_get_total_unique_coverage();
+ uintptr_t PrevCoverage = LastRecordedBlockCoverage;
+ LastRecordedBlockCoverage = __sanitizer_get_total_unique_coverage();
+
+ if (PrevCoverage == LastRecordedBlockCoverage || !Options.PrintNewCovPcs)
+ return LastRecordedBlockCoverage;
+
+ uintptr_t PrevBufferLen = LastCoveragePcBufferLen;
+ uintptr_t *CoverageBuf;
+ LastCoveragePcBufferLen = __sanitizer_get_coverage_pc_buffer(&CoverageBuf);
+ assert(CoverageBuf);
+ for (size_t i = PrevBufferLen; i < LastCoveragePcBufferLen; ++i) {
+ Printf("0x%x\n", CoverageBuf[i]);
+ }
+
+ return LastRecordedBlockCoverage;
}
size_t Fuzzer::RecordCallerCalleeCoverage() {
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index c3fa37a435d6..84ee18e69fb0 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -117,11 +117,18 @@ size_t MutationDispatcher::Mutate_AddWordFromDictionary(uint8_t *Data,
assert(!D.empty());
if (D.empty()) return 0;
const Unit &Word = D[Rand(D.size())];
- if (Size + Word.size() > MaxSize) return 0;
- size_t Idx = Rand(Size + 1);
- memmove(Data + Idx + Word.size(), Data + Idx, Size - Idx);
- memcpy(Data + Idx, Word.data(), Word.size());
- return Size + Word.size();
+ if (Rand.RandBool()) { // Insert Word.
+ if (Size + Word.size() > MaxSize) return 0;
+ size_t Idx = Rand(Size + 1);
+ memmove(Data + Idx + Word.size(), Data + Idx, Size - Idx);
+ memcpy(Data + Idx, Word.data(), Word.size());
+ return Size + Word.size();
+ } else { // Overwrite some bytes with Word.
+ if (Word.size() > Size) return 0;
+ size_t Idx = Rand(Size - Word.size());
+ memcpy(Data + Idx, Word.data(), Word.size());
+ return Size;
+ }
}
size_t MutationDispatcher::Mutate_ChangeASCIIInteger(uint8_t *Data, size_t Size,
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index 8204a2ddc7c8..241c2f0ce590 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -77,6 +77,7 @@
#include <algorithm>
#include <cstring>
+#include <thread>
#include <unordered_map>
#if !LLVM_FUZZER_SUPPORTS_DFSAN
@@ -172,8 +173,13 @@ struct TraceBasedMutation {
class TraceState {
public:
- TraceState(const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
- : Options(Options), CurrentUnit(CurrentUnit) {}
+ TraceState(const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
+ : Options(Options), CurrentUnit(CurrentUnit) {
+ // Current trace collection is not thread-friendly and it probably
+ // does not have to be such, but at least we should not crash in presence
+ // of threads. So, just ignore all traces coming from all threads but one.
+ IsMyThread = true;
+ }
LabelRange GetLabelRange(dfsan_label L);
void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
@@ -213,8 +219,11 @@ class TraceState {
LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)];
const Fuzzer::FuzzingOptions &Options;
const Unit &CurrentUnit;
+ static thread_local bool IsMyThread;
};
+thread_local bool TraceState::IsMyThread;
+
LabelRange TraceState::GetLabelRange(dfsan_label L) {
LabelRange &LR = LabelRanges[L];
if (LR.Beg < LR.End || L == 0)
@@ -238,7 +247,7 @@ void TraceState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
dfsan_label L2) {
assert(ReallyHaveDFSan());
- if (!RecordingTraces) return;
+ if (!RecordingTraces || !IsMyThread) return;
if (L1 == 0 && L2 == 0)
return; // Not actionable.
if (L1 != 0 && L2 != 0)
@@ -267,7 +276,7 @@ void TraceState::DFSanSwitchCallback(uint64_t PC, size_t ValSizeInBits,
uint64_t Val, size_t NumCases,
uint64_t *Cases, dfsan_label L) {
assert(ReallyHaveDFSan());
- if (!RecordingTraces) return;
+ if (!RecordingTraces || !IsMyThread) return;
if (!L) return; // Not actionable.
LabelRange LR = GetLabelRange(L);
size_t ValSize = ValSizeInBits / 8;
@@ -312,7 +321,7 @@ int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,
void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
uint64_t Arg1, uint64_t Arg2) {
- if (!RecordingTraces) return;
+ if (!RecordingTraces || !IsMyThread) return;
int Added = 0;
if (Options.Verbosity >= 3)
Printf("TraceCmp %zd/%zd: %p %zd %zd\n", CmpSize, CmpType, PC, Arg1, Arg2);
@@ -327,7 +336,7 @@ void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits,
uint64_t Val, size_t NumCases,
uint64_t *Cases) {
- if (!RecordingTraces) return;
+ if (!RecordingTraces || !IsMyThread) return;
size_t ValSize = ValSizeInBits / 8;
bool TryShort = IsTwoByteData(Val);
for (size_t i = 0; i < NumCases; i++)
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 674fcc3c9f8c..cd0b167eb388 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -26,6 +26,7 @@ set(Tests
StrcmpTest
StrncmpTest
SwitchTest
+ ThreadedTest
TimeoutTest
)
diff --git a/lib/Fuzzer/test/ThreadedTest.cpp b/lib/Fuzzer/test/ThreadedTest.cpp
new file mode 100644
index 000000000000..7aa114a41f36
--- /dev/null
+++ b/lib/Fuzzer/test/ThreadedTest.cpp
@@ -0,0 +1,23 @@
+// Threaded test for a fuzzer. The fuzzer should not crash.
+#include <assert.h>
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <thread>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ if (Size < 8) return 0;
+ assert(Data);
+ auto C = [&] {
+ size_t Res = 0;
+ for (size_t i = 0; i < Size / 2; i++)
+ Res += memcmp(Data, Data + Size / 2, 4);
+ return Res;
+ };
+ std::thread T[] = {std::thread(C), std::thread(C), std::thread(C),
+ std::thread(C), std::thread(C), std::thread(C)};
+ for (auto &X : T)
+ X.join();
+ return 0;
+}
+
diff --git a/lib/Fuzzer/test/fuzzer-threaded.test b/lib/Fuzzer/test/fuzzer-threaded.test
new file mode 100644
index 000000000000..c58a33456ccb
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-threaded.test
@@ -0,0 +1,7 @@
+CHECK: Done 1000 runs in
+
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000 2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000 2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000 2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000 2>&1 | FileCheck %s
+
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index 810410df6fc7..150fc7202b00 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -30,3 +30,9 @@ RUN: LLVMFuzzer-SimpleDictionaryTest -seed=1 -runs=100000
RUN: not LLVMFuzzer-UninstrumentedTest-Uninstrumented 2>&1 | FileCheck %s --check-prefix=UNINSTRUMENTED
UNINSTRUMENTED: ERROR: __sanitizer_set_death_callback is not defined. Exiting.
+
+RUN: LLVMFuzzer-SimpleTest -print_new_cov_pcs=1 2>&1 | FileCheck %s --check-prefix=PCS
+PCS:{{^0x[a-f0-9]+}}
+PCS:NEW
+PCS:BINGO
+
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 185db47f07e5..1ebe9b7ee5bc 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -2060,7 +2060,7 @@ private:
// printGCRelocateComment - print comment after call to the gc.relocate
// intrinsic indicating base and derived pointer names.
- void printGCRelocateComment(const Value &V);
+ void printGCRelocateComment(const GCRelocateInst &Relocate);
};
} // namespace
@@ -2722,14 +2722,11 @@ void AssemblyWriter::printInstructionLine(const Instruction &I) {
/// printGCRelocateComment - print comment after call to the gc.relocate
/// intrinsic indicating base and derived pointer names.
-void AssemblyWriter::printGCRelocateComment(const Value &V) {
- assert(isGCRelocate(&V));
- GCRelocateOperands GCOps(cast<Instruction>(&V));
-
+void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
Out << " ; (";
- writeOperand(GCOps.getBasePtr(), false);
+ writeOperand(Relocate.getBasePtr(), false);
Out << ", ";
- writeOperand(GCOps.getDerivedPtr(), false);
+ writeOperand(Relocate.getDerivedPtr(), false);
Out << ")";
}
@@ -2737,8 +2734,8 @@ void AssemblyWriter::printGCRelocateComment(const Value &V) {
/// which slot it occupies.
///
void AssemblyWriter::printInfoComment(const Value &V) {
- if (isGCRelocate(&V))
- printGCRelocateComment(V);
+ if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
+ printGCRelocateComment(*Relocate);
if (AnnotationWriter)
AnnotationWriter->printInfoComment(V, Out);
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index bcf7dc365ce5..6c01bb645629 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -641,14 +641,15 @@ AttributeSet AttributeSet::get(LLVMContext &C,
if (Attrs.empty())
return AttributeSet();
-#ifndef NDEBUG
- for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
- assert((!i || Attrs[i-1].first <= Attrs[i].first) &&
- "Misordered Attributes list!");
- assert(!Attrs[i].second.hasAttribute(Attribute::None) &&
- "Pointless attribute!");
- }
-#endif
+ assert(std::is_sorted(Attrs.begin(), Attrs.end(),
+ [](const std::pair<unsigned, Attribute> &LHS,
+ const std::pair<unsigned, Attribute> &RHS) {
+ return LHS.first < RHS.first;
+ }) && "Misordered Attributes list!");
+ assert(std::none_of(Attrs.begin(), Attrs.end(),
+ [](const std::pair<unsigned, Attribute> &Pair) {
+ return Pair.second.hasAttribute(Attribute::None);
+ }) && "Pointless attribute!");
// Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
// list.
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index a0bd2c9698e8..4b33d2e66ea1 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -76,22 +76,21 @@ iplist<Instruction>::iterator Instruction::eraseFromParent() {
return getParent()->getInstList().erase(getIterator());
}
-/// insertBefore - Insert an unlinked instructions into a basic block
-/// immediately before the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately before the
+/// specified instruction.
void Instruction::insertBefore(Instruction *InsertPos) {
InsertPos->getParent()->getInstList().insert(InsertPos->getIterator(), this);
}
-/// insertAfter - Insert an unlinked instructions into a basic block
-/// immediately after the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately after the
+/// specified instruction.
void Instruction::insertAfter(Instruction *InsertPos) {
InsertPos->getParent()->getInstList().insertAfter(InsertPos->getIterator(),
this);
}
-/// moveBefore - Unlink this instruction from its current basic block and
-/// insert it into the basic block that MovePos lives in, right before
-/// MovePos.
+/// Unlink this instruction from its current basic block and insert it into the
+/// basic block that MovePos lives in, right before MovePos.
void Instruction::moveBefore(Instruction *MovePos) {
MovePos->getParent()->getInstList().splice(
MovePos->getIterator(), getParent()->getInstList(), getIterator());
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 4ae2fd522b52..7c64ca7b7275 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -609,20 +609,6 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
return setSuccessor(idx, B);
}
-bool InvokeInst::hasFnAttrImpl(Attribute::AttrKind A) const {
- if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
- return true;
-
- // Operand bundles override attributes on the called function, but don't
- // override attributes directly present on the invoke instruction.
- if (isFnAttrDisallowedByOpBundle(A))
- return false;
-
- if (const Function *F = getCalledFunction())
- return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
- return false;
-}
-
bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
@@ -934,6 +920,17 @@ void CatchSwitchInst::addHandler(BasicBlock *Handler) {
getOperandList()[OpNo] = Handler;
}
+void CatchSwitchInst::removeHandler(handler_iterator HI) {
+ // Move all subsequent handlers up one.
+ Use *EndDst = op_end() - 1;
+ for (Use *CurDst = HI.getCurrent(); CurDst != EndDst; ++CurDst)
+ *CurDst = *(CurDst + 1);
+ // Null out the last handler use.
+ *EndDst = nullptr;
+
+ setNumHungOffUseOperands(getNumOperands() - 1);
+}
+
BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
return getSuccessor(idx);
}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index ab1ba5e2035b..d8eaceb9ea2b 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -190,6 +190,8 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) &&
"Expected non-temp node");
+ assert(CanReplace &&
+ "Attempted to replace Metadata marked for no replacement");
if (UseMap.empty())
return;
@@ -555,7 +557,7 @@ void MDNode::decrementUnresolvedOperandCount() {
resolve();
}
-void MDNode::resolveCycles(bool MDMaterialized) {
+void MDNode::resolveCycles(bool AllowTemps) {
if (isResolved())
return;
@@ -568,7 +570,7 @@ void MDNode::resolveCycles(bool MDMaterialized) {
if (!N)
continue;
- if (N->isTemporary() && !MDMaterialized)
+ if (N->isTemporary() && AllowTemps)
continue;
assert(!N->isTemporary() &&
"Expected all forward declarations to be resolved");
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
index d45c1883ef9e..27a990eaff81 100644
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@@ -40,20 +40,7 @@ bool llvm::isStatepoint(const Value &inst) {
}
bool llvm::isGCRelocate(const ImmutableCallSite &CS) {
- if (!CS.getInstruction()) {
- // This is not a call site
- return false;
- }
-
- return isGCRelocate(CS.getInstruction());
-}
-bool llvm::isGCRelocate(const Value *inst) {
- if (const CallInst *call = dyn_cast<CallInst>(inst)) {
- if (const Function *F = call->getCalledFunction()) {
- return F->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
- }
- }
- return false;
+ return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
}
bool llvm::isGCResult(const ImmutableCallSite &CS) {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 81c87e4759b7..6dfb05d94491 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -1657,14 +1657,14 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
const CallInst *Call = dyn_cast<const CallInst>(U);
Assert(Call, "illegal use of statepoint token", &CI, U);
if (!Call) continue;
- Assert(isGCRelocate(Call) || isGCResult(Call),
+ Assert(isa<GCRelocateInst>(Call) || isGCResult(Call),
"gc.result or gc.relocate are the only value uses"
"of a gc.statepoint",
&CI, U);
if (isGCResult(Call)) {
Assert(Call->getArgOperand(0) == &CI,
"gc.result connected to wrong gc.statepoint", &CI, Call);
- } else if (isGCRelocate(Call)) {
+ } else if (isa<GCRelocateInst>(Call)) {
Assert(Call->getArgOperand(0) == &CI,
"gc.relocate connected to wrong gc.statepoint", &CI, Call);
}
@@ -3019,8 +3019,7 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
&CPI);
auto *ParentPad = CPI.getParentPad();
- Assert(isa<CatchSwitchInst>(ParentPad) || isa<ConstantTokenNone>(ParentPad) ||
- isa<CleanupPadInst>(ParentPad) || isa<CatchPadInst>(ParentPad),
+ Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
"CleanupPadInst has an invalid parent.", &CPI);
User *FirstUser = nullptr;
@@ -3077,10 +3076,17 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
}
auto *ParentPad = CatchSwitch.getParentPad();
- Assert(isa<CatchSwitchInst>(ParentPad) || isa<ConstantTokenNone>(ParentPad) ||
- isa<CleanupPadInst>(ParentPad) || isa<CatchPadInst>(ParentPad),
+ Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
"CatchSwitchInst has an invalid parent.", ParentPad);
+ Assert(CatchSwitch.getNumHandlers() != 0,
+ "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
+
+ for (BasicBlock *Handler : CatchSwitch.handlers()) {
+ Assert(isa<CatchPadInst>(Handler->getFirstNonPHI()),
+ "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
+ }
+
visitTerminatorInst(CatchSwitch);
}
@@ -3675,8 +3681,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
// Verify rest of the relocate arguments
- GCRelocateOperands Ops(CS);
- ImmutableCallSite StatepointCS(Ops.getStatepoint());
+ ImmutableCallSite StatepointCS(
+ cast<GCRelocateInst>(*CS.getInstruction()).getStatepoint());
// Both the base and derived must be piped through the safepoint
Value* Base = CS.getArgOperand(1);
@@ -3731,14 +3737,14 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
// Relocated value must be a pointer type, but gc_relocate does not need to return the
// same pointer type as the relocated pointer. It can be casted to the correct type later
// if it's desired. However, they must have the same address space.
- GCRelocateOperands Operands(CS);
- Assert(Operands.getDerivedPtr()->getType()->isPointerTy(),
+ GCRelocateInst &Relocate = cast<GCRelocateInst>(*CS.getInstruction());
+ Assert(Relocate.getDerivedPtr()->getType()->isPointerTy(),
"gc.relocate: relocated value must be a gc pointer", CS);
// gc_relocate return type must be a pointer type, and is verified earlier in
// VerifyIntrinsicType().
Assert(cast<PointerType>(CS.getType())->getAddressSpace() ==
- cast<PointerType>(Operands.getDerivedPtr()->getType())->getAddressSpace(),
+ cast<PointerType>(Relocate.getDerivedPtr()->getType())->getAddressSpace(),
"gc.relocate: relocating a pointer shouldn't change its address space", CS);
break;
}
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index fa6e37517fc4..309690f61d74 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -524,6 +524,23 @@ public:
ValueMapperFlags = ValueMapperFlags | RF_HaveUnmaterializedMetadata;
}
+ ~IRLinker() {
+ // In the case where we are not linking metadata, we unset the CanReplace
+ // flag on all temporary metadata in the MetadataToIDs map to ensure
+ // none was replaced while being a map key. Now that we are destructing
+ // the map, set the flag back to true, so that it is replaceable during
+ // metadata linking.
+ if (!shouldLinkMetadata()) {
+ for (auto MDI : MetadataToIDs) {
+ Metadata *MD = const_cast<Metadata *>(MDI.first);
+ MDNode *Node = dyn_cast<MDNode>(MD);
+ assert((Node && Node->isTemporary()) &&
+ "Found non-temp metadata in map when not linking metadata");
+ Node->setCanReplace(true);
+ }
+ }
+ }
+
bool run();
Value *materializeDeclFor(Value *V, bool ForAlias);
void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
@@ -1111,7 +1128,8 @@ bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
// a function and before remapping metadata on instructions below
// in RemapInstruction, as the saved mapping is used to handle
// the temporary metadata hanging off instructions.
- SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, true);
+ SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+ /* OnlyTempMD = */ true);
// Link in the prefix data.
if (Src.hasPrefixData())
@@ -1514,7 +1532,8 @@ bool IRLinker::run() {
// Ensure metadata materialized
if (SrcM.getMaterializer()->materializeMetadata())
return true;
- SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, false);
+ SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+ /* OnlyTempMD = */ false);
}
linkNamedMDNodes();
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index a99ac4eca59e..dafa7683b1ab 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -514,13 +514,13 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
MCOS->EmitULEB128IntValue(1);
MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
- EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list,
- context.getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
- : dwarf::DW_FORM_data4);
+ EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, context.getDwarfVersion() >= 4
+ ? dwarf::DW_FORM_sec_offset
+ : dwarf::DW_FORM_data4);
if (context.getGenDwarfSectionSyms().size() > 1 &&
context.getDwarfVersion() >= 3) {
- EmitAbbrev(MCOS, dwarf::DW_AT_ranges,
- context.getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+ EmitAbbrev(MCOS, dwarf::DW_AT_ranges, context.getDwarfVersion() >= 4
+ ? dwarf::DW_FORM_sec_offset
: dwarf::DW_FORM_data4);
} else {
EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 028f2e955b21..34f49cac1628 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MObjectFileInfo.cpp - Object File Information ---------------------===//
+//===-- MCObjectFileInfo.cpp - Object File Information --------------------===//
//
// The LLVM Compiler Infrastructure
//
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index dc864d3a17f8..1b592504b1e4 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -63,31 +63,30 @@ FeatureBitset MCSubtargetInfo::ToggleFeature(const FeatureBitset &FB) {
/// ToggleFeature - Toggle a feature and returns the re-computed feature
/// bits. This version will also change all implied bits.
FeatureBitset MCSubtargetInfo::ToggleFeature(StringRef FS) {
- SubtargetFeatures Features;
- FeatureBits = Features.ToggleFeature(FeatureBits, FS, ProcFeatures);
+ SubtargetFeatures::ToggleFeature(FeatureBits, FS, ProcFeatures);
return FeatureBits;
}
FeatureBitset MCSubtargetInfo::ApplyFeatureFlag(StringRef FS) {
- SubtargetFeatures Features;
- FeatureBits = Features.ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
+ SubtargetFeatures::ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
return FeatureBits;
}
const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
assert(ProcSchedModels && "Processor machine model not available!");
- size_t NumProcs = ProcDesc.size();
- assert(std::is_sorted(ProcSchedModels, ProcSchedModels+NumProcs,
+ ArrayRef<SubtargetInfoKV> SchedModels(ProcSchedModels, ProcDesc.size());
+
+ assert(std::is_sorted(SchedModels.begin(), SchedModels.end(),
[](const SubtargetInfoKV &LHS, const SubtargetInfoKV &RHS) {
return strcmp(LHS.Key, RHS.Key) < 0;
}) &&
"Processor machine model table is not sorted");
// Find entry
- const SubtargetInfoKV *Found =
- std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
- if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
+ auto Found =
+ std::lower_bound(SchedModels.begin(), SchedModels.end(), CPU);
+ if (Found == SchedModels.end() || StringRef(Found->Key) != CPU) {
if (CPU != "help") // Don't error if the user asked for help.
errs() << "'" << CPU
<< "' is not a recognized processor for this target"
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index b642f17f0e79..7cce0fe756ef 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -160,10 +160,9 @@ void ClearImpliedBits(FeatureBitset &Bits,
}
}
-/// ToggleFeature - Toggle a feature and returns the newly updated feature
-/// bits.
-FeatureBitset
-SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
+/// ToggleFeature - Toggle a feature and update the feature bits.
+void
+SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
ArrayRef<SubtargetFeatureKV> FeatureTable) {
// Find feature in table.
@@ -186,12 +185,9 @@ SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
<< "' is not a recognized feature for this target"
<< " (ignoring feature)\n";
}
-
- return Bits;
}
-FeatureBitset
-SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
+void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
ArrayRef<SubtargetFeatureKV> FeatureTable) {
assert(hasFlag(Feature));
@@ -203,7 +199,7 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
if (FeatureEntry) {
// Enable/disable feature in bits
if (isEnabled(Feature)) {
- Bits |= FeatureEntry->Value;
+ Bits |= FeatureEntry->Value;
// For each feature that this implies, set it.
SetImpliedBits(Bits, FeatureEntry, FeatureTable);
@@ -218,8 +214,6 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
<< "' is not a recognized feature for this target"
<< " (ignoring feature)\n";
}
-
- return Bits;
}
@@ -234,14 +228,10 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
return FeatureBitset();
#ifndef NDEBUG
- for (size_t i = 1, e = CPUTable.size(); i != e; ++i) {
- assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
- "CPU table is not sorted");
- }
- for (size_t i = 1, e = FeatureTable.size(); i != e; ++i) {
- assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
- "CPU features table is not sorted");
- }
+ assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
+ "CPU table is not sorted");
+ assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
+ "CPU features table is not sorted");
#endif
// Resulting bits
FeatureBitset Bits;
@@ -277,7 +267,7 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
if (Feature == "+help")
Help(CPUTable, FeatureTable);
- Bits = ApplyFeatureFlag(Bits, Feature, FeatureTable);
+ ApplyFeatureFlag(Bits, Feature, FeatureTable);
}
return Bits;
diff --git a/lib/ProfileData/CoverageMappingReader.cpp b/lib/ProfileData/CoverageMappingReader.cpp
index a0f82a0d4ede..32c692d8073a 100644
--- a/lib/ProfileData/CoverageMappingReader.cpp
+++ b/lib/ProfileData/CoverageMappingReader.cpp
@@ -316,12 +316,17 @@ static std::error_code readCoverageMappingData(
// Read the records in the coverage data section.
for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
- if (Buf + 4 * sizeof(uint32_t) > End)
+ if (Buf + sizeof(CovMapHeader) > End)
return coveragemap_error::malformed;
- uint32_t NRecords = endian::readNext<uint32_t, Endian, unaligned>(Buf);
- uint32_t FilenamesSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
- uint32_t CoverageSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
- uint32_t Version = endian::readNext<uint32_t, Endian, unaligned>(Buf);
+ auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+ uint32_t NRecords =
+ endian::byte_swap<uint32_t, Endian>(CovHeader->NRecords);
+ uint32_t FilenamesSize =
+ endian::byte_swap<uint32_t, Endian>(CovHeader->FilenamesSize);
+ uint32_t CoverageSize =
+ endian::byte_swap<uint32_t, Endian>(CovHeader->CoverageSize);
+ uint32_t Version = endian::byte_swap<uint32_t, Endian>(CovHeader->Version);
+ Buf = reinterpret_cast<const char *>(++CovHeader);
switch (Version) {
case CoverageMappingVersion1:
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index f5acd23129dc..027f0f78c546 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -12,12 +12,15 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
#include "llvm/IR/GlobalVariable.h"
-#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Compression.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
#include "llvm/Support/ManagedStatic.h"
using namespace llvm;
@@ -162,6 +165,98 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) {
return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName);
}
+int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+ bool doCompression, std::string &Result) {
+ uint8_t Header[16], *P = Header;
+ std::string UncompressedNameStrings =
+ join(NameStrs.begin(), NameStrs.end(), StringRef(" "));
+
+ unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P);
+ P += EncLen;
+
+ auto WriteStringToResult = [&](size_t CompressedLen,
+ const std::string &InputStr) {
+ EncLen = encodeULEB128(CompressedLen, P);
+ P += EncLen;
+ char *HeaderStr = reinterpret_cast<char *>(&Header[0]);
+ unsigned HeaderLen = P - &Header[0];
+ Result.append(HeaderStr, HeaderLen);
+ Result += InputStr;
+ return 0;
+ };
+
+ if (!doCompression)
+ return WriteStringToResult(0, UncompressedNameStrings);
+
+ SmallVector<char, 128> CompressedNameStrings;
+ zlib::Status Success =
+ zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
+ zlib::BestSizeCompression);
+
+ if (Success != zlib::StatusOK)
+ return 1;
+
+ return WriteStringToResult(
+ CompressedNameStrings.size(),
+ std::string(CompressedNameStrings.data(), CompressedNameStrings.size()));
+}
+
+StringRef getPGOFuncNameInitializer(GlobalVariable *NameVar) {
+ auto *Arr = cast<ConstantDataArray>(NameVar->getInitializer());
+ StringRef NameStr =
+ Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+ return NameStr;
+}
+
+int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+ std::string &Result) {
+ std::vector<std::string> NameStrs;
+ for (auto *NameVar : NameVars) {
+ NameStrs.push_back(getPGOFuncNameInitializer(NameVar));
+ }
+ return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result);
+}
+
+int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
+ const uint8_t *P = reinterpret_cast<const uint8_t *>(NameStrings.data());
+ const uint8_t *EndP = reinterpret_cast<const uint8_t *>(NameStrings.data() +
+ NameStrings.size());
+ while (P < EndP) {
+ uint32_t N;
+ uint64_t UncompressedSize = decodeULEB128(P, &N);
+ P += N;
+ uint64_t CompressedSize = decodeULEB128(P, &N);
+ P += N;
+ bool isCompressed = (CompressedSize != 0);
+ SmallString<128> UncompressedNameStrings;
+ StringRef NameStrings;
+ if (isCompressed) {
+ StringRef CompressedNameStrings(reinterpret_cast<const char *>(P),
+ CompressedSize);
+ if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
+ UncompressedSize) != zlib::StatusOK)
+ return 1;
+ P += CompressedSize;
+ NameStrings = StringRef(UncompressedNameStrings.data(),
+ UncompressedNameStrings.size());
+ } else {
+ NameStrings =
+ StringRef(reinterpret_cast<const char *>(P), UncompressedSize);
+ P += UncompressedSize;
+ }
+ // Now parse the name strings.
+ SmallVector<StringRef, 0> Names;
+ NameStrings.split(Names, ' ');
+ for (StringRef &Name : Names)
+ Symtab.addFuncName(Name);
+
+ while (P < EndP && *P == 0)
+ P++;
+ }
+ Symtab.finalizeSymtab();
+ return 0;
+}
+
instrprof_error
InstrProfValueSiteRecord::mergeValueData(InstrProfValueSiteRecord &Input,
uint64_t Weight) {
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index a8d1fe3c07d0..7d3537e20727 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -446,7 +446,7 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
return EC;
}
-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
static long ArgMax = sysconf(_SC_ARG_MAX);
// System says no practical limit.
@@ -456,7 +456,7 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
// Conservatively account for space required by environment variables.
long HalfArgMax = ArgMax / 2;
- size_t ArgLength = 0;
+ size_t ArgLength = Program.size() + 1;
for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
I != E; ++I) {
ArgLength += strlen(*I) + 1;
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index d4e14ddc6518..78fc538bd9bf 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -535,14 +535,15 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
return EC;
}
-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
// The documented max length of the command line passed to CreateProcess.
static const size_t MaxCommandStringLength = 32768;
- size_t ArgLength = 0;
+ // Account for the trailing space for the program path and the
+ // trailing NULL of the last argument.
+ size_t ArgLength = ArgLenWithQuotes(Program.str().c_str()) + 2;
for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
I != E; ++I) {
- // Account for the trailing space for every arg but the last one and the
- // trailing NULL of the last argument.
+ // Account for the trailing space for every arg
ArgLength += ArgLenWithQuotes(*I) + 1;
if (ArgLength > MaxCommandStringLength) {
return false;
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index 34d961b148d1..c65e3148921e 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -30,6 +30,9 @@
#define _WIN32_WINNT 0x0601
#define _WIN32_IE 0x0800 // MinGW at it again. FIXME: verify if still needed.
#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -44,6 +47,21 @@
#include <string>
#include <vector>
+#if !defined(__CYGWIN__) && !defined(__MINGW32__)
+#include <VersionHelpers.h>
+#else
+// Cygwin does not have the IsWindows8OrGreater() API.
+// Some version of mingw does not have the API either.
+inline bool IsWindows8OrGreater() {
+ OSVERSIONINFO osvi = {};
+ osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+ if (!::GetVersionEx(&osvi))
+ return false;
+ return (osvi.dwMajorVersion > 6 ||
+ (osvi.dwMajorVersion == 6 && osvi.dwMinorVersion >= 2));
+}
+#endif // __CYGWIN__
+
inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
if (!ErrMsg)
return true;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 57c7ac32f559..57162dc6e95a 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -57,6 +57,10 @@
#endif
#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/WindowsSupport.h"
+#endif
+
using namespace llvm;
raw_ostream::~raw_ostream() {
@@ -567,8 +571,21 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
assert(FD >= 0 && "File already closed.");
pos += Size;
+#ifndef LLVM_ON_WIN32
+ bool ShouldWriteInChunks = false;
+#else
+ // Writing a large size of output to Windows console returns ENOMEM. It seems
+ // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
+ // the latter has a size limit (66000 bytes or less, depending on heap usage).
+ bool ShouldWriteInChunks = !!::_isatty(FD) && !IsWindows8OrGreater();
+#endif
+
do {
- ssize_t ret = ::write(FD, Ptr, Size);
+ size_t ChunkSize = Size;
+ if (ChunkSize > 32767 && ShouldWriteInChunks)
+ ChunkSize = 32767;
+
+ ssize_t ret = ::write(FD, Ptr, ChunkSize);
if (ret < 0) {
// If it's a recoverable error, swallow it and retry the write.
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 87a3422b32ab..11e35b75375e 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -722,7 +722,7 @@ Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
std::string UnOpInit::getAsString() const {
std::string Result;
- switch (Opc) {
+ switch (getOpcode()) {
case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
case HEAD: Result = "!head"; break;
case TAIL: Result = "!tail"; break;
@@ -850,7 +850,7 @@ Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
std::string BinOpInit::getAsString() const {
std::string Result;
- switch (Opc) {
+ switch (getOpcode()) {
case CONCAT: Result = "!con"; break;
case ADD: Result = "!add"; break;
case AND: Result = "!and"; break;
@@ -1054,7 +1054,7 @@ Init *TernOpInit::resolveReferences(Record &R,
const RecordVal *RV) const {
Init *lhs = LHS->resolveReferences(R, RV);
- if (Opc == IF && lhs != LHS) {
+ if (getOpcode() == IF && lhs != LHS) {
IntInit *Value = dyn_cast<IntInit>(lhs);
if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
Value = dyn_cast<IntInit>(I);
@@ -1082,7 +1082,7 @@ Init *TernOpInit::resolveReferences(Record &R,
std::string TernOpInit::getAsString() const {
std::string Result;
- switch (Opc) {
+ switch (getOpcode()) {
case SUBST: Result = "!subst"; break;
case FOREACH: Result = "!foreach"; break;
case IF: Result = "!if"; break;
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index e5f6f165d13f..1506a7171ac4 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -77,7 +77,8 @@ bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
/// SetValue -
/// Return true on error, false on success.
bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
- const std::vector<unsigned> &BitList, Init *V) {
+ ArrayRef<unsigned> BitList, Init *V,
+ bool AllowSelfAssignment) {
if (!V) return false;
if (!CurRec) CurRec = &CurMultiClass->Rec;
@@ -91,8 +92,8 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
// in the resolution machinery.
if (BitList.empty())
if (VarInit *VI = dyn_cast<VarInit>(V))
- if (VI->getNameInit() == ValName)
- return false;
+ if (VI->getNameInit() == ValName && !AllowSelfAssignment)
+ return true;
// If we are assigning to a subset of the bits in the value... then we must be
// assigning to a field of BitsRecTy, which must have a BitsInit
@@ -165,7 +166,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
if (i < SubClass.TemplateArgs.size()) {
// If a value is specified for this template arg, set it now.
if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
- std::vector<unsigned>(), SubClass.TemplateArgs[i]))
+ None, SubClass.TemplateArgs[i]))
return true;
// Resolve it next.
@@ -243,8 +244,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
// If a value is specified for this template arg, set it in the
// superclass now.
if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
- std::vector<unsigned>(),
- SubMultiClass.TemplateArgs[i]))
+ None, SubMultiClass.TemplateArgs[i]))
return true;
// Resolve it next.
@@ -258,8 +258,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
for (const auto &Def :
makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) {
if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i],
- std::vector<unsigned>(),
- SubMultiClass.TemplateArgs[i]))
+ None, SubMultiClass.TemplateArgs[i]))
return true;
// Resolve it next.
@@ -332,8 +331,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
- if (SetValue(IterRec.get(), Loc, IterVar->getName(),
- std::vector<unsigned>(), IVal))
+ if (SetValue(IterRec.get(), Loc, IterVar->getName(), None, IVal))
return Error(Loc, "when instantiating this def");
// Resolve it next.
@@ -1728,7 +1726,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
SMLoc ValLoc = Lex.getLoc();
Init *Val = ParseValue(CurRec, Type);
if (!Val ||
- SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
+ SetValue(CurRec, ValLoc, DeclName, None, Val))
// Return the name, even if an error is thrown. This is so that we can
// continue to make some progress, even without the value having been
// initialized.
@@ -2358,8 +2356,8 @@ Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
// Set the value for NAME. We don't resolve references to it 'til later,
// though, so that uses in nested multiclass names don't get
// confused.
- if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME",
- std::vector<unsigned>(), DefmPrefix)) {
+ if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME", None, DefmPrefix,
+ /*AllowSelfAssignment*/true)) {
Error(DefmPrefixRange.Start, "Could not resolve " +
CurRec->getNameInitAsString() + ":NAME to '" +
DefmPrefix->getAsUnquotedString() + "'");
@@ -2446,8 +2444,7 @@ bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, Record *CurRec,
// Check if a value is specified for this temp-arg.
if (i < TemplateVals.size()) {
// Set it now.
- if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(),
- TemplateVals[i]))
+ if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], None, TemplateVals[i]))
return true;
// Resolve it next.
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 8b41134d4ff1..739d9a9c5f37 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -105,10 +105,13 @@ public:
private: // Semantic analysis methods.
bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName,
- const std::vector<unsigned> &BitList, Init *V);
+ ArrayRef<unsigned> BitList, Init *V,
+ bool AllowSelfAssignment = false);
bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName,
- const std::vector<unsigned> &BitList, Init *V) {
- return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V);
+ ArrayRef<unsigned> BitList, Init *V,
+ bool AllowSelfAssignment = false) {
+ return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V,
+ AllowSelfAssignment);
}
bool AddSubClass(Record *Rec, SubClassReference &SubClass);
bool AddSubMultiClass(MultiClass *CurMC,
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 0bff9b592c15..46ef2c111bae 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -124,6 +124,14 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
FeaturePerfMon,
FeatureZCRegMove, FeatureZCZeroing]>;
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+ "Samsung Exynos-M1 processors",
+ [FeatureFPARMv8,
+ FeatureNEON,
+ FeatureCrypto,
+ FeatureCRC,
+ FeaturePerfMon]>;
+
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
FeatureNEON,
FeatureCRC,
@@ -136,6 +144,8 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
+def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;
//===----------------------------------------------------------------------===//
// Assembly parser
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 79a84ad8c6c5..3d1ab4e3fc2b 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -158,7 +158,7 @@ INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
"AArch64 A57 FP Load-Balancing", false, false)
namespace {
-/// A Chain is a sequence of instructions that are linked together by
+/// A Chain is a sequence of instructions that are linked together by
/// an accumulation operand. For example:
///
/// fmul d0<def>, ?
@@ -285,7 +285,7 @@ public:
std::string str() const {
std::string S;
raw_string_ostream OS(S);
-
+
OS << "{";
StartInst->print(OS, /* SkipOpers= */true);
OS << " -> ";
@@ -427,7 +427,7 @@ Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
return Ch;
}
}
-
+
// Bailout case - just return the first item.
Chain *Ch = L.front();
L.erase(L.begin());
@@ -495,7 +495,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
RS.enterBasicBlock(&MBB);
RS.forward(MachineBasicBlock::iterator(G->getStart()));
- // Can we find an appropriate register that is available throughout the life
+ // Can we find an appropriate register that is available throughout the life
// of the chain?
unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f5beff12100..4ecfbe9e2280 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2426,7 +2426,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
continue;
}
-
+
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
@@ -5074,7 +5074,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
// The index of an EXT is the first element if it is not UNDEF.
// Watch out for the beginning UNDEFs. The EXT index should be the expected
- // value of the first element. E.g.
+ // value of the first element. E.g.
// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
// <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
// ExpectedElt is the last mask index plus 1.
@@ -9491,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N,
return SDValue();
}
+// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
+// as well as whether the test should be inverted. This code is required to
+// catch these cases (as opposed to standard dag combines) because
+// AArch64ISD::TBZ is matched during legalization.
+static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
+ SelectionDAG &DAG) {
+
+ if (!Op->hasOneUse())
+ return Op;
+
+ // We don't handle undef/constant-fold cases below, as they should have
+ // already been taken care of (e.g. and of 0, test of undefined shifted bits,
+ // etc.)
+
+ // (tbz (trunc x), b) -> (tbz x, b)
+ // This case is just here to enable more of the below cases to be caught.
+ if (Op->getOpcode() == ISD::TRUNCATE &&
+ Bit < Op->getValueType(0).getSizeInBits()) {
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+
+ if (Op->getNumOperands() != 2)
+ return Op;
+
+ auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!C)
+ return Op;
+
+ switch (Op->getOpcode()) {
+ default:
+ return Op;
+
+ // (tbz (and x, m), b) -> (tbz x, b)
+ case ISD::AND:
+ if ((C->getZExtValue() >> Bit) & 1)
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ return Op;
+
+ // (tbz (shl x, c), b) -> (tbz x, b-c)
+ case ISD::SHL:
+ if (C->getZExtValue() <= Bit &&
+ (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+ Bit = Bit - C->getZExtValue();
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+ return Op;
+
+ // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
+ case ISD::SRA:
+ Bit = Bit + C->getZExtValue();
+ if (Bit >= Op->getValueType(0).getSizeInBits())
+ Bit = Op->getValueType(0).getSizeInBits() - 1;
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+
+ // (tbz (srl x, c), b) -> (tbz x, b+c)
+ case ISD::SRL:
+ if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+ Bit = Bit + C->getZExtValue();
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+ return Op;
+
+ // (tbz (xor x, -1), b) -> (tbnz x, b)
+ case ISD::XOR:
+ if ((C->getZExtValue() >> Bit) & 1)
+ Invert = !Invert;
+ return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+ }
+}
+
+// Optimize test single bit zero/non-zero and branch.
+static SDValue performTBZCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ bool Invert = false;
+ SDValue TestSrc = N->getOperand(1);
+ SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
+
+ if (TestSrc == NewTestSrc)
+ return SDValue();
+
+ unsigned NewOpc = N->getOpcode();
+ if (Invert) {
+ if (NewOpc == AArch64ISD::TBZ)
+ NewOpc = AArch64ISD::TBNZ;
+ else {
+ assert(NewOpc == AArch64ISD::TBNZ);
+ NewOpc = AArch64ISD::TBZ;
+ }
+ }
+
+ SDLoc DL(N);
+ return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
+ DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
+}
+
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
@@ -9642,6 +9739,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
+ case AArch64ISD::TBNZ:
+ case AArch64ISD::TBZ:
+ return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:
return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 566aa2c9a9ba..43664df3b861 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -613,21 +613,6 @@ static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
(UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
}
-// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
-static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
- MachineInstr *Op1) {
- assert(MI->memoperands_empty() && "expected a new machineinstr");
- size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
- (Op1->memoperands_end() - Op1->memoperands_begin());
-
- MachineFunction *MF = MI->getParent()->getParent();
- MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
- MachineSDNode::mmo_iterator MemEnd =
- std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
- MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
- MI->setMemRefs(MemBegin, MemEnd);
-}
-
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
@@ -692,10 +677,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
TII->get(NewOpc))
.addOperand(getLdStRegOp(RtNewDest))
.addOperand(BaseRegOp)
- .addImm(OffsetImm);
-
- // Copy MachineMemOperands from the original loads.
- concatenateMemOperands(NewMemMI, I, Paired);
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*Paired));
DEBUG(
dbgs()
@@ -786,9 +769,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
TII->get(NewOpc))
.addOperand(getLdStRegOp(I))
.addOperand(BaseRegOp)
- .addImm(OffsetImm);
- // Copy MachineMemOperands from the original stores.
- concatenateMemOperands(MIB, I, Paired);
+ .addImm(OffsetImm)
+ .setMemRefs(I->mergeMemRefsWith(*Paired));
} else {
// Handle Unscaled
if (IsUnscaled)
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 1b8b9b27719c..151133b2f32c 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -33,7 +33,14 @@ class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
protected:
- enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
+ enum ARMProcFamilyEnum {
+ Others,
+ CortexA35,
+ CortexA53,
+ CortexA57,
+ Cyclone,
+ ExynosM1
+ };
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily;
@@ -143,6 +150,7 @@ public:
bool isCyclone() const { return CPUString == "cyclone"; }
bool isCortexA57() const { return CPUString == "cortex-a57"; }
bool isCortexA53() const { return CPUString == "cortex-a53"; }
+ bool isExynosM1() const { return CPUString == "exynos-m1"; }
bool useAA() const override { return isCortexA53(); }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 78f5289ec26d..cde1c6df2608 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -834,7 +834,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
};
uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name,
+AArch64SysReg::SysRegMapper::fromString(StringRef Name,
const FeatureBitset& FeatureBits, bool &Valid) const {
std::string NameLower = Name.lower();
@@ -878,7 +878,7 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
}
std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
const FeatureBitset& FeatureBits) const {
// First search the registers shared by all
for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index f649cb9b8a8d..e63627eae123 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -285,17 +285,17 @@ struct AArch64NamedImmMapper {
// Zero value of FeatureBitSet means the mapping is always available
FeatureBitset FeatureBitSet;
- bool isNameEqual(std::string Other,
+ bool isNameEqual(std::string Other,
const FeatureBitset& FeatureBits) const {
- if (FeatureBitSet.any() &&
+ if (FeatureBitSet.any() &&
(FeatureBitSet & FeatureBits).none())
return false;
return Name == Other;
}
- bool isValueEqual(uint32_t Other,
+ bool isValueEqual(uint32_t Other,
const FeatureBitset& FeatureBits) const {
- if (FeatureBitSet.any() &&
+ if (FeatureBitSet.any() &&
(FeatureBitSet & FeatureBits).none())
return false;
return Value == Other;
@@ -310,7 +310,7 @@ struct AArch64NamedImmMapper {
StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
bool &Valid) const;
// Maps string to value, depending on availability for FeatureBits given
- uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
+ uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
bool &Valid) const;
/// Many of the instructions allow an alternative assembly form consisting of
@@ -1322,7 +1322,7 @@ namespace AArch64TLBI {
return true;
}
}
-}
+}
namespace AArch64II {
/// Target Operand Flag enum.
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index d4af8d2e48d1..db869cf7dd8b 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -118,6 +118,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"true",
"Support flat address space">;
+def FeatureXNACK : SubtargetFeature<"xnack",
+ "EnableXNACK",
+ "true",
+ "Enable XNACK support">;
+
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",
"true",
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index ba71dc05a8fc..9c3790264377 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -417,13 +417,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
- if (VCCUsed || FlatUsed)
+ if (VCCUsed || FlatUsed || STM.isXNACKEnabled()) {
MaxSGPR += 2;
- if (FlatUsed) {
- MaxSGPR += 2;
- // 2 additional for VI+.
- if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (FlatUsed)
+ MaxSGPR += 2;
+
+ if (STM.isXNACKEnabled())
MaxSGPR += 2;
}
@@ -620,6 +620,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+ if (STM.isXNACKEnabled())
+ header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+
header.kernarg_segment_byte_size = MFI->ABIArgOffset;
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 11f6139deddd..2a7ce6a47176 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -204,14 +204,6 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
-def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
@@ -243,14 +235,6 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
-def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
@@ -299,16 +283,6 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
return isGlobalStore(dyn_cast<StoreSDNode>(N));
}]>;
-def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
- (truncstorei8 node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
- (truncstorei16 node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
def local_store : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -385,15 +359,6 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
-def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def flat_store : PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 44e0c47877a9..c6af5b93d257 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -73,6 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
+ EnableXNACK(false),
WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 9c7bb88f8f4a..d3712276d5e7 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -76,6 +76,7 @@ private:
bool EnableIfCvt;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
+ bool EnableXNACK;
unsigned WavefrontSize;
bool CFALUBug;
int LocalMemorySize;
@@ -290,6 +291,10 @@ public:
}
bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+ bool isXNACKEnabled() const {
+ return EnableXNACK;
+ }
+
unsigned getMaxWavesPerCU() const {
if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return 10;
diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td
index 88a090d3df35..c543814cae0d 100644
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ b/lib/Target/AMDGPU/CIInstructions.td
@@ -264,42 +264,6 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
-//===----------------------------------------------------------------------===//
-// Flat Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [HasFlatAddressSpace] in {
-
-class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
- PatFrag flat_ld> :
- Pat <(vt (flat_ld i64:$ptr)),
- (Instr_ADDR64 $ptr, 0, 0, 0)
->;
-
-def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
-
-class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
- Pat <(st vt:$value, i64:$ptr),
- (Instr $value, $ptr, 0, 0, 0)
- >;
-
-def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
-def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
-def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
-
-} // End HasFlatAddressSpace predicate
-
let Predicates = [isCI] in {
// Convert (x - floor(x)) to fract(x)
@@ -320,20 +284,10 @@ def : Pat <
//===----------------------------------------------------------------------===//
-// Patterns to generate flat for global
+// Flat Patterns
//===----------------------------------------------------------------------===//
-def useFlatForGlobal : Predicate <
- "Subtarget->useFlatForGlobal() || "
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
-
-let Predicates = [useFlatForGlobal] in {
-
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
- (SIload_constant v4i32:$sbase, IMM20bit:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
+let Predicates = [isCIVI] in {
// Patterns for global loads with no offset
class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
@@ -341,24 +295,24 @@ class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
(inst $addr, 0, 0, 0)
>;
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
(node vt:$data, i64:$addr),
(inst $data, $addr, 0, 0, 0)
>;
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
(vt (node i64:$addr, vt:$data)),
@@ -376,4 +330,4 @@ def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-} // End Predicates = [useFlatForGlobal]
+} // End Predicates = [isCIVI]
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6b3c81c3af74..7d20509c464d 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -105,51 +105,53 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
- // We reserved the last registers for this. Shift it down to the end of those
- // which were actually used.
- //
- // FIXME: It might be safer to use a pseudoregister before replacement.
-
- // FIXME: We should be able to eliminate unused input registers. We only
- // cannot do this for the resources required for scratch access. For now we
- // skip over user SGPRs and may leave unused holes.
-
- // We find the resource first because it has an alignment requirement.
- if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
- // Skip the last 2 elements because the last one is reserved for VCC, and
- // this is the 2nd to last element already.
- for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
- // Pick the first unallocated one. Make sure we don't clobber the other
- // reserved input we needed.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg));
- MRI.replaceRegWith(ScratchRsrcReg, Reg);
- ScratchRsrcReg = Reg;
- MFI->setScratchRSrcReg(ScratchRsrcReg);
- break;
+ if (!ST.hasSGPRInitBug()) {
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+ if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ ScratchRsrcReg = Reg;
+ MFI->setScratchRSrcReg(ScratchRsrcReg);
+ break;
+ }
}
}
- }
- if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- // Skip the last 2 elements because the last one is reserved for VCC, and
- // this is the 2nd to last element already.
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
- for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
- // Pick the first unallocated SGPR. Be careful not to pick an alias of the
- // scratch descriptor, since we haven’t added its uses yet.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg) &&
- !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
-
- MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
- ScratchWaveOffsetReg = Reg;
- MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
- break;
+ if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ ScratchWaveOffsetReg = Reg;
+ MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ break;
+ }
}
}
}
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 10f2adde4867..8735277149a6 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -134,6 +134,34 @@ def SIconstdata_ptr : SDNode<
SDTCisVT<0, i64>]>
>;
+//===----------------------------------------------------------------------===//
+// PatFrags for FLAT instructions
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+ (ld node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
+ isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
+ isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
+ isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def flat_store: flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+
def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
return isGlobalLoad(cast<LoadSDNode>(N)) ||
isConstantLoad(cast<LoadSDNode>(N), -1);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 6f653c70aca0..b7df058b7c0c 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -59,8 +59,6 @@ defm EXP : EXP_m;
// SMRD Instructions
//===----------------------------------------------------------------------===//
-let mayLoad = 1 in {
-
// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
// SMRD instructions, because the SGPR_32 register class does not include M0
// and writing to M0 from an SMRD instruction will hang the GPU.
@@ -90,8 +88,6 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
>;
-} // mayLoad = 1
-
//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 935aad427198..bf15516bea7b 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -156,6 +156,17 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+
+ if (LaneVGPR == AMDGPU::NoRegister) {
+ LLVMContext &Ctx = MF->getFunction()->getContext();
+ Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+
+ // When compiling from inside Mesa, the compilation continues.
+ // Select an arbitrary register to avoid triggering assertions
+ // during subsequent passes.
+ LaneVGPR = AMDGPU::VGPR0;
+ }
+
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
// Add this register as live-in to all blocks to avoid machine verifer
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3cdffef05583..2afa00996609 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -37,13 +37,17 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ if (ST.isXNACKEnabled())
+ BaseIdx -= 4;
+
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
- // next sgpr128 down.
+ // 98/99 need to be reserved for flat_scr or 96/97 for flat_scr and
+ // 98/99 for xnack_mask, and 100/101 for vcc. This is the next sgpr128 down
+ // either way.
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
}
@@ -54,13 +58,25 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
- unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ unsigned Idx;
+
+ if (!ST.isXNACKEnabled())
+ Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ else
+ Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
+
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // Next register before reservations for flat_scr and vcc.
- return AMDGPU::SGPR97;
+ if (!ST.isXNACKEnabled()) {
+ // Next register before reservations for flat_scr and vcc.
+ return AMDGPU::SGPR97;
+ } else {
+ // Next register before reservations for flat_scr, xnack_mask, vcc,
+ // and scratch resource.
+ return AMDGPU::SGPR91;
+ }
}
return AMDGPU::SGPR95;
@@ -86,6 +102,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// for VCC/FLAT_SCR.
reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+
+ if (ST.isXNACKEnabled())
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
}
// Tonga and Iceland can only allocate a fixed number of SGPRs due
@@ -93,9 +112,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
- // Assume XNACK_MASK is unused.
unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
+ if (ST.isXNACKEnabled())
+ Limit -= 2;
+
for (unsigned i = Limit; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
reserveRegisterTuples(Reserved, Reg);
@@ -282,11 +303,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
- if (Spill.VGPR == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling SGPR");
- }
-
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
@@ -315,11 +331,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
- if (Spill.VGPR == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling SGPR");
- }
-
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
SubReg)
diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td
index 20a026a822e2..1a7801c92bd7 100644
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@@ -101,3 +101,12 @@ def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+} // End Predicates = [isVI]
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index a44dc830a673..c171656b48ab 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -252,6 +252,8 @@ def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
"Swift ARM processors", []>;
+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+ "Samsung Exynos-M1 processors", []>;
def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
"Cortex-R4 ARM processors", []>;
@@ -649,6 +651,12 @@ def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift,
FeatureCrypto,
FeatureZCZeroing]>;
+def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1,
+ FeatureHWDiv,
+ FeatureHWDivARM,
+ FeatureT2XtPk,
+ FeatureCrypto,
+ FeatureCRC]>;
//===----------------------------------------------------------------------===//
// Register File Description
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index e89757c19ecc..55c1684028c2 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -340,12 +340,12 @@ namespace {
/// verify - check BBOffsets, BBSizes, alignment of islands
void ARMConstantIslands::verify() {
#ifndef NDEBUG
- for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
- MBBI != E; ++MBBI) {
- MachineBasicBlock *MBB = &*MBBI;
- unsigned MBBId = MBB->getNumber();
- assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
- }
+ assert(std::is_sorted(MF->begin(), MF->end(),
+ [this](const MachineBasicBlock &LHS,
+ const MachineBasicBlock &RHS) {
+ return BBInfo[LHS.getNumber()].postOffset() <
+ BBInfo[RHS.getNumber()].postOffset();
+ }));
DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
CPUser &U = CPUsers[i];
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 725b8383c961..6e7e47b8706a 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1986,23 +1986,6 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
return AddedRegPressure.size() <= MemRegs.size() * 2;
}
-
-/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
-static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
- MachineInstr *Op1) {
- assert(MI->memoperands_empty() && "expected a new machineinstr");
- size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
- + (Op1->memoperands_end() - Op1->memoperands_begin());
-
- MachineFunction *MF = MI->getParent()->getParent();
- MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
- MachineSDNode::mmo_iterator MemEnd =
- std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
- MemEnd =
- std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
- MI->setMemRefs(MemBegin, MemEnd);
-}
-
bool
ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
DebugLoc &dl, unsigned &NewOpc,
@@ -2196,7 +2179,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
if (!isT2)
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
- concatenateMemOperands(MIB, Op0, Op1);
+ MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumLDRDFormed;
} else {
@@ -2210,7 +2193,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
if (!isT2)
MIB.addReg(0);
MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
- concatenateMemOperands(MIB, Op0, Op1);
+ MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
DEBUG(dbgs() << "Formed " << *MIB << "\n");
++NumSTRDFormed;
}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index a8b28018f1b2..4d54e5751473 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -44,7 +44,7 @@ protected:
enum ARMProcFamilyEnum {
Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53,
- CortexA57, CortexA72, Krait, Swift
+ CortexA57, CortexA72, Krait, Swift, ExynosM1
};
enum ARMProcClassEnum {
None, AClass, RClass, MClass
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 1189cfd488ee..5a7eb215de42 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -251,6 +251,10 @@ def : Proc<"hexagonv60", HexagonModelV60,
// Declare the target which we are implementing
//===----------------------------------------------------------------------===//
+def HexagonAsmParser : AsmParser {
+ bit HasMnemonicFirst = 0;
+}
+
def HexagonAsmParserVariant : AsmParserVariant {
int Variant = 0;
string TokenizingCharacters = "#()=:.<>!+*";
@@ -259,5 +263,6 @@ def HexagonAsmParserVariant : AsmParserVariant {
def Hexagon : Target {
// Pull in Instruction Info:
let InstructionSet = HexagonInstrInfo;
+ let AssemblyParsers = [HexagonAsmParser];
let AssemblyParserVariants = [HexagonAsmParserVariant];
}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 5cfeba720d90..421403f49724 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -5807,3 +5807,5 @@ include "HexagonInstrInfoV60.td"
include "HexagonInstrInfoVector.td"
include "HexagonInstrAlias.td"
+include "HexagonSystemInst.td"
+
diff --git a/lib/Target/Hexagon/HexagonSystemInst.td b/lib/Target/Hexagon/HexagonSystemInst.td
new file mode 100644
index 000000000000..784686a437ad
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSystemInst.td
@@ -0,0 +1,113 @@
+//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Cache manipulation instructions.
+//===----------------------------------------------------------------------===//
+let mayStore = 1 in
+class ST_MISC_CACHEOP<dag outs, dag ins,
+ string asmstr, list<dag> pattern = [],
+ bits<3> amode, bits<3> type, bits<1> un>
+ : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+ let Inst{31-28} = 0b1010;
+ let Inst{27-25} = amode;
+ let Inst{24-22} = type;
+ let Inst{21} = un;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+let mayStore = 1 in
+class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
+ string asmstr, list<dag> pattern = [],
+ bits<3> amode, bits<3> type, bits<1> un>
+ : SYSInst<outs, ins, asmstr, pattern, ""> {
+
+ bits<5> Rs;
+ bits<5> Rt;
+ bits<5> Rd;
+ let Inst{31-28} = 0b1010;
+ let Inst{27-25} = amode;
+ let Inst{24-22} = type;
+ let Inst{21} = un;
+ let Inst{20-16} = Rs;
+ let Inst{12-8} = Rt;
+ let Inst{4-0} = Rd;
+}
+
+
+let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
+def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
+ "syncht" , [], 0b100, 0b001, 0b0>;
+}
+
+let Rt = 0, Rd = 0 in {
+let isSoloAin1 = 1 in {
+ def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
+ def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
+ def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
+ "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
+ }
+}
+
+let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
+ def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
+ "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
+ def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
+ "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
+}
+
+let hasSideEffects = 0, isSolo = 1 in
+class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
+ : JRInst <
+ (outs), (ins IntRegs:$Rs),
+ #mnemonic#"($Rs)" > {
+ bits<5> Rs;
+
+ let IClass = 0b0101;
+ let Inst{27-21} = 0b0110110;
+ let Inst{20-16} = Rs;
+ let Inst{13-12} = 0b00;
+ let Inst{11} = MajOp;
+ }
+// Instruction cache invalidate
+def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
+
+// Zero an aligned 32-byte cacheline.
+let isSoloAin1 = 1 in
+def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
+ "dczeroa($Rs)"> {
+ bits<5> Rs;
+ let IClass = 0b1010;
+ let Inst{27-21} = 0b0000110;
+ let Inst{13} = 0b0;
+ let Inst{20-16} = Rs;
+ }
+
+// Memory synchronization.
+let hasSideEffects = 0, isSolo = 1 in
+def Y2_isync: JRInst <(outs), (ins),
+ "isync"> {
+ let IClass = 0b0101;
+ let Inst{27-16} = 0b011111000000;
+ let Inst{13} = 0b0;
+ let Inst{9-0} = 0b0000000010;
+ }
+
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index ee9d060f339e..92ecde3f90d6 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -5,6 +5,23 @@
pr38151.c
va-arg-22.c
+# WebAssemblyRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator, int, unsigned int, llvm::RegScavenger *) const: Assertion `MI.getOperand(1).getImm() == 0 && "Can't eliminate FI yet if offset is already set"'
+20030313-1.c
+20030916-1.c
+20031012-1.c
+20041126-1.c
+20060420-1.c
+20071202-1.c
+20120808-1.c
+pr20527-1.c
+pr27073.c
+pr36339.c
+pr37573.c
+pr43236.c
+pr43835.c
+pr45070.c
+pr51933.c
+
# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed.
struct-ret-1.c
va-arg-11.c
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index b23f5c353013..55949155da9e 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -27,6 +27,7 @@ set(sources
X86PadShortFunction.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
+ X86ShuffleDecodeConstantPool.cpp
X86Subtarget.cpp
X86TargetMachine.cpp
X86TargetObjectFile.cpp
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 82f0ee5a5ebc..73f654cba38c 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -32,7 +32,6 @@ static unsigned getVectorRegSize(unsigned RegNo) {
return 64;
llvm_unreachable("Unknown vector reg!");
- return 0;
}
static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT,
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 4fdd527d87c8..619f7c8d25df 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -13,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "X86ShuffleDecode.h"
-#include "llvm/IR/Constants.h"
#include "llvm/CodeGen/MachineValueType.h"
//===----------------------------------------------------------------------===//
@@ -296,54 +295,6 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
}
}
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- // It is not an error for the PSHUFB mask to not be a vector of i8 because the
- // constant pool uniques constants by their bit representation.
- // e.g. the following take up the same space in the constant pool:
- // i128 -170141183420855150465331762880109871104
- //
- // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
- //
- // <4 x i32> <i32 -2147483648, i32 -2147483648,
- // i32 -2147483648, i32 -2147483648>
-
-#ifndef NDEBUG
- unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
- assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
-#endif
-
- // This is a straightforward byte vector.
- if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
- int NumElements = MaskTy->getVectorNumElements();
- ShuffleMask.reserve(NumElements);
-
- for (int i = 0; i < NumElements; ++i) {
- // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
- // lane of the vector we're inside.
- int Base = i & ~0xf;
- Constant *COp = C->getAggregateElement(i);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
- ShuffleMask.push_back(SM_SentinelUndef);
- continue;
- }
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
- // If the high bit (7) of the byte is set, the element is zeroed.
- if (Element & (1 << 7))
- ShuffleMask.push_back(SM_SentinelZero);
- else {
- // Only the least significant 4 bits of the byte are used.
- int Index = Base + (Element & 0xf);
- ShuffleMask.push_back(Index);
- }
- }
- }
- // TODO: Handle funny-looking vectors too.
-}
-
void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask) {
for (int i = 0, e = RawMask.size(); i < e; ++i) {
@@ -388,68 +339,6 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
}
}
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
- SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- // It is not an error for the PSHUFB mask to not be a vector of i8 because the
- // constant pool uniques constants by their bit representation.
- // e.g. the following take up the same space in the constant pool:
- // i128 -170141183420855150465331762880109871104
- //
- // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
- //
- // <4 x i32> <i32 -2147483648, i32 -2147483648,
- // i32 -2147483648, i32 -2147483648>
-
- unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-
- if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
- return;
-
- // Only support vector types.
- if (!MaskTy->isVectorTy())
- return;
-
- // Make sure its an integer type.
- Type *VecEltTy = MaskTy->getVectorElementType();
- if (!VecEltTy->isIntegerTy())
- return;
-
- // Support any element type from byte up to element size.
- // This is necesary primarily because 64-bit elements get split to 32-bit
- // in the constant pool on 32-bit target.
- unsigned EltTySize = VecEltTy->getIntegerBitWidth();
- if (EltTySize < 8 || EltTySize > ElSize)
- return;
-
- unsigned NumElements = MaskTySize / ElSize;
- assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
- "Unexpected number of vector elements.");
- ShuffleMask.reserve(NumElements);
- unsigned NumElementsPerLane = 128 / ElSize;
- unsigned Factor = ElSize / EltTySize;
-
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i * Factor);
- if (!COp) {
- ShuffleMask.clear();
- return;
- } else if (isa<UndefValue>(COp)) {
- ShuffleMask.push_back(SM_SentinelUndef);
- continue;
- }
- int Index = i & ~(NumElementsPerLane - 1);
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
- if (ElSize == 64)
- Index += (Element >> 1) & 0x1;
- else
- Index += Element & 0x3;
- ShuffleMask.push_back(Index);
- }
-
- // TODO: Handle funny-looking vectors too.
-}
-
void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) {
unsigned NumDstElts = DstVT.getVectorNumElements();
unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
@@ -572,58 +461,4 @@ void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
}
}
-void DecodeVPERMVMask(const Constant *C, MVT VT,
- SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- if (MaskTy->isVectorTy()) {
- unsigned NumElements = MaskTy->getVectorNumElements();
- if (NumElements == VT.getVectorNumElements()) {
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
- ShuffleMask.clear();
- return;
- }
- if (isa<UndefValue>(COp))
- ShuffleMask.push_back(SM_SentinelUndef);
- else {
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
- Element &= (1 << NumElements) - 1;
- ShuffleMask.push_back(Element);
- }
- }
- }
- return;
- }
- // Scalar value; just broadcast it
- if (!isa<ConstantInt>(C))
- return;
- uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
- int NumElements = VT.getVectorNumElements();
- Element &= (1 << NumElements) - 1;
- for (int i = 0; i < NumElements; ++i)
- ShuffleMask.push_back(Element);
-}
-
-void DecodeVPERMV3Mask(const Constant *C, MVT VT,
- SmallVectorImpl<int> &ShuffleMask) {
- Type *MaskTy = C->getType();
- unsigned NumElements = MaskTy->getVectorNumElements();
- if (NumElements == VT.getVectorNumElements()) {
- for (unsigned i = 0; i < NumElements; ++i) {
- Constant *COp = C->getAggregateElement(i);
- if (!COp) {
- ShuffleMask.clear();
- return;
- }
- if (isa<UndefValue>(COp))
- ShuffleMask.push_back(SM_SentinelUndef);
- else {
- uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
- Element &= (1 << NumElements*2) - 1;
- ShuffleMask.push_back(Element);
- }
- }
- }
-}
} // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index ab18e6438ec9..72db6a81912b 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -23,7 +23,6 @@
//===----------------------------------------------------------------------===//
namespace llvm {
-class Constant;
class MVT;
enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
@@ -72,9 +71,6 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
/// different datatypes and vector widths.
void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
-/// \brief Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
-
/// \brief Decode a PSHUFB mask from a raw array of constants such as from
/// BUILD_VECTOR.
void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
@@ -95,10 +91,6 @@ void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
/// No VT provided since it only works on 256-bit, 4 element vectors.
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
-/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
- SmallVectorImpl<int> &ShuffleMask);
-
/// \brief Decode a zero extension instruction as a shuffle mask.
void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
SmallVectorImpl<int> &ShuffleMask);
@@ -118,18 +110,10 @@ void DecodeEXTRQIMask(int Len, int Idx,
void DecodeINSERTQIMask(int Len, int Idx,
SmallVectorImpl<int> &ShuffleMask);
-/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, MVT VT,
- SmallVectorImpl<int> &ShuffleMask);
-
/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
-/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, MVT VT,
- SmallVectorImpl<int> &ShuffleMask);
-
/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
SmallVectorImpl<int> &ShuffleMask);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index de94a138d865..629d4d3565f2 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1098,9 +1098,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
RetRegs.push_back(VA.getLocReg());
}
- // All x86 ABIs require that for returning structs by value we copy
- // the sret argument into %rax/%eax (depending on ABI) for the return.
- // We saved the argument into a virtual register in the entry block,
+ // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
if (F.hasStructRetAttr()) {
unsigned Reg = X86MFInfo->getSRetReturnReg();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 242d0333ef9a..8b5fd27b4775 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -78,27 +78,6 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
-/// usesTheStack - This function checks if any of the users of EFLAGS
-/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
-/// to use the stack, and if we don't adjust the stack we clobber the first
-/// frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(const MachineFunction &MF) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
-
- // Conservativley assume that inline assembly might use the stack.
- if (MF.hasInlineAsm())
- return true;
-
- return any_of(MRI.reg_instructions(X86::EFLAGS),
- [](const MachineInstr &RI) { return RI.isCopy(); });
-}
-
-static bool doesStackUseImplyFP(const MachineFunction &MF) {
- bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- return IsWin64Prologue && usesTheStack(MF);
-}
-
/// hasFP - Return true if the specified function should have a dedicated frame
/// pointer register. This is true if the function has variable sized allocas
/// or if frame pointer elimination is disabled.
@@ -112,8 +91,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() ||
- MFI->hasStackMap() || MFI->hasPatchPoint() ||
- doesStackUseImplyFP(MF));
+ MFI->hasStackMap() || MFI->hasPatchPoint());
}
static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
@@ -965,11 +943,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// push and pop from the stack.
if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
!TRI->needsStackRealignment(MF) &&
- !MFI->hasVarSizedObjects() && // No dynamic alloca.
- !MFI->adjustsStack() && // No calls.
- !IsWin64CC && // Win64 has no Red Zone
- !usesTheStack(MF) && // Don't push and pop.
- !MF.shouldSplitStack()) { // Regular stack
+ !MFI->hasVarSizedObjects() && // No dynamic alloca.
+ !MFI->adjustsStack() && // No calls.
+ !IsWin64CC && // Win64 has no Red Zone
+ !MFI->hasOpaqueSPAdjustment() && // Don't push and pop.
+ !MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
if (HasFP) MinSize += SlotSize;
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4414e478b99b..868ae4e19e55 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -157,13 +157,9 @@ namespace {
/// performance.
bool OptForSize;
- /// If true, selector should try to optimize for minimum code size.
- bool OptForMinSize;
-
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), OptForSize(false),
- OptForMinSize(false) {}
+ : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
const char *getPassName() const override {
return "X86 DAG->DAG Instruction Selection";
@@ -535,10 +531,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
- // OptFor[Min]Size are used in pattern predicates that isel is matching.
+ // OptForSize is used in pattern predicates that isel is matching.
OptForSize = MF->getFunction()->optForSize();
- OptForMinSize = MF->getFunction()->optForMinSize();
- assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0927c2f4fa50..d31aab0fa141 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -18,6 +18,7 @@
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "llvm/ADT/SmallBitVector.h"
@@ -4556,6 +4557,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+ Result = DAG.getBitcast(CastVT, Result);
Vec256 = DAG.getBitcast(CastVT, Vec256);
Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
return DAG.getBitcast(ResultVT, Vec256);
@@ -4851,8 +4853,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
DecodePSHUFBMask(C, Mask);
- if (Mask.empty())
- return false;
break;
}
@@ -4870,7 +4870,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
- if (Mask.empty()) return false;
// Mask only contains negative index if an element is zero.
if (std::any_of(Mask.begin(), Mask.end(),
[](int M){ return M == SM_SentinelZero; }))
@@ -4948,8 +4947,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
DecodeVPERMVMask(C, VT, Mask);
- if (Mask.empty())
- return false;
break;
}
return false;
@@ -5000,8 +4997,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
DecodeVPERMV3Mask(C, VT, Mask);
- if (Mask.empty())
- return false;
break;
}
return false;
@@ -5009,6 +5004,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
default: llvm_unreachable("unknown target shuffle node");
}
+ // Empty mask indicates the decode failed.
+ if (Mask.empty())
+ return false;
+
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
// into the first input.
@@ -17372,6 +17371,18 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
if (!IntrData) {
if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
return MarkEHRegistrationNode(Op, DAG);
+ if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
+ IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+ // We need a frame pointer because this will get lowered to a PUSH/POP
+ // sequence.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setHasOpaqueSPAdjustment(true);
+ // Don't do anything here, we will expand these intrinsics out later
+ // during ExpandISelPseudos in EmitInstrWithCustomInserter.
+ return SDValue();
+ }
return SDValue();
}
@@ -21144,6 +21155,47 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
return BB;
}
+static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ // insert input VAL into EAX
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+ .addReg(MI->getOperand(0).getReg());
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+ .addReg(X86::ECX)
+ .addReg(X86::ECX);
+ // insert zero to EDX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX)
+ .addReg(X86::EDX)
+ .addReg(X86::EDX);
+ // insert WRPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+
+ // insert zero to ECX
+ BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX)
+ .addReg(X86::ECX)
+ .addReg(X86::ECX);
+ // insert RDPKRU instruction
+ BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
+ BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::EAX);
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
@@ -22495,6 +22547,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::CMOV_V64I1:
return EmitLoweredSelect(MI, BB);
+ case X86::RDFLAGS32:
+ case X86::RDFLAGS64: {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned PushF =
+ MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
+ unsigned Pop =
+ MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
+ BuildMI(*BB, MI, DL, TII->get(PushF));
+ BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg());
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
+ case X86::WRFLAGS32:
+ case X86::WRFLAGS64: {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned Push =
+ MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
+ unsigned PopF =
+ MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
+ BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg());
+ BuildMI(*BB, MI, DL, TII->get(PopF));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+
case X86::RELEASE_FADD32mr:
case X86::RELEASE_FADD64mr:
return EmitLoweredAtomicFP(MI, BB);
@@ -22611,7 +22693,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// Thread synchronization.
case X86::MONITOR:
return EmitMonitor(MI, BB, Subtarget);
-
+ // PKU feature
+ case X86::WRPKRU:
+ return EmitWRPKRU(MI, BB, Subtarget);
+ case X86::RDPKRU:
+ return EmitRDPKRU(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
@@ -23480,6 +23566,31 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
+ case X86ISD::BLENDI: {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
+ "Unexpected input vector types");
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
+ }
default:
return SDValue();
}
@@ -23573,9 +23684,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
/// the operands which explicitly discard the lanes which are unused by this
/// operation to try to flow through the rest of the combiner the fact that
/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
@@ -23617,12 +23732,6 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
return SDValue();
- // Only specific types are legal at this point, assert so we notice if and
- // when these change.
- assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
- VT == MVT::v4f64) &&
- "Unknown vector type encountered!");
-
return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
}
@@ -23642,8 +23751,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
- if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
- if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
+ if (TLI.isTypeLegal(VT))
+ if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
return AddSub;
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
@@ -27310,7 +27419,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
// from AH (which we otherwise need to do contortions to access).
if (N0.getOpcode() == ISD::UDIVREM &&
N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
- (VT == MVT::i32 || VT == MVT::i64)) {
+ VT == MVT::i32) {
SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
N0.getOperand(0), N0.getOperand(1));
@@ -27382,32 +27491,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
- SDValue V0 = N->getOperand(0);
- SDValue V1 = N->getOperand(1);
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
- // operands and changing the mask to 1. This saves us a bunch of
- // pattern-matching possibilities related to scalar math ops in SSE/AVX.
- // x86InstrInfo knows how to commute this back after instruction selection
- // if it would help register allocation.
-
- // TODO: If optimizing for size or a processor that doesn't suffer from
- // partial register update stalls, this should be transformed into a MOVSD
- // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
-
- if (VT == MVT::v2f64)
- if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
- if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
- SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
- return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
- }
-
- return SDValue();
-}
-
static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
// Gather and Scatter instructions use k-registers for masks. The type of
@@ -27840,6 +27923,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
+// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated.
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
@@ -27851,6 +27935,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGNR:
+ case X86ISD::BLENDI:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:
@@ -27865,7 +27950,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
- case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
case ISD::MGATHER:
case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
}
@@ -27902,6 +27986,18 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
}
}
+/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
+/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
+/// we don't adjust the stack we clobber the first frame index.
+/// See X86InstrInfo::copyPhysReg.
+bool X86TargetLowering::hasCopyImplyingStackAdjustment(
+ MachineFunction *MF) const {
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ return any_of(MRI.reg_instructions(X86::EFLAGS),
+ [](const MachineInstr &RI) { return RI.isCopy(); });
+}
+
/// IsDesirableToPromoteOp - This method query the target whether it is
/// beneficial for dag combiner to promote the specified node. If true, it
/// should return the desired promotion type by reference.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index a29dc9af54f6..8bb0e5f8bd36 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -697,6 +697,10 @@ namespace llvm {
/// and some i16 instructions are slow.
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+ /// Return true if the MachineFunction contains a COPY which would imply
+ /// HasOpaqueSPAdjustment.
+ bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *MBB) const override;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 8bf2925a75db..0a27c33f033e 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2366,6 +2366,7 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)),
multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
RegisterClass KRCSrc, Predicate prd> {
let Predicates = [prd] in {
+ let hasSideEffects = 0 in
def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
(ins KRC:$src1, KRC:$src2),
"kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 5d7283f7bd57..96a29ca8c370 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1, AddedComplexity = 20 in
+ isPseudo = 1 in
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
[(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
@@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
}
let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
- AddedComplexity = 15 in {
+ AddedComplexity = 1 in {
// Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
// which only require 3 bytes compared to MOV32ri which requires 5.
let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@@ -278,24 +278,12 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
}
-let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
-// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
-// FIXME: Add itinerary class and Schedule.
-def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
- [(set GR32:$dst, i32immSExt8:$src)]>,
- Requires<[OptForMinSize]>;
-def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
- [(set GR64:$dst, i64immSExt8:$src)]>,
- Requires<[OptForMinSize, NotWin64WithoutFP]>;
-}
-
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
-let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
- isCodeGenOnly = 1, hasSideEffects = 0 in
-def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
- "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, hasSideEffects = 0 in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
// This 64-bit pseudo-move can be used for both a 64-bit constant that is
// actually the zero-extension of a 32-bit constant and for labels in the
@@ -566,8 +554,8 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in {
// TODO: Get this to fold the constant into the instruction.
let isCodeGenOnly = 1, Defs = [EFLAGS] in
def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
- "or{l}\t{$zero, $dst|$dst, $zero}",
- [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
+ "or{l}\t{$zero, $dst|$dst, $zero}", [],
+ IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK,
Sched<[WriteALULd, WriteRMW]>;
let hasSideEffects = 1 in
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 63e78de69bc9..246804e34289 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -23,7 +23,6 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
@@ -4453,7 +4452,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// such as TF/IF/DF, which LLVM doesn't model.
//
// Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - usesTheStack.
+ // first frame index.
+ // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
bool AXDead = (Reg == AX) ||
@@ -4465,6 +4465,10 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// (unnecessarily) saving+restoring a dead register. However the
// MachineVerifier expects operands that read from dead registers
// to be marked with the "undef" flag.
+ // An example of this can be found in
+ // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and
+ // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using
+ // -verify-machineinstrs.
BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
}
if (FromEFLAGS) {
@@ -5309,50 +5313,6 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
return true;
}
-bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
- MachineBasicBlock &MBB = *MIB->getParent();
- DebugLoc DL = MIB->getDebugLoc();
- int64_t Imm = MIB->getOperand(1).getImm();
- assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
- MachineBasicBlock::iterator I = MIB.getInstr();
-
- int StackAdjustment;
-
- if (Subtarget.is64Bit()) {
- assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
- MIB->getOpcode() == X86::MOV32ImmSExti8);
- // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
- // widen the register if necessary.
- StackAdjustment = 8;
- BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP64r));
- MIB->getOperand(0)
- .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
- } else {
- assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
- StackAdjustment = 4;
- BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
- MIB->setDesc(get(X86::POP32r));
- }
-
- // Build CFI if necessary.
- MachineFunction &MF = *MBB.getParent();
- const X86FrameLowering *TFL = Subtarget.getFrameLowering();
- bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsDwarfCFI =
- !IsWin64Prologue &&
- (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
- bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
- if (EmitCFI) {
- TFL->BuildCFI(MBB, I, DL,
- MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
- TFL->BuildCFI(MBB, std::next(I), DL,
- MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
- }
-
- return true;
-}
-
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
// code sequence is needed for other targets.
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
@@ -5385,9 +5345,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
case X86::MOV32r_1:
return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
- case X86::MOV32ImmSExti8:
- case X86::MOV64ImmSExti8:
- return ExpandMOVImmSExti8(MIB);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
@@ -5412,7 +5369,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
case X86::TEST8ri_NOREX:
MI->setDesc(get(X86::TEST8ri));
return true;
-
+ case X86::MOV32ri64:
+ MI->setDesc(get(X86::MOV32ri));
+ return true;
+
// KNL does not recognize dependency-breaking idioms for mask registers,
// so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
// Using %k0 as the undef input register is a performance heuristic based
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 9d40334206b2..edd09d617595 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -23,7 +23,6 @@
#include "X86GenInstrInfo.inc"
namespace llvm {
- class MachineInstrBuilder;
class X86RegisterInfo;
class X86Subtarget;
@@ -565,9 +564,6 @@ private:
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
int &FrameIndex) const;
-
- /// Expand the MOVImmSExti8 pseudo-instructions.
- bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index f4ca2b880bad..ea8e56206ce6 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -822,8 +822,6 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">,
AssemblerPredicate<"Mode32Bit", "32-bit mode">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
-def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
- "Subtarget->getFrameLowering()->hasFP(*MF)">;
def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
@@ -837,7 +835,6 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
def OptForSize : Predicate<"OptForSize">;
-def OptForMinSize : Predicate<"OptForMinSize">;
def OptForSpeed : Predicate<"!OptForSize">;
def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
@@ -1093,6 +1090,32 @@ def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
}
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW], Defs = [ESP] in {
+ let Uses = [ESP, EFLAGS] in
+ def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_flags_read_u32))]>,
+ Requires<[Not64BitMode]>;
+
+ let Uses = [RSP, EFLAGS] in
+ def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins),
+ [(set GR64:$dst, (int_x86_flags_read_u64))]>,
+ Requires<[In64BitMode]>;
+}
+
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1,
+ SchedRW = [WriteRMW] in {
+ let Defs = [ESP, EFLAGS], Uses = [ESP] in
+ def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_flags_write_u32 GR32:$src)]>,
+ Requires<[Not64BitMode]>;
+
+ let Defs = [RSP, EFLAGS], Uses = [RSP] in
+ def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src),
+ [(int_x86_flags_write_u64 GR64:$src)]>,
+ Requires<[In64BitMode]>;
+}
+
let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0,
SchedRW = [WriteLoad] in {
def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>,
@@ -1133,7 +1156,8 @@ def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteStore] in {
def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
- "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>;
+ "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
+ Requires<[In64BitMode]>;
def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
"push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32,
Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 11dc1e7d466b..83f9b1409f61 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -651,7 +651,7 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
// Misc.
let SchedRW = [WriteShuffle] in {
-let Uses = [EDI], Predicates = [HasSSE1,In32BitMode] in
+let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in
def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
"maskmovq\t{$mask, $src|$src, $mask}",
[(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index cf5e2e38fe58..31608cd4c128 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -63,8 +63,8 @@ def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src),
Requires<[HasMPX, In64BitMode]>;
def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
- "bndstx \t{$src, $dst|$dst, $src}", []>, TB,
+ "bndstx \t{$src, $dst|$dst, $src}", []>, PS,
Requires<[HasMPX]>;
def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- "bndldx \t{$src, $dst|$dst, $src}", []>, TB,
- Requires<[HasMPX]>; \ No newline at end of file
+ "bndldx \t{$src, $dst|$dst, $src}", []>, PS,
+ Requires<[HasMPX]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7a44212bd829..624b9316e6fd 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1466,6 +1466,8 @@ def SSE_CVT_SD2SI : OpndItins<
IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
>;
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, OpndItins itins> {
@@ -1489,6 +1491,8 @@ let hasSideEffects = 0 in {
}
}
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
X86MemOperand x86memop, string asm> {
let hasSideEffects = 0, Predicates = [UseAVX] in {
@@ -1626,6 +1630,8 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
+// FIXME: We probably want to match the rm form only when optimizing for
+// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
string asm, OpndItins itins> {
@@ -3387,9 +3393,18 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
def : Pat<(Intr (load addr:$src)),
(vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m)
addr:$src), VR128))>;
- def : Pat<(Intr mem_cpat:$src),
- (!cast<Instruction>(NAME#Suffix##m_Int)
- (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
+ }
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // movss mem, %xmm0
+ // rcpss %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // rcpss mem, %xmm0
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr mem_cpat:$src),
+ (!cast<Instruction>(NAME#Suffix##m_Int)
+ (vt (IMPLICIT_DEF)), mem_cpat:$src)>;
}
}
@@ -3420,28 +3435,37 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
}
+ // We don't want to fold scalar loads into these instructions unless
+ // optimizing for size. This is because the folded instruction will have a
+ // partial register update, while the unfolded sequence will not, e.g.
+ // vmovss mem, %xmm0
+ // vrcpss %xmm0, %xmm0, %xmm0
+ // which has a clobber before the rcp, vs.
+ // vrcpss mem, %xmm0, %xmm0
+ // TODO: In theory, we could fold the load, and avoid the stall caused by
+ // the partial register store, either in ExeDepFix or with smarter RA.
let Predicates = [UseAVX] in {
def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
-
- def : Pat<(vt (OpNode mem_cpat:$src)),
- (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
- mem_cpat:$src)>;
-
}
let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
(!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)),
VR128:$src)>;
-
- def : Pat<(Intr mem_cpat:$src),
- (!cast<Instruction>("V"#NAME#Suffix##m_Int)
+ }
+ let Predicates = [HasAVX, OptForSize] in {
+ def : Pat<(Intr mem_cpat:$src),
+ (!cast<Instruction>("V"#NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), mem_cpat:$src)>;
}
- let Predicates = [UseAVX, OptForSize] in
- def : Pat<(ScalarVT (OpNode (load addr:$src))),
- (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
- addr:$src)>;
+ let Predicates = [UseAVX, OptForSize] in {
+ def : Pat<(ScalarVT (OpNode (load addr:$src))),
+ (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
+ addr:$src)>;
+ def : Pat<(vt (OpNode mem_cpat:$src)),
+ (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)),
+ mem_cpat:$src)>;
+ }
}
/// sse1_fp_unop_p - SSE1 unops in packed form.
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 85e17f516f91..a97d1e5c86d0 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -498,10 +498,10 @@ let Predicates = [HasXSAVE] in {
let Predicates = [HasXSAVEOPT] in {
def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
"xsaveopt\t$dst",
- [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB;
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
"xsaveopt64\t$dst",
- [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
}
let Predicates = [HasXSAVEC] in {
def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
@@ -551,10 +551,17 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
//==-----------------------------------------------------------------------===//
// PKU - enable protection key
+let usesCustomInserter = 1 in {
+ def WRPKRU : PseudoI<(outs), (ins GR32:$src),
+ [(int_x86_wrpkru GR32:$src)]>;
+ def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
+ [(set GR32:$dst, (int_x86_rdpkru))]>;
+}
+
let Defs = [EAX, EDX], Uses = [ECX] in
- def RDPKRU : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
let Uses = [EAX, ECX, EDX] in
- def WRPKRU : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
//===----------------------------------------------------------------------===//
// FS/GS Base Instructions
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index dc6d85d582c8..646b556faa8f 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1208,19 +1208,55 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK,
X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0),
X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0),
X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0),
@@ -1229,6 +1265,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0),
X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
+ X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0),
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index e186f7039b43..e1ca558f0f2c 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -14,6 +14,7 @@
#include "X86AsmPrinter.h"
#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "Utils/X86ShuffleDecode.h"
@@ -454,10 +455,6 @@ ReSimplify:
"LEA has segment specified!");
break;
- case X86::MOV32ri64:
- OutMI.setOpcode(X86::MOV32ri);
- break;
-
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
// if one of the registers is extended, but other isn't.
case X86::VMOVZPQILo2PQIrr:
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
new file mode 100644
index 000000000000..ef16c5bdbfd8
--- /dev/null
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -0,0 +1,190 @@
+//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecodeConstantPool.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+#ifndef NDEBUG
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+ assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512);
+#endif
+
+ // This is a straightforward byte vector.
+ if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
+ int NumElements = MaskTy->getVectorNumElements();
+ ShuffleMask.reserve(NumElements);
+
+ for (int i = 0; i < NumElements; ++i) {
+ // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+ // lane of the vector we're inside.
+ int Base = i & ~0xf;
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ // If the high bit (7) of the byte is set, the element is zeroed.
+ if (Element & (1 << 7))
+ ShuffleMask.push_back(SM_SentinelZero);
+ else {
+ // Only the least significant 4 bits of the byte are used.
+ int Index = Base + (Element & 0xf);
+ ShuffleMask.push_back(Index);
+ }
+ }
+ }
+ // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ // It is not an error for the PSHUFB mask to not be a vector of i8 because the
+ // constant pool uniques constants by their bit representation.
+ // e.g. the following take up the same space in the constant pool:
+ // i128 -170141183420855150465331762880109871104
+ //
+ // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160>
+ //
+ // <4 x i32> <i32 -2147483648, i32 -2147483648,
+ // i32 -2147483648, i32 -2147483648>
+
+ unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+
+ if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
+ return;
+
+ // Only support vector types.
+ if (!MaskTy->isVectorTy())
+ return;
+
+ // Make sure its an integer type.
+ Type *VecEltTy = MaskTy->getVectorElementType();
+ if (!VecEltTy->isIntegerTy())
+ return;
+
+ // Support any element type from byte up to element size.
+ // This is necesary primarily because 64-bit elements get split to 32-bit
+ // in the constant pool on 32-bit target.
+ unsigned EltTySize = VecEltTy->getIntegerBitWidth();
+ if (EltTySize < 8 || EltTySize > ElSize)
+ return;
+
+ unsigned NumElements = MaskTySize / ElSize;
+ assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+ "Unexpected number of vector elements.");
+ ShuffleMask.reserve(NumElements);
+ unsigned NumElementsPerLane = 128 / ElSize;
+ unsigned Factor = ElSize / EltTySize;
+
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i * Factor);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ } else if (isa<UndefValue>(COp)) {
+ ShuffleMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+ int Index = i & ~(NumElementsPerLane - 1);
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ if (ElSize == 64)
+ Index += (Element >> 1) & 0x1;
+ else
+ Index += Element & 0x3;
+ ShuffleMask.push_back(Index);
+ }
+
+ // TODO: Handle funny-looking vectors too.
+}
+
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ if (MaskTy->isVectorTy()) {
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+ return;
+ }
+ // Scalar value; just broadcast it
+ if (!isa<ConstantInt>(C))
+ return;
+ uint64_t Element = cast<ConstantInt>(C)->getZExtValue();
+ int NumElements = VT.getVectorNumElements();
+ Element &= (1 << NumElements) - 1;
+ for (int i = 0; i < NumElements; ++i)
+ ShuffleMask.push_back(Element);
+}
+
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask) {
+ Type *MaskTy = C->getType();
+ unsigned NumElements = MaskTy->getVectorNumElements();
+ if (NumElements == VT.getVectorNumElements()) {
+ for (unsigned i = 0; i < NumElements; ++i) {
+ Constant *COp = C->getAggregateElement(i);
+ if (!COp) {
+ ShuffleMask.clear();
+ return;
+ }
+ if (isa<UndefValue>(COp))
+ ShuffleMask.push_back(SM_SentinelUndef);
+ else {
+ uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+ Element &= (1 << NumElements*2) - 1;
+ ShuffleMask.push_back(Element);
+ }
+ }
+ }
+}
+} // llvm namespace
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
new file mode 100644
index 000000000000..bcf46322c8cd
--- /dev/null
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -0,0 +1,45 @@
+//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics using
+// constants from the constant pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class Constant;
+class MVT;
+
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMVMask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
+void DecodeVPERMV3Mask(const Constant *C, MVT VT,
+ SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp
index d02c861a2948..4295a7595c29 100644
--- a/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -10,6 +10,7 @@
#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
@@ -21,10 +22,12 @@ using namespace llvm;
STATISTIC(NumReadNone, "Number of functions inferred as readnone");
STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
static bool setDoesNotAccessMemory(Function &F) {
if (F.doesNotAccessMemory())
@@ -42,6 +45,15 @@ static bool setOnlyReadsMemory(Function &F) {
return true;
}
+static bool setOnlyAccessesArgMemory(Function &F) {
+ if (F.onlyAccessesArgMemory())
+ return false;
+ F.setOnlyAccessesArgMemory ();
+ ++NumArgMemOnly;
+ return true;
+}
+
+
static bool setDoesNotThrow(Function &F) {
if (F.doesNotThrow())
return false;
@@ -74,6 +86,17 @@ static bool setDoesNotAlias(Function &F, unsigned n) {
return true;
}
+static bool setNonNull(Function &F, unsigned n) {
+ assert((n != AttributeSet::ReturnIndex ||
+ F.getReturnType()->isPointerTy()) &&
+ "nonnull applies only to pointers");
+ if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
+ return false;
+ F.addAttribute(n, Attribute::NonNull);
+ ++NumNonNull;
+ return true;
+}
+
/// Analyze the name and prototype of the given function and set any applicable
/// attributes.
///
@@ -89,7 +112,6 @@ static bool inferPrototypeAttributes(Function &F,
return false;
bool Changed = false;
-
switch (TheLibFunc) {
case LibFunc::strlen:
if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
@@ -873,6 +895,35 @@ static bool inferPrototypeAttributes(Function &F,
Changed |= setDoesNotCapture(F, 2);
return Changed;
+ case LibFunc::Znwj: // new(unsigned int)
+ case LibFunc::Znwm: // new(unsigned long)
+ case LibFunc::Znaj: // new[](unsigned int)
+ case LibFunc::Znam: // new[](unsigned long)
+ case LibFunc::msvc_new_int: // new(unsigned int)
+ case LibFunc::msvc_new_longlong: // new(unsigned long long)
+ case LibFunc::msvc_new_array_int: // new[](unsigned int)
+ case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+ if (FTy->getNumParams() != 1)
+ return false;
+ // Operator new always returns a nonnull noalias pointer
+ Changed |= setNonNull(F, AttributeSet::ReturnIndex);
+ Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+ return Changed;
+
+ //TODO: add LibFunc entries for:
+ //case LibFunc::memset_pattern4:
+ //case LibFunc::memset_pattern8:
+ case LibFunc::memset_pattern16:
+ if (FTy->isVarArg() || FTy->getNumParams() != 3 ||
+ !isa<PointerType>(FTy->getParamType(0)) ||
+ !isa<PointerType>(FTy->getParamType(1)) ||
+ !isa<IntegerType>(FTy->getParamType(2)))
+ return false;
+
+ Changed |= setOnlyAccessesArgMemory(F);
+ Changed |= setOnlyReadsMemory(F, 2);
+ return Changed;
+
default:
// FIXME: It'd be really nice to cover all the library functions we're
// aware of here.
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e3634f269cf5..090245d1b22c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1747,8 +1747,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Translate facts known about a pointer before relocating into
// facts about the relocate value, while being careful to
// preserve relocation semantics.
- GCRelocateOperands Operands(II);
- Value *DerivedPtr = Operands.getDerivedPtr();
+ Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();
auto *GCRelocateType = cast<PointerType>(II->getType());
// Remove the relocation if unused, note that this check is required
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index da835a192322..0f01d183b1ad 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -591,19 +591,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
// zext (x <s 0) to i32 --> x>>u31 true if signbit set.
// zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear.
if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
- (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+ (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
if (!DoXform) return ICI;
Value *In = ICI->getOperand(0);
Value *Sh = ConstantInt::get(In->getType(),
- In->getType()->getScalarSizeInBits()-1);
- In = Builder->CreateLShr(In, Sh, In->getName()+".lobit");
+ In->getType()->getScalarSizeInBits() - 1);
+ In = Builder->CreateLShr(In, Sh, In->getName() + ".lobit");
if (In->getType() != CI.getType())
In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
Constant *One = ConstantInt::get(In->getType(), 1);
- In = Builder->CreateXor(In, One, In->getName()+".not");
+ In = Builder->CreateXor(In, One, In->getName() + ".not");
}
return ReplaceInstUsesWith(CI, In);
@@ -639,13 +639,13 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
return ReplaceInstUsesWith(CI, Res);
}
- uint32_t ShiftAmt = KnownZeroMask.logBase2();
+ uint32_t ShAmt = KnownZeroMask.logBase2();
Value *In = ICI->getOperand(0);
- if (ShiftAmt) {
+ if (ShAmt) {
// Perform a logical shr by shiftamt.
// Insert the shift to put the result in the low bit.
- In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
- In->getName()+".lobit");
+ In = Builder->CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+ In->getName() + ".lobit");
}
if ((Op1CV != 0) == isNE) { // Toggle the low bit.
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 534f67008150..e4e506509d39 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -414,7 +414,7 @@ public:
/// \brief A combiner-aware RAUW-like routine.
///
/// This method is to be used when an instruction is found to be dead,
- /// replacable with another preexisting expression. Here we add all uses of
+ /// replaceable with another preexisting expression. Here we add all uses of
/// I to the worklist, replace all uses of I with the new value, then return
/// I, so that the inst combiner will know that I was modified.
Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index e25639ae943b..54a9fbdbe82e 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -383,15 +383,28 @@ static void replaceExtractElements(InsertElementInst *InsElt,
auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
ConstantVector::get(ExtendMask));
- // Replace all extracts from the original narrow vector with extracts from
- // the new wide vector.
- WideVec->insertBefore(ExtElt);
+ // Insert the new shuffle after the vector operand of the extract is defined
+ // or at the start of the basic block, so any subsequent extracts can use it.
+ bool ReplaceAllExtUsers;
+ if (auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp)) {
+ WideVec->insertAfter(ExtVecOpInst);
+ ReplaceAllExtUsers = true;
+ } else {
+ // TODO: Insert at start of function, so it's always safe to replace all?
+ IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
+ ReplaceAllExtUsers = false;
+ }
+
+ // Replace extracts from the original narrow vector with extracts from the new
+ // wide vector.
for (User *U : ExtVecOp->users()) {
- if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) {
- auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
- NewExt->insertAfter(WideVec);
- IC.ReplaceInstUsesWith(*OldExt, NewExt);
- }
+ ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
+ if (!OldExt ||
+ (!ReplaceAllExtUsers && OldExt->getParent() != WideVec->getParent()))
+ continue;
+ auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
+ NewExt->insertAfter(WideVec);
+ IC.ReplaceInstUsesWith(*OldExt, NewExt);
}
}
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 7c46cfd28fc9..903a0b5f5400 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3021,7 +3021,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
Instruction *Inst = &*--EndInst->getIterator();
if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
- if (Inst->isEHPad()) {
+ if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
EndInst = Inst;
continue;
}
@@ -3029,8 +3029,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
++NumDeadInst;
MadeIRChange = true;
}
- if (!Inst->getType()->isTokenTy())
- Inst->eraseFromParent();
+ Inst->eraseFromParent();
}
}
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 92e41ee27c09..51ff95d9a74c 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -234,16 +234,14 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
}
void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {
- CoverageData->setSection(getCoverageSection());
- CoverageData->setAlignment(8);
Constant *Init = CoverageData->getInitializer();
- // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] }
+ // We're expecting { [4 x 32], [n x { i8*, i32, i32 }], [m x i8] }
// for some C. If not, the frontend's given us something broken.
- assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map");
- assert(isa<ConstantArray>(Init->getAggregateElement(4)) &&
+ assert(Init->getNumOperands() == 3 && "bad number of fields in coverage map");
+ assert(isa<ConstantArray>(Init->getAggregateElement(1)) &&
"invalid function list in coverage map");
- ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4));
+ ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(1));
for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) {
Constant *Record = Records->getOperand(I);
Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts();
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 6d70cdc3ade2..e01e23f71732 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -75,10 +75,12 @@ DisablePromotion("disable-licm-promotion", cl::Hidden,
cl::desc("Disable memory promotion in LICM pass"));
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo);
static bool hoist(Instruction &I, BasicBlock *Preheader);
static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST );
+ const Loop *CurLoop, AliasSetTracker *CurAST,
+ const LICMSafetyInfo *SafetyInfo);
static bool isGuaranteedToExecute(const Instruction &Inst,
const DominatorTree *DT,
const Loop *CurLoop,
@@ -92,10 +94,10 @@ static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
const AAMDNodes &AAInfo,
AliasSetTracker *CurAST);
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
- BasicBlock &ExitBlock,
- PHINode &PN,
- const LoopInfo *LI);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LICMSafetyInfo *SafetyInfo);
static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
DominatorTree *DT, TargetLibraryInfo *TLI,
Loop *CurLoop, AliasSetTracker *CurAST,
@@ -348,10 +350,10 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
// outside of the loop. In this case, it doesn't even matter if the
// operands of the instruction are loop invariant.
//
- if (isNotUsedInLoop(I, CurLoop) &&
+ if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
++II;
- Changed |= sink(I, LI, DT, CurLoop, CurAST);
+ Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
}
}
return Changed;
@@ -432,6 +434,14 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
(I != E) && !SafetyInfo->MayThrow; ++I)
SafetyInfo->MayThrow |= I->mayThrow();
+
+ // Compute funclet colors if we might sink/hoist in a function with a funclet
+ // personality routine.
+ Function *Fn = CurLoop->getHeader()->getParent();
+ if (Fn->hasPersonalityFn())
+ if (Constant *PersonalityFn = Fn->getPersonalityFn())
+ if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+ SafetyInfo->BlockColors = colorEHFunclets(*Fn);
}
/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
@@ -466,6 +476,10 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
if (isa<DbgInfoIntrinsic>(I))
return false;
+ // Don't sink calls which can throw.
+ if (CI->mayThrow())
+ return false;
+
// Handle simple cases by querying alias analysis.
FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
if (Behavior == FMRB_DoesNotAccessMemory)
@@ -534,10 +548,24 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
/// the loop. If this is true, we can sink the instruction to the exit
/// blocks of the loop.
///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+ const LICMSafetyInfo *SafetyInfo) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
for (const User *U : I.users()) {
const Instruction *UI = cast<Instruction>(U);
if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+ const BasicBlock *BB = PN->getParent();
+ // We cannot sink uses in catchswitches.
+ if (isa<CatchSwitchInst>(BB->getTerminator()))
+ return false;
+
+ // We need to sink a callsite to a unique funclet. Avoid sinking if the
+ // phi use is too muddled.
+ if (isa<CallInst>(I))
+ if (!BlockColors.empty() &&
+ BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+ return false;
+
// A PHI node where all of the incoming values are this instruction are
// special -- they can just be RAUW'ed with the instruction and thus
// don't require a use in the predecessor. This is a particular important
@@ -565,11 +593,41 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop) {
return true;
}
-static Instruction *CloneInstructionInExitBlock(const Instruction &I,
- BasicBlock &ExitBlock,
- PHINode &PN,
- const LoopInfo *LI) {
- Instruction *New = I.clone();
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+ const LoopInfo *LI,
+ const LICMSafetyInfo *SafetyInfo) {
+ Instruction *New;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const auto &BlockColors = SafetyInfo->BlockColors;
+
+ // Sinking call-sites need to be handled differently from other
+ // instructions. The cloned call-site needs a funclet bundle operand
+ // appropriate for it's location in the CFG.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+ BundleIdx != BundleEnd; ++BundleIdx) {
+ OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+ assert(CV.size() == 1 && "non-unique color for exit block!");
+ BasicBlock *BBColor = CV.front();
+ Instruction *EHPad = BBColor->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ New = CallInst::Create(CI, OpBundles);
+ } else {
+ New = I.clone();
+ }
+
ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
if (!I.getName().empty()) New->setName(I.getName() + ".le");
@@ -601,7 +659,8 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I,
/// position, and may either delete it or move it to outside of the loop.
///
static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
- const Loop *CurLoop, AliasSetTracker *CurAST ) {
+ const Loop *CurLoop, AliasSetTracker *CurAST,
+ const LICMSafetyInfo *SafetyInfo) {
DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
bool Changed = false;
if (isa<LoadInst>(I)) ++NumMovedLoads;
@@ -652,7 +711,7 @@ static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
New = It->second;
else
New = SunkCopies[ExitBlock] =
- CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI);
+ CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
PN->replaceAllUsesWith(New);
PN->eraseFromParent();
@@ -950,6 +1009,21 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
if (!GuaranteedToExecute)
return Changed;
+ // Figure out the loop exits and their insertion points, if this is the
+ // first promotion.
+ if (ExitBlocks.empty()) {
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ InsertPts.clear();
+ InsertPts.reserve(ExitBlocks.size());
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+ }
+
+ // Can't insert into a catchswitch.
+ for (BasicBlock *ExitBlock : ExitBlocks)
+ if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
+ return Changed;
+
// Otherwise, this is safe to promote, lets do it!
DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
Changed = true;
@@ -961,15 +1035,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
// location is better than none.
DebugLoc DL = LoopUses[0]->getDebugLoc();
- // Figure out the loop exits and their insertion points, if this is the
- // first promotion.
- if (ExitBlocks.empty()) {
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- InsertPts.resize(ExitBlocks.size());
- for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
- InsertPts[i] = &*ExitBlocks[i]->getFirstInsertionPt();
- }
-
// We use the SSAUpdater interface to insert phi nodes as required.
SmallVector<PHINode*, 16> NewPHIs;
SSAUpdater SSA(&NewPHIs);
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2d577de7c2b8..4521640e3947 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -108,7 +108,11 @@ public:
private:
typedef SmallVector<StoreInst *, 8> StoreList;
- StoreList StoreRefs;
+ StoreList StoreRefsForMemset;
+ StoreList StoreRefsForMemcpy;
+ bool HasMemset;
+ bool HasMemsetPattern;
+ bool HasMemcpy;
/// \name Countable Loop Idiom Handling
/// @{
@@ -118,17 +122,15 @@ private:
SmallVectorImpl<BasicBlock *> &ExitBlocks);
void collectStores(BasicBlock *BB);
- bool isLegalStore(StoreInst *SI);
+ bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
bool processLoopStore(StoreInst *SI, const SCEV *BECount);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
- unsigned StoreAlignment, Value *SplatValue,
+ unsigned StoreAlignment, Value *StoredVal,
Instruction *TheStore, const SCEVAddRecExpr *Ev,
const SCEV *BECount, bool NegStride);
- bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
- const SCEVAddRecExpr *StoreEv,
- const SCEV *BECount, bool NegStride);
+ bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
/// @}
/// \name Noncountable Loop Idiom Handling
@@ -207,8 +209,13 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
*CurLoop->getHeader()->getParent());
DL = &CurLoop->getHeader()->getModule()->getDataLayout();
- if (SE->hasLoopInvariantBackedgeTakenCount(L))
- return runOnCountableLoop();
+ HasMemset = TLI->has(LibFunc::memset);
+ HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
+ HasMemcpy = TLI->has(LibFunc::memcpy);
+
+ if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (SE->hasLoopInvariantBackedgeTakenCount(L))
+ return runOnCountableLoop();
return runOnNoncountableLoop();
}
@@ -297,7 +304,8 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
}
-bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
+ bool &ForMemcpy) {
// Don't touch volatile stores.
if (!SI->isSimple())
return false;
@@ -322,22 +330,86 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
return false;
- return true;
+ // See if the store can be turned into a memset.
+
+ // If the stored value is a byte-wise value (like i32 -1), then it may be
+ // turned into a memset of i8 -1, assuming that all the consecutive bytes
+ // are stored. A store of i32 0x01020304 can never be turned into a memset,
+ // but it can be turned into memset_pattern if the target supports it.
+ Value *SplatValue = isBytewiseValue(StoredVal);
+ Constant *PatternValue = nullptr;
+
+ // If we're allowed to form a memset, and the stored value would be
+ // acceptable for memset, use it.
+ if (HasMemset && SplatValue &&
+ // Verify that the stored value is loop invariant. If not, we can't
+ // promote the memset.
+ CurLoop->isLoopInvariant(SplatValue)) {
+ // It looks like we can use SplatValue.
+ ForMemset = true;
+ return true;
+ } else if (HasMemsetPattern &&
+ // Don't create memset_pattern16s with address spaces.
+ StorePtr->getType()->getPointerAddressSpace() == 0 &&
+ (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+ // It looks like we can use PatternValue!
+ ForMemset = true;
+ return true;
+ }
+
+ // Otherwise, see if the store can be turned into a memcpy.
+ if (HasMemcpy) {
+ // Check to see if the stride matches the size of the store. If so, then we
+ // know that every byte is touched in the loop.
+ unsigned Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ if (StoreSize != Stride && StoreSize != -Stride)
+ return false;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+ if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+ return false;
+
+ // The store and load must share the same stride.
+ if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+ return false;
+
+ // Success. This store can be converted into a memcpy.
+ ForMemcpy = true;
+ return true;
+ }
+ // This store can't be transformed into a memset/memcpy.
+ return false;
}
void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
- StoreRefs.clear();
+ StoreRefsForMemset.clear();
+ StoreRefsForMemcpy.clear();
for (Instruction &I : *BB) {
StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
+ bool ForMemset = false;
+ bool ForMemcpy = false;
// Make sure this is a strided store with a constant stride.
- if (!isLegalStore(SI))
+ if (!isLegalStore(SI, ForMemset, ForMemcpy))
continue;
// Save the store locations.
- StoreRefs.push_back(SI);
+ if (ForMemset)
+ StoreRefsForMemset.push_back(SI);
+ else if (ForMemcpy)
+ StoreRefsForMemcpy.push_back(SI);
}
}
@@ -357,9 +429,15 @@ bool LoopIdiomRecognize::runOnLoopBlock(
bool MadeChange = false;
// Look for store instructions, which may be optimized to memset/memcpy.
collectStores(BB);
- for (auto &SI : StoreRefs)
+
+ // Look for a single store which can be optimized into a memset.
+ for (auto &SI : StoreRefsForMemset)
MadeChange |= processLoopStore(SI, BECount);
+ // Optimize the store into a memcpy, if it feeds an similarly strided load.
+ for (auto &SI : StoreRefsForMemcpy)
+ MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *Inst = &*I++;
// Look for memset instructions, which may be optimized to a larger memset.
@@ -380,7 +458,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(
return MadeChange;
}
-/// processLoopStore - See if this store can be promoted to a memset or memcpy.
+/// processLoopStore - See if this store can be promoted to a memset.
bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
assert(SI->isSimple() && "Expected only non-volatile stores.");
@@ -398,12 +476,8 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
bool NegStride = StoreSize == -Stride;
// See if we can optimize just this store in isolation.
- if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
- StoredVal, SI, StoreEv, BECount, NegStride))
- return true;
-
- // Optimize the store into a memcpy, if it feeds an similarly strided load.
- return processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, BECount, NegStride);
+ return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+ StoredVal, SI, StoreEv, BECount, NegStride);
}
/// processLoopMemSet - See if this memset can be promoted to a large memset.
@@ -440,8 +514,14 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
if (!Stride || MSI->getLength() != Stride->getValue())
return false;
+ // Verify that the memset value is loop invariant. If not, we can't promote
+ // the memset.
+ Value *SplatValue = MSI->getValue();
+ if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+ return false;
+
return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
- MSI->getAlignment(), MSI->getValue(), MSI, Ev,
+ MSI->getAlignment(), SplatValue, MSI, Ev,
BECount, /*NegStride=*/false);
}
@@ -496,37 +576,19 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
const SCEV *BECount, bool NegStride) {
-
- // If the stored value is a byte-wise value (like i32 -1), then it may be
- // turned into a memset of i8 -1, assuming that all the consecutive bytes
- // are stored. A store of i32 0x01020304 can never be turned into a memset,
- // but it can be turned into memset_pattern if the target supports it.
Value *SplatValue = isBytewiseValue(StoredVal);
Constant *PatternValue = nullptr;
- unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
- // If we're allowed to form a memset, and the stored value would be acceptable
- // for memset, use it.
- if (SplatValue && TLI->has(LibFunc::memset) &&
- // Verify that the stored value is loop invariant. If not, we can't
- // promote the memset.
- CurLoop->isLoopInvariant(SplatValue)) {
- // Keep and use SplatValue.
- PatternValue = nullptr;
- } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) &&
- (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
- // Don't create memset_pattern16s with address spaces.
- // It looks like we can use PatternValue!
- SplatValue = nullptr;
- } else {
- // Otherwise, this isn't an idiom we can transform. For example, we can't
- // do anything with a 3-byte store.
- return false;
- }
+ if (!SplatValue)
+ PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+ assert((SplatValue || PatternValue) &&
+ "Expected either splat value or pattern value.");
// The trip count of the loop and the base pointer of the addrec SCEV is
// guaranteed to be loop invariant, which means that it should dominate the
// header. This allows us to insert code for it in the preheader.
+ unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
BasicBlock *Preheader = CurLoop->getLoopPreheader();
IRBuilder<> Builder(Preheader->getTerminator());
SCEVExpander Expander(*SE, *DL, "loop-idiom");
@@ -608,29 +670,25 @@ bool LoopIdiomRecognize::processLoopStridedStore(
/// If the stored value is a strided load in the same loop with the same stride
/// this may be transformable into a memcpy. This kicks in for stuff like
/// for (i) A[i] = B[i];
-bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
- StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv,
- const SCEV *BECount, bool NegStride) {
- // If we're not allowed to form memcpy, we fail.
- if (!TLI->has(LibFunc::memcpy))
- return false;
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+ const SCEV *BECount) {
+ assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+ Value *StorePtr = SI->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ unsigned Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+ bool NegStride = StoreSize == -Stride;
// The store must be feeding a non-volatile load.
- LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
- if (!LI || !LI->isSimple())
- return false;
+ LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+ assert(LI->isSimple() && "Expected only non-volatile stores.");
// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided load. If we have something else, it's a
// random load we can't handle.
const SCEVAddRecExpr *LoadEv =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
- if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
- return false;
-
- // The store and load must share the same stride.
- if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
- return false;
+ cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
// The trip count of the loop and the base pointer of the addrec SCEV is
// guaranteed to be loop invariant, which means that it should dominate the
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0333bf2284e1..7354016c2122 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -481,6 +481,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
return AMemSet;
}
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+ const LoadInst *LI) {
+ unsigned StoreAlign = SI->getAlignment();
+ if (!StoreAlign)
+ StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+ unsigned LoadAlign = LI->getAlignment();
+ if (!LoadAlign)
+ LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+ return std::min(StoreAlign, LoadAlign);
+}
bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
if (!SI->isSimple()) return false;
@@ -496,12 +507,84 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
const DataLayout &DL = SI->getModule()->getDataLayout();
- // Detect cases where we're performing call slot forwarding, but
- // happen to be using a load-store pair to implement it, rather than
- // a memcpy.
+ // Load to store forwarding can be interpreted as memcpy.
if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
if (LI->isSimple() && LI->hasOneUse() &&
LI->getParent() == SI->getParent()) {
+
+ auto *T = LI->getType();
+ if (T->isAggregateType()) {
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ // We use alias analysis to check if an instruction may store to
+ // the memory we load from in between the load and the store. If
+ // such an instruction is found, we try to promote there instead
+ // of at the store position.
+ Instruction *P = SI;
+ for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+ I != E; ++I) {
+ if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
+ continue;
+
+ // We found an instruction that may write to the loaded memory.
+ // We can try to promote at this position instead of the store
+ // position if nothing alias the store memory after this.
+ P = &*I;
+ for (; I != E; ++I) {
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+ DEBUG(dbgs() << "Alias " << *I << "\n");
+ P = nullptr;
+ break;
+ }
+ }
+
+ break;
+ }
+
+ // If a valid insertion position is found, then we can promote
+ // the load/store pair to a memcpy.
+ if (P) {
+ // If we load from memory that may alias the memory we store to,
+ // memmove must be used to preserve semantic. If not, memcpy can
+ // be used.
+ bool UseMemMove = false;
+ if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+ UseMemMove = true;
+
+ unsigned Align = findCommonAlignment(DL, SI, LI);
+ uint64_t Size = DL.getTypeStoreSize(T);
+
+ IRBuilder<> Builder(P);
+ Instruction *M;
+ if (UseMemMove)
+ M = Builder.CreateMemMove(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+ else
+ M = Builder.CreateMemCpy(SI->getPointerOperand(),
+ LI->getPointerOperand(), Size,
+ Align, SI->isVolatile());
+
+ DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+ << " => " << *M << "\n");
+
+ MD->removeInstruction(SI);
+ SI->eraseFromParent();
+ MD->removeInstruction(LI);
+ LI->eraseFromParent();
+ ++NumMemCpyInstr;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ // Detect cases where we're performing call slot forwarding, but
+ // happen to be using a load-store pair to implement it, rather than
+ // a memcpy.
MemDepResult ldep = MD->getDependency(LI);
CallInst *C = nullptr;
if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
@@ -522,18 +605,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
}
if (C) {
- unsigned storeAlign = SI->getAlignment();
- if (!storeAlign)
- storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
- unsigned loadAlign = LI->getAlignment();
- if (!loadAlign)
- loadAlign = DL.getABITypeAlignment(LI->getType());
-
bool changed = performCallSlotOptzn(
LI, SI->getPointerOperand()->stripPointerCasts(),
LI->getPointerOperand()->stripPointerCasts(),
DL.getTypeStoreSize(SI->getOperand(0)->getType()),
- std::min(storeAlign, loadAlign), C);
+ findCommonAlignment(DL, SI, LI), C);
if (changed) {
MD->removeInstruction(SI);
SI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index fb970c747ce1..401a740856e9 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -183,6 +183,8 @@ namespace {
Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
Value *RemoveFactorFromExpression(Value *V, Value *Factor);
void EraseInst(Instruction *I);
+ void RecursivelyEraseDeadInsts(Instruction *I,
+ SetVector<AssertingVH<Instruction>> &Insts);
void OptimizeInst(Instruction *I);
Instruction *canonicalizeNegConstExpr(Instruction *I);
};
@@ -1926,6 +1928,22 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
return nullptr;
}
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void Reassociate::RecursivelyEraseDeadInsts(
+ Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+ ValueRankMap.erase(I);
+ Insts.remove(I);
+ RedoInsts.remove(I);
+ I->eraseFromParent();
+ for (auto Op : Ops)
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->use_empty())
+ Insts.insert(OpInst);
+}
+
/// Zap the given instruction, adding interesting operands to the work list.
void Reassociate::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
@@ -2255,7 +2273,21 @@ bool Reassociate::runOnFunction(Function &F) {
++II;
}
- // If this produced extra instructions to optimize, handle them now.
+ // Make a copy of all the instructions to be redone so we can remove dead
+ // instructions.
+ SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+ // Iterate over all instructions to be reevaluated and remove trivially dead
+ // instructions. If any operand of the trivially dead instruction becomes
+ // dead mark it for deletion as well. Continue this process until all
+ // trivially dead instructions have been removed.
+ while (!ToRedo.empty()) {
+ Instruction *I = ToRedo.pop_back_val();
+ if (isInstructionTriviallyDead(I))
+ RecursivelyEraseDeadInsts(I, ToRedo);
+ }
+
+ // Now that we have removed dead instructions, we can reoptimize the
+ // remaining instructions.
while (!RedoInsts.empty()) {
Instruction *I = RedoInsts.pop_back_val();
if (isInstructionTriviallyDead(I))
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index db127c3f7b4e..5d253be1aa86 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -428,30 +428,15 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
// We should have never reached here if this argument isn't an gc value
return BaseDefiningValueResult(I, true);
- if (isa<GlobalVariable>(I))
- // base case
+ if (isa<Constant>(I))
+ // We assume that objects with a constant base (e.g. a global) can't move
+ // and don't need to be reported to the collector because they are always
+ // live. All constants have constant bases. Besides global references, all
+ // kinds of constants (e.g. undef, constant expressions, null pointers) can
+ // be introduced by the inliner or the optimizer, especially on dynamically
+ // dead paths. See e.g. test4 in constants.ll.
return BaseDefiningValueResult(I, true);
- // inlining could possibly introduce phi node that contains
- // undef if callee has multiple returns
- if (isa<UndefValue>(I))
- // utterly meaningless, but useful for dealing with
- // partially optimized code.
- return BaseDefiningValueResult(I, true);
-
- // Due to inheritance, this must be _after_ the global variable and undef
- // checks
- if (isa<Constant>(I)) {
- assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) &&
- "order of checks wrong!");
- // Note: Even for frontends which don't have constant references, we can
- // see constants appearing after optimizations. A simple example is
- // specialization of an address computation on null feeding into a merge
- // point where the actual use of the now-constant input is protected by
- // another null check. (e.g. test4 in constants.ll)
- return BaseDefiningValueResult(I, true);
- }
-
if (CastInst *CI = dyn_cast<CastInst>(I)) {
Value *Def = CI->stripPointerCasts();
// If stripping pointer casts changes the address space there is an
@@ -1642,33 +1627,24 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
DenseSet<Value *> &VisitedLiveValues) {
for (User *U : GCRelocs) {
- if (!isa<IntrinsicInst>(U))
+ GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+ if (!Relocate)
continue;
- IntrinsicInst *RelocatedValue = cast<IntrinsicInst>(U);
-
- // We only care about relocates
- if (RelocatedValue->getIntrinsicID() !=
- Intrinsic::experimental_gc_relocate) {
- continue;
- }
-
- GCRelocateOperands RelocateOperands(RelocatedValue);
- Value *OriginalValue =
- const_cast<Value *>(RelocateOperands.getDerivedPtr());
+ Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
assert(AllocaMap.count(OriginalValue));
Value *Alloca = AllocaMap[OriginalValue];
// Emit store into the related alloca
// All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
// the correct type according to alloca.
- assert(RelocatedValue->getNextNode() &&
+ assert(Relocate->getNextNode() &&
"Should always have one since it's not a terminator");
- IRBuilder<> Builder(RelocatedValue->getNextNode());
+ IRBuilder<> Builder(Relocate->getNextNode());
Value *CastedRelocatedValue =
- Builder.CreateBitCast(RelocatedValue,
+ Builder.CreateBitCast(Relocate,
cast<AllocaInst>(Alloca)->getAllocatedType(),
- suffixed_name_or(RelocatedValue, ".casted", ""));
+ suffixed_name_or(Relocate, ".casted", ""));
StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 0914699a2e38..42287d3bb2e8 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -74,17 +74,13 @@ namespace llvm {
// insertFastDiv - Substitutes the div/rem instruction with code that checks the
// value of the operands and uses a shorter-faster div/rem instruction when
// possible and the longer-slower div/rem instruction otherwise.
-static bool insertFastDiv(Function &F,
- Function::iterator &I,
- BasicBlock::iterator &J,
- IntegerType *BypassType,
- bool UseDivOp,
- bool UseSignedOp,
+static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
+ bool UseDivOp, bool UseSignedOp,
DivCacheTy &PerBBDivCache) {
+ Function *F = I->getParent()->getParent();
// Get instruction operands
- Instruction *Instr = &*J;
- Value *Dividend = Instr->getOperand(0);
- Value *Divisor = Instr->getOperand(1);
+ Value *Dividend = I->getOperand(0);
+ Value *Divisor = I->getOperand(1);
if (isa<ConstantInt>(Divisor) ||
(isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
@@ -94,13 +90,12 @@ static bool insertFastDiv(Function &F,
}
// Basic Block is split before divide
- BasicBlock *MainBB = &*I;
- BasicBlock *SuccessorBB = I->splitBasicBlock(J);
- ++I; //advance iterator I to successorBB
+ BasicBlock *MainBB = &*I->getParent();
+ BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
// Add new basic block for slow divide operation
- BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
- MainBB->getParent(), SuccessorBB);
+ BasicBlock *SlowBB =
+ BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
SlowBB->moveBefore(SuccessorBB);
IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
Value *SlowQuotientV;
@@ -115,8 +110,8 @@ static bool insertFastDiv(Function &F,
SlowBuilder.CreateBr(SuccessorBB);
// Add new basic block for fast divide operation
- BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
- MainBB->getParent(), SuccessorBB);
+ BasicBlock *FastBB =
+ BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
FastBB->moveBefore(SlowBB);
IRBuilder<> FastBuilder(FastBB, FastBB->begin());
Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
@@ -139,19 +134,19 @@ static bool insertFastDiv(Function &F,
// Phi nodes for result of div and rem
IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
- PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+ PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
QuoPhi->addIncoming(SlowQuotientV, SlowBB);
QuoPhi->addIncoming(FastQuotientV, FastBB);
- PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+ PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
RemPhi->addIncoming(SlowRemainderV, SlowBB);
RemPhi->addIncoming(FastRemainderV, FastBB);
- // Replace Instr with appropriate phi node
+ // Replace I with appropriate phi node
if (UseDivOp)
- Instr->replaceAllUsesWith(QuoPhi);
+ I->replaceAllUsesWith(QuoPhi);
else
- Instr->replaceAllUsesWith(RemPhi);
- Instr->eraseFromParent();
+ I->replaceAllUsesWith(RemPhi);
+ I->eraseFromParent();
// Combine operands into a single value with OR for value testing below
MainBB->getInstList().back().eraseFromParent();
@@ -168,9 +163,6 @@ static bool insertFastDiv(Function &F,
Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
- // point iterator J at first instruction of successorBB
- J = I->begin();
-
// Cache phi nodes to be used later in place of other instances
// of div or rem with the same sign, dividend, and divisor
DivOpInfo Key(UseSignedOp, Dividend, Divisor);
@@ -179,57 +171,54 @@ static bool insertFastDiv(Function &F,
return true;
}
-// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
-// operands and operation are identical. Otherwise call insertFastDiv to perform
-// the optimization and cache the resulting dividend and remainder.
-static bool reuseOrInsertFastDiv(Function &F,
- Function::iterator &I,
- BasicBlock::iterator &J,
- IntegerType *BypassType,
- bool UseDivOp,
- bool UseSignedOp,
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
+// the current BB if operands and operation are identical. Otherwise calls
+// insertFastDiv to perform the optimization and caches the resulting dividend
+// and remainder.
+static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
+ bool UseDivOp, bool UseSignedOp,
DivCacheTy &PerBBDivCache) {
// Get instruction operands
- Instruction *Instr = &*J;
- DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+ DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
if (CacheI == PerBBDivCache.end()) {
// If previous instance does not exist, insert fast div
- return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
- PerBBDivCache);
+ return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
}
// Replace operation value with previously generated phi node
DivPhiNodes &Value = CacheI->second;
if (UseDivOp) {
// Replace all uses of div instruction with quotient phi node
- J->replaceAllUsesWith(Value.Quotient);
+ I->replaceAllUsesWith(Value.Quotient);
} else {
// Replace all uses of rem instruction with remainder phi node
- J->replaceAllUsesWith(Value.Remainder);
+ I->replaceAllUsesWith(Value.Remainder);
}
- // Advance to next operation
- ++J;
-
// Remove redundant operation
- Instr->eraseFromParent();
+ I->eraseFromParent();
return true;
}
-// bypassSlowDivision - This optimization identifies DIV instructions that can
-// be profitably bypassed and carried out with a shorter, faster divide.
-bool llvm::bypassSlowDivision(Function &F,
- Function::iterator &I,
- const DenseMap<unsigned int, unsigned int> &BypassWidths) {
+// bypassSlowDivision - This optimization identifies DIV instructions in a BB
+// that can be profitably bypassed and carried out with a shorter, faster
+// divide.
+bool llvm::bypassSlowDivision(
+ BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
DivCacheTy DivCache;
bool MadeChange = false;
- for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+ Instruction* Next = &*BB->begin();
+ while (Next != nullptr) {
+ // We may add instructions immediately after I, but we want to skip over
+ // them.
+ Instruction* I = Next;
+ Next = Next->getNextNode();
// Get instruction details
- unsigned Opcode = J->getOpcode();
+ unsigned Opcode = I->getOpcode();
bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
bool UseSignedOp = Opcode == Instruction::SDiv ||
@@ -240,11 +229,11 @@ bool llvm::bypassSlowDivision(Function &F,
continue;
// Skip division on vector types, only optimize integer instructions
- if (!J->getType()->isIntegerTy())
+ if (!I->getType()->isIntegerTy())
continue;
// Get bitwidth of div/rem instruction
- IntegerType *T = cast<IntegerType>(J->getType());
+ IntegerType *T = cast<IntegerType>(I->getType());
unsigned int bitwidth = T->getBitWidth();
// Continue if bitwidth is not bypassed
@@ -253,10 +242,9 @@ bool llvm::bypassSlowDivision(Function &F,
continue;
// Get type for div/rem instruction with bypass bitwidth
- IntegerType *BT = IntegerType::get(J->getContext(), BI->second);
+ IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
- MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
- UseSignedOp, DivCache);
+ MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
}
return MadeChange;
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index e75163f323df..0e386ac83e9e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1305,8 +1305,9 @@ static bool markAliveBlocks(Function &F,
}
}
- // Turn invokes that call 'nounwind' functions into ordinary calls.
- if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+ TerminatorInst *Terminator = BB->getTerminator();
+ if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
+ // Turn invokes that call 'nounwind' functions into ordinary calls.
Value *Callee = II->getCalledValue();
if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
changeToUnreachable(II, true);
@@ -1321,6 +1322,44 @@ static bool markAliveBlocks(Function &F,
changeToCall(II);
Changed = true;
}
+ } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
+ // Remove catchpads which cannot be reached.
+ struct CatchPadDenseMapInfo {
+ static CatchPadInst *getEmptyKey() {
+ return DenseMapInfo<CatchPadInst *>::getEmptyKey();
+ }
+ static CatchPadInst *getTombstoneKey() {
+ return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(CatchPadInst *CatchPad) {
+ return static_cast<unsigned>(hash_combine_range(
+ CatchPad->value_op_begin(), CatchPad->value_op_end()));
+ }
+ static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
+ if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+ RHS == getEmptyKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+ };
+
+ // Set of unique CatchPads.
+ SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
+ CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
+ HandlerSet;
+ detail::DenseSetEmpty Empty;
+ for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
+ E = CatchSwitch->handler_end();
+ I != E; ++I) {
+ BasicBlock *HandlerBB = *I;
+ auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+ if (!HandlerSet.insert({CatchPad, Empty}).second) {
+ CatchSwitch->removeHandler(I);
+ --I;
+ --E;
+ Changed = true;
+ }
+ }
}
Changed |= ConstantFoldTerminator(BB, true);
@@ -1514,8 +1553,8 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
return true;
// Check if the function is specifically marked as a gc leaf function.
- //
- // TODO: we should be checking the attributes on the call site as well.
+ if (CS.hasFnAttr("gc-leaf-function"))
+ return true;
if (const Function *F = CS.getCalledFunction())
return F->hasFnAttribute("gc-leaf-function");
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index d0932f834cf5..3bb3fa5a301f 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -3448,18 +3449,26 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
if (BBI->mayHaveSideEffects()) {
- if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+ if (auto *SI = dyn_cast<StoreInst>(BBI)) {
if (SI->isVolatile())
break;
- } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+ } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
if (LI->isVolatile())
break;
- } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+ } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
if (RMWI->isVolatile())
break;
- } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+ } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
if (CXI->isVolatile())
break;
+ } else if (isa<CatchPadInst>(BBI)) {
+ // A catchpad may invoke exception object constructors and such, which
+ // in some languages can be arbitrary code, so be conservative by
+ // default.
+ // For CoreCLR, it just involves a type test, so can be removed.
+ if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
+ EHPersonality::CoreCLR)
+ break;
} else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
!isa<LandingPadInst>(BBI)) {
break;
@@ -3485,7 +3494,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
TerminatorInst *TI = Preds[i]->getTerminator();
IRBuilder<> Builder(TI);
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
if (BI->isUnconditional()) {
if (BI->getSuccessor(0) == BB) {
new UnreachableInst(TI->getContext(), TI);
@@ -3502,7 +3511,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
Changed = true;
}
}
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
i != e; ++i)
if (i.getCaseSuccessor() == BB) {
@@ -3511,18 +3520,49 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
--i; --e;
Changed = true;
}
- } else if ((isa<InvokeInst>(TI) &&
- cast<InvokeInst>(TI)->getUnwindDest() == BB) ||
- isa<CatchSwitchInst>(TI)) {
- removeUnwindEdge(TI->getParent());
- Changed = true;
+ } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
+ if (II->getUnwindDest() == BB) {
+ removeUnwindEdge(TI->getParent());
+ Changed = true;
+ }
+ } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+ if (CSI->getUnwindDest() == BB) {
+ removeUnwindEdge(TI->getParent());
+ Changed = true;
+ continue;
+ }
+
+ for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+ E = CSI->handler_end();
+ I != E; ++I) {
+ if (*I == BB) {
+ CSI->removeHandler(I);
+ --I;
+ --E;
+ Changed = true;
+ }
+ }
+ if (CSI->getNumHandlers() == 0) {
+ BasicBlock *CatchSwitchBB = CSI->getParent();
+ if (CSI->hasUnwindDest()) {
+ // Redirect preds to the unwind dest
+ CatchSwitchBB->replaceAllUsesWith(CSI->getUnwindDest());
+ } else {
+ // Rewrite all preds to unwind to caller (or from invoke to call).
+ SmallVector<BasicBlock *, 8> EHPreds(predecessors(CatchSwitchBB));
+ for (BasicBlock *EHPred : EHPreds)
+ removeUnwindEdge(EHPred);
+ }
+ // The catchswitch is no longer reachable.
+ new UnreachableInst(CSI->getContext(), CSI);
+ CSI->eraseFromParent();
+ Changed = true;
+ }
} else if (isa<CleanupReturnInst>(TI)) {
new UnreachableInst(TI->getContext(), TI);
TI->eraseFromParent();
Changed = true;
}
- // TODO: We can remove a catchswitch if all it's catchpads end in
- // unreachable.
}
// If this block is now dead, remove it.
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 81dea6d1b9ae..dc5fee523d4c 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -57,8 +57,7 @@ static bool ignoreCallingConv(LibFunc::Func Func) {
Func == LibFunc::llabs || Func == LibFunc::strlen;
}
-/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
+/// Return true if it only matters that the value is equal or not-equal to zero.
static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
for (User *U : V->users()) {
if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -72,8 +71,7 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
return true;
}
-/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
-/// comparisons with With.
+/// Return true if it is only used in equality comparisons with With.
static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
for (User *U : V->users()) {
if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -249,12 +247,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
!FT->getParamType(2)->isIntegerTy())
return nullptr;
- // Extract some information from the instruction
+ // Extract some information from the instruction.
Value *Dst = CI->getArgOperand(0);
Value *Src = CI->getArgOperand(1);
uint64_t Len;
- // We don't do anything if length is not constant
+ // We don't do anything if length is not constant.
if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
Len = LengthArg->getZExtValue();
else
@@ -272,12 +270,12 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
if (SrcLen == 0 || Len == 0)
return Dst;
- // We don't optimize this case
+ // We don't optimize this case.
if (Len < SrcLen)
return nullptr;
// strncat(x, s, c) -> strcat(x, s)
- // s is constant so the strcat can be optimized further
+ // s is constant so the strcat can be optimized further.
return emitStrLenMemCpy(Src, Dst, SrcLen, B);
}
@@ -310,7 +308,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
StringRef Str;
if (!getConstantStringInfo(SrcStr, Str)) {
if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
- return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
+ return B.CreateGEP(B.getInt8Ty(), SrcStr, EmitStrLen(SrcStr, B, DL, TLI),
+ "strchr");
return nullptr;
}
@@ -490,8 +489,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
Type *PT = Callee->getFunctionType()->getParamType(0);
Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
- Value *DstEnd =
- B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+ Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
+ ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
// We have enough information to now generate the memcpy call to do the
// copy for us. Make a memcpy to copy the nul byte with align = 1.
@@ -599,7 +598,8 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
if (I == StringRef::npos) // No match.
return Constant::getNullValue(CI->getType());
- return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I), "strpbrk");
+ return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
+ "strpbrk");
}
// strpbrk(s, "a") -> strchr(s, 'a')
@@ -878,8 +878,10 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
Type *RHSPtrTy =
IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
- Value *LHSV = B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv");
- Value *RHSV = B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv");
+ Value *LHSV =
+ B.CreateLoad(B.CreateBitCast(LHS, LHSPtrTy, "lhsc"), "lhsv");
+ Value *RHSV =
+ B.CreateLoad(B.CreateBitCast(RHS, RHSPtrTy, "rhsc"), "rhsv");
return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
}
@@ -992,6 +994,10 @@ Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
Value *V = valueHasFloatPrecision(CI->getArgOperand(0));
if (V == nullptr)
return nullptr;
+
+ // Propagate fast-math flags from the existing call to the new call.
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ B.SetFastMathFlags(CI->getFastMathFlags());
// floor((double)floatval) -> (double)floorf(floatval)
if (Callee->isIntrinsic()) {
@@ -1027,6 +1033,10 @@ Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
if (V2 == nullptr)
return nullptr;
+ // Propagate fast-math flags from the existing call to the new call.
+ IRBuilder<>::FastMathFlagGuard Guard(B);
+ B.SetFastMathFlags(CI->getFastMathFlags());
+
// fmin((double)floatval1, (double)floatval2)
// -> (double)fminf(floatval1, floatval2)
// TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
@@ -1117,7 +1127,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
Callee->getAttributes());
}
- bool unsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent());
+ bool UnsafeFPMath = canUseUnsafeFPMath(CI->getParent()->getParent());
// pow(exp(x), y) -> exp(x*y)
// pow(exp2(x), y) -> exp2(x * y)
@@ -1126,7 +1136,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
// underflow behavior quite dramatically.
// Example: x = 1000, y = 0.001.
// pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
- if (unsafeFPMath) {
+ if (UnsafeFPMath) {
if (auto *OpC = dyn_cast<CallInst>(Op1)) {
IRBuilder<>::FastMathFlagGuard Guard(B);
FastMathFlags FMF;
@@ -1157,7 +1167,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
LibFunc::fabsl)) {
// In -ffast-math, pow(x, 0.5) -> sqrt(x).
- if (unsafeFPMath)
+ if (UnsafeFPMath)
return EmitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
Callee->getAttributes());
@@ -1183,7 +1193,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
// In -ffast-math, generate repeated fmul instead of generating pow(x, n).
- if (unsafeFPMath) {
+ if (UnsafeFPMath) {
APFloat V = abs(Op2C->getValueAPF());
// We limit to a max of 7 fmul(s). Thus max exponent is 32.
// This transformation applies to integer exponents only.
@@ -1291,12 +1301,9 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
// function, do that first.
Function *Callee = CI->getCalledFunction();
StringRef Name = Callee->getName();
- if ((Name == "fmin" && hasFloatVersion(Name)) ||
- (Name == "fmax" && hasFloatVersion(Name))) {
- Value *Ret = optimizeBinaryDoubleFP(CI, B);
- if (Ret)
+ if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
+ if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
return Ret;
- }
// Make sure this has 2 arguments of FP type which match the result type.
FunctionType *FT = Callee->getFunctionType();
@@ -1307,14 +1314,12 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
IRBuilder<>::FastMathFlagGuard Guard(B);
FastMathFlags FMF;
- Function *F = CI->getParent()->getParent();
- if (canUseUnsafeFPMath(F)) {
+ if (CI->hasUnsafeAlgebra()) {
// Unsafe algebra sets all fast-math-flags to true.
FMF.setUnsafeAlgebra();
} else {
// At a minimum, no-nans-fp-math must be true.
- Attribute Attr = F->getFnAttribute("no-nans-fp-math");
- if (Attr.getValueAsString() != "true")
+ if (!CI->hasNoNaNs())
return nullptr;
// No-signed-zeros is implied by the definitions of fmax/fmin themselves:
// "Ideally, fmax would be sensitive to the sign of zero, for example
@@ -2169,7 +2174,10 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
LibFunc::Func Func;
Function *Callee = CI->getCalledFunction();
StringRef FuncName = Callee->getName();
- IRBuilder<> Builder(CI);
+
+ SmallVector<OperandBundleDef, 2> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
// Command-line parameter overrides function attribute.
@@ -2419,7 +2427,8 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
return false;
}
-Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
+ IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk))
@@ -2433,7 +2442,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &
return nullptr;
}
-Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
+ IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk))
@@ -2447,7 +2457,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<>
return nullptr;
}
-Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
+ IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk))
@@ -2539,7 +2550,10 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
LibFunc::Func Func;
Function *Callee = CI->getCalledFunction();
StringRef FuncName = Callee->getName();
- IRBuilder<> Builder(CI);
+
+ SmallVector<OperandBundleDef, 2> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
// First, check that this is a known library functions.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 1add78e01657..2e361d38ed0b 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -218,12 +218,12 @@ static Metadata *mapMetadataOp(Metadata *Op,
}
/// Resolve uniquing cycles involving the given metadata.
-static void resolveCycles(Metadata *MD, bool MDMaterialized) {
+static void resolveCycles(Metadata *MD, bool AllowTemps) {
if (auto *N = dyn_cast_or_null<MDNode>(MD)) {
- if (!MDMaterialized && N->isTemporary())
+ if (AllowTemps && N->isTemporary())
return;
if (!N->isResolved())
- N->resolveCycles(MDMaterialized);
+ N->resolveCycles(AllowTemps);
}
}
@@ -253,7 +253,7 @@ static bool remapOperands(MDNode &Node,
// Resolve uniquing cycles underneath distinct nodes on the fly so they
// don't infect later operands.
if (IsDistinct)
- resolveCycles(New, !(Flags & RF_HaveUnmaterializedMetadata));
+ resolveCycles(New, Flags & RF_HaveUnmaterializedMetadata);
}
}
@@ -401,7 +401,7 @@ Metadata *llvm::MapMetadata(const Metadata *MD, ValueToValueMapTy &VM,
return NewMD;
// Resolve cycles involving the entry metadata.
- resolveCycles(NewMD, !(Flags & RF_HaveUnmaterializedMetadata));
+ resolveCycles(NewMD, Flags & RF_HaveUnmaterializedMetadata);
// Remap the operands of distinct MDNodes.
while (!DistinctWorklist.empty())
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a627dd665179..2c0d317d16bc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4294,12 +4294,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
- if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
- Reductions[Phi])) {
- if (Reductions[Phi].hasUnsafeAlgebra())
- Requirements->addUnsafeAlgebraInst(
- Reductions[Phi].getUnsafeAlgebraInst());
- AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
continue;
}
diff --git a/test/Analysis/BasicAA/memset_pattern.ll b/test/Analysis/BasicAA/memset_pattern.ll
index 25bdb2e202fb..821cbdf4bb06 100644
--- a/test/Analysis/BasicAA/memset_pattern.ll
+++ b/test/Analysis/BasicAA/memset_pattern.ll
@@ -18,4 +18,4 @@ entry:
ret i32 %l
}
-declare void @memset_pattern16(i8*, i8*, i64)
+declare void @memset_pattern16(i8*, i8* readonly, i64) argmemonly
diff --git a/test/Analysis/GlobalsModRef/argmemonly-escape.ll b/test/Analysis/GlobalsModRef/argmemonly-escape.ll
deleted file mode 100644
index 64c625810af9..000000000000
--- a/test/Analysis/GlobalsModRef/argmemonly-escape.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt < %s -O1 -S -enable-non-lto-gmr=true | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-@a = internal global [3 x i32] zeroinitializer, align 4
-
-; The important thing we're checking for here is the reload of (some element of)
-; @a after the memset.
-
-; CHECK-LABEL: @main
-; CHECK: load i32, i32* getelementptr {{.*}} @a
-; CHECK-NEXT: call void @memsetp0i8i64{{.*}} @a
-; CHECK-NEXT: load i32, i32* getelementptr {{.*}} @a
-; CHECK-NEXT: call void @memsetp0i8i64A{{.*}} @a
-; CHECK-NEXT: load i32, i32* getelementptr {{.*}} @a
-; CHECK: icmp eq
-; CHECK: br i1
-
-define i32 @main() {
-entry:
- %0 = bitcast [3 x i32]* @a to i8*
- %1 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
- call void @memsetp0i8i64(i8* %0, i8 0, i64 4, i32 4, i1 false)
- %2 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
- call void @memsetp0i8i64A(i8* %0, i8 0, i64 4, i32 4, i1 false)
- %3 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
- %4 = add i32 %2, %3
- %cmp1 = icmp eq i32 %1, %4
- br i1 %cmp1, label %if.then, label %if.end
-
-if.then: ; preds = %entr
- call void @abort() #3
- unreachable
-
-if.end: ; preds = %entry
- ret i32 0
-}
-
-; Function Attrs: nounwind argmemonly
-declare void @memsetp0i8i64(i8* nocapture, i8, i64, i32, i1) nounwind argmemonly
-
-; Function Attrs: nounwind inaccessiblemem_or_argmemonly
-declare void @memsetp0i8i64A(i8* nocapture, i8, i64, i32, i1) nounwind inaccessiblemem_or_argmemonly
-
-; Function Attrs: noreturn nounwind
-declare void @abort() noreturn nounwind
diff --git a/test/Analysis/GlobalsModRef/inaccessiblememonly.ll b/test/Analysis/GlobalsModRef/inaccessiblememonly.ll
new file mode 100644
index 000000000000..d7a3cfc78a33
--- /dev/null
+++ b/test/Analysis/GlobalsModRef/inaccessiblememonly.ll
@@ -0,0 +1,21 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @donteliminate() {
+; CHECK-LABEL: donteliminate
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: tail call noalias i8* @allocmemory()
+; CHECK-NEXT: ret void
+ %1 = tail call noalias i8* @allocmemory()
+ %2 = tail call noalias i8* @allocmemory()
+ %3 = tail call noalias i8* @allocmemory()
+ ret void
+}
+
+; Function Attrs: inaccessiblememonly
+declare noalias i8* @allocmemory() #0
+
+attributes #0 = { inaccessiblememonly }
diff --git a/test/Analysis/GlobalsModRef/modreftest.ll b/test/Analysis/GlobalsModRef/modreftest.ll
index 2018b149fc06..07497705e65a 100644
--- a/test/Analysis/GlobalsModRef/modreftest.ll
+++ b/test/Analysis/GlobalsModRef/modreftest.ll
@@ -16,23 +16,3 @@ define i32 @test(i32* %P) {
define void @doesnotmodX() {
ret void
}
-
-declare void @InaccessibleMemOnlyFunc( ) #0
-declare void @InaccessibleMemOrArgMemOnlyFunc( ) #1
-
-define i32 @test2(i32* %P) {
-; CHECK: @test2
-; CHECK-NEXT: store i32 12, i32* @X
-; CHECK-NEXT: call void @InaccessibleMemOnlyFunc()
-; CHECK-NEXT: call void @InaccessibleMemOrArgMemOnlyFunc()
-; CHECK-NOT: load i32
-; CHECK-NEXT: ret i32 12
- store i32 12, i32* @X
- call void @InaccessibleMemOnlyFunc( )
- call void @InaccessibleMemOrArgMemOnlyFunc( )
- %V = load i32, i32* @X ; <i32> [#uses=1]
- ret i32 %V
-}
-
-attributes #0 = { inaccessiblememonly }
-attributes #1 = { inaccessiblemem_or_argmemonly }
diff --git a/test/Analysis/ValueTracking/known-power-of-two.ll b/test/Analysis/ValueTracking/known-power-of-two.ll
new file mode 100644
index 000000000000..ed98a8f53616
--- /dev/null
+++ b/test/Analysis/ValueTracking/known-power-of-two.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; https://llvm.org/bugs/show_bug.cgi?id=25900
+; An arithmetic shift right of a power of two is not a power
+; of two if the original value is the sign bit. Therefore,
+; we can't transform the sdiv into a udiv.
+
+define i32 @pr25900(i32 %d) {
+ %and = and i32 %d, -2147483648
+; The next 3 lines prevent another fold from masking the bug.
+ %ext = zext i32 %and to i64
+ %or = or i64 %ext, 4294967296
+ %trunc = trunc i64 %or to i32
+ %ashr = ashr exact i32 %trunc, 31
+ %div = sdiv i32 4, %ashr
+ ret i32 %div
+
+; CHECK: sdiv
+}
+
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index 31e501de0a11..9363f503be5c 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -859,17 +859,23 @@ catchpad:
; CHECK-NEXT: br label %body
body:
- invoke void @f.ccc() to label %continue unwind label %terminate
+ invoke void @f.ccc() to label %continue unwind label %terminate.inner
catchret from %catch to label %return
; CHECK: catchret from %catch to label %return
return:
ret i32 0
+terminate.inner:
+ cleanuppad within %catch []
+ unreachable
+ ; CHECK: cleanuppad within %catch []
+ ; CHECK-NEXT: unreachable
+
terminate:
- cleanuppad within %cs []
+ cleanuppad within none []
unreachable
- ; CHECK: cleanuppad within %cs []
+ ; CHECK: cleanuppad within none []
; CHECK-NEXT: unreachable
continue:
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 994a9956cf7f..921cf6a6f0d1 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: movi.4h v1, #0x1
-;CHECK: and.8b v0, v0, v1
-;CHECK: ushll.4s v0, v0, #0
-;CHECK: str q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
- %r = zext %T0_30 %v0 to %T1_30
- store %T1_30 %r, %T1_30* %p1
- ret void
-}
-
-; Extend from v1i1 was crashing things (PR20791). Make sure we do something
-; sensible instead.
-define <1 x i32> @autogen_SD7918() {
-; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
-; CHECK-NEXT: ret
- %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
- %ZE = zext <1 x i1> %I29 to <1 x i32>
- ret <1 x i32> %ZE
-}
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: movi.4h v1, #0x1
+;CHECK: and.8b v0, v0, v1
+;CHECK: ushll.4s v0, v0, #0
+;CHECK: str q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+ %r = zext %T0_30 %v0 to %T1_30
+ store %T1_30 %r, %T1_30* %p1
+ ret void
+}
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+ %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+ %ZE = zext <1 x i1> %I29 to <1 x i32>
+ ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index a8399f92ebe4..9c2a4fd55d1b 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -6,6 +6,7 @@
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m1 2>&1 | FileCheck %s
; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
; CHECK-NOT: {{.*}} is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index a397c339a2d7..c2721e70190a 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s
%X = type { i64, i64, i64 }
declare void @f(%X*)
diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
index 8863f70444d1..2099333950ea 100644
--- a/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -256,3 +256,106 @@ if.then:
if.end:
ret void
}
+
+define void @test14(i1 %cond) {
+; CHECK-LABEL: @test14
+ br i1 %cond, label %if.end, label %if.then
+
+; CHECK-NOT: and
+; CHECK: tbnz w0, #0
+
+if.then:
+ call void @t()
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @test15(i1 %cond) {
+; CHECK-LABEL: @test15
+ %cond1 = xor i1 %cond, -1
+ br i1 %cond1, label %if.then, label %if.end
+
+; CHECK-NOT: movn
+; CHECK: tbnz w0, #0
+
+if.then:
+ call void @t()
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @test16(i64 %in) {
+; CHECK-LABEL: @test16
+ %shl = shl i64 %in, 3
+ %and = and i64 %shl, 32
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %then, label %end
+
+; CHECK-NOT: lsl
+; CHECK: tbnz w0, #2
+
+then:
+ call void @t()
+ br label %end
+
+end:
+ ret void
+}
+
+define void @test17(i64 %in) {
+; CHECK-LABEL: @test17
+ %shr = ashr i64 %in, 3
+ %and = and i64 %shr, 1
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %then, label %end
+
+; CHECK-NOT: lsr
+; CHECK: tbnz w0, #3
+
+then:
+ call void @t()
+ br label %end
+
+end:
+ ret void
+}
+
+define void @test18(i32 %in) {
+; CHECK-LABEL: @test18
+ %shr = ashr i32 %in, 2
+ %cond = icmp sge i32 %shr, 0
+ br i1 %cond, label %then, label %end
+
+; CHECK-NOT: asr
+; CHECK: tbnz w0, #31
+
+then:
+ call void @t()
+ br label %end
+
+end:
+ ret void
+}
+
+define void @test19(i64 %in) {
+; CHECK-LABEL: @test19
+ %shl = lshr i64 %in, 3
+ %trunc = trunc i64 %shl to i32
+ %and = and i32 %trunc, 1
+ %cond = icmp eq i32 %and, 0
+ br i1 %cond, label %then, label %end
+
+; CHECK-NOT: ubfx
+; CHECK: tbnz w0, #3
+
+then:
+ call void @t()
+ br label %end
+
+end:
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e2ae3353ae1d..9aea7c773431 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI
+; RUN: llc < %s -march=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=CI --check-prefix=NO-XNACK
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=NO-XNACK
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=XNACK
; GCN-LABEL: {{^}}no_vcc_no_flat:
-; GCN: ; NumSgprs: 8
+; NO-XNACK: ; NumSgprs: 8
+; XNACK: ; NumSgprs: 12
define void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{SGPR7}"()
@@ -10,7 +12,8 @@ entry:
}
; GCN-LABEL: {{^}}vcc_no_flat:
-; GCN: ; NumSgprs: 10
+; NO-XNACK: ; NumSgprs: 10
+; XNACK: ; NumSgprs: 12
define void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{VCC}"()
@@ -19,7 +22,8 @@ entry:
; GCN-LABEL: {{^}}no_vcc_flat:
; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; VI: ; NumSgprs: 12
+; XNACK: ; NumSgprs: 14
define void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
@@ -28,7 +32,8 @@ entry:
; GCN-LABEL: {{^}}vcc_flat:
; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; VI: ; NumSgprs: 12
+; XNACK: ; NumSgprs: 14
define void @vcc_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 8347b8c96ec4..84380b421051 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index 141ee2560152..b6f8093313cb 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; ALL-LABEL: {{^}}large_alloca_pixel_shader:
; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll
index 6a04261fe47b..6486c6ab2ffc 100644
--- a/test/CodeGen/AMDGPU/load.ll
+++ b/test/CodeGen/AMDGPU/load.ll
@@ -1,7 +1,8 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
;===------------------------------------------------------------------------===;
; GLOBAL ADDRESS SPACE
@@ -11,7 +12,8 @@
; FUNC-LABEL: {{^}}load_i8:
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte
define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
%1 = load i8, i8 addrspace(1)* %in
%2 = zext i8 %1 to i32
@@ -23,7 +25,8 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; R600: 8
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%0 = load i8, i8 addrspace(1)* %in
@@ -35,8 +38,10 @@ entry:
; FUNC-LABEL: {{^}}load_v2i8:
; R600: VTX_READ_8
; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
entry:
%0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -53,8 +58,10 @@ entry:
; R600-DAG: 8
; R600-DAG: 8
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
entry:
%0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -68,10 +75,14 @@ entry:
; R600: VTX_READ_8
; R600: VTX_READ_8
; R600: VTX_READ_8
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; SI-NOHSA: buffer_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
+; CI-HSA: flat_load_ubyte
define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
entry:
%0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -93,10 +104,14 @@ entry:
; R600-DAG: 8
; R600-DAG: 8
; R600-DAG: 8
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; SI-NOHSA: buffer_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
+; CI-HSA: flat_load_sbyte
define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
entry:
%0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -108,7 +123,8 @@ entry:
; Load an i16 value from the global address space.
; FUNC-LABEL: {{^}}load_i16:
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
entry:
%0 = load i16 , i16 addrspace(1)* %in
@@ -121,7 +137,8 @@ entry:
; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; R600: 16
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
entry:
%0 = load i16, i16 addrspace(1)* %in
@@ -133,8 +150,10 @@ entry:
; FUNC-LABEL: {{^}}load_v2i16:
; R600: VTX_READ_16
; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
entry:
%0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -150,8 +169,10 @@ entry:
; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
; R600-DAG: 16
; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
entry:
%0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -165,10 +186,14 @@ entry:
; R600: VTX_READ_16
; R600: VTX_READ_16
; R600: VTX_READ_16
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
+; CI-HSA: flat_load_ushort
define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
entry:
%0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -190,10 +215,14 @@ entry:
; R600-DAG: 16
; R600-DAG: 16
; R600-DAG: 16
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
+; CI-HSA: flat_load_sshort
define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
entry:
%0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -206,7 +235,8 @@ entry:
; FUNC-LABEL: {{^}}load_i32:
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
+; CI-HSA: flat_load_dword
define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%0 = load i32, i32 addrspace(1)* %in
@@ -218,7 +248,8 @@ entry:
; FUNC-LABEL: {{^}}load_f32:
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-; SI: buffer_load_dword v{{[0-9]+}}
+; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
+; CI-HSA: flat_load_dword
define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
entry:
%0 = load float, float addrspace(1)* %in
@@ -230,7 +261,8 @@ entry:
; FUNC-LABEL: {{^}}load_v2f32:
; R600: MEM_RAT
; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
+; SI-NOHSA: buffer_load_dwordx2
+; CI-HSA: flat_load_dwordx2
define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
entry:
%0 = load <2 x float>, <2 x float> addrspace(1)* %in
@@ -240,7 +272,8 @@ entry:
; FUNC-LABEL: {{^}}load_i64:
; R600: VTX_READ_64
-; SI: buffer_load_dwordx2
+; SI-NOHSA: buffer_load_dwordx2
+; CI-HSA: flat_load_dwordx2
define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
entry:
%0 = load i64, i64 addrspace(1)* %in
@@ -253,7 +286,8 @@ entry:
; R600: MEM_RAT
; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x
; R600: 31
-; SI: buffer_load_dword
+; SI-NOHSA: buffer_load_dword
+; CI-HSA: flat_load_dword
define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
@@ -278,8 +312,10 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
entry:
%0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -293,10 +329,14 @@ entry:
; R600: VTX_READ_128
; R600: VTX_READ_128
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; SI-NOHSA: buffer_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
+; CI-HSA: flat_load_dwordx4
define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
entry:
%0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
@@ -313,7 +353,8 @@ entry:
; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; R600: 8
-; SI: buffer_load_sbyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}},
+; CI-HSA: flat_load_sbyte v{{[0-9]+}},
define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
entry:
%0 = load i8, i8 addrspace(2)* %in
@@ -325,7 +366,8 @@ entry:
; Load an aligned i8 value
; FUNC-LABEL: {{^}}load_const_i8_aligned:
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte v{{[0-9]+}},
define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
entry:
%0 = load i8, i8 addrspace(2)* %in
@@ -337,7 +379,8 @@ entry:
; Load an un-aligned i8 value
; FUNC-LABEL: {{^}}load_const_i8_unaligned:
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ubyte v{{[0-9]+}},
+; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
+; CI-HSA: flat_load_ubyte v{{[0-9]+}},
define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
entry:
%0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
@@ -352,7 +395,8 @@ entry:
; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
; R600: 16
-; SI: buffer_load_sshort
+; SI-NOHSA: buffer_load_sshort
+; CI-HSA: flat_load_sshort
define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
entry:
%0 = load i16, i16 addrspace(2)* %in
@@ -364,7 +408,8 @@ entry:
; Load an aligned i16 value
; FUNC-LABEL: {{^}}load_const_i16_aligned:
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
entry:
%0 = load i16, i16 addrspace(2)* %in
@@ -376,7 +421,8 @@ entry:
; Load an un-aligned i16 value
; FUNC-LABEL: {{^}}load_const_i16_unaligned:
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI: buffer_load_ushort
+; SI-NOHSA: buffer_load_ushort
+; CI-HSA: flat_load_ushort
define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
entry:
%0 = getelementptr i16, i16 addrspace(2)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index a30c25e700ab..551f34339a12 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
declare i32 @llvm.r600.read.tidig.x() #0
declare i32 @llvm.r600.read.tidig.y() #0
@@ -18,8 +19,10 @@ declare i32 @llvm.r600.read.tidig.y() #0
; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
; instructions
-; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
-; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
+; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
entry:
@@ -50,8 +53,10 @@ done: ; preds = %loop
; Test moving an SMRD instruction to the VALU
; GCN-LABEL: {{^}}smrd_valu:
+; FIXME: We should be using flat load for HSA.
; GCN: buffer_load_dword [[OUT:v[0-9]+]]
-; GCN: buffer_store_dword [[OUT]]
+; GCN-NOHSA: buffer_store_dword [[OUT]]
+; GCN-HSA: flat_store_dword [[OUT]]
define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
entry:
%tmp = icmp ne i32 %a, 0
@@ -77,8 +82,9 @@ endif: ; preds = %else, %if
; Test moving an SMRD with an immediate offset to the VALU
; GCN-LABEL: {{^}}smrd_valu2:
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
+; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -91,12 +97,14 @@ entry:
; Use a big offset that will use the SMRD literal offset on CI
; GCN-LABEL: {{^}}smrd_valu_ci_offset:
-; GCN-NOT: v_add
-; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -109,13 +117,14 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx2
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx2
+; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -128,15 +137,16 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
-; GCN-NOT: v_add
-; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -152,25 +162,27 @@ entry:
; CI.
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -184,35 +196,40 @@ entry:
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
-; GCN-NOT: v_add
-; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
-; GCN-NOT: v_add
-; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
-
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
-; GCN: buffer_store_dwordx4
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
+
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+; GCN-NOHSA: buffer_store_dwordx4
+
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
; GCN: s_endpgm
define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
@@ -227,9 +244,11 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu2_salu_user:
-; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
-; GCN: buffer_store_dword [[ADD]]
+; GCN-NOHSA: buffer_store_dword [[ADD]]
+; GCN-HSA: flat_store_dword [[ADD]]
define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -242,7 +261,8 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
+; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -254,8 +274,9 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
-; GCN-NOT: v_add
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
+; GCN-NOHSA-NOT: v_add
+; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
+; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -267,8 +288,10 @@ entry:
}
; GCN-LABEL: {{^}}s_load_imm_v8i32:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
%tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -280,16 +303,18 @@ entry:
}
; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
%tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -319,10 +344,14 @@ entry:
}
; GCN-LABEL: {{^}}s_load_imm_v16i32:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
%tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -334,26 +363,30 @@ entry:
}
; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: buffer_load_dwordx4
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: v_add_i32_e32
-; GCN: buffer_store_dword
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: v_add_i32_e32
+; GCN-NOHSA: buffer_store_dword
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
entry:
%tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
new file mode 100644
index 000000000000..c91a44cf60e5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -0,0 +1,24 @@
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+
+; On Tonga and Iceland, limited SGPR availability means care must be taken to
+; allocate scratch registers correctly. Check that this test compiles without
+; error.
+; TONGA-LABEL: test
+define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+entry:
+ %tid = call i32 @llvm.SI.tid() nounwind readnone
+ %aptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
+ %a = load <256 x i32>, <256 x i32> addrspace(1)* %aptr
+ call void asm sideeffect "", "~{memory}" ()
+ %outptr = getelementptr <256 x i32>, <256 x i32> addrspace(1)* %in, i32 %tid
+ store <256 x i32> %a, <256 x i32> addrspace(1)* %outptr
+
+; mark 128-bit SGPR registers as used so they are unavailable for the
+; scratch resource descriptor
+ call void asm sideeffect "", "~{SGPR4},~{SGPR8},~{SGPR12},~{SGPR16},~{SGPR20},~{SGPR24},~{SGPR28}" ()
+ call void asm sideeffect "", "~{SGPR32},~{SGPR36},~{SGPR40},~{SGPR44},~{SGPR48},~{SGPR52},~{SGPR56}" ()
+ call void asm sideeffect "", "~{SGPR60},~{SGPR64},~{SGPR68}" ()
+ ret void
+}
+
+declare i32 @llvm.SI.tid() nounwind readnone
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index bf502b3ae077..a74b3e441a13 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -109,6 +109,9 @@
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 | FileCheck %s --check-prefix=EXYNOS-M1
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-M1-FAST
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m1 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST
; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=CORTEX-A7-CHECK
@@ -138,6 +141,9 @@
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a72 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=exynos-m1 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=exynos-m1 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+
; ARMv7a
; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
@@ -1238,6 +1244,36 @@
; CORTEX-A72-FAST-NOT: .eabi_attribute 22
; CORTEX-A72-FAST: .eabi_attribute 23, 1
+; EXYNOS-M1: .cpu exynos-m1
+; EXYNOS-M1: .eabi_attribute 6, 14
+; EXYNOS-M1: .eabi_attribute 7, 65
+; EXYNOS-M1: .eabi_attribute 8, 1
+; EXYNOS-M1: .eabi_attribute 9, 2
+; EXYNOS-M1: .fpu crypto-neon-fp-armv8
+; EXYNOS-M1: .eabi_attribute 12, 3
+; EXYNOS-M1-NOT: .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; EXYNOS-M1: .eabi_attribute 20, 1
+; EXYNOS-M1: .eabi_attribute 21, 1
+; EXYNOS-M1-NOT: .eabi_attribute 22
+; EXYNOS-M1: .eabi_attribute 23, 3
+; EXYNOS-M1: .eabi_attribute 24, 1
+; EXYNOS-M1: .eabi_attribute 25, 1
+; EXYNOS-M1-NOT: .eabi_attribute 27
+; EXYNOS-M1-NOT: .eabi_attribute 28
+; EXYNOS-M1: .eabi_attribute 36, 1
+; EXYNOS-M1: .eabi_attribute 38, 1
+; EXYNOS-M1: .eabi_attribute 42, 1
+; EXYNOS-M1-NOT: .eabi_attribute 44
+; EXYNOS-M15: .eabi_attribute 68, 3
+
+; EXYNOS-M1-FAST-NOT: .eabi_attribute 19
+;; The exynos-m1 has the ARMv8 FP unit, which always flushes preserving sign.
+; EXYNOS-M1-FAST: .eabi_attribute 20, 2
+; EXYNOS-M1-FAST-NOT: .eabi_attribute 21
+; EXYNOS-M1-FAST-NOT: .eabi_attribute 22
+; EXYNOS-M1-FAST: .eabi_attribute 23, 1
+
; GENERIC-FPU-VFPV3-FP16: .fpu vfpv3-fp16
; GENERIC-FPU-VFPV3-D16-FP16: .fpu vfpv3-d16-fp16
; GENERIC-FPU-VFPV3XD: .fpu vfpv3xd
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
index 9ce73939ce56..3d8cdea6cdae 100644
--- a/test/CodeGen/ARM/debugtrap.ll
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -1,17 +1,17 @@
-; This test ensures the @llvm.debugtrap() call is not removed when generating
-; the 'pop' instruction to restore the callee saved registers on ARM.
-
-; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s
-
-declare void @llvm.debugtrap() nounwind
-declare void @foo() nounwind
-
-define void @test() nounwind {
-entry:
- ; CHECK: bl foo
- ; CHECK-NEXT: pop
- ; CHECK-NEXT: trap
- call void @foo()
- call void @llvm.debugtrap()
- ret void
-}
+; This test ensures the @llvm.debugtrap() call is not removed when generating
+; the 'pop' instruction to restore the callee saved registers on ARM.
+
+; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s
+
+declare void @llvm.debugtrap() nounwind
+declare void @foo() nounwind
+
+define void @test() nounwind {
+entry:
+ ; CHECK: bl foo
+ ; CHECK-NEXT: pop
+ ; CHECK-NEXT: trap
+ call void @foo()
+ call void @llvm.debugtrap()
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 75a0bc9ab6c6..901801d7dbbe 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -17,6 +17,28 @@ define i32 @load_i32_with_folded_offset(i32* %p) {
ret i32 %t
}
+; With an inbounds gep, we can fold an offset.
+
+; CHECK-LABEL: load_i32_with_folded_gep_offset:
+; CHECK: i32.load $push0=, 24($0){{$}}
+define i32 @load_i32_with_folded_gep_offset(i32* %p) {
+ %s = getelementptr inbounds i32, i32* %p, i32 6
+ %t = load i32, i32* %s
+ ret i32 %t
+}
+
+; We can't fold a negative offset though, even with an inbounds gep.
+
+; CHECK-LABEL: load_i32_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i32.load $push2=, 0($pop1){{$}}
+define i32 @load_i32_with_unfolded_gep_negative_offset(i32* %p) {
+ %s = getelementptr inbounds i32, i32* %p, i32 -6
+ %t = load i32, i32* %s
+ ret i32 %t
+}
+
; Without nuw, and even with nsw, we can't fold an offset.
; CHECK-LABEL: load_i32_with_unfolded_offset:
@@ -31,6 +53,18 @@ define i32 @load_i32_with_unfolded_offset(i32* %p) {
ret i32 %t
}
+; Without inbounds, we can't fold a gep offset.
+
+; CHECK-LABEL: load_i32_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i32.load $push2=, 0($pop1){{$}}
+define i32 @load_i32_with_unfolded_gep_offset(i32* %p) {
+ %s = getelementptr i32, i32* %p, i32 6
+ %t = load i32, i32* %s
+ ret i32 %t
+}
+
; Same as above but with i64.
; CHECK-LABEL: load_i64_with_folded_offset:
@@ -45,6 +79,28 @@ define i64 @load_i64_with_folded_offset(i64* %p) {
; Same as above but with i64.
+; CHECK-LABEL: load_i64_with_folded_gep_offset:
+; CHECK: i64.load $push0=, 24($0){{$}}
+define i64 @load_i64_with_folded_gep_offset(i64* %p) {
+ %s = getelementptr inbounds i64, i64* %p, i32 3
+ %t = load i64, i64* %s
+ ret i64 %t
+}
+
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i64.load $push2=, 0($pop1){{$}}
+define i64 @load_i64_with_unfolded_gep_negative_offset(i64* %p) {
+ %s = getelementptr inbounds i64, i64* %p, i32 -3
+ %t = load i64, i64* %s
+ ret i64 %t
+}
+
+; Same as above but with i64.
+
; CHECK-LABEL: load_i64_with_unfolded_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
@@ -57,6 +113,18 @@ define i64 @load_i64_with_unfolded_offset(i64* %p) {
ret i64 %t
}
+; Same as above but with i64.
+
+; CHECK-LABEL: load_i64_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i64.load $push2=, 0($pop1){{$}}
+define i64 @load_i64_with_unfolded_gep_offset(i64* %p) {
+ %s = getelementptr i64, i64* %p, i32 3
+ %t = load i64, i64* %s
+ ret i64 %t
+}
+
; Same as above but with store.
; CHECK-LABEL: store_i32_with_folded_offset:
@@ -71,6 +139,28 @@ define void @store_i32_with_folded_offset(i32* %p) {
; Same as above but with store.
+; CHECK-LABEL: store_i32_with_folded_gep_offset:
+; CHECK: i32.store $discard=, 24($0), $pop0{{$}}
+define void @store_i32_with_folded_gep_offset(i32* %p) {
+ %s = getelementptr inbounds i32, i32* %p, i32 6
+ store i32 0, i32* %s
+ ret void
+}
+
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i32_with_unfolded_gep_negative_offset(i32* %p) {
+ %s = getelementptr inbounds i32, i32* %p, i32 -6
+ store i32 0, i32* %s
+ ret void
+}
+
+; Same as above but with store.
+
; CHECK-LABEL: store_i32_with_unfolded_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
@@ -83,6 +173,18 @@ define void @store_i32_with_unfolded_offset(i32* %p) {
ret void
}
+; Same as above but with store.
+
+; CHECK-LABEL: store_i32_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i32.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i32_with_unfolded_gep_offset(i32* %p) {
+ %s = getelementptr i32, i32* %p, i32 6
+ store i32 0, i32* %s
+ ret void
+}
+
; Same as above but with store with i64.
; CHECK-LABEL: store_i64_with_folded_offset:
@@ -97,6 +199,28 @@ define void @store_i64_with_folded_offset(i64* %p) {
; Same as above but with store with i64.
+; CHECK-LABEL: store_i64_with_folded_gep_offset:
+; CHECK: i64.store $discard=, 24($0), $pop0{{$}}
+define void @store_i64_with_folded_gep_offset(i64* %p) {
+ %s = getelementptr inbounds i64, i64* %p, i32 3
+ store i64 0, i64* %s
+ ret void
+}
+
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_unfolded_gep_negative_offset:
+; CHECK: i32.const $push0=, -24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i64_with_unfolded_gep_negative_offset(i64* %p) {
+ %s = getelementptr inbounds i64, i64* %p, i32 -3
+ store i64 0, i64* %s
+ ret void
+}
+
+; Same as above but with store with i64.
+
; CHECK-LABEL: store_i64_with_unfolded_offset:
; CHECK: i32.const $push0=, 24{{$}}
; CHECK: i32.add $push1=, $0, $pop0{{$}}
@@ -109,6 +233,18 @@ define void @store_i64_with_unfolded_offset(i64* %p) {
ret void
}
+; Same as above but with store with i64.
+
+; CHECK-LABEL: store_i64_with_unfolded_gep_offset:
+; CHECK: i32.const $push0=, 24{{$}}
+; CHECK: i32.add $push1=, $0, $pop0{{$}}
+; CHECK: i64.store $discard=, 0($pop1), $pop2{{$}}
+define void @store_i64_with_unfolded_gep_offset(i64* %p) {
+ %s = getelementptr i64, i64* %p, i32 3
+ store i64 0, i64* %s
+ ret void
+}
+
; When loading from a fixed address, materialize a zero.
; CHECK-LABEL: load_i32_from_numeric_address
@@ -159,6 +295,17 @@ define i32 @load_i8_s_with_folded_offset(i8* %p) {
ret i32 %u
}
+; Fold a gep offset into a sign-extending load.
+
+; CHECK-LABEL: load_i8_s_with_folded_gep_offset:
+; CHECK: i32.load8_s $push0=, 24($0){{$}}
+define i32 @load_i8_s_with_folded_gep_offset(i8* %p) {
+ %s = getelementptr inbounds i8, i8* %p, i32 24
+ %t = load i8, i8* %s
+ %u = sext i8 %t to i32
+ ret i32 %u
+}
+
; Fold an offset into a zero-extending load.
; CHECK-LABEL: load_i8_u_with_folded_offset:
@@ -172,6 +319,17 @@ define i32 @load_i8_u_with_folded_offset(i8* %p) {
ret i32 %u
}
+; Fold a gep offset into a zero-extending load.
+
+; CHECK-LABEL: load_i8_u_with_folded_gep_offset:
+; CHECK: i32.load8_u $push0=, 24($0){{$}}
+define i32 @load_i8_u_with_folded_gep_offset(i8* %p) {
+ %s = getelementptr inbounds i8, i8* %p, i32 24
+ %t = load i8, i8* %s
+ %u = zext i8 %t to i32
+ ret i32 %u
+}
+
; Fold an offset into a truncating store.
; CHECK-LABEL: store_i8_with_folded_offset:
@@ -183,3 +341,43 @@ define void @store_i8_with_folded_offset(i8* %p) {
store i8 0, i8* %s
ret void
}
+
+; Fold a gep offset into a truncating store.
+
+; CHECK-LABEL: store_i8_with_folded_gep_offset:
+; CHECK: i32.store8 $discard=, 24($0), $pop0{{$}}
+define void @store_i8_with_folded_gep_offset(i8* %p) {
+ %s = getelementptr inbounds i8, i8* %p, i32 24
+ store i8 0, i8* %s
+ ret void
+}
+
+; Fold the offsets when lowering aggregate loads and stores.
+
+; CHECK-LABEL: aggregate_load_store:
+; CHECK: i32.load $2=, 0($0){{$}}
+; CHECK: i32.load $3=, 4($0){{$}}
+; CHECK: i32.load $4=, 8($0){{$}}
+; CHECK: i32.load $push0=, 12($0){{$}}
+; CHECK: i32.store $discard=, 12($1), $pop0{{$}}
+; CHECK: i32.store $discard=, 8($1), $4{{$}}
+; CHECK: i32.store $discard=, 4($1), $3{{$}}
+; CHECK: i32.store $discard=, 0($1), $2{{$}}
+define void @aggregate_load_store({i32,i32,i32,i32}* %p, {i32,i32,i32,i32}* %q) {
+ ; volatile so that things stay in order for the tests above
+ %t = load volatile {i32,i32,i32,i32}, {i32, i32,i32,i32}* %p
+ store volatile {i32,i32,i32,i32} %t, {i32, i32,i32,i32}* %q
+ ret void
+}
+
+; Fold the offsets when lowering aggregate return values.
+
+; CHECK-LABEL: aggregate_return:
+; CHECK: i32.const $push0=, 0{{$}}
+; CHECK: i32.store $push1=, 12($0), $pop0{{$}}
+; CHECK: i32.store $push2=, 8($0), $pop1{{$}}
+; CHECK: i32.store $push3=, 4($0), $pop2{{$}}
+; CHECK: i32.store $discard=, 0($0), $pop3{{$}}
+define {i32,i32,i32,i32} @aggregate_return() {
+ ret {i32,i32,i32,i32} zeroinitializer
+}
diff --git a/test/CodeGen/WinEH/wineh-cloning.ll b/test/CodeGen/WinEH/wineh-cloning.ll
index c13e0a163641..3c1793a3bd7f 100644
--- a/test/CodeGen/WinEH/wineh-cloning.ll
+++ b/test/CodeGen/WinEH/wineh-cloning.ll
@@ -2,6 +2,7 @@
declare i32 @__CxxFrameHandler3(...)
declare i32 @__C_specific_handler(...)
+declare void @ProcessCLRException(...)
declare void @f()
@@ -369,6 +370,50 @@ unreachable:
unreachable
}
+define void @test14() personality void (...)* @ProcessCLRException {
+entry:
+ invoke void @f()
+ to label %cont unwind label %cleanup
+cont:
+ invoke void @f()
+ to label %exit unwind label %switch.outer
+cleanup:
+ %cleanpad = cleanuppad within none []
+ invoke void @f() [ "funclet" (token %cleanpad) ]
+ to label %cleanret unwind label %switch.inner
+switch.inner:
+ %cs.inner = catchswitch within %cleanpad [label %pad.inner] unwind to caller
+pad.inner:
+ %cp.inner = catchpad within %cs.inner [i32 1]
+ catchret from %cp.inner to label %join
+cleanret:
+ cleanupret from %cleanpad unwind to caller
+switch.outer:
+ %cs.outer = catchswitch within none [label %pad.outer] unwind to caller
+pad.outer:
+ %cp.outer = catchpad within %cs.outer [i32 2]
+ catchret from %cp.outer to label %join
+join:
+ %phi = phi i32 [ 1, %pad.inner ], [ 2, %pad.outer ]
+ call void @llvm.foo(i32 %phi)
+ unreachable
+exit:
+ ret void
+}
+; Both catchrets target %join, but the catchret from %cp.inner
+; returns to %cleanpad and the catchret from %cp.outer returns to the
+; main function, so %join needs to get cloned and one of the cleanuprets
+; needs to be updated to target the clone
+; CHECK-LABEL: define void @test14()
+; CHECK: catchret from %cp.inner to label %[[Clone1:.+]]
+; CHECK: catchret from %cp.outer to label %[[Clone2:.+]]
+; CHECK: [[Clone1]]:
+; CHECK-NEXT: call void @llvm.foo(i32 1)
+; CHECK-NEXT: unreachable
+; CHECK: [[Clone2]]:
+; CHECK-NEXT: call void @llvm.foo(i32 2)
+; CHECK-NEXT: unreachable
+
;; Debug info (from test12)
; Make sure the DISubprogram doesn't get cloned
diff --git a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
index a74aa2dd4623..dfb98bb1ab39 100644
--- a/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
+++ b/test/CodeGen/X86/2009-06-03-Win64SpillXMM.ll
@@ -1,9 +1,7 @@
; RUN: llc -mcpu=generic -mtriple=x86_64-mingw32 < %s | FileCheck %s
-; CHECK: pushq %rbp
-; CHECK: subq $32, %rsp
-; CHECK: leaq 32(%rsp), %rbp
-; CHECK: movaps %xmm8, -16(%rbp)
-; CHECK: movaps %xmm7, -32(%rbp)
+; CHECK: subq $40, %rsp
+; CHECK: movaps %xmm8, 16(%rsp)
+; CHECK: movaps %xmm7, (%rsp)
define i32 @a() nounwind {
entry:
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 4260e817b415..8378a022eab7 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -2,13 +2,13 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "x86_64-apple-macosx10.6.6"
-
-; Test that the order of operands is correct
-; CHECK: select_func
-; CHECK: pblendvb {{LCPI0_[0-9]*}}(%rip), %xmm1
-; CHECK: ret
-
-define void @select_func(<8 x i16> %in) {
+
+; Test that the order of operands is correct
+; CHECK: select_func
+; CHECK: pblendvb {{LCPI0_[0-9]*}}(%rip), %xmm1
+; CHECK: ret
+
+define void @select_func(<8 x i16> %in) {
entry:
%c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll
index b4798f159455..34c5dfaa0162 100644
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -1,38 +1,27 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; Prefer a blend instruction to a vinsert128 instruction because blends
; are simpler (no lane changes) and therefore will have equal or better
; performance.
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castA:
-; AVX1: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: castA:
-; AVX2: vxorps %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: retq
-
-entry:
+; AVX-LABEL: castA:
+; AVX: ## BB#0:
+; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle.i
}
define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castB:
-; AVX1: vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: castB:
-; AVX2: vxorpd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX2-NEXT: retq
-
-entry:
+; AVX-LABEL: castB:
+; AVX: ## BB#0:
+; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT: retq
%shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x double> %shuffle.i
}
@@ -41,16 +30,16 @@ entry:
define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
-; AVX1: vxorps %xmm1, %xmm1, %xmm1
+; AVX1: ## BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castC:
-; AVX2: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
-
-entry:
%shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x i64> %shuffle.i
}
@@ -59,43 +48,28 @@ entry:
; vzeroupper before the return, so just check for the absence of shuffles.
define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castD:
-; AVX1-NOT: extract
-; AVX1-NOT: blend
-;
-; AVX2-LABEL: castD:
-; AVX2-NOT: extract
-; AVX2-NOT: blend
-
-entry:
+; AVX-LABEL: castD:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle.i
}
define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castE:
-; AVX1-NOT: extract
-; AVX1-NOT: blend
-;
-; AVX2-LABEL: castE:
-; AVX2-NOT: extract
-; AVX2-NOT: blend
-
-entry:
+; AVX-LABEL: castE:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
ret <2 x i64> %shuffle.i
}
define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
-; AVX1-LABEL: castF:
-; AVX1-NOT: extract
-; AVX1-NOT: blend
-;
-; AVX2-LABEL: castF:
-; AVX2-NOT: extract
-; AVX2-NOT: blend
-
-entry:
+; AVX-LABEL: castF:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
ret <2 x double> %shuffle.i
}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 764e13638485..5a17cdb29216 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6419,3 +6419,126 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
ret <8 x i64> %res5
}
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsrad $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsrad $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpslld $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpslld $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 5f3d16d4efbb..064652aa470d 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2906,3 +2906,63 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpsraw %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsraw $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsraw $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 1db6756c23a8..6b2cb432f1cd 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -4591,3 +4591,126 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i8 %x1, <
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16>, <8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 29f17bbc0190..febd3d69dd18 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,18 +1,18 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
- ; CHECK: test_x86_vbroadcastmw_512
- ; CHECK: vpbroadcastmw2d %k0, %zmm0
- %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ;
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
- ; CHECK: test_x86_broadcastmb_512
- ; CHECK: vpbroadcastmb2q %k0, %zmm0
- %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ;
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+ ; CHECK: test_x86_vbroadcastmw_512
+ ; CHECK: vpbroadcastmw2d %k0, %zmm0
+ %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ;
+ ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+ ; CHECK: test_x86_broadcastmb_512
+ ; CHECK: vpbroadcastmb2q %k0, %zmm0
+ %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ;
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index d9e8728c5ca6..8ab34bd8c436 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -5801,3 +5801,589 @@ define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x
%res5 = add <8 x i32> %res3, %res4
ret <8 x i32> %res5
}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res3, %res2
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> zeroinitializer, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> zeroinitializer, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res3, %res2
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res3, %res2
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res3, %res2
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+
+declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i8, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i8, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res3, %res2
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index e21ba2a14cf5..1665360e4990 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,11 +1,14 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386
+; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf
+
+; TODO: Reenable verify-machineinstr once the if (!AXDead) // FIXME
+; in X86InstrInfo::copyPhysReg() is resolved.
declare i32 @foo()
declare i32 @bar(i64)
@@ -58,9 +61,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: movq %rax, %rdi
; x8664-sahf-NEXT: callq bar
+; x8664-sahf-NEXT: pushq %rax
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
; x8664-sahf-NEXT: addb $127, %al
; x8664-sahf-NEXT: sahf
+; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: jne
%cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
@@ -161,9 +166,11 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
; x8664-sahf-LABEL: test_feed_cmov:
; x8664-sahf: cmpxchgl
-; x8664-sahf: seto %al
+; x8664-sahf: pushq %rax
+; x8664-sahf-NEXT: seto %al
; x8664-sahf-NEXT: lahf
; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
+; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: callq foo
; x8664-sahf-NEXT: pushq %rax
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
new file mode 100644
index 000000000000..796c1ecd8c71
--- /dev/null
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -0,0 +1,54 @@
+; RUN: llc -o - %s | FileCheck %s
+; This tests for the problem originally reported in http://llvm.org/PR25951
+target triple = "i686-unknown-linux-gnu"
+
+@b = common global i8 0, align 1
+@c = common global i32 0, align 4
+@a = common global i8 0, align 1
+@d = common global i8 0, align 1
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+; CHECK-LABEL: func:
+; This tests whether eax is properly saved/restored around the lahf/sahf
+; instruction sequences.
+define i32 @func() {
+entry:
+ %bval = load i8, i8* @b
+ %inc = add i8 %bval, 1
+ store i8 %inc, i8* @b
+ %cval = load i32, i32* @c
+ %inc1 = add nsw i32 %cval, 1
+ store i32 %inc1, i32* @c
+ %aval = load i8, i8* @a
+ %inc2 = add i8 %aval, 1
+ store i8 %inc2, i8* @a
+; Copy flags produced by the incb of %inc1 to a register, need to save+restore
+; eax around it. The flags will be reused by %tobool.
+; CHECK: pushl %eax
+; CHECK: seto %al
+; CHECK: lahf
+; CHECK: movl %eax, [[REG:%[a-z]+]]
+; CHECK: popl %eax
+ %cmp = icmp eq i8 %aval, %bval
+ %conv5 = zext i1 %cmp to i8
+ store i8 %conv5, i8* @d
+ %tobool = icmp eq i32 %inc1, 0
+; We restore flags with an 'addb, sahf' sequence, need to save+restore eax
+; around it.
+; CHECK: pushl %eax
+; CHECK: movl [[REG]], %eax
+; CHECK: addb $127, %al
+; CHECK: sahf
+; CHECK: popl %eax
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+ %conv6 = sext i8 %inc to i32
+ %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %conv6)
+ br label %if.end
+
+if.end:
+ ret i32 0
+}
+
+declare i32 @printf(i8* nocapture readonly, ...)
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
index ec367c86526d..b38797e2d9dd 100644
--- a/test/CodeGen/X86/divrem8_ext.ll
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -97,4 +97,23 @@ define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
ret i64 %2
}
+define i64 @pr25754(i8 %a, i8 %c) {
+; CHECK-LABEL: pr25754
+; CHECK: movzbl {{.+}}, %eax
+; CHECK: divb
+; CHECK: movzbl %ah, %ecx
+; CHECK: movzbl %al, %eax
+; CHECK-32: addl %ecx, %eax
+; CHECK-32: sbbl %edx, %edx
+; CHECK-32: andl $1, %edx
+; CHECK-64: addq %rcx, %rax
+; CHECK: ret
+ %r1 = urem i8 %a, %c
+ %d1 = udiv i8 %a, %c
+ %r2 = zext i8 %r1 to i64
+ %d2 = zext i8 %d1 to i64
+ %ret = add i64 %r2, %d2
+ ret i64 %ret
+}
+
@z = external global i8
diff --git a/test/CodeGen/X86/fold-load-unops.ll b/test/CodeGen/X86/fold-load-unops.ll
index bedda3f297da..d2b03dde8319 100644
--- a/test/CodeGen/X86/fold-load-unops.ll
+++ b/test/CodeGen/X86/fold-load-unops.ll
@@ -2,17 +2,19 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
-; Verify that we're folding the load into the math instruction.
+; Verify we fold loads into unary sse intrinsics only when optimizing for size
define float @rcpss(float* %a) {
; SSE-LABEL: rcpss:
; SSE: # BB#0:
-; SSE-NEXT: rcpss (%rdi), %xmm0
+; SSE-NEXT: movss (%rdi), %xmm0
+; SSE-NEXT: rcpss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rcpss:
; AVX: # BB#0:
-; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
%ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -24,12 +26,14 @@ define float @rcpss(float* %a) {
define float @rsqrtss(float* %a) {
; SSE-LABEL: rsqrtss:
; SSE: # BB#0:
-; SSE-NEXT: rsqrtss (%rdi), %xmm0
+; SSE-NEXT: movss (%rdi), %xmm0
+; SSE-NEXT: rsqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rsqrtss:
; AVX: # BB#0:
-; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
%ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -41,12 +45,14 @@ define float @rsqrtss(float* %a) {
define float @sqrtss(float* %a) {
; SSE-LABEL: sqrtss:
; SSE: # BB#0:
-; SSE-NEXT: sqrtss (%rdi), %xmm0
+; SSE-NEXT: movss (%rdi), %xmm0
+; SSE-NEXT: sqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtss:
; AVX: # BB#0:
-; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
%ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -58,12 +64,14 @@ define float @sqrtss(float* %a) {
define double @sqrtsd(double* %a) {
; SSE-LABEL: sqrtsd:
; SSE: # BB#0:
-; SSE-NEXT: sqrtsd (%rdi), %xmm0
+; SSE-NEXT: movsd (%rdi), %xmm0
+; SSE-NEXT: sqrtsd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtsd:
; AVX: # BB#0:
-; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovsd (%rdi), %xmm0
+; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load double, double* %a
%ins = insertelement <2 x double> undef, double %ld, i32 0
@@ -72,9 +80,75 @@ define double @sqrtsd(double* %a) {
ret double %ext
}
+define float @rcpss_size(float* %a) optsize {
+; SSE-LABEL: rcpss_size:
+; SSE: # BB#0:
+; SSE-NEXT: rcpss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: rcpss_size:
+; AVX: # BB#0:
+; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load float, float* %a
+ %ins = insertelement <4 x float> undef, float %ld, i32 0
+ %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+ %ext = extractelement <4 x float> %res, i32 0
+ ret float %ext
+}
+
+define float @rsqrtss_size(float* %a) optsize {
+; SSE-LABEL: rsqrtss_size:
+; SSE: # BB#0:
+; SSE-NEXT: rsqrtss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: rsqrtss_size:
+; AVX: # BB#0:
+; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load float, float* %a
+ %ins = insertelement <4 x float> undef, float %ld, i32 0
+ %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+ %ext = extractelement <4 x float> %res, i32 0
+ ret float %ext
+}
+
+define float @sqrtss_size(float* %a) optsize{
+; SSE-LABEL: sqrtss_size:
+; SSE: # BB#0:
+; SSE-NEXT: sqrtss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sqrtss_size:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load float, float* %a
+ %ins = insertelement <4 x float> undef, float %ld, i32 0
+ %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+ %ext = extractelement <4 x float> %res, i32 0
+ ret float %ext
+}
+
+define double @sqrtsd_size(double* %a) optsize {
+; SSE-LABEL: sqrtsd_size:
+; SSE: # BB#0:
+; SSE-NEXT: sqrtsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sqrtsd_size:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load double, double* %a
+ %ins = insertelement <2 x double> undef, double %ld, i32 0
+ %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+ %ext = extractelement <2 x double> %res, i32 0
+ ret double %ext
+}
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
diff --git a/test/CodeGen/X86/fpcmp-soft-fp.ll b/test/CodeGen/X86/fpcmp-soft-fp.ll
index 58d57017d18a..dac468e5cbf0 100644
--- a/test/CodeGen/X86/fpcmp-soft-fp.ll
+++ b/test/CodeGen/X86/fpcmp-soft-fp.ll
@@ -1,127 +1,127 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s
-
-define i1 @test1(double %d) #0 {
-entry:
- %cmp = fcmp ule double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test1:
-; CHECK: calll __gtdf2
-; CHECK: setle
-; CHECK: retl
-
-define i1 @test2(double %d) #0 {
-entry:
- %cmp = fcmp ult double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test2:
-; CHECK: calll __gedf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test3(double %d) #0 {
-entry:
- %cmp = fcmp ugt double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test3:
-; CHECK: calll __ledf2
-; CHECK: setg
-; CHECK: retl
-
-define i1 @test4(double %d) #0 {
-entry:
- %cmp = fcmp uge double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test4:
-; CHECK: calll __ltdf2
-; CHECK: setns
-; CHECK: retl
-
-define i1 @test5(double %d) #0 {
-entry:
- %cmp = fcmp ole double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test5:
-; CHECK: calll __ledf2
-; CHECK: setle
-; CHECK: retl
-
-define i1 @test6(double %d) #0 {
-entry:
- %cmp = fcmp olt double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test6:
-; CHECK: calll __ltdf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test7(double %d) #0 {
-entry:
- %cmp = fcmp ogt double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test7:
-; CHECK: calll __gtdf2
-; CHECK: setg
-; CHECK: retl
-
-define i1 @test8(double %d) #0 {
-entry:
- %cmp = fcmp oge double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test8:
-; CHECK: calll __gedf2
-; CHECK: setns
-; CHECK: retl
-
-define i1 @test9(double %d) #0 {
-entry:
- %cmp = fcmp oeq double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test9:
-; CHECK: calll __eqdf2
-; CHECK: sete
-; CHECK: retl
-
-define i1 @test10(double %d) #0 {
-entry:
- %cmp = fcmp ueq double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test10:
-; CHECK: calll __eqdf2
-; CHECK: sete
-; CHECK: calll __unorddf2
-; CHECK: setne
-; CHECK: retl
-
-define i1 @test11(double %d) #0 {
-entry:
- %cmp = fcmp one double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test11:
-; CHECK: calll __gtdf2
-; CHECK: setg
-; CHECK: calll __ltdf2
-; CHECK: sets
-; CHECK: retl
-
-define i1 @test12(double %d) #0 {
-entry:
- %cmp = fcmp une double %d, 0.000000e+00
- ret i1 %cmp
-}
-; CHECK-LABEL: test12:
-; CHECK: calll __nedf2
-; CHECK: setne
-; CHECK: retl
-
-attributes #0 = { "use-soft-float"="true" }
+; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s
+
+define i1 @test1(double %d) #0 {
+entry:
+ %cmp = fcmp ule double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test1:
+; CHECK: calll __gtdf2
+; CHECK: setle
+; CHECK: retl
+
+define i1 @test2(double %d) #0 {
+entry:
+ %cmp = fcmp ult double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test2:
+; CHECK: calll __gedf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test3(double %d) #0 {
+entry:
+ %cmp = fcmp ugt double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test3:
+; CHECK: calll __ledf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test4(double %d) #0 {
+entry:
+ %cmp = fcmp uge double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test4:
+; CHECK: calll __ltdf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test5(double %d) #0 {
+entry:
+ %cmp = fcmp ole double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test5:
+; CHECK: calll __ledf2
+; CHECK: setle
+; CHECK: retl
+
+define i1 @test6(double %d) #0 {
+entry:
+ %cmp = fcmp olt double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test6:
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test7(double %d) #0 {
+entry:
+ %cmp = fcmp ogt double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test7:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: retl
+
+define i1 @test8(double %d) #0 {
+entry:
+ %cmp = fcmp oge double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test8:
+; CHECK: calll __gedf2
+; CHECK: setns
+; CHECK: retl
+
+define i1 @test9(double %d) #0 {
+entry:
+ %cmp = fcmp oeq double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test9:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: retl
+
+define i1 @test10(double %d) #0 {
+entry:
+ %cmp = fcmp ueq double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test10:
+; CHECK: calll __eqdf2
+; CHECK: sete
+; CHECK: calll __unorddf2
+; CHECK: setne
+; CHECK: retl
+
+define i1 @test11(double %d) #0 {
+entry:
+ %cmp = fcmp one double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test11:
+; CHECK: calll __gtdf2
+; CHECK: setg
+; CHECK: calll __ltdf2
+; CHECK: sets
+; CHECK: retl
+
+define i1 @test12(double %d) #0 {
+entry:
+ %cmp = fcmp une double %d, 0.000000e+00
+ ret i1 %cmp
+}
+; CHECK-LABEL: test12:
+; CHECK: calll __nedf2
+; CHECK: setne
+; CHECK: retl
+
+attributes #0 = { "use-soft-float"="true" }
diff --git a/test/CodeGen/X86/inline-sse.ll b/test/CodeGen/X86/inline-sse.ll
index 78d6b762b5e5..08819b858293 100644
--- a/test/CodeGen/X86/inline-sse.ll
+++ b/test/CodeGen/X86/inline-sse.ll
@@ -21,11 +21,9 @@ define void @nop() nounwind {
;
; X64-LABEL: nop:
; X64: # BB#0:
-; X64-NEXT: subq $24, %rsp
; X64-NEXT: #APP
; X64-NEXT: #NO_APP
-; X64-NEXT: movaps %xmm0, (%rsp)
-; X64-NEXT: addq $24, %rsp
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: retq
%1 = alloca <4 x float>, align 16
%2 = call <4 x float> asm "", "=x,~{dirflag},~{fpsr},~{flags}"()
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
new file mode 100644
index 000000000000..4e582de22a1f
--- /dev/null
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
+; SSE-LABEL: insert_v2f64_z1:
+; SSE: # BB#0:
+; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_v2f64_z1:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: retq
+ %1 = insertelement <2 x double> %a, double 0.0, i32 0
+ ret <2 x double> %1
+}
+
+define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
+; SSE-LABEL: insert_v4f64_0zz3:
+; SSE: # BB#0:
+; SSE-NEXT: xorpd %xmm2, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_v4f64_0zz3:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %1 = insertelement <4 x double> %a, double 0.0, i32 1
+ %2 = insertelement <4 x double> %1, double 0.0, i32 2
+ ret <4 x double> %2
+}
+
+define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
+; SSE2-LABEL: insert_v2i64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v2i64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm1, %xmm1
+; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v2i64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm1, %xmm1
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v2i64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrq $0, %rax, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v2i64_z1:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = insertelement <2 x i64> %a, i64 0, i32 0
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
+; SSE2-LABEL: insert_v4i64_01z3:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4i64_01z3:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm2, %xmm2
+; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4i64_01z3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4i64_01z3:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrq $0, %rax, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v4i64_01z3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v4i64_01z3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = insertelement <4 x i64> %a, i64 0, i32 2
+ ret <4 x i64> %1
+}
+
+define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
+; SSE2-LABEL: insert_v4f32_01z3:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4f32_01z3:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4f32_01z3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4f32_01z3:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v4f32_01z3:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: retq
+ %1 = insertelement <4 x float> %a, float 0.0, i32 2
+ ret <4 x float> %1
+}
+
+define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
+; SSE2-LABEL: insert_v8f32_z12345z7:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8f32_z12345z7:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm2, %xmm2
+; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8f32_z12345z7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8f32_z12345z7:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm2, %xmm2
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v8f32_z12345z7:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %1 = insertelement <8 x float> %a, float 0.0, i32 0
+ %2 = insertelement <8 x float> %1, float 0.0, i32 6
+ ret <8 x float> %2
+}
+
+define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
+; SSE2-LABEL: insert_v4i32_01z3:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4i32_01z3:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4i32_01z3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4i32_01z3:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v4i32_01z3:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = insertelement <4 x i32> %a, i32 0, i32 2
+ ret <4 x i32> %1
+}
+
+define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
+; SSE2-LABEL: insert_v8i32_z12345z7:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8i32_z12345z7:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm2, %xmm2
+; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8i32_z12345z7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8i32_z12345z7:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrd $0, %eax, %xmm0
+; SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v8i32_z12345z7:
+; AVX1: # BB#0:
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v8i32_z12345z7:
+; AVX2: # BB#0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = insertelement <8 x i32> %a, i32 0, i32 0
+ %2 = insertelement <8 x i32> %1, i32 0, i32 6
+ ret <8 x i32> %2
+}
+
+define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
+; SSE-LABEL: insert_v8i16_z12345z7:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pinsrw $0, %eax, %xmm0
+; SSE-NEXT: pinsrw $6, %eax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insert_v8i16_z12345z7:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = insertelement <8 x i16> %a, i16 0, i32 0
+ %2 = insertelement <8 x i16> %1, i16 0, i32 6
+ ret <8 x i16> %2
+}
+
+define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
+; SSE-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pinsrw $0, %eax, %xmm0
+; SSE-NEXT: pinsrw $6, %eax, %xmm0
+; SSE-NEXT: pinsrw $7, %eax, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz:
+; AVX1: # BB#0:
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz:
+; AVX2: # BB#0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = insertelement <16 x i16> %a, i16 0, i32 0
+ %2 = insertelement <16 x i16> %1, i16 0, i32 6
+ %3 = insertelement <16 x i16> %2, i16 0, i32 15
+ ret <16 x i16> %3
+}
+
+define <16 x i8> @insert_v16i8_z123456789ABZDEz(<16 x i8> %a) {
+; SSE2-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE3: # BB#0:
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: pandn %xmm2, %xmm1
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE3-NEXT: pandn %xmm2, %xmm1
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v16i8_z123456789ABZDEz:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrb $0, %eax, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v16i8_z123456789ABZDEz:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = insertelement <16 x i8> %a, i8 0, i32 0
+ %2 = insertelement <16 x i8> %1, i8 0, i32 15
+ ret <16 x i8> %2
+}
+
+define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
+; SSE2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE3: # BB#0:
+; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: movd %eax, %xmm3
+; SSE3-NEXT: pandn %xmm3, %xmm2
+; SSE3-NEXT: por %xmm2, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm3, %xmm4
+; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE3-NEXT: pand %xmm5, %xmm1
+; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE3-NEXT: pandn %xmm3, %xmm5
+; SSE3-NEXT: por %xmm5, %xmm1
+; SSE3-NEXT: pand %xmm2, %xmm1
+; SSE3-NEXT: pandn %xmm4, %xmm2
+; SSE3-NEXT: por %xmm2, %xmm0
+; SSE3-NEXT: por %xmm2, %xmm1
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; SSSE3-NEXT: pshufb %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: pinsrb $0, %eax, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: pinsrb $14, %eax, %xmm1
+; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX1: # BB#0:
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX2: # BB#0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %1 = insertelement <32 x i8> %a, i8 0, i32 0
+ %2 = insertelement <32 x i8> %1, i8 0, i32 15
+ %3 = insertelement <32 x i8> %2, i8 0, i32 30
+ %4 = insertelement <32 x i8> %3, i8 0, i32 31
+ ret <32 x i8> %4
+}
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
new file mode 100644
index 000000000000..655f8f49f838
--- /dev/null
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
+; SSE-LABEL: shuffle_v4f32_0z27:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0z27:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT: retq
+ %vecext = extractelement <4 x float> %x, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+ %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+ %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
+; SSE-LABEL: shuffle_v4f32_0zz4:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz4:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: retq
+ %vecext = extractelement <4 x float> %xyzw, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+ %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
+; SSE-LABEL: shuffle_v4f32_0z24:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0z24:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: retq
+ %vecext = extractelement <4 x float> %xyzw, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+ %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuffle_v4f32_0zz0(float %a) {
+; SSE-LABEL: shuffle_v4f32_0zz0:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0zz0:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: retq
+ %vecinit = insertelement <4 x float> undef, float %a, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit2, float %a, i32 3
+ ret <4 x float> %vecinit3
+}
+
+define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: shuffle_v4f32_0z6z:
+; SSE: # BB#0:
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0z6z:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; AVX-NEXT: retq
+ %vecext = extractelement <4 x float> %A, i32 0
+ %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+ %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+ %vecext2 = extractelement <4 x float> %B, i32 2
+ %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+ %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+ ret <4 x float> %vecinit4
+}
diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll
new file mode 100644
index 000000000000..49da8008b88c
--- /dev/null
+++ b/test/CodeGen/X86/materialize-one.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+
+define i32 @one32() optsize {
+entry:
+ ret i32 1
+
+; CHECK32-LABEL: one32
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: ret
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32
+; CHECK64: movl $1, %eax
+; CHECK64-NEXT: retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+ ret i32 -1
+
+; CHECK32-LABEL: minus_one32
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: ret
+}
+
+define i16 @one16() optsize {
+entry:
+ ret i16 1
+
+; CHECK32-LABEL: one16
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+ ret i16 -1
+
+; CHECK32-LABEL: minus_one16
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: retl
+}
+
+define i32 @test_rematerialization() optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; -1 should be re-materialized here instead of getting spilled above.
+ ret i32 -1
+
+; CHECK32-LABEL: test_rematerialization
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NOT: %eax
+; CHECK32: retl
+}
+
+define i32 @test_rematerialization2(i32 %x) optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; Define eflags.
+ %a = icmp ne i32 %x, 123
+ %b = zext i1 %a to i32
+ ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+ ; It must therefore not use the xor-dec lowering.
+ %c = select i1 %a, i32 %b, i32 -1
+ ret i32 %c
+
+; CHECK32-LABEL: test_rematerialization2
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: cmpl
+; CHECK32: setne
+; CHECK32-NOT: xorl
+; CHECK32: movl $-1
+; CHECK32: cmov
+; CHECK32: retl
+}
+
+declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll
deleted file mode 100644
index 695bf0fa5b98..000000000000
--- a/test/CodeGen/X86/materialize.ll
+++ /dev/null
@@ -1,184 +0,0 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
-; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
-
-define i32 @one32_nooptsize() {
-entry:
- ret i32 1
-
-; When not optimizing for size, use mov.
-; CHECK32-LABEL: one32_nooptsize:
-; CHECK32: movl $1, %eax
-; CHECK32-NEXT: retl
-; CHECK64-LABEL: one32_nooptsize:
-; CHECK64: movl $1, %eax
-; CHECK64-NEXT: retq
-}
-
-define i32 @one32() optsize {
-entry:
- ret i32 1
-
-; CHECK32-LABEL: one32:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: retl
-
-; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32:
-; CHECK64: movl $1, %eax
-; CHECK64-NEXT: retq
-}
-
-define i32 @one32_minsize() minsize {
-entry:
- ret i32 1
-
-; On 32-bit, xor-inc is preferred over push-pop.
-; CHECK32-LABEL: one32_minsize:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: retl
-
-; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
-; pop into a 64-bit register even when we just need 32 bits.
-; CHECK64-LABEL: one32_minsize:
-; CHECK64: pushq $1
-; CHECK64: .cfi_adjust_cfa_offset 8
-; CHECK64: popq %rax
-; CHECK64: .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT: retq
-}
-
-define i64 @one64_minsize() minsize {
-entry:
- ret i64 1
-; On 64-bit we don't do xor-inc yet, so push-pop it is.
-; CHECK64-LABEL: one64_minsize:
-; CHECK64: pushq $1
-; CHECK64: .cfi_adjust_cfa_offset 8
-; CHECK64: popq %rax
-; CHECK64: .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT: retq
-
-; On Win64 we can't adjust the stack unless there's a frame pointer.
-; CHECKWIN64-LABEL: one64_minsize:
-; CHECKWIN64: movl $1, %eax
-; CHECKWIN64-NEXT: retq
-}
-
-define i32 @minus_one32() optsize {
-entry:
- ret i32 -1
-
-; CHECK32-LABEL: minus_one32:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: retl
-}
-
-define i32 @minus_one32_minsize() minsize {
-entry:
- ret i32 -1
-
-; xor-dec is preferred over push-pop.
-; CHECK32-LABEL: minus_one32_minsize:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: retl
-}
-
-define i16 @one16() optsize {
-entry:
- ret i16 1
-
-; CHECK32-LABEL: one16:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: retl
-}
-
-define i16 @minus_one16() optsize {
-entry:
- ret i16 -1
-
-; CHECK32-LABEL: minus_one16:
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: retl
-}
-
-define i32 @minus_five32() minsize {
-entry:
- ret i32 -5
-
-; CHECK32-LABEL: minus_five32:
-; CHECK32: pushl $-5
-; CHECK32: popl %eax
-; CHECK32: retl
-}
-
-define i64 @minus_five64() minsize {
-entry:
- ret i64 -5
-
-; CHECK64-LABEL: minus_five64:
-; CHECK64: pushq $-5
-; CHECK64: .cfi_adjust_cfa_offset 8
-; CHECK64: popq %rax
-; CHECK64: .cfi_adjust_cfa_offset -8
-; CHECK64: retq
-}
-
-define i32 @rematerialize_minus_one() optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; -1 should be re-materialized here instead of getting spilled above.
- ret i32 -1
-
-; CHECK32-LABEL: rematerialize_minus_one
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NOT: %eax
-; CHECK32: retl
-}
-
-define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; Define eflags.
- %a = icmp ne i32 %x, 123
- %b = zext i1 %a to i32
- ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
- ; It must therefore not use the xor-dec lowering.
- %c = select i1 %a, i32 %b, i32 -1
- ret i32 %c
-
-; CHECK32-LABEL: rematerialize_minus_one_eflags
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: cmpl
-; CHECK32: setne
-; CHECK32-NOT: xorl
-; CHECK32: movl $-1
-; CHECK32: cmov
-; CHECK32: retl
-}
-
-declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index bf457814079c..441fb02a89e6 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,8 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+
+; TODO: Reenable verify-machineinstrs once the if (!AXDead) // FIXME in
+; X86InstrInfo::copyPhysReg() is resolved.
; The peephole optimizer can elide some physical register copies such as
; EFLAGS. Make sure the flags are used directly, instead of needlessly using
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
new file mode 100644
index 000000000000..8568cf43abc0
--- /dev/null
+++ b/test/CodeGen/X86/pku.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+declare i32 @llvm.x86.rdpkru()
+declare void @llvm.x86.wrpkru(i32)
+
+define void @test_x86_wrpkru(i32 %src) {
+; CHECK-LABEL: test_x86_wrpkru:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: wrpkru
+; CHECK-NEXT: retq
+ call void @llvm.x86.wrpkru(i32 %src)
+ ret void
+}
+
+define i32 @test_x86_rdpkru() {
+; CHECK-LABEL: test_x86_rdpkru:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: rdpkru
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.x86.rdpkru()
+ ret i32 %res
+}
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index 17d3e3e7d33c..88b5f4eb14b0 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
define double @pow_wrapper_minsize(double %a) minsize {
; CHECK-LABEL: pow_wrapper_minsize:
; CHECK: # BB#0:
-; CHECK-NEXT: movl $128, %edi
+; CHECK-NEXT: movl $15, %edi
; CHECK-NEXT: jmp
- %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; <double> [#uses=1]
+ %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
ret double %ret
}
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index 73c497014116..6c32a2206a7e 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -4,17 +4,15 @@
; defining %0 before it was read. This caused us to omit the
; movq -8(%rsp), %rdx
-; CHECK: pushq %rax
; CHECK: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: movq %rax, (%rsp)
-; CHECK-NEXT: movq (%rsp), %rdx
+; CHECK-NEXT: movq %rax, -8(%rsp)
+; CHECK-NEXT: movq -8(%rsp), %rdx
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: movq %rdx, (%rsp)
-; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: movq %rdx, -8(%rsp)
; CHECK-NEXT: ret
define i64 @foo() {
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 59866c090543..f6dca609bc05 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -1,41 +1,41 @@
-; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
-; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
-; PR21792
-
-@stuff = external constant [256 x double], align 16
-
-define void @func(<4 x float> %vx) {
-entry:
- %tmp2 = bitcast <4 x float> %vx to <2 x i64>
- %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
- %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
- %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
- %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
- %add.ptr = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
- %tmp4 = bitcast i8* %add.ptr to double*
- %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
- %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
- %add.ptr6 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
- %tmp5 = bitcast i8* %add.ptr6 to double*
- %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
- %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
- %add.ptr15 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
- %tmp6 = bitcast i8* %add.ptr15 to double*
- %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
- %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
- %add.ptr20 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
- %tmp7 = bitcast i8* %add.ptr20 to double*
- %add.ptr46 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
- %tmp16 = bitcast i8* %add.ptr46 to double*
- %add.ptr51 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
- %tmp17 = bitcast i8* %add.ptr51 to double*
- call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
- ret void
-; CHECK-LABEL: func:
-; CHECK: pextrq $1, %xmm0,
-; CHECK-NEXT: movd %xmm0, %r[[AX:..]]
-; CHECK-NEXT: movslq %e[[AX]],
-; CHECK-NEXT: sarq $32, %r[[AX]]
-}
-
-declare void @toto(double*, double*, double*, double*, double*, double*)
+; RUN: llc -mtriple=x86_64-linux -mcpu=corei7 < %s | FileCheck %s
+; This fixes a missing cases in the MI scheduler's constrainLocalCopy exposed by
+; PR21792
+
+@stuff = external constant [256 x double], align 16
+
+define void @func(<4 x float> %vx) {
+entry:
+ %tmp2 = bitcast <4 x float> %vx to <2 x i64>
+ %and.i = and <2 x i64> %tmp2, <i64 8727373547504, i64 8727373547504>
+ %tmp3 = bitcast <2 x i64> %and.i to <4 x i32>
+ %index.sroa.0.0.vec.extract = extractelement <4 x i32> %tmp3, i32 0
+ %idx.ext = sext i32 %index.sroa.0.0.vec.extract to i64
+ %add.ptr = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext
+ %tmp4 = bitcast i8* %add.ptr to double*
+ %index.sroa.0.4.vec.extract = extractelement <4 x i32> %tmp3, i32 1
+ %idx.ext5 = sext i32 %index.sroa.0.4.vec.extract to i64
+ %add.ptr6 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext5
+ %tmp5 = bitcast i8* %add.ptr6 to double*
+ %index.sroa.0.8.vec.extract = extractelement <4 x i32> %tmp3, i32 2
+ %idx.ext14 = sext i32 %index.sroa.0.8.vec.extract to i64
+ %add.ptr15 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext14
+ %tmp6 = bitcast i8* %add.ptr15 to double*
+ %index.sroa.0.12.vec.extract = extractelement <4 x i32> %tmp3, i32 3
+ %idx.ext19 = sext i32 %index.sroa.0.12.vec.extract to i64
+ %add.ptr20 = getelementptr inbounds i8, i8* bitcast ([256 x double]* @stuff to i8*), i64 %idx.ext19
+ %tmp7 = bitcast i8* %add.ptr20 to double*
+ %add.ptr46 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext
+ %tmp16 = bitcast i8* %add.ptr46 to double*
+ %add.ptr51 = getelementptr inbounds i8, i8* bitcast (double* getelementptr inbounds ([256 x double], [256 x double]* @stuff, i64 0, i64 1) to i8*), i64 %idx.ext5
+ %tmp17 = bitcast i8* %add.ptr51 to double*
+ call void @toto(double* %tmp4, double* %tmp5, double* %tmp6, double* %tmp7, double* %tmp16, double* %tmp17)
+ ret void
+; CHECK-LABEL: func:
+; CHECK: pextrq $1, %xmm0,
+; CHECK-NEXT: movd %xmm0, %r[[AX:..]]
+; CHECK-NEXT: movslq %e[[AX]],
+; CHECK-NEXT: sarq $32, %r[[AX]]
+}
+
+declare void @toto(double*, double*, double*, double*, double*, double*)
diff --git a/test/CodeGen/X86/pr24139.ll b/test/CodeGen/X86/pr24139.ll
index fbe55abcbf7c..ec56345ba648 100644
--- a/test/CodeGen/X86/pr24139.ll
+++ b/test/CodeGen/X86/pr24139.ll
@@ -1,148 +1,148 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
-
-; Check that we do not get excessive spilling from splitting of constant live ranges.
-
-; CHECK-LABEL: PR24139:
-; CHECK: # 16-byte Spill
-; CHECK-NOT: # 16-byte Spill
-; CHECK: retq
-
-define <2 x double> @PR24139(<2 x double> %arg, <2 x double> %arg1, <2 x double> %arg2) {
- %tmp = bitcast <2 x double> %arg to <4 x float>
- %tmp3 = fmul <4 x float> %tmp, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
- %tmp4 = bitcast <2 x double> %arg to <4 x i32>
- %tmp5 = and <4 x i32> %tmp4, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp6 = or <4 x i32> %tmp5, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
- %tmp7 = bitcast <4 x i32> %tmp6 to <4 x float>
- %tmp8 = fadd <4 x float> %tmp3, %tmp7
- %tmp9 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp8) #2
- %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
- %tmp11 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp9) #2
- %tmp12 = fmul <4 x float> %tmp11, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
- %tmp13 = fsub <4 x float> %tmp, %tmp12
- %tmp14 = fmul <4 x float> %tmp11, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
- %tmp15 = fsub <4 x float> %tmp13, %tmp14
- %tmp16 = fmul <4 x float> %tmp15, %tmp15
- %tmp17 = fmul <4 x float> %tmp15, %tmp16
- %tmp18 = fmul <4 x float> %tmp16, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
- %tmp19 = fadd <4 x float> %tmp18, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
- %tmp20 = fmul <4 x float> %tmp16, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
- %tmp21 = fadd <4 x float> %tmp20, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
- %tmp22 = fmul <4 x float> %tmp16, %tmp19
- %tmp23 = fadd <4 x float> %tmp22, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
- %tmp24 = fmul <4 x float> %tmp16, %tmp21
- %tmp25 = fadd <4 x float> %tmp24, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
- %tmp26 = fmul <4 x float> %tmp16, %tmp23
- %tmp27 = fadd <4 x float> %tmp26, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
- %tmp28 = fmul <4 x float> %tmp17, %tmp25
- %tmp29 = fadd <4 x float> %tmp15, %tmp28
- %tmp30 = and <2 x i64> %tmp10, <i64 4294967297, i64 4294967297>
- %tmp31 = bitcast <2 x i64> %tmp30 to <4 x i32>
- %tmp32 = icmp eq <4 x i32> %tmp31, zeroinitializer
- %tmp33 = sext <4 x i1> %tmp32 to <4 x i32>
- %tmp34 = bitcast <4 x i32> %tmp33 to <4 x float>
- %tmp35 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp27, <4 x float> %tmp29, <4 x float> %tmp34) #2
- %tmp36 = and <2 x i64> %tmp10, <i64 8589934594, i64 8589934594>
- %tmp37 = bitcast <2 x i64> %tmp36 to <4 x i32>
- %tmp38 = icmp eq <4 x i32> %tmp37, zeroinitializer
- %tmp39 = sext <4 x i1> %tmp38 to <4 x i32>
- %tmp40 = bitcast <4 x float> %tmp35 to <4 x i32>
- %tmp41 = xor <4 x i32> %tmp40, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp42 = bitcast <4 x i32> %tmp41 to <4 x float>
- %tmp43 = bitcast <4 x i32> %tmp39 to <4 x float>
- %tmp44 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp42, <4 x float> %tmp35, <4 x float> %tmp43) #2
- %tmp45 = bitcast <2 x double> %arg1 to <4 x float>
- %tmp46 = fmul <4 x float> %tmp45, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
- %tmp47 = bitcast <2 x double> %arg1 to <4 x i32>
- %tmp48 = and <4 x i32> %tmp47, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp49 = or <4 x i32> %tmp48, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
- %tmp50 = bitcast <4 x i32> %tmp49 to <4 x float>
- %tmp51 = fadd <4 x float> %tmp46, %tmp50
- %tmp52 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp51) #2
- %tmp53 = bitcast <4 x i32> %tmp52 to <2 x i64>
- %tmp54 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp52) #2
- %tmp55 = fmul <4 x float> %tmp54, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
- %tmp56 = fsub <4 x float> %tmp45, %tmp55
- %tmp57 = fmul <4 x float> %tmp54, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
- %tmp58 = fsub <4 x float> %tmp56, %tmp57
- %tmp59 = fmul <4 x float> %tmp58, %tmp58
- %tmp60 = fmul <4 x float> %tmp58, %tmp59
- %tmp61 = fmul <4 x float> %tmp59, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
- %tmp62 = fadd <4 x float> %tmp61, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
- %tmp63 = fmul <4 x float> %tmp59, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
- %tmp64 = fadd <4 x float> %tmp63, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
- %tmp65 = fmul <4 x float> %tmp59, %tmp62
- %tmp66 = fadd <4 x float> %tmp65, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
- %tmp67 = fmul <4 x float> %tmp59, %tmp64
- %tmp68 = fadd <4 x float> %tmp67, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
- %tmp69 = fmul <4 x float> %tmp59, %tmp66
- %tmp70 = fadd <4 x float> %tmp69, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
- %tmp71 = fmul <4 x float> %tmp60, %tmp68
- %tmp72 = fadd <4 x float> %tmp58, %tmp71
- %tmp73 = and <2 x i64> %tmp53, <i64 4294967297, i64 4294967297>
- %tmp74 = bitcast <2 x i64> %tmp73 to <4 x i32>
- %tmp75 = icmp eq <4 x i32> %tmp74, zeroinitializer
- %tmp76 = sext <4 x i1> %tmp75 to <4 x i32>
- %tmp77 = bitcast <4 x i32> %tmp76 to <4 x float>
- %tmp78 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp70, <4 x float> %tmp72, <4 x float> %tmp77) #2
- %tmp79 = and <2 x i64> %tmp53, <i64 8589934594, i64 8589934594>
- %tmp80 = bitcast <2 x i64> %tmp79 to <4 x i32>
- %tmp81 = icmp eq <4 x i32> %tmp80, zeroinitializer
- %tmp82 = sext <4 x i1> %tmp81 to <4 x i32>
- %tmp83 = bitcast <4 x float> %tmp78 to <4 x i32>
- %tmp84 = xor <4 x i32> %tmp83, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp85 = bitcast <4 x i32> %tmp84 to <4 x float>
- %tmp86 = bitcast <4 x i32> %tmp82 to <4 x float>
- %tmp87 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp85, <4 x float> %tmp78, <4 x float> %tmp86) #2
- %tmp88 = fadd <4 x float> %tmp44, %tmp87
- %tmp89 = bitcast <2 x double> %arg2 to <4 x float>
- %tmp90 = fmul <4 x float> %tmp89, <float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000, float 0x3FE45F3060000000>
- %tmp91 = bitcast <2 x double> %arg2 to <4 x i32>
- %tmp92 = and <4 x i32> %tmp91, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp93 = or <4 x i32> %tmp92, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
- %tmp94 = bitcast <4 x i32> %tmp93 to <4 x float>
- %tmp95 = fadd <4 x float> %tmp90, %tmp94
- %tmp96 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %tmp95) #2
- %tmp97 = bitcast <4 x i32> %tmp96 to <2 x i64>
- %tmp98 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp96) #2
- %tmp99 = fmul <4 x float> %tmp98, <float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000, float 0x3FF921FB40000000>
- %tmp100 = fsub <4 x float> %tmp89, %tmp99
- %tmp101 = fmul <4 x float> %tmp98, <float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000, float 0x3E74442D00000000>
- %tmp102 = fsub <4 x float> %tmp100, %tmp101
- %tmp103 = fmul <4 x float> %tmp102, %tmp102
- %tmp104 = fmul <4 x float> %tmp102, %tmp103
- %tmp105 = fmul <4 x float> %tmp103, <float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000, float 0xBF56493260000000>
- %tmp106 = fadd <4 x float> %tmp105, <float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000, float 0x3FA55406C0000000>
- %tmp107 = fmul <4 x float> %tmp103, <float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000, float 0xBF29918DC0000000>
- %tmp108 = fadd <4 x float> %tmp107, <float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000, float 0x3F81106840000000>
- %tmp109 = fmul <4 x float> %tmp103, %tmp106
- %tmp110 = fadd <4 x float> %tmp109, <float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000, float 0xBFDFFFFBE0000000>
- %tmp111 = fmul <4 x float> %tmp103, %tmp108
- %tmp112 = fadd <4 x float> %tmp111, <float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000, float 0xBFC5555420000000>
- %tmp113 = fmul <4 x float> %tmp103, %tmp110
- %tmp114 = fadd <4 x float> %tmp113, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
- %tmp115 = fmul <4 x float> %tmp104, %tmp112
- %tmp116 = fadd <4 x float> %tmp102, %tmp115
- %tmp117 = and <2 x i64> %tmp97, <i64 4294967297, i64 4294967297>
- %tmp118 = bitcast <2 x i64> %tmp117 to <4 x i32>
- %tmp119 = icmp eq <4 x i32> %tmp118, zeroinitializer
- %tmp120 = sext <4 x i1> %tmp119 to <4 x i32>
- %tmp121 = bitcast <4 x i32> %tmp120 to <4 x float>
- %tmp122 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp114, <4 x float> %tmp116, <4 x float> %tmp121) #2
- %tmp123 = and <2 x i64> %tmp97, <i64 8589934594, i64 8589934594>
- %tmp124 = bitcast <2 x i64> %tmp123 to <4 x i32>
- %tmp125 = icmp eq <4 x i32> %tmp124, zeroinitializer
- %tmp126 = sext <4 x i1> %tmp125 to <4 x i32>
- %tmp127 = bitcast <4 x float> %tmp122 to <4 x i32>
- %tmp128 = xor <4 x i32> %tmp127, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
- %tmp129 = bitcast <4 x i32> %tmp128 to <4 x float>
- %tmp130 = bitcast <4 x i32> %tmp126 to <4 x float>
- %tmp131 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %tmp129, <4 x float> %tmp122, <4 x float> %tmp130) #2
- %tmp132 = fadd <4 x float> %tmp88, %tmp131
- %tmp133 = bitcast <4 x float> %tmp132 to <2 x double>
- ret <2 x double> %tmp133
-}
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x floa