aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-05-17 20:22:39 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-05-17 20:22:39 +0000
commit7af96fb3afd6725a2824a0a5ca5dad34e5e0b056 (patch)
tree6661ffbabf869009597684462f5a3df3beccc952
parent6b3f41ed88e8e440e11a4fbf20b6600529f80049 (diff)
downloadsrc-7af96fb3afd6725a2824a0a5ca5dad34e5e0b056.tar.gz
src-7af96fb3afd6725a2824a0a5ca5dad34e5e0b056.zip
Vendor import of llvm trunk r303291:vendor/llvm/llvm-trunk-r303291
Notes
Notes: svn path=/vendor/llvm/dist/; revision=318414 svn path=/vendor/llvm/llvm-trunk-r303291/; revision=318415; tag=vendor/llvm/llvm-trunk-r303291
-rw-r--r--include/llvm/ADT/APInt.h4
-rw-r--r--include/llvm/ADT/BitVector.h273
-rw-r--r--include/llvm/ADT/PostOrderIterator.h33
-rw-r--r--include/llvm/ADT/PriorityWorklist.h15
-rw-r--r--include/llvm/ADT/SCCIterator.h10
-rw-r--r--include/llvm/ADT/Sequence.h21
-rw-r--r--include/llvm/ADT/SetVector.h22
-rw-r--r--include/llvm/ADT/SmallBitVector.h13
-rw-r--r--include/llvm/ADT/SmallPtrSet.h30
-rw-r--r--include/llvm/ADT/SmallVector.h45
-rw-r--r--include/llvm/ADT/SparseBitVector.h20
-rw-r--r--include/llvm/ADT/SparseMultiSet.h39
-rw-r--r--include/llvm/ADT/SparseSet.h22
-rw-r--r--include/llvm/ADT/StringExtras.h22
-rw-r--r--include/llvm/ADT/StringMap.h88
-rw-r--r--include/llvm/ADT/StringRef.h20
-rw-r--r--include/llvm/ADT/StringSet.h17
-rw-r--r--include/llvm/ADT/TinyPtrVector.h14
-rw-r--r--include/llvm/ADT/UniqueVector.h15
-rw-r--r--include/llvm/Analysis/ProfileSummaryInfo.h15
-rw-r--r--include/llvm/DebugInfo/CodeView/CVTypeVisitor.h32
-rw-r--r--include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h15
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFAttribute.h4
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h18
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h8
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h38
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDie.h44
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFFormValue.h11
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h16
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFRelocMap.h11
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFUnit.h12
-rw-r--r--include/llvm/DebugInfo/PDB/Native/TpiStream.h1
-rw-r--r--include/llvm/IR/IntrinsicsPowerPC.td2
-rw-r--r--include/llvm/Target/GlobalISel/SelectionDAGCompat.td1
-rw-r--r--lib/Analysis/DependenceAnalysis.cpp33
-rw-r--r--lib/Analysis/InlineCost.cpp42
-rw-r--r--lib/Analysis/InstructionSimplify.cpp18
-rw-r--r--lib/Analysis/ProfileSummaryInfo.cpp2
-rw-r--r--lib/Analysis/ScalarEvolution.cpp43
-rw-r--r--lib/CodeGen/AggressiveAntiDepBreaker.cpp5
-rw-r--r--lib/CodeGen/AsmPrinter/CodeViewDebug.cpp2
-rw-r--r--lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp3
-rw-r--r--lib/CodeGen/GlobalISel/IRTranslator.cpp5
-rw-r--r--lib/CodeGen/MachineVerifier.cpp2
-rw-r--r--lib/CodeGen/RegAllocGreedy.cpp11
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp28
-rw-r--r--lib/CodeGen/SpillPlacement.cpp4
-rw-r--r--lib/CodeGen/StackColoring.cpp6
-rw-r--r--lib/CodeGen/TargetLoweringBase.cpp2
-rw-r--r--lib/CodeGen/TargetPassConfig.cpp24
-rw-r--r--lib/CodeGen/TargetRegisterInfo.cpp3
-rw-r--r--lib/DebugInfo/CodeView/CVTypeDumper.cpp22
-rw-r--r--lib/DebugInfo/CodeView/CVTypeVisitor.cpp99
-rw-r--r--lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp10
-rw-r--r--lib/DebugInfo/CodeView/TypeDumpVisitor.cpp3
-rw-r--r--lib/DebugInfo/CodeView/TypeStreamMerger.cpp17
-rw-r--r--lib/DebugInfo/DWARF/DWARFContext.cpp61
-rw-r--r--lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp3
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp10
-rw-r--r--lib/Support/CrashRecoveryContext.cpp130
-rw-r--r--lib/Support/Unix/Path.inc30
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp3
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp341
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h1
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp25
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.h3
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp30
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp3
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp14
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp26
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td20
-rw-r--r--lib/Target/ARM/ARMInstructionSelector.cpp15
-rw-r--r--lib/Target/ARM/Thumb1FrameLowering.cpp3
-rw-r--r--lib/Target/Mips/MipsDelaySlotFiller.cpp2
-rw-r--r--lib/Target/PowerPC/PPCFrameLowering.cpp23
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp45
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h1
-rw-r--r--lib/Target/PowerPC/PPCInstr64Bit.td4
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.cpp13
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td12
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp34
-rw-r--r--lib/Target/WebAssembly/WebAssemblyRegColoring.cpp3
-rw-r--r--lib/Target/WebAssembly/known_gcc_test_failures.txt3
-rw-r--r--lib/Target/X86/X86.td3
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp269
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp66
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp5
-rw-r--r--lib/Target/X86/X86Subtarget.h6
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp18
-rw-r--r--lib/Transforms/Coroutines/CoroFrame.cpp28
-rw-r--r--lib/Transforms/InstCombine/InstCombineInternal.h21
-rw-r--r--lib/Transforms/InstCombine/InstructionCombining.cpp47
-rw-r--r--lib/Transforms/Scalar/LICM.cpp2
-rw-r--r--lib/Transforms/Scalar/LoopIdiomRecognize.cpp16
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp3
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp68
-rw-r--r--lib/Transforms/Scalar/Reassociate.cpp2
-rw-r--r--lib/Transforms/Scalar/SimpleLoopUnswitch.cpp37
-rw-r--r--test/Analysis/CostModel/SystemZ/div-pow2.ll154
-rw-r--r--test/Analysis/CostModel/X86/bitreverse.ll69
-rw-r--r--test/Analysis/CostModel/X86/ctbits-cost.ll587
-rw-r--r--test/Analysis/CostModel/X86/ctlz.ll233
-rw-r--r--test/Analysis/CostModel/X86/ctpop.ll133
-rw-r--r--test/Analysis/CostModel/X86/cttz.ll233
-rw-r--r--test/CodeGen/AArch64/aarch64-addv.ll63
-rw-r--r--test/CodeGen/AArch64/aarch64-minmaxv.ll424
-rw-r--r--test/CodeGen/AArch64/arm64-vabs.ll42
-rw-r--r--test/CodeGen/AArch64/ldst-zero.ll23
-rw-r--r--test/CodeGen/AArch64/misched-stp.ll35
-rw-r--r--test/CodeGen/AMDGPU/fmax3.ll103
-rw-r--r--test/CodeGen/AMDGPU/fmin3.ll102
-rw-r--r--test/CodeGen/AMDGPU/global-constant.ll4
-rw-r--r--test/CodeGen/AMDGPU/immv216.ll6
-rw-r--r--test/CodeGen/AMDGPU/max3.ll91
-rw-r--r--test/CodeGen/AMDGPU/min3.ll133
-rw-r--r--test/CodeGen/AMDGPU/packed-op-sel.ll266
-rw-r--r--test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll16
-rw-r--r--test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll2
-rw-r--r--test/CodeGen/ARM/dag-combine-ldst.ll2
-rw-r--r--test/CodeGen/MSP430/vararg.ll1
-rw-r--r--test/CodeGen/Mips/msa/bmzi_bmnzi.ll8
-rw-r--r--test/CodeGen/PowerPC/atomic-2.ll14
-rw-r--r--test/CodeGen/PowerPC/atomics-indexed.ll14
-rw-r--r--test/CodeGen/PowerPC/atomics-regression.ll64
-rw-r--r--test/CodeGen/PowerPC/atomics.ll14
-rw-r--r--test/CodeGen/PowerPC/ppcf128sf.ll8
-rw-r--r--test/CodeGen/PowerPC/save-bp.ll54
-rw-r--r--test/CodeGen/PowerPC/save-cr-ppc32svr4.ll46
-rw-r--r--test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll57
-rw-r--r--test/CodeGen/SPARC/32abi.ll16
-rw-r--r--test/CodeGen/SPARC/64abi.ll18
-rw-r--r--test/CodeGen/SystemZ/swift-return.ll6
-rw-r--r--test/CodeGen/Thumb/stack-access.ll10
-rw-r--r--test/CodeGen/Thumb2/ldr-str-imm12.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll94
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-add.mir69
-rw-r--r--test/CodeGen/X86/GlobalISel/regbankselect-X32.mir36
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add-x32.mir63
-rw-r--r--test/CodeGen/X86/arg-copy-elide.ll7
-rw-r--r--test/CodeGen/X86/leaFixup32.mir508
-rw-r--r--test/CodeGen/X86/leaFixup64.mir1041
-rw-r--r--test/CodeGen/X86/nontemporal.ll72
-rw-r--r--test/CodeGen/X86/psubus.ll1263
-rw-r--r--test/CodeGen/X86/store-narrow.ll5
-rw-r--r--test/CodeGen/X86/swift-return.ll6
-rw-r--r--test/CodeGen/X86/win32-spill-xmm.ll2
-rw-r--r--test/CodeGen/X86/win64_sibcall.ll4
-rw-r--r--test/CodeGen/X86/win64_vararg.ll4
-rw-r--r--test/CodeGen/X86/x86-64-ms_abi-vararg.ll4
-rw-r--r--test/ExecutionEngine/RuntimeDyld/X86/ELF_x86-64_debug_frame.s20
-rw-r--r--test/Feature/optnone-llc.ll1
-rw-r--r--test/MC/AMDGPU/vop3-gfx9.s24
-rw-r--r--test/TableGen/GlobalISelEmitter.td75
-rw-r--r--test/Transforms/Coroutines/coro-catchswitch.ll88
-rw-r--r--test/Transforms/Inline/inline-hot-callee.ll10
-rw-r--r--test/Transforms/InstCombine/canonicalize_branch.ll513
-rw-r--r--test/Transforms/InstCombine/debuginfo-skip.ll44
-rw-r--r--test/Transforms/InstSimplify/AndOrXor.ll12
-rw-r--r--test/Transforms/LoopVectorize/AArch64/pr33053.ll56
-rw-r--r--test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll26
-rw-r--r--test/Transforms/NewGVN/pr32934.ll1
-rw-r--r--test/Transforms/SLPVectorizer/AArch64/gather-root.ll40
-rw-r--r--tools/llvm-pdbdump/Analyze.cpp15
-rw-r--r--tools/llvm-pdbdump/LLVMOutputStyle.cpp32
-rw-r--r--tools/llvm-pdbdump/PdbYaml.cpp14
-rw-r--r--tools/llvm-pdbdump/YamlTypeDumper.cpp80
-rw-r--r--unittests/ADT/BitVectorTest.cpp184
-rw-r--r--unittests/Analysis/ProfileSummaryInfoTest.cpp8
-rw-r--r--unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp1
-rw-r--r--unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp20
-rw-r--r--unittests/Support/BinaryStreamTest.cpp29
-rw-r--r--unittests/Support/CMakeLists.txt1
-rw-r--r--unittests/Support/CrashRecoveryTest.cpp83
-rw-r--r--utils/TableGen/AsmMatcherEmitter.cpp4
-rw-r--r--utils/TableGen/GlobalISelEmitter.cpp124
-rwxr-xr-xutils/lit/lit/main.py12
-rw-r--r--utils/lit/lit/run.py103
179 files changed, 5401 insertions, 5242 deletions
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 94fbd1a29bf9..894e5571f8ad 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -1067,9 +1067,7 @@ public:
/// \returns the bit value at bitPosition
bool operator[](unsigned bitPosition) const {
assert(bitPosition < getBitWidth() && "Bit position out of bounds!");
- return (maskBit(bitPosition) &
- (isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)])) !=
- 0;
+ return (maskBit(bitPosition) & getWord(bitPosition)) != 0;
}
/// @}
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 4a2af7cd68a6..e68ef5f53d10 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -15,6 +15,7 @@
#define LLVM_ADT_BITVECTOR_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <cassert>
@@ -26,6 +27,50 @@
namespace llvm {
+/// ForwardIterator for the bits that are set.
+/// Iterators get invalidated when resize / reserve is called.
+template <typename BitVectorT> class const_set_bits_iterator_impl {
+ const BitVectorT &Parent;
+ int Current = 0;
+
+ void advance() {
+ assert(Current != -1 && "Trying to advance past end.");
+ Current = Parent.find_next(Current);
+ }
+
+public:
+ const_set_bits_iterator_impl(const BitVectorT &Parent, int Current)
+ : Parent(Parent), Current(Current) {}
+ explicit const_set_bits_iterator_impl(const BitVectorT &Parent)
+ : const_set_bits_iterator_impl(Parent, Parent.find_first()) {}
+ const_set_bits_iterator_impl(const const_set_bits_iterator_impl &) = default;
+
+ const_set_bits_iterator_impl operator++(int) {
+ auto Prev = *this;
+ advance();
+ return Prev;
+ }
+
+ const_set_bits_iterator_impl &operator++() {
+ advance();
+ return *this;
+ }
+
+ unsigned operator*() const { return Current; }
+
+ bool operator==(const const_set_bits_iterator_impl &Other) const {
+ assert(&Parent == &Other.Parent &&
+ "Comparing iterators from different BitVectors");
+ return Current == Other.Current;
+ }
+
+ bool operator!=(const const_set_bits_iterator_impl &Other) const {
+ assert(&Parent == &Other.Parent &&
+ "Comparing iterators from different BitVectors");
+ return Current != Other.Current;
+ }
+};
+
class BitVector {
typedef unsigned long BitWord;
@@ -73,6 +118,18 @@ public:
}
};
+ typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator;
+ typedef const_set_bits_iterator set_iterator;
+
+ const_set_bits_iterator set_bits_begin() const {
+ return const_set_bits_iterator(*this);
+ }
+ const_set_bits_iterator set_bits_end() const {
+ return const_set_bits_iterator(*this, -1);
+ }
+ iterator_range<const_set_bits_iterator> set_bits() const {
+ return make_range(set_bits_begin(), set_bits_end());
+ }
/// BitVector default ctor - Creates an empty bitvector.
BitVector() : Size(0) {}
@@ -146,138 +203,164 @@ public:
return !any();
}
- /// find_first - Returns the index of the first set bit, -1 if none
- /// of the bits are set.
- int find_first() const {
- for (unsigned i = 0; i < NumBitWords(size()); ++i)
- if (Bits[i] != 0)
- return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
- return -1;
- }
-
- /// find_last - Returns the index of the last set bit, -1 if none of the bits
- /// are set.
- int find_last() const {
- if (Size == 0)
+ /// find_first_in - Returns the index of the first set bit in the range
+ /// [Begin, End). Returns -1 if all bits in the range are unset.
+ int find_first_in(unsigned Begin, unsigned End) const {
+ assert(Begin <= End && End <= Size);
+ if (Begin == End)
return -1;
- unsigned N = NumBitWords(size());
- assert(N > 0);
+ unsigned FirstWord = Begin / BITWORD_SIZE;
+ unsigned LastWord = (End - 1) / BITWORD_SIZE;
- unsigned i = N - 1;
- while (i > 0 && Bits[i] == BitWord(0))
- --i;
+ // Check subsequent words.
+ for (unsigned i = FirstWord; i <= LastWord; ++i) {
+ BitWord Copy = Bits[i];
- return int((i + 1) * BITWORD_SIZE - countLeadingZeros(Bits[i])) - 1;
- }
+ if (i == FirstWord) {
+ unsigned FirstBit = Begin % BITWORD_SIZE;
+ Copy &= maskTrailingZeros<BitWord>(FirstBit);
+ }
- /// find_first_unset - Returns the index of the first unset bit, -1 if all
- /// of the bits are set.
- int find_first_unset() const {
- for (unsigned i = 0; i < NumBitWords(size()); ++i)
- if (Bits[i] != ~0UL) {
- unsigned Result = i * BITWORD_SIZE + countTrailingOnes(Bits[i]);
- return Result < size() ? Result : -1;
+ if (i == LastWord) {
+ unsigned LastBit = (End - 1) % BITWORD_SIZE;
+ Copy &= maskTrailingOnes<BitWord>(LastBit + 1);
}
+ if (Copy != 0)
+ return i * BITWORD_SIZE + countTrailingZeros(Copy);
+ }
return -1;
}
- /// find_last_unset - Returns the index of the last unset bit, -1 if all of
- /// the bits are set.
- int find_last_unset() const {
- if (Size == 0)
+ /// find_last_in - Returns the index of the last set bit in the range
+ /// [Begin, End). Returns -1 if all bits in the range are unset.
+ int find_last_in(unsigned Begin, unsigned End) const {
+ assert(Begin <= End && End <= Size);
+ if (Begin == End)
return -1;
- const unsigned N = NumBitWords(size());
- assert(N > 0);
+ unsigned LastWord = (End - 1) / BITWORD_SIZE;
+ unsigned FirstWord = Begin / BITWORD_SIZE;
- unsigned i = N - 1;
- BitWord W = Bits[i];
+ for (unsigned i = LastWord + 1; i >= FirstWord + 1; --i) {
+ unsigned CurrentWord = i - 1;
- // The last word in the BitVector has some unused bits, so we need to set
- // them all to 1 first. Set them all to 1 so they don't get treated as
- // valid unset bits.
- unsigned UnusedCount = BITWORD_SIZE - size() % BITWORD_SIZE;
- W |= maskLeadingOnes<BitWord>(UnusedCount);
+ BitWord Copy = Bits[CurrentWord];
+ if (CurrentWord == LastWord) {
+ unsigned LastBit = (End - 1) % BITWORD_SIZE;
+ Copy &= maskTrailingOnes<BitWord>(LastBit + 1);
+ }
- while (W == ~BitWord(0) && --i > 0)
- W = Bits[i];
+ if (CurrentWord == FirstWord) {
+ unsigned FirstBit = Begin % BITWORD_SIZE;
+ Copy &= maskTrailingZeros<BitWord>(FirstBit);
+ }
+
+ if (Copy != 0)
+ return (CurrentWord + 1) * BITWORD_SIZE - countLeadingZeros(Copy) - 1;
+ }
- return int((i + 1) * BITWORD_SIZE - countLeadingOnes(W)) - 1;
+ return -1;
}
- /// find_next - Returns the index of the next set bit following the
- /// "Prev" bit. Returns -1 if the next set bit is not found.
- int find_next(unsigned Prev) const {
- ++Prev;
- if (Prev >= Size)
+ /// find_first_unset_in - Returns the index of the first unset bit in the
+ /// range [Begin, End). Returns -1 if all bits in the range are set.
+ int find_first_unset_in(unsigned Begin, unsigned End) const {
+ assert(Begin <= End && End <= Size);
+ if (Begin == End)
return -1;
- unsigned WordPos = Prev / BITWORD_SIZE;
- unsigned BitPos = Prev % BITWORD_SIZE;
- BitWord Copy = Bits[WordPos];
- // Mask off previous bits.
- Copy &= maskTrailingZeros<BitWord>(BitPos);
-
- if (Copy != 0)
- return WordPos * BITWORD_SIZE + countTrailingZeros(Copy);
+ unsigned FirstWord = Begin / BITWORD_SIZE;
+ unsigned LastWord = (End - 1) / BITWORD_SIZE;
// Check subsequent words.
- for (unsigned i = WordPos+1; i < NumBitWords(size()); ++i)
- if (Bits[i] != 0)
- return i * BITWORD_SIZE + countTrailingZeros(Bits[i]);
+ for (unsigned i = FirstWord; i <= LastWord; ++i) {
+ BitWord Copy = Bits[i];
+
+ if (i == FirstWord) {
+ unsigned FirstBit = Begin % BITWORD_SIZE;
+ Copy |= maskTrailingOnes<BitWord>(FirstBit);
+ }
+
+ if (i == LastWord) {
+ unsigned LastBit = (End - 1) % BITWORD_SIZE;
+ Copy |= maskTrailingZeros<BitWord>(LastBit + 1);
+ }
+ if (Copy != ~0UL) {
+ unsigned Result = i * BITWORD_SIZE + countTrailingOnes(Copy);
+ return Result < size() ? Result : -1;
+ }
+ }
return -1;
}
- /// find_next_unset - Returns the index of the next unset bit following the
- /// "Prev" bit. Returns -1 if all remaining bits are set.
- int find_next_unset(unsigned Prev) const {
- ++Prev;
- if (Prev >= Size)
+ /// find_last_unset_in - Returns the index of the last unset bit in the
+ /// range [Begin, End). Returns -1 if all bits in the range are set.
+ int find_last_unset_in(unsigned Begin, unsigned End) const {
+ assert(Begin <= End && End <= Size);
+ if (Begin == End)
return -1;
- unsigned WordPos = Prev / BITWORD_SIZE;
- unsigned BitPos = Prev % BITWORD_SIZE;
- BitWord Copy = Bits[WordPos];
- // Mask in previous bits.
- BitWord Mask = (1 << BitPos) - 1;
- Copy |= Mask;
+ unsigned LastWord = (End - 1) / BITWORD_SIZE;
+ unsigned FirstWord = Begin / BITWORD_SIZE;
- if (Copy != ~0UL)
- return next_unset_in_word(WordPos, Copy);
+ for (unsigned i = LastWord + 1; i >= FirstWord + 1; --i) {
+ unsigned CurrentWord = i - 1;
- // Check subsequent words.
- for (unsigned i = WordPos + 1; i < NumBitWords(size()); ++i)
- if (Bits[i] != ~0UL)
- return next_unset_in_word(i, Bits[i]);
+ BitWord Copy = Bits[CurrentWord];
+ if (CurrentWord == LastWord) {
+ unsigned LastBit = (End - 1) % BITWORD_SIZE;
+ Copy |= maskTrailingZeros<BitWord>(LastBit + 1);
+ }
+
+ if (CurrentWord == FirstWord) {
+ unsigned FirstBit = Begin % BITWORD_SIZE;
+ Copy |= maskTrailingOnes<BitWord>(FirstBit);
+ }
+
+ if (Copy != ~0UL) {
+ unsigned Result =
+ (CurrentWord + 1) * BITWORD_SIZE - countLeadingOnes(Copy) - 1;
+ return Result < Size ? Result : -1;
+ }
+ }
return -1;
}
+ /// find_first - Returns the index of the first set bit, -1 if none
+ /// of the bits are set.
+ int find_first() const { return find_first_in(0, Size); }
+
+ /// find_last - Returns the index of the last set bit, -1 if none of the bits
+ /// are set.
+ int find_last() const { return find_last_in(0, Size); }
+
+ /// find_next - Returns the index of the next set bit following the
+ /// "Prev" bit. Returns -1 if the next set bit is not found.
+ int find_next(unsigned Prev) const { return find_first_in(Prev + 1, Size); }
+
/// find_prev - Returns the index of the first set bit that precedes the
/// the bit at \p PriorTo. Returns -1 if all previous bits are unset.
- int find_prev(unsigned PriorTo) const {
- if (PriorTo == 0)
- return -1;
+ int find_prev(unsigned PriorTo) const { return find_last_in(0, PriorTo); }
- --PriorTo;
+ /// find_first_unset - Returns the index of the first unset bit, -1 if all
+ /// of the bits are set.
+ int find_first_unset() const { return find_first_unset_in(0, Size); }
- unsigned WordPos = PriorTo / BITWORD_SIZE;
- unsigned BitPos = PriorTo % BITWORD_SIZE;
- BitWord Copy = Bits[WordPos];
- // Mask off next bits.
- Copy &= maskTrailingOnes<BitWord>(BitPos + 1);
+ /// find_next_unset - Returns the index of the next unset bit following the
+ /// "Prev" bit. Returns -1 if all remaining bits are set.
+ int find_next_unset(unsigned Prev) const {
+ return find_first_unset_in(Prev + 1, Size);
+ }
- if (Copy != 0)
- return (WordPos + 1) * BITWORD_SIZE - countLeadingZeros(Copy) - 1;
+ /// find_last_unset - Returns the index of the last unset bit, -1 if all of
+ /// the bits are set.
+ int find_last_unset() const { return find_last_unset_in(0, Size); }
- // Check previous words.
- for (unsigned i = 1; i <= WordPos; ++i) {
- unsigned Index = WordPos - i;
- if (Bits[Index] == 0)
- continue;
- return (Index + 1) * BITWORD_SIZE - countLeadingZeros(Bits[Index]) - 1;
- }
- return -1;
+ /// find_prev_unset - Returns the index of the first unset bit that precedes
+ /// the bit at \p PriorTo. Returns -1 if all previous bits are set.
+ int find_prev_unset(unsigned PriorTo) {
+ return find_last_unset_in(0, PriorTo);
}
/// clear - Removes all bits from the bitvector. Does not change capacity.
diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index 8fc08eb252eb..a179d29956b1 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -96,24 +96,14 @@ template <class GraphT,
class po_iterator
: public std::iterator<std::forward_iterator_tag, typename GT::NodeRef>,
public po_iterator_storage<SetType, ExtStorage> {
- typedef std::iterator<std::forward_iterator_tag, typename GT::NodeRef> super;
- typedef typename GT::NodeRef NodeRef;
- typedef typename GT::ChildIteratorType ChildItTy;
+ using super = std::iterator<std::forward_iterator_tag, typename GT::NodeRef>;
+ using NodeRef = typename GT::NodeRef;
+ using ChildItTy = typename GT::ChildIteratorType;
// VisitStack - Used to maintain the ordering. Top = current block
// First element is basic block pointer, second is the 'next child' to visit
std::vector<std::pair<NodeRef, ChildItTy>> VisitStack;
- void traverseChild() {
- while (VisitStack.back().second != GT::child_end(VisitStack.back().first)) {
- NodeRef BB = *VisitStack.back().second++;
- if (this->insertEdge(Optional<NodeRef>(VisitStack.back().first), BB)) {
- // If the block is not visited...
- VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
- }
- }
- }
-
po_iterator(NodeRef BB) {
this->insertEdge(Optional<NodeRef>(), BB);
VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
@@ -134,8 +124,18 @@ class po_iterator
: po_iterator_storage<SetType, ExtStorage>(S) {
} // End is when stack is empty.
+ void traverseChild() {
+ while (VisitStack.back().second != GT::child_end(VisitStack.back().first)) {
+ NodeRef BB = *VisitStack.back().second++;
+ if (this->insertEdge(Optional<NodeRef>(VisitStack.back().first), BB)) {
+ // If the block is not visited...
+ VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
+ }
+ }
+ }
+
public:
- typedef typename super::pointer pointer;
+ using pointer = typename super::pointer;
// Provide static "constructors"...
static po_iterator begin(GraphT G) {
@@ -286,7 +286,8 @@ inverse_post_order_ext(const T &G, SetType &S) {
template<class GraphT, class GT = GraphTraits<GraphT>>
class ReversePostOrderTraversal {
- typedef typename GT::NodeRef NodeRef;
+ using NodeRef = typename GT::NodeRef;
+
std::vector<NodeRef> Blocks; // Block list in normal PO order
void Initialize(NodeRef BB) {
@@ -294,7 +295,7 @@ class ReversePostOrderTraversal {
}
public:
- typedef typename std::vector<NodeRef>::reverse_iterator rpo_iterator;
+ using rpo_iterator = typename std::vector<NodeRef>::reverse_iterator;
ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); }
diff --git a/include/llvm/ADT/PriorityWorklist.h b/include/llvm/ADT/PriorityWorklist.h
index 3198dd438700..35891e931801 100644
--- a/include/llvm/ADT/PriorityWorklist.h
+++ b/include/llvm/ADT/PriorityWorklist.h
@@ -17,13 +17,14 @@
#define LLVM_ADT_PRIORITYWORKLIST_H
#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Compiler.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
+#include <iterator>
+#include <type_traits>
#include <vector>
namespace llvm {
@@ -55,11 +56,11 @@ template <typename T, typename VectorT = std::vector<T>,
typename MapT = DenseMap<T, ptrdiff_t>>
class PriorityWorklist {
public:
- typedef T value_type;
- typedef T key_type;
- typedef T& reference;
- typedef const T& const_reference;
- typedef typename MapT::size_type size_type;
+ using value_type = T;
+ using key_type = T;
+ using reference = T&;
+ using const_reference = const T&;
+ using size_type = typename MapT::size_type;
/// Construct an empty PriorityWorklist
PriorityWorklist() = default;
diff --git a/include/llvm/ADT/SCCIterator.h b/include/llvm/ADT/SCCIterator.h
index 9a8a7b168fce..734a58f87da2 100644
--- a/include/llvm/ADT/SCCIterator.h
+++ b/include/llvm/ADT/SCCIterator.h
@@ -1,4 +1,4 @@
-//===---- ADT/SCCIterator.h - Strongly Connected Comp. Iter. ----*- C++ -*-===//
+//===- ADT/SCCIterator.h - Strongly Connected Comp. Iter. -------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -43,10 +43,10 @@ template <class GraphT, class GT = GraphTraits<GraphT>>
class scc_iterator : public iterator_facade_base<
scc_iterator<GraphT, GT>, std::forward_iterator_tag,
const std::vector<typename GT::NodeRef>, ptrdiff_t> {
- typedef typename GT::NodeRef NodeRef;
- typedef typename GT::ChildIteratorType ChildItTy;
- typedef std::vector<NodeRef> SccTy;
- typedef typename scc_iterator::reference reference;
+ using NodeRef = typename GT::NodeRef;
+ using ChildItTy = typename GT::ChildIteratorType;
+ using SccTy = std::vector<NodeRef>;
+ using reference = typename scc_iterator::reference;
/// Element of VisitStack during DFS.
struct StackElement {
diff --git a/include/llvm/ADT/Sequence.h b/include/llvm/ADT/Sequence.h
index 5d36831cc128..3d4a897bf9a9 100644
--- a/include/llvm/ADT/Sequence.h
+++ b/include/llvm/ADT/Sequence.h
@@ -13,27 +13,31 @@
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_ADT_SEQ_H
-#define LLVM_ADT_SEQ_H
+#ifndef LLVM_ADT_SEQUENCE_H
+#define LLVM_ADT_SEQUENCE_H
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
+#include <algorithm>
+#include <iterator>
+#include <utility>
namespace llvm {
namespace detail {
+
template <typename ValueT>
class value_sequence_iterator
: public iterator_facade_base<value_sequence_iterator<ValueT>,
std::random_access_iterator_tag,
const ValueT> {
- typedef typename value_sequence_iterator::iterator_facade_base BaseT;
+ using BaseT = typename value_sequence_iterator::iterator_facade_base;
ValueT Value;
public:
- typedef typename BaseT::difference_type difference_type;
- typedef typename BaseT::reference reference;
+ using difference_type = typename BaseT::difference_type;
+ using reference = typename BaseT::reference;
value_sequence_iterator() = default;
value_sequence_iterator(const value_sequence_iterator &) = default;
@@ -65,7 +69,8 @@ public:
reference operator*() const { return Value; }
};
-} // End detail namespace.
+
+} // end namespace detail
template <typename ValueT>
iterator_range<detail::value_sequence_iterator<ValueT>> seq(ValueT Begin,
@@ -74,6 +79,6 @@ iterator_range<detail::value_sequence_iterator<ValueT>> seq(ValueT Begin,
detail::value_sequence_iterator<ValueT>(End));
}
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_ADT_SEQUENCE_H
diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 13378aa3a04e..04ed52fc543f 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -40,17 +40,17 @@ template <typename T, typename Vector = std::vector<T>,
typename Set = DenseSet<T>>
class SetVector {
public:
- typedef T value_type;
- typedef T key_type;
- typedef T& reference;
- typedef const T& const_reference;
- typedef Set set_type;
- typedef Vector vector_type;
- typedef typename vector_type::const_iterator iterator;
- typedef typename vector_type::const_iterator const_iterator;
- typedef typename vector_type::const_reverse_iterator reverse_iterator;
- typedef typename vector_type::const_reverse_iterator const_reverse_iterator;
- typedef typename vector_type::size_type size_type;
+ using value_type = T;
+ using key_type = T;
+ using reference = T&;
+ using const_reference = const T&;
+ using set_type = Set;
+ using vector_type = Vector;
+ using iterator = typename vector_type::const_iterator;
+ using const_iterator = typename vector_type::const_iterator;
+ using reverse_iterator = typename vector_type::const_reverse_iterator;
+ using const_reverse_iterator = typename vector_type::const_reverse_iterator;
+ using size_type = typename vector_type::size_type;
/// \brief Construct an empty SetVector
SetVector() = default;
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index 0eeacc162543..0ff427066959 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -134,6 +134,19 @@ private:
}
public:
+ typedef const_set_bits_iterator_impl<SmallBitVector> const_set_bits_iterator;
+ typedef const_set_bits_iterator set_iterator;
+
+ const_set_bits_iterator set_bits_begin() const {
+ return const_set_bits_iterator(*this);
+ }
+ const_set_bits_iterator set_bits_end() const {
+ return const_set_bits_iterator(*this, -1);
+ }
+ iterator_range<const_set_bits_iterator> set_bits() const {
+ return make_range(set_bits_begin(), set_bits_end());
+ }
+
/// Creates an empty bitvector.
SmallBitVector() : X(1) {}
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 196ab6338047..b49d216e0b6e 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -27,15 +27,13 @@
#include <iterator>
#include <utility>
-#if LLVM_ENABLE_ABI_BREAKING_CHECKS
namespace llvm {
+
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
template <class T = void> struct ReverseIterate { static bool value; };
template <class T> bool ReverseIterate<T>::value = false;
-}
#endif
-namespace llvm {
-
/// SmallPtrSetImplBase - This is the common code shared among all the
/// SmallPtrSet<>'s, which is almost everything. SmallPtrSet has two modes, one
/// for small and one for large sets.
@@ -92,7 +90,7 @@ protected:
}
public:
- typedef unsigned size_type;
+ using size_type = unsigned;
SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete;
@@ -273,14 +271,14 @@ protected:
/// SmallPtrSetIterator - This implements a const_iterator for SmallPtrSet.
template<typename PtrTy>
class SmallPtrSetIterator : public SmallPtrSetIteratorImpl {
- typedef PointerLikeTypeTraits<PtrTy> PtrTraits;
+ using PtrTraits = PointerLikeTypeTraits<PtrTy>;
public:
- typedef PtrTy value_type;
- typedef PtrTy reference;
- typedef PtrTy pointer;
- typedef std::ptrdiff_t difference_type;
- typedef std::forward_iterator_tag iterator_category;
+ using value_type = PtrTy;
+ using reference = PtrTy;
+ using pointer = PtrTy;
+ using difference_type = std::ptrdiff_t;
+ using iterator_category = std::forward_iterator_tag;
explicit SmallPtrSetIterator(const void *const *BP, const void *const *E)
: SmallPtrSetIteratorImpl(BP, E) {}
@@ -351,8 +349,8 @@ struct RoundUpToPowerOfTwo {
template <typename PtrType>
class SmallPtrSetImpl : public SmallPtrSetImplBase {
using ConstPtrType = typename add_const_past_pointer<PtrType>::type;
- typedef PointerLikeTypeTraits<PtrType> PtrTraits;
- typedef PointerLikeTypeTraits<ConstPtrType> ConstPtrTraits;
+ using PtrTraits = PointerLikeTypeTraits<PtrType>;
+ using ConstPtrTraits = PointerLikeTypeTraits<ConstPtrType>;
protected:
// Constructors that forward to the base.
@@ -365,8 +363,8 @@ protected:
: SmallPtrSetImplBase(SmallStorage, SmallSize) {}
public:
- typedef SmallPtrSetIterator<PtrType> iterator;
- typedef SmallPtrSetIterator<PtrType> const_iterator;
+ using iterator = SmallPtrSetIterator<PtrType>;
+ using const_iterator = SmallPtrSetIterator<PtrType>;
SmallPtrSetImpl(const SmallPtrSetImpl &) = delete;
@@ -431,7 +429,7 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
// DenseSet<> instead if you expect many elements in the set.
static_assert(SmallSize <= 32, "SmallSize should be small");
- typedef SmallPtrSetImpl<PtrType> BaseT;
+ using BaseT = SmallPtrSetImpl<PtrType>;
// Make sure that SmallSize is a power of two, round up if not.
enum { SmallSizePowTwo = RoundUpToPowerOfTwo<SmallSize>::Val };
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index b9588214023c..bd24eab93b50 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -71,7 +71,7 @@ private:
// Allocate raw space for N elements of type T. If T has a ctor or dtor, we
// don't want it to be automatically run, so we need to represent the space as
// something else. Use an array of char of sufficient alignment.
- typedef AlignedCharArrayUnion<T> U;
+ using U = AlignedCharArrayUnion<T>;
U FirstEl;
// Space after 'FirstEl' is clobbered, do not add any instance vars after it.
@@ -96,19 +96,19 @@ protected:
void setEnd(T *P) { this->EndX = P; }
public:
- typedef size_t size_type;
- typedef ptrdiff_t difference_type;
- typedef T value_type;
- typedef T *iterator;
- typedef const T *const_iterator;
+ using size_type = size_t;
+ using difference_type = ptrdiff_t;
+ using value_type = T;
+ using iterator = T *;
+ using const_iterator = const T *;
- typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
- typedef std::reverse_iterator<iterator> reverse_iterator;
+ using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+ using reverse_iterator = std::reverse_iterator<iterator>;
- typedef T &reference;
- typedef const T &const_reference;
- typedef T *pointer;
- typedef const T *const_pointer;
+ using reference = T &;
+ using const_reference = const T &;
+ using pointer = T *;
+ using const_pointer = const T *;
// forward iterator creation methods.
LLVM_ATTRIBUTE_ALWAYS_INLINE
@@ -319,12 +319,12 @@ public:
/// reduce code duplication based on the SmallVector 'N' template parameter.
template <typename T>
class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
- typedef SmallVectorTemplateBase<T, isPodLike<T>::value > SuperClass;
+ using SuperClass = SmallVectorTemplateBase<T, isPodLike<T>::value>;
public:
- typedef typename SuperClass::iterator iterator;
- typedef typename SuperClass::const_iterator const_iterator;
- typedef typename SuperClass::size_type size_type;
+ using iterator = typename SuperClass::iterator;
+ using const_iterator = typename SuperClass::const_iterator;
+ using size_type = typename SuperClass::size_type;
protected:
// Default ctor - Initialize to empty.
@@ -845,8 +845,7 @@ class SmallVector : public SmallVectorImpl<T> {
SmallVectorStorage<T, N> Storage;
public:
- SmallVector() : SmallVectorImpl<T>(N) {
- }
+ SmallVector() : SmallVectorImpl<T>(N) {}
explicit SmallVector(size_t Size, const T &Value = T())
: SmallVectorImpl<T>(N) {
@@ -883,16 +882,16 @@ public:
SmallVectorImpl<T>::operator=(::std::move(RHS));
}
- const SmallVector &operator=(SmallVector &&RHS) {
- SmallVectorImpl<T>::operator=(::std::move(RHS));
- return *this;
- }
-
SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
if (!RHS.empty())
SmallVectorImpl<T>::operator=(::std::move(RHS));
}
+ const SmallVector &operator=(SmallVector &&RHS) {
+ SmallVectorImpl<T>::operator=(::std::move(RHS));
+ return *this;
+ }
+
const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
SmallVectorImpl<T>::operator=(::std::move(RHS));
return *this;
diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index a82cef6028f9..4cbf40c76805 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -1,4 +1,4 @@
-//===- llvm/ADT/SparseBitVector.h - Efficient Sparse BitVector -*- C++ -*- ===//
+//===- llvm/ADT/SparseBitVector.h - Efficient Sparse BitVector --*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -41,8 +41,8 @@ namespace llvm {
template <unsigned ElementSize = 128> struct SparseBitVectorElement {
public:
- typedef unsigned long BitWord;
- typedef unsigned size_type;
+ using BitWord = unsigned long;
+ using size_type = unsigned;
enum {
BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT,
BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE,
@@ -100,7 +100,7 @@ public:
Bits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE);
}
- bool test_and_set (unsigned Idx) {
+ bool test_and_set(unsigned Idx) {
bool old = test(Idx);
if (!old) {
set(Idx);
@@ -254,9 +254,9 @@ public:
template <unsigned ElementSize = 128>
class SparseBitVector {
- typedef std::list<SparseBitVectorElement<ElementSize>> ElementList;
- typedef typename ElementList::iterator ElementListIter;
- typedef typename ElementList::const_iterator ElementListConstIter;
+ using ElementList = std::list<SparseBitVectorElement<ElementSize>>;
+ using ElementListIter = typename ElementList::iterator;
+ using ElementListConstIter = typename ElementList::const_iterator;
enum {
BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE
};
@@ -421,14 +421,12 @@ class SparseBitVector {
};
public:
- typedef SparseBitVectorIterator iterator;
+ using iterator = SparseBitVectorIterator;
SparseBitVector() {
CurrElementIter = Elements.begin();
}
- ~SparseBitVector() = default;
-
// SparseBitVector copy ctor.
SparseBitVector(const SparseBitVector &RHS) {
ElementListConstIter ElementIter = RHS.Elements.begin();
@@ -440,6 +438,8 @@ public:
CurrElementIter = Elements.begin ();
}
+ ~SparseBitVector() = default;
+
// Clear.
void clear() {
Elements.clear();
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index 08da4b68ebaa..b3a413aa3aa5 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -1,4 +1,4 @@
-//===--- llvm/ADT/SparseMultiSet.h - Sparse multiset ------------*- C++ -*-===//
+//===- llvm/ADT/SparseMultiSet.h - Sparse multiset --------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -101,7 +101,7 @@ class SparseMultiSet {
unsigned Prev;
unsigned Next;
- SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) { }
+ SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) {}
/// List tails have invalid Nexts.
bool isTail() const {
@@ -118,8 +118,8 @@ class SparseMultiSet {
bool isValid() const { return Prev != INVALID; }
};
- typedef typename KeyFunctorT::argument_type KeyT;
- typedef SmallVector<SMSNode, 8> DenseT;
+ using KeyT = typename KeyFunctorT::argument_type;
+ using DenseT = SmallVector<SMSNode, 8>;
DenseT Dense;
SparseT *Sparse = nullptr;
unsigned Universe = 0;
@@ -183,12 +183,12 @@ class SparseMultiSet {
}
public:
- typedef ValueT value_type;
- typedef ValueT &reference;
- typedef const ValueT &const_reference;
- typedef ValueT *pointer;
- typedef const ValueT *const_pointer;
- typedef unsigned size_type;
+ using value_type = ValueT;
+ using reference = ValueT &;
+ using const_reference = const ValueT &;
+ using pointer = ValueT *;
+ using const_pointer = const ValueT *;
+ using size_type = unsigned;
SparseMultiSet() = default;
SparseMultiSet(const SparseMultiSet &) = delete;
@@ -227,7 +227,7 @@ public:
unsigned SparseIdx;
iterator_base(SMSPtrTy P, unsigned I, unsigned SI)
- : SMS(P), Idx(I), SparseIdx(SI) { }
+ : SMS(P), Idx(I), SparseIdx(SI) {}
/// Whether our iterator has fallen outside our dense vector.
bool isEnd() const {
@@ -248,11 +248,11 @@ public:
void setNext(unsigned N) { SMS->Dense[Idx].Next = N; }
public:
- typedef std::iterator<std::bidirectional_iterator_tag, ValueT> super;
- typedef typename super::value_type value_type;
- typedef typename super::difference_type difference_type;
- typedef typename super::pointer pointer;
- typedef typename super::reference reference;
+ using super = std::iterator<std::bidirectional_iterator_tag, ValueT>;
+ using value_type = typename super::value_type;
+ using difference_type = typename super::difference_type;
+ using pointer = typename super::pointer;
+ using reference = typename super::reference;
reference operator*() const {
assert(isKeyed() && SMS->sparseIndex(SMS->Dense[Idx].Data) == SparseIdx &&
@@ -308,11 +308,12 @@ public:
return I;
}
};
- typedef iterator_base<SparseMultiSet *> iterator;
- typedef iterator_base<const SparseMultiSet *> const_iterator;
+
+ using iterator = iterator_base<SparseMultiSet *>;
+ using const_iterator = iterator_base<const SparseMultiSet *>;
// Convenience types
- typedef std::pair<iterator, iterator> RangePair;
+ using RangePair = std::pair<iterator, iterator>;
/// Returns an iterator past this container. Note that such an iterator cannot
/// be decremented, but will compare equal to other end iterators.
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 00c18c743219..25ade8831922 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -1,4 +1,4 @@
-//===--- llvm/ADT/SparseSet.h - Sparse set ----------------------*- C++ -*-===//
+//===- llvm/ADT/SparseSet.h - Sparse set ------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -125,9 +125,9 @@ class SparseSet {
!std::numeric_limits<SparseT>::is_signed,
"SparseT must be an unsigned integer type");
- typedef typename KeyFunctorT::argument_type KeyT;
- typedef SmallVector<ValueT, 8> DenseT;
- typedef unsigned size_type;
+ using KeyT = typename KeyFunctorT::argument_type;
+ using DenseT = SmallVector<ValueT, 8>;
+ using size_type = unsigned;
DenseT Dense;
SparseT *Sparse = nullptr;
unsigned Universe = 0;
@@ -135,11 +135,11 @@ class SparseSet {
SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
public:
- typedef ValueT value_type;
- typedef ValueT &reference;
- typedef const ValueT &const_reference;
- typedef ValueT *pointer;
- typedef const ValueT *const_pointer;
+ using value_type = ValueT;
+ using reference = ValueT &;
+ using const_reference = const ValueT &;
+ using pointer = ValueT *;
+ using const_pointer = const ValueT *;
SparseSet() = default;
SparseSet(const SparseSet &) = delete;
@@ -168,8 +168,8 @@ public:
}
// Import trivial vector stuff from DenseT.
- typedef typename DenseT::iterator iterator;
- typedef typename DenseT::const_iterator const_iterator;
+ using iterator = typename DenseT::iterator;
+ using const_iterator = typename DenseT::const_iterator;
const_iterator begin() const { return Dense.begin(); }
const_iterator end() const { return Dense.end(); }
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 1c109be3bab3..e22a3f688c40 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -1,4 +1,4 @@
-//===-- llvm/ADT/StringExtras.h - Useful string functions -------*- C++ -*-===//
+//===- llvm/ADT/StringExtras.h - Useful string functions --------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,12 +15,18 @@
#define LLVM_ADT_STRINGEXTRAS_H
#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataTypes.h"
#include <iterator>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <utility>
namespace llvm {
-class raw_ostream;
+
template<typename T> class SmallVectorImpl;
+class raw_ostream;
/// hexdigit - Return the hexadecimal character for the
/// given number \p X (which should be less than 16).
@@ -128,7 +134,6 @@ static inline std::string utostr(uint64_t X, bool isNeg = false) {
return std::string(BufPtr, std::end(Buffer));
}
-
static inline std::string itostr(int64_t X) {
if (X < 0)
return utostr(static_cast<uint64_t>(-X), true);
@@ -261,13 +266,14 @@ template <typename A1, typename... Args>
inline size_t join_items_size(const A1 &A, Args &&... Items) {
return join_one_item_size(A) + join_items_size(std::forward<Args>(Items)...);
}
-}
+
+} // end namespace detail
/// Joins the strings in the range [Begin, End), adding Separator between
/// the elements.
template <typename IteratorT>
inline std::string join(IteratorT Begin, IteratorT End, StringRef Separator) {
- typedef typename std::iterator_traits<IteratorT>::iterator_category tag;
+ using tag = typename std::iterator_traits<IteratorT>::iterator_category;
return detail::join_impl(Begin, End, Separator, tag());
}
@@ -295,6 +301,6 @@ inline std::string join_items(Sep Separator, Args &&... Items) {
return Result;
}
-} // End llvm namespace
+} // end namespace llvm
-#endif
+#endif // LLVM_ADT_STRINGEXTRAS_H
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index c36fda7d6906..d573148665a1 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -1,4 +1,4 @@
-//===--- StringMap.h - String Hash table map interface ----------*- C++ -*-===//
+//===- StringMap.h - String Hash table map interface ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -16,25 +16,23 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/PointerLikeTypeTraits.h"
+#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <initializer_list>
-#include <new>
+#include <iterator>
#include <utility>
namespace llvm {
- template<typename ValueT>
- class StringMapConstIterator;
- template<typename ValueT>
- class StringMapIterator;
- template <typename ValueT> class StringMapKeyIterator;
- template<typename ValueTy>
- class StringMapEntry;
+template<typename ValueTy> class StringMapConstIterator;
+template<typename ValueTy> class StringMapIterator;
+template<typename ValueTy> class StringMapKeyIterator;
/// StringMapEntryBase - Shared base class of StringMapEntry instances.
class StringMapEntryBase {
@@ -53,17 +51,15 @@ protected:
// Array of NumBuckets pointers to entries, null pointers are holes.
// TheTable[NumBuckets] contains a sentinel value for easy iteration. Followed
// by an array of the actual hash values as unsigned integers.
- StringMapEntryBase **TheTable;
- unsigned NumBuckets;
- unsigned NumItems;
- unsigned NumTombstones;
+ StringMapEntryBase **TheTable = nullptr;
+ unsigned NumBuckets = 0;
+ unsigned NumItems = 0;
+ unsigned NumTombstones = 0;
unsigned ItemSize;
protected:
explicit StringMapImpl(unsigned itemSize)
- : TheTable(nullptr),
- // Initialize the map with zero buckets to allocation.
- NumBuckets(0), NumItems(0), NumTombstones(0), ItemSize(itemSize) {}
+ : ItemSize(itemSize) {}
StringMapImpl(StringMapImpl &&RHS)
: TheTable(RHS.TheTable), NumBuckets(RHS.NumBuckets),
NumItems(RHS.NumItems), NumTombstones(RHS.NumTombstones),
@@ -225,9 +221,10 @@ class StringMap : public StringMapImpl {
AllocatorTy Allocator;
public:
- typedef StringMapEntry<ValueTy> MapEntryTy;
+ using MapEntryTy = StringMapEntry<ValueTy>;
StringMap() : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {}
+
explicit StringMap(unsigned InitialSize)
: StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))) {}
@@ -248,12 +245,6 @@ public:
StringMap(StringMap &&RHS)
: StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {}
- StringMap &operator=(StringMap RHS) {
- StringMapImpl::swap(RHS);
- std::swap(Allocator, RHS.Allocator);
- return *this;
- }
-
StringMap(const StringMap &RHS) :
StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))),
Allocator(RHS.Allocator) {
@@ -289,16 +280,37 @@ public:
// not worthwhile.
}
+ StringMap &operator=(StringMap RHS) {
+ StringMapImpl::swap(RHS);
+ std::swap(Allocator, RHS.Allocator);
+ return *this;
+ }
+
+ ~StringMap() {
+ // Delete all the elements in the map, but don't reset the elements
+ // to default values. This is a copy of clear(), but avoids unnecessary
+ // work not required in the destructor.
+ if (!empty()) {
+ for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
+ StringMapEntryBase *Bucket = TheTable[I];
+ if (Bucket && Bucket != getTombstoneVal()) {
+ static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
+ }
+ }
+ }
+ free(TheTable);
+ }
+
AllocatorTy &getAllocator() { return Allocator; }
const AllocatorTy &getAllocator() const { return Allocator; }
- typedef const char* key_type;
- typedef ValueTy mapped_type;
- typedef StringMapEntry<ValueTy> value_type;
- typedef size_t size_type;
+ using key_type = const char*;
+ using mapped_type = ValueTy;
+ using value_type = StringMapEntry<ValueTy>;
+ using size_type = size_t;
- typedef StringMapConstIterator<ValueTy> const_iterator;
- typedef StringMapIterator<ValueTy> iterator;
+ using const_iterator = StringMapConstIterator<ValueTy>;
+ using iterator = StringMapIterator<ValueTy>;
iterator begin() {
return iterator(TheTable, NumBuckets == 0);
@@ -313,7 +325,7 @@ public:
return const_iterator(TheTable+NumBuckets, true);
}
- llvm::iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
+ iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
return make_range(StringMapKeyIterator<ValueTy>(begin()),
StringMapKeyIterator<ValueTy>(end()));
}
@@ -433,21 +445,6 @@ public:
erase(I);
return true;
}
-
- ~StringMap() {
- // Delete all the elements in the map, but don't reset the elements
- // to default values. This is a copy of clear(), but avoids unnecessary
- // work not required in the destructor.
- if (!empty()) {
- for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
- StringMapEntryBase *Bucket = TheTable[I];
- if (Bucket && Bucket != getTombstoneVal()) {
- static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
- }
- }
- }
- free(TheTable);
- }
};
template <typename DerivedTy, typename ValueTy>
@@ -542,7 +539,6 @@ class StringMapKeyIterator
public:
StringMapKeyIterator() = default;
-
explicit StringMapKeyIterator(StringMapConstIterator<ValueTy> Iter)
: base(std::move(Iter)) {}
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index ce48f6d3bad3..4b25f56432df 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -1,4 +1,4 @@
-//===--- StringRef.h - Constant String Reference Wrapper --------*- C++ -*-===//
+//===- StringRef.h - Constant String Reference Wrapper ----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,16 +15,18 @@
#include "llvm/Support/Compiler.h"
#include <algorithm>
#include <cassert>
+#include <cstddef>
#include <cstring>
#include <limits>
+#include <type_traits>
#include <string>
#include <utility>
namespace llvm {
- template <typename T>
- class SmallVectorImpl;
+
class APInt;
class hash_code;
+ template <typename T> class SmallVectorImpl;
class StringRef;
/// Helper functions for StringRef::getAsInteger.
@@ -46,10 +48,11 @@ namespace llvm {
/// general safe to store a StringRef.
class StringRef {
public:
- typedef const char *iterator;
- typedef const char *const_iterator;
static const size_t npos = ~size_t(0);
- typedef size_t size_type;
+
+ using iterator = const char *;
+ using const_iterator = const char *;
+ using size_type = size_t;
private:
/// The start of the string, in an external buffer.
@@ -906,6 +909,7 @@ namespace llvm {
// StringRefs can be treated like a POD type.
template <typename T> struct isPodLike;
template <> struct isPodLike<StringRef> { static const bool value = true; };
-}
-#endif
+} // end namespace llvm
+
+#endif // LLVM_ADT_STRINGREF_H
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index c32c2a497438..9af44c07df79 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -1,4 +1,4 @@
-//===--- StringSet.h - The LLVM Compiler Driver -----------------*- C++ -*-===//
+//===- StringSet.h - The LLVM Compiler Driver -------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,13 +15,19 @@
#define LLVM_ADT_STRINGSET_H
#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cassert>
+#include <initializer_list>
+#include <utility>
namespace llvm {
/// StringSet - A wrapper for StringMap that provides set-like functionality.
- template <class AllocatorTy = llvm::MallocAllocator>
- class StringSet : public llvm::StringMap<char, AllocatorTy> {
- typedef llvm::StringMap<char, AllocatorTy> base;
+ template <class AllocatorTy = MallocAllocator>
+ class StringSet : public StringMap<char, AllocatorTy> {
+ using base = StringMap<char, AllocatorTy>;
+
public:
StringSet() = default;
StringSet(std::initializer_list<StringRef> S) {
@@ -40,6 +46,7 @@ namespace llvm {
base::insert(std::make_pair(*It, '\0'));
}
};
-}
+
+} // end namespace llvm
#endif // LLVM_ADT_STRINGSET_H
diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index ca43b6046193..79740713f75b 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h
@@ -30,9 +30,9 @@ namespace llvm {
template <typename EltTy>
class TinyPtrVector {
public:
- typedef SmallVector<EltTy, 4> VecTy;
- typedef typename VecTy::value_type value_type;
- typedef PointerUnion<EltTy, VecTy *> PtrUnion;
+ using VecTy = SmallVector<EltTy, 4>;
+ using value_type = typename VecTy::value_type;
+ using PtrUnion = PointerUnion<EltTy, VecTy *>;
private:
PtrUnion Val;
@@ -167,10 +167,10 @@ public:
return Val.template get<VecTy*>()->size();
}
- typedef EltTy *iterator;
- typedef const EltTy *const_iterator;
- typedef std::reverse_iterator<iterator> reverse_iterator;
- typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+ using iterator = EltTy *;
+ using const_iterator = const EltTy *;
+ using reverse_iterator = std::reverse_iterator<iterator>;
+ using const_reverse_iterator = std::reverse_iterator<const_iterator>;
iterator begin() {
if (Val.template is<EltTy>())
diff --git a/include/llvm/ADT/UniqueVector.h b/include/llvm/ADT/UniqueVector.h
index e1ab4b56023f..b17fb2392baf 100644
--- a/include/llvm/ADT/UniqueVector.h
+++ b/include/llvm/ADT/UniqueVector.h
@@ -1,4 +1,4 @@
-//===-- llvm/ADT/UniqueVector.h ---------------------------------*- C++ -*-===//
+//===- llvm/ADT/UniqueVector.h ----------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -24,16 +24,15 @@ namespace llvm {
/// Entries can be fetched using operator[] with the entry ID.
template<class T> class UniqueVector {
public:
- typedef typename std::vector<T> VectorType;
- typedef typename VectorType::iterator iterator;
- typedef typename VectorType::const_iterator const_iterator;
+ using VectorType = typename std::vector<T>;
+ using iterator = typename VectorType::iterator;
+ using const_iterator = typename VectorType::const_iterator;
private:
// Map - Used to handle the correspondence of entry to ID.
std::map<T, unsigned> Map;
// Vector - ID ordered vector of entries. Entries can be indexed by ID - 1.
- //
VectorType Vector;
public:
@@ -68,7 +67,6 @@ public:
}
/// operator[] - Returns a reference to the entry with the specified ID.
- ///
const T &operator[](unsigned ID) const {
assert(ID-1 < size() && "ID is 0 or out of range!");
return Vector[ID - 1];
@@ -87,21 +85,18 @@ public:
const_iterator end() const { return Vector.end(); }
/// size - Returns the number of entries in the vector.
- ///
size_t size() const { return Vector.size(); }
/// empty - Returns true if the vector is empty.
- ///
bool empty() const { return Vector.empty(); }
/// reset - Clears all the entries.
- ///
void reset() {
Map.clear();
Vector.resize(0, 0);
}
};
-} // End of namespace llvm
+} // end namespace llvm
#endif // LLVM_ADT_UNIQUEVECTOR_H
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index c5f97083af4d..6aaabe1d1889 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -55,6 +55,21 @@ public:
ProfileSummaryInfo(ProfileSummaryInfo &&Arg)
: M(Arg.M), Summary(std::move(Arg.Summary)) {}
+ /// \brief Returns true if profile summary is available.
+ bool hasProfileSummary() { return computeSummary(); }
+
+ /// \brief Returns true if module \c M has sample profile.
+ bool hasSampleProfile() {
+ return hasProfileSummary() &&
+ Summary->getKind() == ProfileSummary::PSK_Sample;
+ }
+
+ /// \brief Returns true if module \c M has instrumentation profile.
+ bool hasInstrumentationProfile() {
+ return hasProfileSummary() &&
+ Summary->getKind() == ProfileSummary::PSK_Instr;
+ }
+
/// Handle the invalidation of this information.
///
/// When used as a result of \c ProfileSummaryAnalysis this method will be
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index f3122f0bf7f0..6d9f345755ab 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -28,7 +28,7 @@ public:
Error visitTypeRecord(CVType &Record, TypeIndex Index);
Error visitTypeRecord(CVType &Record);
- Error visitMemberRecord(CVMemberRecord &Record);
+ Error visitMemberRecord(CVMemberRecord Record);
/// Visits the type records in Data. Sets the error flag on parse failures.
Error visitTypeStream(const CVTypeArray &Types);
@@ -47,6 +47,36 @@ private:
TinyPtrVector<TypeServerHandler *> Handlers;
};
+enum VisitorDataSource {
+ VDS_BytesPresent, // The record bytes are passed into the the visitation
+ // function. The algorithm should first deserialize them
+ // before passing them on through the pipeline.
+ VDS_BytesExternal // The record bytes are not present, and it is the
+ // responsibility of the visitor callback interface to
+ // supply the bytes.
+};
+
+Error visitTypeRecord(CVType &Record, TypeIndex Index,
+ TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source = VDS_BytesPresent,
+ TypeServerHandler *TS = nullptr);
+Error visitTypeRecord(CVType &Record, TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source = VDS_BytesPresent,
+ TypeServerHandler *TS = nullptr);
+
+Error visitMemberRecord(CVMemberRecord Record, TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source = VDS_BytesPresent);
+Error visitMemberRecord(TypeLeafKind Kind, ArrayRef<uint8_t> Record,
+ TypeVisitorCallbacks &Callbacks);
+
+Error visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
+ TypeVisitorCallbacks &Callbacks);
+
+Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks,
+ TypeServerHandler *TS = nullptr);
+Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks,
+ TypeServerHandler *TS = nullptr);
+
} // end namespace codeview
} // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
index 35a8010f1163..21288df89be2 100644
--- a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h
@@ -11,13 +11,10 @@
#define LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H
#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
#include "llvm/DebugInfo/CodeView/TypeIndex.h"
#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
#include "llvm/Support/Error.h"
namespace llvm {
@@ -73,18 +70,6 @@ private:
/// The database visitor which adds new records to the database.
TypeDatabaseVisitor DatabaseVisitor;
- /// The deserializer which deserializes new records.
- TypeDeserializer Deserializer;
-
- /// The visitation callback pipeline to use. By default this contains a
- /// deserializer and a type database visitor. But the callback specified
- /// in the constructor is also added.
- TypeVisitorCallbackPipeline Pipeline;
-
- /// The visitor used to visit the internal pipeline for deserialization and
- /// database maintenance.
- CVTypeVisitor InternalVisitor;
-
/// A vector mapping type indices to type offset. For every record that has
/// been visited, contains the absolute offset of that record in the record
/// array.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
index 5919aaddea40..c3953b62d780 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
@@ -31,10 +31,10 @@ struct DWARFAttribute {
dwarf::Attribute Attr;
/// The form and value for this attribute.
DWARFFormValue Value;
-
+
DWARFAttribute(uint32_t O, dwarf::Attribute A = dwarf::Attribute(0),
dwarf::Form F = dwarf::Form(0)) : Attr(A), Value(F) {}
-
+
bool isValid() const {
return Offset != 0 && Attr != dwarf::Attribute(0);
}
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 40eb7e9a8836..2d82104ea098 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -22,19 +22,19 @@ class raw_ostream;
class DWARFDebugArangeSet {
public:
struct Header {
- // The total length of the entries for that set, not including the length
- // field itself.
+ /// The total length of the entries for that set, not including the length
+ /// field itself.
uint32_t Length;
- // The offset from the beginning of the .debug_info section of the
- // compilation unit entry referenced by the table.
+ /// The offset from the beginning of the .debug_info section of the
+ /// compilation unit entry referenced by the table.
uint32_t CuOffset;
- // The DWARF version number.
+ /// The DWARF version number.
uint16_t Version;
- // The size in bytes of an address on the target architecture. For segmented
- // addressing, this is the size of the offset portion of the address.
+ /// The size in bytes of an address on the target architecture. For segmented
+ /// addressing, this is the size of the offset portion of the address.
uint8_t AddrSize;
- // The size in bytes of a segment descriptor on the target architecture.
- // If the target system uses a flat address space, this value is 0.
+ /// The size in bytes of a segment descriptor on the target architecture.
+ /// If the target system uses a flat address space, this value is 0.
uint8_t SegSize;
};
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index c06771d6afb4..2237aa361d18 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -28,7 +28,7 @@ private:
void clear();
void extract(DataExtractor DebugArangesData);
- // Call appendRange multiple times and then call construct.
+ /// Call appendRange multiple times and then call construct.
void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
void construct();
@@ -58,9 +58,9 @@ private:
return LowPC < other.LowPC;
}
- uint64_t LowPC; // Start of address range.
- uint32_t Length; // End of address range (not including this address).
- uint32_t CUOffset; // Offset of the compile unit or die.
+ uint64_t LowPC; /// Start of address range.
+ uint32_t Length; /// End of address range (not including this address).
+ uint32_t CUOffset; /// Offset of the compile unit or die.
};
struct RangeEndpoint {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 23a573b7a9fa..95ec1be62a79 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -33,31 +33,31 @@ typedef std::vector<DWARFAddressRange> DWARFAddressRangesVector;
class DWARFDebugRangeList {
public:
struct RangeListEntry {
- // A beginning address offset. This address offset has the size of an
- // address and is relative to the applicable base address of the
- // compilation unit referencing this range list. It marks the beginning
- // of an address range.
+ /// A beginning address offset. This address offset has the size of an
+ /// address and is relative to the applicable base address of the
+ /// compilation unit referencing this range list. It marks the beginning
+ /// of an address range.
uint64_t StartAddress;
- // An ending address offset. This address offset again has the size of
- // an address and is relative to the applicable base address of the
- // compilation unit referencing this range list. It marks the first
- // address past the end of the address range. The ending address must
- // be greater than or equal to the beginning address.
+ /// An ending address offset. This address offset again has the size of
+ /// an address and is relative to the applicable base address of the
+ /// compilation unit referencing this range list. It marks the first
+ /// address past the end of the address range. The ending address must
+ /// be greater than or equal to the beginning address.
uint64_t EndAddress;
- // The end of any given range list is marked by an end of list entry,
- // which consists of a 0 for the beginning address offset
- // and a 0 for the ending address offset.
+ /// The end of any given range list is marked by an end of list entry,
+ /// which consists of a 0 for the beginning address offset
+ /// and a 0 for the ending address offset.
bool isEndOfListEntry() const {
return (StartAddress == 0) && (EndAddress == 0);
}
- // A base address selection entry consists of:
- // 1. The value of the largest representable address offset
- // (for example, 0xffffffff when the size of an address is 32 bits).
- // 2. An address, which defines the appropriate base address for
- // use in interpreting the beginning and ending address offsets of
- // subsequent entries of the location list.
+ /// A base address selection entry consists of:
+ /// 1. The value of the largest representable address offset
+ /// (for example, 0xffffffff when the size of an address is 32 bits).
+ /// 2. An address, which defines the appropriate base address for
+ /// use in interpreting the beginning and ending address offsets of
+ /// subsequent entries of the location list.
bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
assert(AddressSize == 4 || AddressSize == 8);
if (AddressSize == 4)
@@ -68,7 +68,7 @@ public:
};
private:
- // Offset in .debug_ranges section.
+ /// Offset in .debug_ranges section.
uint32_t Offset;
uint8_t AddressSize;
std::vector<RangeListEntry> Entries;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index ee06125ea278..ca94a90fabfc 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -24,10 +24,10 @@
#include <iterator>
namespace llvm {
-
+
class DWARFUnit;
class raw_ostream;
-
+
//===----------------------------------------------------------------------===//
/// Utility class that carries the DWARF compile/type unit and the debug info
/// entry in an object.
@@ -47,7 +47,7 @@ class DWARFDie {
public:
DWARFDie() = default;
DWARFDie(DWARFUnit *Unit, const DWARFDebugInfoEntry * D) : U(Unit), Die(D) {}
-
+
bool isValid() const { return U && Die; }
explicit operator bool() const { return isValid(); }
const DWARFDebugInfoEntry *getDebugInfoEntry() const { return Die; }
@@ -68,7 +68,7 @@ public:
assert(isValid() && "must check validity prior to calling");
return Die->getOffset();
}
-
+
dwarf::Tag getTag() const {
auto AbbrevDecl = getAbbreviationDeclarationPtr();
if (AbbrevDecl)
@@ -80,7 +80,7 @@ public:
assert(isValid() && "must check validity prior to calling");
return Die->hasChildren();
}
-
+
/// Returns true for a valid DIE that terminates a sibling chain.
bool isNULL() const {
return getAbbreviationDeclarationPtr() == nullptr;
@@ -97,13 +97,13 @@ public:
/// \returns a valid DWARFDie instance if this object has a parent or an
/// invalid DWARFDie instance if it doesn't.
DWARFDie getParent() const;
-
+
/// Get the sibling of this DIE object.
///
/// \returns a valid DWARFDie instance if this object has a sibling or an
/// invalid DWARFDie instance if it doesn't.
DWARFDie getSibling() const;
-
+
/// Get the first child of this DIE object.
///
/// \returns a valid DWARFDie instance if this object has children or an
@@ -113,7 +113,7 @@ public:
return DWARFDie(U, Die + 1);
return DWARFDie();
}
-
+
/// Dump the DIE and all of its attributes to the supplied stream.
///
/// \param OS the stream to use for output.
@@ -121,7 +121,7 @@ public:
/// children.
/// \param indent the number of characters to indent each line that is output.
void dump(raw_ostream &OS, unsigned recurseDepth, unsigned indent = 0) const;
-
+
/// Extract the specified attribute from this DIE.
///
/// Extract an attribute value from this DIE only. This call doesn't look
@@ -132,7 +132,7 @@ public:
/// \returns an optional DWARFFormValue that will have the form value if the
/// attribute was successfully extracted.
Optional<DWARFFormValue> find(dwarf::Attribute Attr) const;
-
+
/// Extract the first value of any attribute in Attrs from this DIE.
///
/// Extract the first attribute that matches from this DIE only. This call
@@ -180,7 +180,7 @@ public:
///
/// \returns anm optional absolute section offset value for the attribute.
Optional<uint64_t> getRangesBaseAttribute() const;
-
+
/// Get the DW_AT_high_pc attribute value as an address.
///
/// In DWARF version 4 and later the high PC can be encoded as an offset from
@@ -196,7 +196,7 @@ public:
/// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
/// Returns true if both attributes are present.
bool getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const;
-
+
/// Get the address ranges for this DIE.
///
/// Get the hi/low PC range if both attributes are available or exrtracts the
@@ -208,7 +208,7 @@ public:
/// \returns a address range vector that might be empty if no address range
/// information is available.
DWARFAddressRangesVector getAddressRanges() const;
-
+
/// Get all address ranges for any DW_TAG_subprogram DIEs in this DIE or any
/// of its children.
///
@@ -218,19 +218,19 @@ public:
///
/// \param Ranges the addres range vector to fill in.
void collectChildrenAddressRanges(DWARFAddressRangesVector &Ranges) const;
-
+
bool addressRangeContainsAddress(const uint64_t Address) const;
-
+
/// If a DIE represents a subprogram (or inlined subroutine), returns its
/// mangled name (or short name, if mangled is missing). This name may be
/// fetched from specification or abstract origin for this subprogram.
/// Returns null if no name is found.
const char *getSubroutineName(DINameKind Kind) const;
-
+
/// Return the DIE name resolving DW_AT_sepcification or DW_AT_abstract_origin
/// references if necessary. Returns null if no name is found.
const char *getName(DINameKind Kind) const;
-
+
/// Returns the declaration line (start line) for a DIE, assuming it specifies
/// a subprogram. This may be fetched from specification or abstract origin
/// for this subprogram by resolving DW_AT_sepcification or
@@ -251,21 +251,21 @@ public:
/// there is no DW_AT_GNU_discriminator attribute in this DIE.
void getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
uint32_t &CallColumn, uint32_t &CallDiscriminator) const;
-
+
class attribute_iterator;
/// Get an iterator range to all attributes in the current DIE only.
///
/// \returns an iterator range for the attributes of the current DIE.
iterator_range<attribute_iterator> attributes() const;
-
+
class iterator;
-
+
iterator begin() const;
iterator end() const;
iterator_range<iterator> children() const;
};
-
+
class DWARFDie::attribute_iterator :
public iterator_facade_base<attribute_iterator, std::forward_iterator_tag,
const DWARFAttribute> {
@@ -275,7 +275,7 @@ class DWARFDie::attribute_iterator :
DWARFAttribute AttrValue;
/// The attribute index within the abbreviation declaration in Die.
uint32_t Index;
-
+
/// Update the attribute index and attempt to read the attribute value. If the
/// attribute is able to be read, update AttrValue and the Index member
/// variable. If the attribute value is not able to be read, an appropriate
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index f3516ebdecba..a30e0be9c3c3 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -49,9 +49,9 @@ private:
const uint8_t *data = nullptr;
};
- dwarf::Form Form; // Form for this value.
- ValueType Value; // Contains all data for the form.
- const DWARFUnit *U = nullptr; // Remember the DWARFUnit at extract time.
+ dwarf::Form Form; /// Form for this value.
+ ValueType Value; /// Contains all data for the form.
+ const DWARFUnit *U = nullptr; /// Remember the DWARFUnit at extract time.
public:
DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F) {}
@@ -72,11 +72,14 @@ public:
const DWARFUnit *getUnit() const { return U; }
void dump(raw_ostream &OS) const;
- /// \brief extracts a value in data at offset *offset_ptr.
+ /// Extracts a value in \p Data at offset \p *OffsetPtr.
///
/// The passed DWARFUnit is allowed to be nullptr, in which
/// case no relocation processing will be performed and some
/// kind of forms that depend on Unit information are disallowed.
+ /// \param Data The DataExtractor to use.
+ /// \param OffsetPtr The offset within DataExtractor where the data starts.
+ /// \param U The optional DWARFUnit supplying information for some forms.
/// \returns whether the extraction succeeded.
bool extractValue(const DataExtractor &Data, uint32_t *OffsetPtr,
const DWARFUnit *U);
diff --git a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
index 7a52218663b9..8d1ac5c83c23 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
@@ -29,25 +29,25 @@ class DWARFGdbIndex {
uint32_t ConstantPoolOffset;
struct CompUnitEntry {
- uint64_t Offset; // Offset of a CU in the .debug_info section.
- uint64_t Length; // Length of that CU.
+ uint64_t Offset; /// Offset of a CU in the .debug_info section.
+ uint64_t Length; /// Length of that CU.
};
SmallVector<CompUnitEntry, 0> CuList;
struct AddressEntry {
- uint64_t LowAddress; // The low address.
- uint64_t HighAddress; // The high address.
- uint32_t CuIndex; // The CU index.
+ uint64_t LowAddress; /// The low address.
+ uint64_t HighAddress; /// The high address.
+ uint32_t CuIndex; /// The CU index.
};
SmallVector<AddressEntry, 0> AddressArea;
struct SymTableEntry {
- uint32_t NameOffset; // Offset of the symbol's name in the constant pool.
- uint32_t VecOffset; // Offset of the CU vector in the constant pool.
+ uint32_t NameOffset; /// Offset of the symbol's name in the constant pool.
+ uint32_t VecOffset; /// Offset of the CU vector in the constant pool.
};
SmallVector<SymTableEntry, 0> SymbolTable;
- // Each value is CU index + attributes.
+ /// Each value is CU index + attributes.
SmallVector<std::pair<uint32_t, SmallVector<uint32_t, 0>>, 0>
ConstantPoolVectors;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index f1e03bb4c2e1..ec0397a0fb09 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -17,15 +17,14 @@
namespace llvm {
struct RelocAddrEntry {
- uint8_t Width;
int64_t Value;
};
-// In place of applying the relocations to the data we've read from disk we use
-// a separate mapping table to the side and checking that at locations in the
-// dwarf where we expect relocated values. This adds a bit of complexity to the
-// dwarf parsing/extraction at the benefit of not allocating memory for the
-// entire size of the debug info sections.
+/// In place of applying the relocations to the data we've read from disk we use
+/// a separate mapping table to the side and checking that at locations in the
+/// dwarf where we expect relocated values. This adds a bit of complexity to the
+/// dwarf parsing/extraction at the benefit of not allocating memory for the
+/// entire size of the debug info sections.
typedef DenseMap<uint64_t, RelocAddrEntry> RelocAddrMap;
} // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 68e541bac73c..c15e27f36a8b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -111,7 +111,7 @@ private:
class DWARFUnit {
DWARFContext &Context;
- // Section containing this DWARFUnit.
+ /// Section containing this DWARFUnit.
const DWARFSection &InfoSection;
const DWARFDebugAbbrev *Abbrev;
@@ -133,12 +133,12 @@ class DWARFUnit {
uint8_t UnitType;
uint8_t AddrSize;
uint64_t BaseAddr;
- // The compile unit debug information entry items.
+ /// The compile unit debug information entry items.
std::vector<DWARFDebugInfoEntry> DieArray;
- // Map from range's start address to end address and corresponding DIE.
- // IntervalMap does not support range removal, as a result, we use the
- // std::map::upper_bound for address range lookup.
+ /// Map from range's start address to end address and corresponding DIE.
+ /// IntervalMap does not support range removal, as a result, we use the
+ /// std::map::upper_bound for address range lookup.
std::map<uint64_t, std::pair<uint64_t, DWARFDie>> AddrDieMap;
typedef iterator_range<std::vector<DWARFDebugInfoEntry>::iterator>
die_iterator_range;
@@ -189,7 +189,7 @@ public:
AddrOffsetSectionBase = Base;
}
- // Recursively update address to Die map.
+ /// Recursively update address to Die map.
void updateAddressDieMap(DWARFDie Die);
void setRangesSection(const DWARFSection *RS, uint32_t Base) {
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 4579cbf4227b..c5549983ed43 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -51,6 +51,7 @@ public:
HashTable &getHashAdjusters();
codeview::CVTypeRange types(bool *HadError) const;
+ const codeview::CVTypeArray &typeArray() const { return TypeRecords; }
Error commit();
diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 64240a929782..6321bb81b8cb 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1132,4 +1132,6 @@ def int_ppc_tsuspend : GCCBuiltin<"__builtin_tsuspend">,
def int_ppc_ttest : GCCBuiltin<"__builtin_ttest">,
Intrinsic<[llvm_i64_ty], [], []>;
+
+def int_ppc_cfence : Intrinsic<[], [llvm_anyint_ty], []>;
}
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index a06c67fe814c..071ec2edb538 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -62,6 +62,7 @@ def : GINodeEquiv<G_FMUL, fmul>;
def : GINodeEquiv<G_FDIV, fdiv>;
def : GINodeEquiv<G_FREM, frem>;
def : GINodeEquiv<G_FPOW, fpow>;
+def : GINodeEquiv<G_INTRINSIC, intrinsic_wo_chain>;
def : GINodeEquiv<G_BR, br>;
// Specifies the GlobalISel equivalents for SelectionDAG's ComplexPattern.
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index a4672efeedd6..e4d58bf1b4eb 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -2984,7 +2984,7 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
SmallVectorImpl<Constraint> &Constraints,
bool &Consistent) {
bool Result = false;
- for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
+ for (unsigned LI : Loops.set_bits()) {
DEBUG(dbgs() << "\t Constraint[" << LI << "] is");
DEBUG(Constraints[LI].dump(dbgs()));
if (Constraints[LI].isDistance())
@@ -3266,7 +3266,7 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
// For debugging purposes, dump a small bit vector to dbgs().
static void dumpSmallBitVector(SmallBitVector &BV) {
dbgs() << "{";
- for (int VI = BV.find_first(); VI >= 0; VI = BV.find_next(VI)) {
+ for (unsigned VI : BV.set_bits()) {
dbgs() << VI;
if (BV.find_next(VI) >= 0)
dbgs() << ' ';
@@ -3506,7 +3506,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
NewConstraint.setAny(SE);
// test separable subscripts
- for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+ for (unsigned SI : Separable.set_bits()) {
DEBUG(dbgs() << "testing subscript " << SI);
switch (Pair[SI].Classification) {
case Subscript::ZIV:
@@ -3545,14 +3545,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
for (unsigned II = 0; II <= MaxLevels; ++II)
Constraints[II].setAny(SE);
- for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+ for (unsigned SI : Coupled.set_bits()) {
DEBUG(dbgs() << "testing subscript group " << SI << " { ");
SmallBitVector Group(Pair[SI].Group);
SmallBitVector Sivs(Pairs);
SmallBitVector Mivs(Pairs);
SmallBitVector ConstrainedLevels(MaxLevels + 1);
SmallVector<Subscript *, 4> PairsInGroup;
- for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+ for (unsigned SJ : Group.set_bits()) {
DEBUG(dbgs() << SJ << " ");
if (Pair[SJ].Classification == Subscript::SIV)
Sivs.set(SJ);
@@ -3564,7 +3564,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
DEBUG(dbgs() << "}\n");
while (Sivs.any()) {
bool Changed = false;
- for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+ for (unsigned SJ : Sivs.set_bits()) {
DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
// SJ is an SIV subscript that's part of the current coupled group
unsigned Level;
@@ -3588,7 +3588,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
DEBUG(dbgs() << " propagating\n");
DEBUG(dbgs() << "\tMivs = ");
DEBUG(dumpSmallBitVector(Mivs));
- for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+ for (unsigned SJ : Mivs.set_bits()) {
// SJ is an MIV subscript that's part of the current coupled group
DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops,
@@ -3622,7 +3622,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
}
// test & propagate remaining RDIVs
- for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+ for (unsigned SJ : Mivs.set_bits()) {
if (Pair[SJ].Classification == Subscript::RDIV) {
DEBUG(dbgs() << "RDIV test\n");
if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
@@ -3635,7 +3635,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
// test remaining MIVs
// This code is temporary.
// Better to somehow test all remaining subscripts simultaneously.
- for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+ for (unsigned SJ : Mivs.set_bits()) {
if (Pair[SJ].Classification == Subscript::MIV) {
DEBUG(dbgs() << "MIV test\n");
if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
@@ -3647,9 +3647,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
// update Result.DV from constraint vector
DEBUG(dbgs() << " updating\n");
- for (int SJ = ConstrainedLevels.find_first(); SJ >= 0;
- SJ = ConstrainedLevels.find_next(SJ)) {
- if (SJ > (int)CommonLevels)
+ for (unsigned SJ : ConstrainedLevels.set_bits()) {
+ if (SJ > CommonLevels)
break;
updateDirection(Result.DV[SJ - 1], Constraints[SJ]);
if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE)
@@ -3859,7 +3858,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
NewConstraint.setAny(SE);
// test separable subscripts
- for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+ for (unsigned SI : Separable.set_bits()) {
switch (Pair[SI].Classification) {
case Subscript::SIV: {
unsigned Level;
@@ -3886,12 +3885,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
for (unsigned II = 0; II <= MaxLevels; ++II)
Constraints[II].setAny(SE);
- for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+ for (unsigned SI : Coupled.set_bits()) {
SmallBitVector Group(Pair[SI].Group);
SmallBitVector Sivs(Pairs);
SmallBitVector Mivs(Pairs);
SmallBitVector ConstrainedLevels(MaxLevels + 1);
- for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+ for (unsigned SJ : Group.set_bits()) {
if (Pair[SJ].Classification == Subscript::SIV)
Sivs.set(SJ);
else
@@ -3899,7 +3898,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
}
while (Sivs.any()) {
bool Changed = false;
- for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+ for (unsigned SJ : Sivs.set_bits()) {
// SJ is an SIV subscript that's part of the current coupled group
unsigned Level;
const SCEV *SplitIter = nullptr;
@@ -3914,7 +3913,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
}
if (Changed) {
// propagate, possibly creating new SIVs and ZIVs
- for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+ for (unsigned SJ : Mivs.set_bits()) {
// SJ is an MIV subscript that's part of the current coupled group
if (propagate(Pair[SJ].Src, Pair[SJ].Dst,
Pair[SJ].Loops, Constraints, Result.Consistent)) {
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 44c14cb17c22..4702569126c6 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -669,21 +669,33 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
if (PSI) {
BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
- if (PSI->isHotCallSite(CS, CallerBFI)) {
- DEBUG(dbgs() << "Hot callsite.\n");
- Threshold = Params.HotCallSiteThreshold.getValue();
- } else if (PSI->isFunctionEntryHot(&Callee)) {
- DEBUG(dbgs() << "Hot callee.\n");
- // If callsite hotness can not be determined, we may still know
- // that the callee is hot and treat it as a weaker hint for threshold
- // increase.
- Threshold = MaxIfValid(Threshold, Params.HintThreshold);
- } else if (PSI->isColdCallSite(CS, CallerBFI)) {
- DEBUG(dbgs() << "Cold callsite.\n");
- Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
- } else if (PSI->isFunctionEntryCold(&Callee)) {
- DEBUG(dbgs() << "Cold callee.\n");
- Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+ // FIXME: After switching to the new passmanager, simplify the logic below
+ // by checking only the callsite hotness/coldness. The check for CallerBFI
+ // exists only because we do not have BFI available with the old PM.
+ //
+ // Use callee's hotness information only if we have no way of determining
+ // callsite's hotness information. Callsite hotness can be determined if
+ // sample profile is used (which adds hotness metadata to calls) or if
+ // caller's BlockFrequencyInfo is available.
+ if (CallerBFI || PSI->hasSampleProfile()) {
+ if (PSI->isHotCallSite(CS, CallerBFI)) {
+ DEBUG(dbgs() << "Hot callsite.\n");
+ Threshold = Params.HotCallSiteThreshold.getValue();
+ } else if (PSI->isColdCallSite(CS, CallerBFI)) {
+ DEBUG(dbgs() << "Cold callsite.\n");
+ Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
+ }
+ } else {
+ if (PSI->isFunctionEntryHot(&Callee)) {
+ DEBUG(dbgs() << "Hot callee.\n");
+ // If callsite hotness can not be determined, we may still know
+ // that the callee is hot and treat it as a weaker hint for threshold
+ // increase.
+ Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+ } else if (PSI->isFunctionEntryCold(&Callee)) {
+ DEBUG(dbgs() << "Cold callee.\n");
+ Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+ }
}
}
}
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 5728887cc1e9..5652248a60ce 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1752,6 +1752,24 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
(A == Op0 || B == Op0))
return Op0;
+ // A mask that only clears known zeros of a shifted value is a no-op.
+ Value *X;
+ const APInt *Mask;
+ const APInt *ShAmt;
+ if (match(Op1, m_APInt(Mask))) {
+ // If all bits in the inverted and shifted mask are clear:
+ // and (shl X, ShAmt), Mask --> shl X, ShAmt
+ if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
+ (~(*Mask)).lshr(*ShAmt).isNullValue())
+ return Op0;
+
+ // If all bits in the inverted and shifted mask are clear:
+ // and (lshr X, ShAmt), Mask --> lshr X, ShAmt
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+ (~(*Mask)).shl(*ShAmt).isNullValue())
+ return Op0;
+ }
+
// A & (-A) = A if A is a power of two or zero.
if (match(Op0, m_Neg(m_Specific(Op1))) ||
match(Op1, m_Neg(m_Specific(Op0)))) {
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 502f4205b689..12b86daa602b 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -75,7 +75,7 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
return None;
assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
"We can only get profile count for call/invoke instruction.");
- if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) {
+ if (hasSampleProfile()) {
// In sample PGO mode, check if there is a profile metadata on the
// instruction. If it is present, determine hotness solely based on that,
// since the sampled entry count may not be accurate.
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 800354d2f5b4..a746ddfd7a63 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -629,19 +629,19 @@ static int CompareSCEVComplexity(
const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
- // If there is a dominance relationship between the loops, sort by the
- // dominance. Otherwise, sort by depth. We require such order in getAddExpr.
+ // There is always a dominance between two recs that are used by one SCEV,
+ // so we can safely sort recs by loop header dominance. We require such
+ // order in getAddExpr.
const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
if (LLoop != RLoop) {
const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
assert(LHead != RHead && "Two loops share the same header?");
if (DT.dominates(LHead, RHead))
return 1;
- else if (DT.dominates(RHead, LHead))
- return -1;
- unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
- if (LDepth != RDepth)
- return (int)LDepth - (int)RDepth;
+ else
+ assert(DT.dominates(RHead, LHead) &&
+ "No dominance between recurrences used by one SCEV?");
+ return -1;
}
// Addrec complexity grows with operand count.
@@ -2512,22 +2512,23 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
AddRec->op_end());
for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
- ++OtherIdx)
- if (const auto *OtherAddRec = dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
- if (OtherAddRec->getLoop() == AddRecLoop) {
- for (unsigned i = 0, e = OtherAddRec->getNumOperands();
- i != e; ++i) {
- if (i >= AddRecOps.size()) {
- AddRecOps.append(OtherAddRec->op_begin()+i,
- OtherAddRec->op_end());
- break;
- }
- SmallVector<const SCEV *, 2> TwoOps = {
- AddRecOps[i], OtherAddRec->getOperand(i)};
- AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
+ ++OtherIdx) {
+ const auto *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+ if (OtherAddRec->getLoop() == AddRecLoop) {
+ for (unsigned i = 0, e = OtherAddRec->getNumOperands();
+ i != e; ++i) {
+ if (i >= AddRecOps.size()) {
+ AddRecOps.append(OtherAddRec->op_begin()+i,
+ OtherAddRec->op_end());
+ break;
}
- Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+ SmallVector<const SCEV *, 2> TwoOps = {
+ AddRecOps[i], OtherAddRec->getOperand(i)};
+ AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
}
+ Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+ }
+ }
// Step size has changed, so we cannot guarantee no self-wraparound.
Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 3a57772cc7f5..43b245c66400 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -128,8 +128,7 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
}
DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
- DEBUG(for (int r = CriticalPathSet.find_first(); r != -1;
- r = CriticalPathSet.find_next(r))
+ DEBUG(for (unsigned r : CriticalPathSet.set_bits())
dbgs() << " " << TRI->getName(r));
DEBUG(dbgs() << '\n');
}
@@ -571,7 +570,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
DEBUG({
dbgs() << " ::";
- for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+ for (unsigned r : BV.set_bits())
dbgs() << " " << TRI->getName(r);
dbgs() << "\n";
});
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 98163bffb60b..7d945690e9c3 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -501,7 +501,7 @@ void CodeViewDebug::emitTypeInformation() {
Error E = Reader.readArray(Types, Reader.getLength());
if (!E) {
TypeVisitorCallbacks C;
- E = CVTypeVisitor(C).visitTypeStream(Types);
+ E = codeview::visitTypeStream(Types, C);
}
if (E) {
logAllUnhandledErrors(std::move(E), errs(), "error: ");
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 22fd7bb46056..20e1467b30c3 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -209,8 +209,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
} else if (MO.isRegMask()) {
// If this is a register mask operand, clobber all debug values in
// non-CSRs.
- for (int I = ChangingRegs.find_first(); I != -1;
- I = ChangingRegs.find_next(I)) {
+ for (unsigned I : ChangingRegs.set_bits()) {
// Don't consider SP to be clobbered by register masks.
if (unsigned(I) != SP && TRI->isPhysicalRegister(I) &&
MO.clobbersPhysReg(I)) {
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 811858f136eb..77dfb13ac1f2 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1129,6 +1129,11 @@ void IRTranslator::finalizeFunction() {
ValToVReg.clear();
FrameIndices.clear();
MachinePreds.clear();
+ // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it
+ // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid
+ // destroying it twice (in ~IRTranslator() and ~LLVMContext())
+ EntryBuilder = MachineIRBuilder();
+ CurBuilder = MachineIRBuilder();
}
bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index ab433273b189..b53b002f55a6 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -760,7 +760,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
const MachineFrameInfo &MFI = MF->getFrameInfo();
BitVector PR = MFI.getPristineRegs(*MF);
- for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
+ for (unsigned I : PR.set_bits()) {
for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true);
SubRegs.isValid(); ++SubRegs)
regsLive.insert(*SubRegs);
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 06500289c971..47d726f6da7a 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -285,8 +285,7 @@ class RAGreedy : public MachineFunctionPass,
// Set B[i] = C for every live bundle where B[i] was NoCand.
unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
unsigned Count = 0;
- for (int i = LiveBundles.find_first(); i >= 0;
- i = LiveBundles.find_next(i))
+ for (unsigned i : LiveBundles.set_bits())
if (B[i] == NoCand) {
B[i] = C;
Count++;
@@ -1162,9 +1161,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
}
DEBUG({
- for (int i = Cand.LiveBundles.find_first(); i>=0;
- i = Cand.LiveBundles.find_next(i))
- dbgs() << " EB#" << i;
+ for (int i : Cand.LiveBundles.set_bits())
+ dbgs() << " EB#" << i;
dbgs() << ".\n";
});
return true;
@@ -1482,8 +1480,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
DEBUG({
dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost)
<< " with bundles";
- for (int i = Cand.LiveBundles.find_first(); i>=0;
- i = Cand.LiveBundles.find_next(i))
+ for (int i : Cand.LiveBundles.set_bits())
dbgs() << " EB#" << i;
dbgs() << ".\n";
});
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index caf5cb497a71..0ccee175abfb 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13087,14 +13087,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
}
}
- // If this is a store followed by a store with the same value to the same
- // location, then the store is dead/noop.
if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
- if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
- ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
- ST1->isUnindexed() && !ST1->isVolatile()) {
- // The store is dead, remove it.
- return Chain;
+ if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
+ !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
+ ST->getMemoryVT() == ST1->getMemoryVT()) {
+ // If this is a store followed by a store with the same value to the same
+ // location, then the store is dead/noop.
+ if (ST1->getValue() == Value) {
+ // The store is dead, remove it.
+ return Chain;
+ }
+
+ // If this is a store who's preceeding store to the same location
+ // and no one other node is chained to that store we can effectively
+ // drop the store. Do not remove stores to undef as they may be used as
+ // data sinks.
+ if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
+ !ST1->getBasePtr().isUndef()) {
+ // ST1 is fully overwritten and can be elided. Combine with it's chain
+ // value.
+ CombineTo(ST1, ST1->getChain());
+ return SDValue();
+ }
}
}
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index f10c98ef4e50..43cbf4add0f8 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -310,7 +310,7 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
bool SpillPlacement::scanActiveBundles() {
RecentPositive.clear();
- for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
+ for (unsigned n : ActiveNodes->set_bits()) {
update(n);
// A node that must spill, or a node without any links is not going to
// change its value ever again, so exclude it from iterations.
@@ -365,7 +365,7 @@ SpillPlacement::finish() {
// Write preferences back to ActiveNodes.
bool Perfect = true;
- for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n))
+ for (unsigned n : ActiveNodes->set_bits())
if (!nodes[n].preferReg()) {
ActiveNodes->reset(n);
Perfect = false;
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index f51d959a089a..86a16187fcb6 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -703,12 +703,10 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
// Create the interval of the blocks that we previously found to be 'alive'.
BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
- for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
- pos = MBBLiveness.LiveIn.find_next(pos)) {
+ for (unsigned pos : MBBLiveness.LiveIn.set_bits()) {
Starts[pos] = Indexes->getMBBStartIdx(&MBB);
}
- for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1;
- pos = MBBLiveness.LiveOut.find_next(pos)) {
+ for (unsigned pos : MBBLiveness.LiveOut.set_bits()) {
Finishes[pos] = Indexes->getMBBEndIdx(&MBB);
}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 39aa946fa840..5f63fd4320bb 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1312,7 +1312,7 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
// Find the first legal register class with the largest spill size.
const TargetRegisterClass *BestRC = RC;
- for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+ for (unsigned i : SuperRegRC.set_bits()) {
const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
// We want the largest possible spill size.
if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC))
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index e6c5d8753b83..9724cb074584 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -564,6 +564,14 @@ void TargetPassConfig::addISelPrepare() {
addPass(createVerifierPass());
}
+/// -regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+ RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+ cl::init(&useDefaultRegisterAllocator),
+ cl::desc("Register allocator to use"));
+
/// Add the complete set of target-independent postISel code generator passes.
///
/// This can be read as the standard order of major LLVM CodeGen stages. Stages
@@ -625,8 +633,12 @@ void TargetPassConfig::addMachinePasses() {
// including phi elimination and scheduling.
if (getOptimizeRegAlloc())
addOptimizedRegAlloc(createRegAllocPass(true));
- else
+ else {
+ if (RegAlloc != &useDefaultRegisterAllocator &&
+ RegAlloc != &createFastRegisterAllocator)
+ report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
addFastRegAlloc(createRegAllocPass(false));
+ }
// Run post-ra passes.
addPostRegAlloc();
@@ -759,19 +771,12 @@ MachinePassRegistry RegisterRegAlloc::Registry;
/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
-static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+
static RegisterRegAlloc
defaultRegAlloc("default",
"pick register allocator based on -O option",
useDefaultRegisterAllocator);
-/// -regalloc=... command line option.
-static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
- RegisterPassParser<RegisterRegAlloc> >
-RegAlloc("regalloc",
- cl::init(&useDefaultRegisterAllocator),
- cl::desc("Register allocator to use"));
-
static void initializeDefaultRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
@@ -781,7 +786,6 @@ static void initializeDefaultRegisterAllocatorOnce() {
}
}
-
/// Instantiate the default register allocator pass for this target for either
/// the optimized or unoptimized allocation path. This will be added to the pass
/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index f6e4c17d514c..41ec082a24cf 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -50,8 +50,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
ArrayRef<MCPhysReg> Exceptions) const {
// Check that all super registers of reserved regs are reserved as well.
BitVector Checked(getNumRegs());
- for (int Reg = RegisterSet.find_first(); Reg>=0;
- Reg = RegisterSet.find_next(Reg)) {
+ for (unsigned Reg : RegisterSet.set_bits()) {
if (Checked[Reg])
continue;
for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) {
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
index bcc8218d9446..02e1682f76e7 100644
--- a/lib/DebugInfo/CodeView/CVTypeDumper.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
@@ -11,7 +11,6 @@
#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
#include "llvm/DebugInfo/CodeView/TypeRecord.h"
#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
#include "llvm/Support/BinaryByteStream.h"
@@ -21,38 +20,23 @@ using namespace llvm::codeview;
Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
TypeDatabaseVisitor DBV(TypeDB);
- TypeDeserializer Deserializer;
TypeVisitorCallbackPipeline Pipeline;
- Pipeline.addCallbackToPipeline(Deserializer);
Pipeline.addCallbackToPipeline(DBV);
Pipeline.addCallbackToPipeline(Dumper);
- CVTypeVisitor Visitor(Pipeline);
- if (Handler)
- Visitor.addTypeServerHandler(*Handler);
-
CVType RecordCopy = Record;
- if (auto EC = Visitor.visitTypeRecord(RecordCopy))
- return EC;
- return Error::success();
+ return codeview::visitTypeRecord(RecordCopy, Pipeline, VDS_BytesPresent,
+ Handler);
}
Error CVTypeDumper::dump(const CVTypeArray &Types,
TypeVisitorCallbacks &Dumper) {
TypeDatabaseVisitor DBV(TypeDB);
- TypeDeserializer Deserializer;
TypeVisitorCallbackPipeline Pipeline;
- Pipeline.addCallbackToPipeline(Deserializer);
Pipeline.addCallbackToPipeline(DBV);
Pipeline.addCallbackToPipeline(Dumper);
- CVTypeVisitor Visitor(Pipeline);
- if (Handler)
- Visitor.addTypeServerHandler(*Handler);
-
- if (auto EC = Visitor.visitTypeStream(Types))
- return EC;
- return Error::success();
+ return codeview::visitTypeStream(Types, Pipeline, Handler);
}
Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index b6ed0453d9c4..0f7f5f667790 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -59,13 +59,8 @@ static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
};
TypeServer2Record R(TypeRecordKind::TypeServer2);
- TypeDeserializer Deserializer;
StealTypeServerVisitor Thief(R);
- TypeVisitorCallbackPipeline Pipeline;
- Pipeline.addCallbackToPipeline(Deserializer);
- Pipeline.addCallbackToPipeline(Thief);
- CVTypeVisitor Visitor(Pipeline);
- if (auto EC = Visitor.visitTypeRecord(Record))
+ if (auto EC = visitTypeRecord(Record, Thief))
return std::move(EC);
return R;
@@ -178,7 +173,7 @@ static Error visitMemberRecord(CVMemberRecord &Record,
return Error::success();
}
-Error CVTypeVisitor::visitMemberRecord(CVMemberRecord &Record) {
+Error CVTypeVisitor::visitMemberRecord(CVMemberRecord Record) {
return ::visitMemberRecord(Record, Callbacks);
}
@@ -224,3 +219,93 @@ Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) {
BinaryStreamReader SR(S);
return visitFieldListMemberStream(SR);
}
+
+namespace {
+struct FieldListVisitHelper {
+ FieldListVisitHelper(TypeVisitorCallbacks &Callbacks, ArrayRef<uint8_t> Data,
+ VisitorDataSource Source)
+ : Stream(Data, llvm::support::little), Reader(Stream),
+ Deserializer(Reader),
+ Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
+ if (Source == VDS_BytesPresent) {
+ Pipeline.addCallbackToPipeline(Deserializer);
+ Pipeline.addCallbackToPipeline(Callbacks);
+ }
+ }
+
+ BinaryByteStream Stream;
+ BinaryStreamReader Reader;
+ FieldListDeserializer Deserializer;
+ TypeVisitorCallbackPipeline Pipeline;
+ CVTypeVisitor Visitor;
+};
+
+struct VisitHelper {
+ VisitHelper(TypeVisitorCallbacks &Callbacks, VisitorDataSource Source,
+ TypeServerHandler *TS)
+ : Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
+ if (TS)
+ Visitor.addTypeServerHandler(*TS);
+ if (Source == VDS_BytesPresent) {
+ Pipeline.addCallbackToPipeline(Deserializer);
+ Pipeline.addCallbackToPipeline(Callbacks);
+ }
+ }
+
+ TypeDeserializer Deserializer;
+ TypeVisitorCallbackPipeline Pipeline;
+ CVTypeVisitor Visitor;
+};
+}
+
+Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
+ TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source,
+ TypeServerHandler *TS) {
+ VisitHelper Helper(Callbacks, Source, TS);
+ return Helper.Visitor.visitTypeRecord(Record, Index);
+}
+
+Error llvm::codeview::visitTypeRecord(CVType &Record,
+ TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source,
+ TypeServerHandler *TS) {
+ VisitHelper Helper(Callbacks, Source, TS);
+ return Helper.Visitor.visitTypeRecord(Record);
+}
+
+Error llvm::codeview::visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
+ TypeVisitorCallbacks &Callbacks) {
+ CVTypeVisitor Visitor(Callbacks);
+ return Visitor.visitFieldListMemberStream(FieldList);
+}
+
+Error llvm::codeview::visitMemberRecord(CVMemberRecord Record,
+ TypeVisitorCallbacks &Callbacks,
+ VisitorDataSource Source) {
+ FieldListVisitHelper Helper(Callbacks, Record.Data, Source);
+ return Helper.Visitor.visitMemberRecord(Record);
+}
+
+Error llvm::codeview::visitMemberRecord(TypeLeafKind Kind,
+ ArrayRef<uint8_t> Record,
+ TypeVisitorCallbacks &Callbacks) {
+ CVMemberRecord R;
+ R.Data = Record;
+ R.Kind = Kind;
+ return visitMemberRecord(R, Callbacks, VDS_BytesPresent);
+}
+
+Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
+ TypeVisitorCallbacks &Callbacks,
+ TypeServerHandler *TS) {
+ VisitHelper Helper(Callbacks, VDS_BytesPresent, TS);
+ return Helper.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitTypeStream(CVTypeRange Types,
+ TypeVisitorCallbacks &Callbacks,
+ TypeServerHandler *TS) {
+ VisitHelper Helper(Callbacks, VDS_BytesPresent, TS);
+ return Helper.Visitor.visitTypeStream(Types);
+}
diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
index 4cb9acbe07d9..704d1131108a 100644
--- a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp
@@ -9,6 +9,7 @@
#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
@@ -20,9 +21,7 @@ RandomAccessTypeVisitor::RandomAccessTypeVisitor(
const CVTypeArray &Types, uint32_t NumRecords,
PartialOffsetArray PartialOffsets)
: Database(NumRecords), Types(Types), DatabaseVisitor(Database),
- InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) {
- Pipeline.addCallbackToPipeline(Deserializer);
- Pipeline.addCallbackToPipeline(DatabaseVisitor);
+ PartialOffsets(PartialOffsets) {
KnownOffsets.resize(Database.capacity());
}
@@ -38,8 +37,7 @@ Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI,
assert(Database.contains(TI));
auto &Record = Database.getTypeRecord(TI);
- CVTypeVisitor V(Callbacks);
- return V.visitTypeRecord(Record, TI);
+ return codeview::visitTypeRecord(Record, TI, Callbacks);
}
Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) {
@@ -78,7 +76,7 @@ Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset,
while (Begin != End) {
assert(!Database.contains(Begin));
- if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin))
+ if (auto EC = codeview::visitTypeRecord(*RI, Begin, DatabaseVisitor))
return EC;
KnownOffsets[Begin.toArrayIndex()] = BeginOffset;
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 27a6e0987886..9485c9cfedff 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -216,8 +216,7 @@ Error TypeDumpVisitor::visitMemberEnd(CVMemberRecord &Record) {
Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
FieldListRecord &FieldList) {
- CVTypeVisitor Visitor(*this);
- if (auto EC = Visitor.visitFieldListMemberStream(FieldList.Data))
+ if (auto EC = codeview::visitMemberRecordStream(FieldList.Data, *this))
return EC;
return Error::success();
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index aad20ae6dda1..51f24fa3f135 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -361,8 +361,7 @@ Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) {
// Visit the members inside the field list.
HadUntranslatedMember = false;
FieldListBuilder.begin();
- CVTypeVisitor Visitor(*this);
- if (auto EC = Visitor.visitFieldListMemberStream(R.Data))
+ if (auto EC = codeview::visitMemberRecordStream(R.Data, *this))
return EC;
// Write the record if we translated all field list members.
@@ -440,18 +439,9 @@ Error TypeStreamMerger::visitUnknownType(CVType &Rec) {
Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
assert(IndexMap.empty());
- TypeVisitorCallbackPipeline Pipeline;
LastError = Error::success();
- TypeDeserializer Deserializer;
- Pipeline.addCallbackToPipeline(Deserializer);
- Pipeline.addCallbackToPipeline(*this);
-
- CVTypeVisitor Visitor(Pipeline);
- if (Handler)
- Visitor.addTypeServerHandler(*Handler);
-
- if (auto EC = Visitor.visitTypeStream(Types))
+ if (auto EC = codeview::visitTypeStream(Types, *this, Handler))
return EC;
// If we found bad indices but no other errors, try doing another pass and see
@@ -466,7 +456,8 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
IsSecondPass = true;
NumBadIndices = 0;
CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
- if (auto EC = Visitor.visitTypeStream(Types))
+
+ if (auto EC = codeview::visitTypeStream(Types, *this, Handler))
return EC;
assert(NumBadIndices <= BadIndicesRemaining &&
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 59a060d143ff..61e75a2b56ab 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1086,49 +1086,32 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
continue;
}
+ if (Section.relocation_begin() == Section.relocation_end())
+ continue;
+
std::map<SymbolRef, uint64_t> AddrCache;
- if (Section.relocation_begin() != Section.relocation_end()) {
- uint64_t SectionSize = RelocatedSection->getSize();
- for (const RelocationRef &Reloc : Section.relocations()) {
- // FIXME: it's not clear how to correctly handle scattered
- // relocations.
- if (isRelocScattered(Obj, Reloc))
- continue;
+ for (const RelocationRef &Reloc : Section.relocations()) {
+ // FIXME: it's not clear how to correctly handle scattered
+ // relocations.
+ if (isRelocScattered(Obj, Reloc))
+ continue;
- Expected<uint64_t> SymAddrOrErr =
- getSymbolAddress(Obj, Reloc, L, AddrCache);
- if (!SymAddrOrErr) {
- errs() << toString(SymAddrOrErr.takeError()) << '\n';
- continue;
- }
+ Expected<uint64_t> SymAddrOrErr =
+ getSymbolAddress(Obj, Reloc, L, AddrCache);
+ if (!SymAddrOrErr) {
+ errs() << toString(SymAddrOrErr.takeError()) << '\n';
+ continue;
+ }
- object::RelocVisitor V(Obj);
- object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr));
- if (V.error()) {
- SmallString<32> Name;
- Reloc.getTypeName(Name);
- errs() << "error: failed to compute relocation: "
- << Name << "\n";
- continue;
- }
- uint64_t Address = Reloc.getOffset();
- if (Address + R.Width > SectionSize) {
- errs() << "error: " << R.Width << "-byte relocation starting "
- << Address << " bytes into section " << name << " which is "
- << SectionSize << " bytes long.\n";
- continue;
- }
- if (R.Width > 8) {
- errs() << "error: can't handle a relocation of more than 8 bytes at "
- "a time.\n";
- continue;
- }
- DEBUG(dbgs() << "Writing " << format("%p", R.Value)
- << " at " << format("%p", Address)
- << " with width " << format("%d", R.Width)
- << "\n");
- Map->insert({Address, {(uint8_t)R.Width, R.Value}});
+ object::RelocVisitor V(Obj);
+ object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr));
+ if (V.error()) {
+ SmallString<32> Name;
+ Reloc.getTypeName(Name);
+ errs() << "error: failed to compute relocation: " << Name << "\n";
+ continue;
}
+ Map->insert({Reloc.getOffset(), {R.Value}});
}
}
}
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
index 629f3e80b0ed..cb783cf4fea7 100644
--- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
@@ -55,9 +55,8 @@ PDBTypeServerHandler::handleInternal(PDBFile &File,
auto ExpectedTpi = File.getPDBTpiStream();
if (!ExpectedTpi)
return ExpectedTpi.takeError();
- CVTypeVisitor Visitor(Callbacks);
- if (auto EC = Visitor.visitTypeStream(ExpectedTpi->types(nullptr)))
+ if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks))
return std::move(EC);
return true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index e9a4b71c903d..ab86e5d6a0fd 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -705,7 +705,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
unsigned PaddingSize = 0;
unsigned StubBufSize = 0;
- bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
+ bool IsRequired = isRequiredForExecution(Section);
bool IsVirtual = Section.isVirtual();
bool IsZeroInit = isZeroInit(Section);
bool IsReadOnly = isReadOnlyData(Section);
@@ -745,8 +745,8 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
Alignment = std::max(Alignment, getStubAlignment());
// Some sections, such as debug info, don't need to be loaded for execution.
- // Leave those where they are.
- if (IsRequired) {
+ // Process those only if explicitly requested.
+ if (IsRequired || ProcessAllSections) {
Allocate = DataSize + PaddingSize + StubBufSize;
if (!Allocate)
Allocate = 1;
@@ -790,6 +790,10 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
Sections.push_back(
SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData));
+ // Debug info sections are linked as if their load address was zero
+ if (!IsRequired)
+ Sections.back().setLoadAddress(0);
+
if (Checker)
Checker->registerSection(Obj.getFileName(), SectionID);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index 98865f5e065e..bd38dd88201f 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -78,6 +78,9 @@ static bool gCrashRecoveryEnabled = false;
static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>>
tlIsRecoveringFromCrash;
+static void installExceptionOrSignalHandlers();
+static void uninstallExceptionOrSignalHandlers();
+
CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
CrashRecoveryContext::~CrashRecoveryContext() {
@@ -113,6 +116,23 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
return CRCI->CRC;
}
+void CrashRecoveryContext::Enable() {
+ sys::ScopedLock L(*gCrashRecoveryContextMutex);
+ // FIXME: Shouldn't this be a refcount or something?
+ if (gCrashRecoveryEnabled)
+ return;
+ gCrashRecoveryEnabled = true;
+ installExceptionOrSignalHandlers();
+}
+
+void CrashRecoveryContext::Disable() {
+ sys::ScopedLock L(*gCrashRecoveryContextMutex);
+ if (!gCrashRecoveryEnabled)
+ return;
+ gCrashRecoveryEnabled = false;
+ uninstallExceptionOrSignalHandlers();
+}
+
void CrashRecoveryContext::registerCleanup(CrashRecoveryContextCleanup *cleanup)
{
if (!cleanup)
@@ -140,30 +160,70 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
delete cleanup;
}
-#ifdef LLVM_ON_WIN32
+#if defined(_MSC_VER)
+// If _MSC_VER is defined, we must have SEH. Use it if it's available. It's way
+// better than VEH. Vectored exception handling catches all exceptions happening
+// on the thread with installed exception handlers, so it can interfere with
+// internal exception handling of other libraries on that thread. SEH works
+// exactly as you would expect normal exception handling to work: it only
+// catches exceptions if they would bubble out from the stack frame with __try /
+// __except.
-#include "Windows/WindowsSupport.h"
+static void installExceptionOrSignalHandlers() {}
+static void uninstallExceptionOrSignalHandlers() {}
-// On Windows, we can make use of vectored exception handling to
-// catch most crashing situations. Note that this does mean
-// we will be alerted of exceptions *before* structured exception
-// handling has the opportunity to catch it. But that isn't likely
-// to cause problems because nowhere in the project is SEH being
-// used.
+bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
+ if (!gCrashRecoveryEnabled) {
+ Fn();
+ return true;
+ }
+
+ bool Result = true;
+ __try {
+ Fn();
+ } __except (1) { // Catch any exception.
+ Result = false;
+ }
+ return Result;
+}
+
+#else // !_MSC_VER
+
+#if defined(LLVM_ON_WIN32)
+// This is a non-MSVC compiler, probably mingw gcc or clang without
+// -fms-extensions. Use vectored exception handling (VEH).
+//
+// On Windows, we can make use of vectored exception handling to catch most
+// crashing situations. Note that this does mean we will be alerted of
+// exceptions *before* structured exception handling has the opportunity to
+// catch it. Unfortunately, this causes problems in practice with other code
+// running on threads with LLVM crash recovery contexts, so we would like to
+// eventually move away from VEH.
//
-// Vectored exception handling is built on top of SEH, and so it
-// works on a per-thread basis.
+// Vectored works on a per-thread basis, which is an advantage over
+// SetUnhandledExceptionFilter. SetUnhandledExceptionFilter also doesn't have
+// any native support for chaining exception handlers, but VEH allows more than
+// one.
//
// The vectored exception handler functionality was added in Windows
// XP, so if support for older versions of Windows is required,
// it will have to be added.
-//
-// If we want to support as far back as Win2k, we could use the
-// SetUnhandledExceptionFilter API, but there's a risk of that
-// being entirely overwritten (it's not a chain).
+
+#include "Windows/WindowsSupport.h"
static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
{
+ // DBG_PRINTEXCEPTION_WIDE_C is not properly defined on all supported
+ // compilers and platforms, so we define it manually.
+ constexpr ULONG DbgPrintExceptionWideC = 0x4001000AL;
+ switch (ExceptionInfo->ExceptionRecord->ExceptionCode)
+ {
+ case DBG_PRINTEXCEPTION_C:
+ case DbgPrintExceptionWideC:
+ case 0x406D1388: // set debugger thread name
+ return EXCEPTION_CONTINUE_EXECUTION;
+ }
+
// Lookup the current thread local recovery object.
const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
@@ -192,14 +252,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
// non-NULL, valid VEH handles, or NULL.
static sys::ThreadLocal<const void> sCurrentExceptionHandle;
-void CrashRecoveryContext::Enable() {
- sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
- if (gCrashRecoveryEnabled)
- return;
-
- gCrashRecoveryEnabled = true;
-
+static void installExceptionOrSignalHandlers() {
// We can set up vectored exception handling now. We will install our
// handler as the front of the list, though there's no assurances that
// it will remain at the front (another call could install itself before
@@ -208,14 +261,7 @@ void CrashRecoveryContext::Enable() {
sCurrentExceptionHandle.set(handle);
}
-void CrashRecoveryContext::Disable() {
- sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
- if (!gCrashRecoveryEnabled)
- return;
-
- gCrashRecoveryEnabled = false;
-
+static void uninstallExceptionOrSignalHandlers() {
PVOID currentHandle = const_cast<PVOID>(sCurrentExceptionHandle.get());
if (currentHandle) {
// Now we can remove the vectored exception handler from the chain
@@ -226,7 +272,7 @@ void CrashRecoveryContext::Disable() {
}
}
-#else
+#else // !LLVM_ON_WIN32
// Generic POSIX implementation.
//
@@ -278,14 +324,7 @@ static void CrashRecoverySignalHandler(int Signal) {
const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
}
-void CrashRecoveryContext::Enable() {
- sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
- if (gCrashRecoveryEnabled)
- return;
-
- gCrashRecoveryEnabled = true;
-
+static void installExceptionOrSignalHandlers() {
// Setup the signal handler.
struct sigaction Handler;
Handler.sa_handler = CrashRecoverySignalHandler;
@@ -297,20 +336,13 @@ void CrashRecoveryContext::Enable() {
}
}
-void CrashRecoveryContext::Disable() {
- sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
- if (!gCrashRecoveryEnabled)
- return;
-
- gCrashRecoveryEnabled = false;
-
+static void uninstallExceptionOrSignalHandlers() {
// Restore the previous signal handlers.
for (unsigned i = 0; i != NumSignals; ++i)
sigaction(Signals[i], &PrevActions[i], nullptr);
}
-#endif
+#endif // !LLVM_ON_WIN32
bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
// If crash recovery is disabled, do nothing.
@@ -328,6 +360,8 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
return true;
}
+#endif // !_MSC_VER
+
void CrashRecoveryContext::HandleCrash() {
CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
assert(CRCI && "Crash recovery context never initialized!");
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index cdea09be41e0..fa28ba1b6ab6 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -103,16 +103,13 @@
#define STATVFS_F_FLAG(vfs) (vfs).f_flags
#endif
-#if defined(__FreeBSD__) || defined(__NetBSD__)
-#include <sys/sysctl.h>
-#endif
-
using namespace llvm;
namespace llvm {
namespace sys {
namespace fs {
-#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
+#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+ defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \
defined(_AIX)
static int
@@ -167,7 +164,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
free(pv);
return nullptr;
}
-#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX
+#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
/// GetMainExecutable - Return the path to the main executable, given the
/// value of argv[0] from program startup.
@@ -183,24 +180,9 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
if (realpath(exe_path, link_path))
return link_path;
}
-#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__)
- int mib[4];
- mib[0] = CTL_KERN;
-#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
- mib[1] = KERN_PROC;
- mib[2] = KERN_PROC_PATHNAME;
- mib[3] = -1;
-#else
- mib[1] = KERN_PROC_ARGS;
- mib[2] = -1;
- mib[3] = KERN_PROC_PATHNAME;
-#endif
- char exe_path[PATH_MAX];
- size_t cb = sizeof(exe_path);
- if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0)
- return exe_path;
-#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \
- defined(__DragonFly__) || defined(_AIX)
+#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+ defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
+ defined(__FreeBSD_kernel__) || defined(_AIX)
char exe_path[PATH_MAX];
if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index dc916c034661..1aec602a2a36 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1158,8 +1158,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
}
DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
- for (int Reg = SavedRegs.find_first(); Reg != -1;
- Reg = SavedRegs.find_next(Reg))
+ for (unsigned Reg : SavedRegs.set_bits())
dbgs() << ' ' << PrintReg(Reg, RegInfo);
dbgs() << "\n";);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4f7c2e122390..1af36086ad90 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -553,7 +553,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -659,6 +658,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ // Vector reductions
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ }
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ }
+
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
@@ -2606,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ return LowerVECREDUCE(Op, DAG);
}
}
@@ -7128,6 +7148,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return Cmp;
}
+static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
+ SelectionDAG &DAG) {
+ SDValue VecOp = ScalarOp.getOperand(0);
+ auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ switch (Op.getOpcode()) {
+ case ISD::VECREDUCE_ADD:
+ return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
+ case ISD::VECREDUCE_SMAX:
+ return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
+ case ISD::VECREDUCE_SMIN:
+ return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
+ case ISD::VECREDUCE_UMAX:
+ return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
+ case ISD::VECREDUCE_UMIN:
+ return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
+ case ISD::VECREDUCE_FMAX: {
+ assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+ DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
+ Op.getOperand(0));
+ }
+ case ISD::VECREDUCE_FMIN: {
+ assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+ DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
+ Op.getOperand(0));
+ }
+ default:
+ llvm_unreachable("Unhandled reduction");
+ }
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
@@ -9490,266 +9551,6 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
-/// This function handles the log2-shuffle pattern produced by the
-/// LoopVectorizer for the across vector reduction. It consists of
-/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
-/// are reduced, where s is an induction variable from 0 to
-/// log2(NumVectorElements).
-static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
- unsigned Op,
- SelectionDAG &DAG) {
- EVT VTy = OpV->getOperand(0).getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- int NumVecElts = VTy.getVectorNumElements();
- if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
- if (NumVecElts != 4)
- return SDValue();
- } else {
- if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
- return SDValue();
- }
-
- int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
- SDValue PreOp = OpV;
- // Iterate over each step of the across vector reduction.
- for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
- SDValue CurOp = PreOp.getOperand(0);
- SDValue Shuffle = PreOp.getOperand(1);
- if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
- // Try to swap the 1st and 2nd operand as add and min/max instructions
- // are commutative.
- CurOp = PreOp.getOperand(1);
- Shuffle = PreOp.getOperand(0);
- if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
- }
-
- // Check if the input vector is fed by the operator we want to handle,
- // except the last step; the very first input vector is not necessarily
- // the same operator we are handling.
- if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
- return SDValue();
-
- // Check if it forms one step of the across vector reduction.
- // E.g.,
- // %cur = add %1, %0
- // %shuffle = vector_shuffle %cur, <2, 3, u, u>
- // %pre = add %cur, %shuffle
- if (Shuffle.getOperand(0) != CurOp)
- return SDValue();
-
- int NumMaskElts = 1 << CurStep;
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
- // Check mask values in each step.
- // We expect the shuffle mask in each step follows a specific pattern
- // denoted here by the <M, U> form, where M is a sequence of integers
- // starting from NumMaskElts, increasing by 1, and the number integers
- // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
- // of undef in U should be NumVecElts - NumMaskElts.
- // E.g., for <8 x i16>, mask values in each step should be :
- // step 0 : <1,u,u,u,u,u,u,u>
- // step 1 : <2,3,u,u,u,u,u,u>
- // step 2 : <4,5,6,7,u,u,u,u>
- for (int i = 0; i < NumVecElts; ++i)
- if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
- (i >= NumMaskElts && !(Mask[i] < 0)))
- return SDValue();
-
- PreOp = CurOp;
- }
- unsigned Opcode;
- bool IsIntrinsic = false;
-
- switch (Op) {
- default:
- llvm_unreachable("Unexpected operator for across vector reduction");
- case ISD::ADD:
- Opcode = AArch64ISD::UADDV;
- break;
- case ISD::SMAX:
- Opcode = AArch64ISD::SMAXV;
- break;
- case ISD::UMAX:
- Opcode = AArch64ISD::UMAXV;
- break;
- case ISD::SMIN:
- Opcode = AArch64ISD::SMINV;
- break;
- case ISD::UMIN:
- Opcode = AArch64ISD::UMINV;
- break;
- case ISD::FMAXNUM:
- Opcode = Intrinsic::aarch64_neon_fmaxnmv;
- IsIntrinsic = true;
- break;
- case ISD::FMINNUM:
- Opcode = Intrinsic::aarch64_neon_fminnmv;
- IsIntrinsic = true;
- break;
- }
- SDLoc DL(N);
-
- return IsIntrinsic
- ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
- DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
- : DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
- DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
- DAG.getConstant(0, DL, MVT::i64));
-}
-
-/// Target-specific DAG combine for the across vector min/max reductions.
-/// This function specifically handles the final clean-up step of the vector
-/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which narrows down and finds the final min/max value from all
-/// elements of the vector.
-/// For example, for a <16 x i8> vector :
-/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
-/// %smax0 = smax %arr, svn0
-/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %smax1 = smax %smax0, %svn1
-/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %smax2 = smax %smax1, svn2
-/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %sc = setcc %smax2, %svn3, gt
-/// %n0 = extract_vector_elt %sc, #0
-/// %n1 = extract_vector_elt %smax2, #0
-/// %n2 = extract_vector_elt $smax2, #1
-/// %result = select %n0, %n1, n2
-/// becomes :
-/// %1 = smaxv %0
-/// %result = extract_vector_elt %1, 0
-static SDValue
-performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- if (!Subtarget->hasNEON())
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- SDValue IfTrue = N->getOperand(1);
- SDValue IfFalse = N->getOperand(2);
-
- // Check if the SELECT merges up the final result of the min/max
- // from a vector.
- if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
-
- // Expect N0 is fed by SETCC.
- SDValue SetCC = N0.getOperand(0);
- EVT SetCCVT = SetCC.getValueType();
- if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
- SetCCVT.getVectorElementType() != MVT::i1)
- return SDValue();
-
- SDValue VectorOp = SetCC.getOperand(0);
- unsigned Op = VectorOp->getOpcode();
- // Check if the input vector is fed by the operator we want to handle.
- if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
- Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
- return SDValue();
-
- EVT VTy = VectorOp.getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- if (VTy.getSizeInBits() < 64)
- return SDValue();
-
- EVT EltTy = VTy.getVectorElementType();
- if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
- if (EltTy != MVT::f32)
- return SDValue();
- } else {
- if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
- return SDValue();
- }
-
- // Check if extracting from the same vector.
- // For example,
- // %sc = setcc %vector, %svn1, gt
- // %n0 = extract_vector_elt %sc, #0
- // %n1 = extract_vector_elt %vector, #0
- // %n2 = extract_vector_elt $vector, #1
- if (!(VectorOp == IfTrue->getOperand(0) &&
- VectorOp == IfFalse->getOperand(0)))
- return SDValue();
-
- // Check if the condition code is matched with the operator type.
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
- if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
- (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
- (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
- (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
- (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
- CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
- CC != ISD::SETGE) ||
- (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
- CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
- CC != ISD::SETLE))
- return SDValue();
-
- // Expect to check only lane 0 from the vector SETCC.
- if (!isNullConstant(N0.getOperand(1)))
- return SDValue();
-
- // Expect to extract the true value from lane 0.
- if (!isNullConstant(IfTrue.getOperand(1)))
- return SDValue();
-
- // Expect to extract the false value from lane 1.
- if (!isOneConstant(IfFalse.getOperand(1)))
- return SDValue();
-
- return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
-}
-
-/// Target-specific DAG combine for the across vector add reduction.
-/// This function specifically handles the final clean-up step of the vector
-/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which adds all elements of a vector together.
-/// For example, for a <4 x i32> vector :
-/// %1 = vector_shuffle %0, <2,3,u,u>
-/// %2 = add %0, %1
-/// %3 = vector_shuffle %2, <1,u,u,u>
-/// %4 = add %2, %3
-/// %result = extract_vector_elt %4, 0
-/// becomes :
-/// %0 = uaddv %0
-/// %result = extract_vector_elt %0, 0
-static SDValue
-performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- if (!Subtarget->hasNEON())
- return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- // Check if the input vector is fed by the ADD.
- if (N0->getOpcode() != ISD::ADD)
- return SDValue();
-
- // The vector extract idx must constant zero because we only expect the final
- // result of the reduction is placed in lane 0.
- if (!isNullConstant(N1))
- return SDValue();
-
- EVT VTy = N0.getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- EVT EltTy = VTy.getVectorElementType();
- if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
- return SDValue();
-
- if (VTy.getSizeInBits() < 64)
- return SDValue();
-
- return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
-}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
@@ -10428,12 +10229,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::SELECT: {
- SDValue RV = performSelectCombine(N, DCI);
- if (!RV.getNode())
- RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
- return RV;
- }
+ case ISD::SELECT:
+ return performSelectCombine(N, DCI);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::LOAD:
@@ -10455,8 +10252,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
- case ISD::EXTRACT_VECTOR_ELT:
- return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -10676,6 +10471,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
+ return;
+
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 89db566c219c..ecc2517fb288 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -568,6 +568,7 @@ private:
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7c6f55c06bce..43569af04347 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -769,3 +769,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() {
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
return ST->getMaxPrefetchIterationsAhead();
}
+
+bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ switch (Opcode) {
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ return false;
+ case Instruction::Add:
+ return ScalarBits * Ty->getVectorNumElements() >= 128;
+ case Instruction::ICmp:
+ return (ScalarBits < 64) &&
+ (ScalarBits * Ty->getVectorNumElements() >= 128);
+ case Instruction::FCmp:
+ return Flags.NoNaN;
+ default:
+ llvm_unreachable("Unhandled reduction opcode");
+ }
+ return false;
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 280d97f3c502..d0299149c38c 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -145,6 +145,9 @@ public:
bool shouldExpandReduction(const IntrinsicInst *II) const {
return false;
}
+
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
/// @}
};
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7c99752b881f..c3ac796a0a44 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1707,10 +1707,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
// FIXME: Look for on separate components
if (Src.getOpcode() == ISD::FNEG) {
- Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = Src.getOperand(0);
}
+ if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned VecMods = Mods;
+
+ SDValue Lo = Src.getOperand(0);
+ SDValue Hi = Src.getOperand(1);
+
+ if (Lo.getOpcode() == ISD::FNEG) {
+ Lo = Lo.getOperand(0);
+ Mods ^= SISrcMods::NEG;
+ }
+
+ if (Hi.getOpcode() == ISD::FNEG) {
+ Hi = Hi.getOperand(0);
+ Mods ^= SISrcMods::NEG_HI;
+ }
+
+ if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
+ // Really a scalar input. Just select from the low half of the register to
+ // avoid packing.
+
+ Src = Lo;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ Mods = VecMods;
+ }
+
// Packed instructions do not have abs modifiers.
// FIXME: Handle abs/neg of individual components.
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index bed7d326b3dd..e543cae07ada 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -289,6 +289,10 @@ public:
return getGeneration() >= GFX9;
}
+ bool hasMin3Max3_16() const {
+ return getGeneration() >= GFX9;
+ }
+
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 48a14e4dbea2..286be355bc14 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4491,7 +4491,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- VT != MVT::f64) {
+ VT != MVT::f64 &&
+ ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 065fd09eb356..38a16b525a75 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -765,7 +765,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
// correctly handled.
@@ -796,7 +796,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addReg(MFI->getFrameOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
}
@@ -869,7 +869,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
- .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+ .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
@@ -892,10 +892,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
- .addImm(0) // offset
+ .addFrameIndex(FrameIndex) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getFrameOffsetReg()) // scratch_offset
+ .addImm(0) // offset
.addMemOperand(MMO);
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8820e294562b..06cfc95be96a 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -654,11 +654,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getScratchWaveOffsetReg())
+ .addReg(MFI->getFrameOffsetReg())
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getScratchWaveOffsetReg());
+ .addReg(MFI->getFrameOffsetReg());
}
BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
@@ -715,11 +715,11 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srrsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addReg(TmpReg, RegState::Kill) // src
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srrsrc
+ .addReg(MFI->getFrameOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
}
}
@@ -806,11 +806,11 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
if (Offset != 0) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(MFI->getScratchWaveOffsetReg())
+ .addReg(MFI->getFrameOffsetReg())
.addImm(Offset);
} else {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(MFI->getScratchWaveOffsetReg());
+ .addReg(MFI->getFrameOffsetReg());
}
auto MIB =
@@ -853,10 +853,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
- .addFrameIndex(Index) // vaddr
- .addReg(MFI->getScratchRSrcReg()) // srsrc
- .addReg(MFI->getScratchWaveOffsetReg()) // soffset
- .addImm(i * 4) // offset
+ .addFrameIndex(Index) // vaddr
+ .addReg(MFI->getScratchRSrcReg()) // srsrc
+ .addReg(MFI->getFrameOffsetReg()) // soffset
+ .addImm(i * 4) // offset
.addMemOperand(MMO);
auto MIB =
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index ffa6c60d6b1f..c0b5069948fb 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -300,10 +300,19 @@ def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
-}
+
+def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>;
+def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>;
+def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>;
+
+def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>;
+def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>;
+def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>;
+} // End SubtargetPredicate = isGFX9
//===----------------------------------------------------------------------===//
@@ -509,6 +518,15 @@ defm V_OR3_B32 : VOP3_Real_vi <0x202>;
defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
+
+defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>;
+defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>;
+defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>;
+
+defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>;
+defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>;
+defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>;
+
defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 8c680cdf9b47..b1f059835ff5 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -345,25 +345,10 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
I.setDesc(TII.get(COPY));
return selectCopy(I, TII, MRI, TRI, RBI);
}
- case G_ADD:
case G_GEP:
I.setDesc(TII.get(ARM::ADDrr));
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
break;
- case G_SUB:
- I.setDesc(TII.get(ARM::SUBrr));
- MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
- break;
- case G_MUL:
- if (TII.getSubtarget().hasV6Ops()) {
- I.setDesc(TII.get(ARM::MUL));
- } else {
- assert(TII.getSubtarget().useMulOps() && "Unsupported target");
- I.setDesc(TII.get(ARM::MULv5));
- MIB->getOperand(0).setIsEarlyClobber(true);
- }
- MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
- break;
case G_FRAME_INDEX:
// Add 0 to the given frame index and hope it will eventually be folded into
// the user(s).
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index d0fd366ab9ed..1a17d4e33e4f 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -571,8 +571,7 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
GPRsNoLRSP.reset(ARM::LR);
GPRsNoLRSP.reset(ARM::SP);
GPRsNoLRSP.reset(ARM::PC);
- for (int Register = GPRsNoLRSP.find_first(); Register != -1;
- Register = GPRsNoLRSP.find_next(Register)) {
+ for (unsigned Register : GPRsNoLRSP.set_bits()) {
if (!UsedRegs.contains(Register)) {
// Remember the first pop-friendly register and exit.
if (PopFriendly.test(Register)) {
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index ae58c26e145a..1597057ad63f 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -386,7 +386,7 @@ void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
BitVector AllocSet = TRI.getAllocatableSet(MF);
- for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+ for (unsigned R : AllocSet.set_bits())
for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
AllocSet.set(*AI);
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 40bfe3a449f7..57a1d373c88c 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1765,31 +1765,36 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
// Check whether the frame pointer register is allocated. If so, make sure it
// is spilled to the correct offset.
if (needsFP(MF)) {
- HasGPSaveArea = true;
-
int FI = PFI->getFramePointerSaveIndex();
assert(FI && "No Frame Pointer Save Slot!");
-
MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+ // FP is R31/X31, so no need to update MinGPR/MinG8R.
+ HasGPSaveArea = true;
}
if (PFI->usesPICBase()) {
- HasGPSaveArea = true;
-
int FI = PFI->getPICBasePointerSaveIndex();
assert(FI && "No PIC Base Pointer Save Slot!");
-
MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+
+ MinGPR = std::min<unsigned>(MinGPR, PPC::R30);
+ HasGPSaveArea = true;
}
const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
if (RegInfo->hasBasePointer(MF)) {
- HasGPSaveArea = true;
-
int FI = PFI->getBasePointerSaveIndex();
assert(FI && "No Base Pointer Save Slot!");
-
MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+
+ unsigned BP = RegInfo->getBaseRegister(MF);
+ if (PPC::G8RCRegClass.contains(BP)) {
+ MinG8R = std::min<unsigned>(MinG8R, BP);
+ HasG8SaveArea = true;
+ } else if (PPC::GPRCRegClass.contains(BP)) {
+ MinGPR = std::min<unsigned>(MinGPR, BP);
+ HasGPSaveArea = true;
+ }
}
// General register save area starts right below the Floating-point
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 17bdd595da10..144aea850833 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -410,6 +410,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
// To handle counter-based loop conditions.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
// Comparisons that require checking two conditions.
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
@@ -8184,6 +8189,26 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return Flags;
}
+SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+ SelectionDAG &DAG) const {
+ // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
+ // the beginning of the argument list.
+ int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
+ SDLoc DL(Op);
+ switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
+ case Intrinsic::ppc_cfence: {
+ assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
+ return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
+ DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
+ Op.getOperand(ArgStart + 1))),
+ 0);
+ }
+ default:
+ break;
+ }
+ return SDValue();
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -8649,6 +8674,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
// Frame & Return address.
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+
+ case ISD::INTRINSIC_VOID:
+ return LowerINTRINSIC_VOID(Op, DAG);
}
}
@@ -8753,12 +8781,19 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
- if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord))
+ if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
+ // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+ // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+ // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+ if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
+ return Builder.CreateCall(
+ Intrinsic::getDeclaration(
+ Builder.GetInsertBlock()->getParent()->getParent(),
+ Intrinsic::ppc_cfence, {Inst->getType()}),
+ {Inst});
+ // FIXME: Can use isync for rmw operation.
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
- // FIXME: this is too conservative, a dependent branch + isync is enough.
- // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
- // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
- // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+ }
return nullptr;
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 4fc744257262..acb77943b118 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -905,6 +905,7 @@ namespace llvm {
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index a8433919f0f3..a3f894c81a01 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -983,6 +983,10 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
[(set i64:$rD,
(PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
isPPC64;
+
+let isBarrier = 1, isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
+def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
+
def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
(ADD8TLS $in, tglobaltlsaddr:$g)>;
def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 790a8902b3d2..3afcec1248d5 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1873,6 +1873,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
}
bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+ auto &MBB = *MI.getParent();
+ auto DL = MI.getDebugLoc();
switch (MI.getOpcode()) {
case TargetOpcode::LOAD_STACK_GUARD: {
assert(Subtarget.isTargetLinux() &&
@@ -1920,6 +1922,17 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(Opcode));
return true;
}
+ case PPC::CFENCE8: {
+ auto Val = MI.getOperand(0).getReg();
+ BuildMI(MBB, MI, DL, get(PPC::CMPW), PPC::CR7).addReg(Val).addReg(Val);
+ BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP))
+ .addImm(PPC::PRED_NE_MINUS)
+ .addReg(PPC::CR7)
+ .addImm(1);
+ MI.setDesc(get(PPC::ISYNC));
+ MI.RemoveOperand(0);
+ return true;
+ }
}
return false;
}
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 1af5e7f28342..0766cfe4a987 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1223,9 +1223,15 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
// FIXME: should be able to write a pattern for PPCcondbranch, but can't use
// a two-value operand where a dag node expects two operands. :(
let isCodeGenOnly = 1 in {
- def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
- "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
- /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+ class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+ "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
+ /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+ def BCC : BCC_class;
+
+ // The same as BCC, except that it's not a terminator. Used for introducing
+ // control flow dependency without creating new blocks.
+ let isTerminator = 0 in def CTRL_DEP : BCC_class;
+
def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
"b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f56b238f91e6..6a3dc6799c43 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -325,6 +325,30 @@ int SystemZTTIImpl::getArithmeticInstrCost(
unsigned ScalarBits = Ty->getScalarSizeInBits();
+ // Div with a constant which is a power of 2 will be converted by
+ // DAGCombiner to use shifts. With vector shift-element instructions, a
+ // vector sdiv costs about as much as a scalar one.
+ const unsigned SDivCostEstimate = 4;
+ bool SDivPow2 = false;
+ bool UDivPow2 = false;
+ if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
+ Args.size() == 2) {
+ const ConstantInt *CI = nullptr;
+ if (const Constant *C = dyn_cast<Constant>(Args[1])) {
+ if (C->getType()->isVectorTy())
+ CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+ else
+ CI = dyn_cast<const ConstantInt>(C);
+ }
+ if (CI != nullptr &&
+ (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
+ if (Opcode == Instruction::SDiv)
+ SDivPow2 = true;
+ else
+ UDivPow2 = true;
+ }
+ }
+
if (Ty->isVectorTy()) {
assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
unsigned VF = Ty->getVectorNumElements();
@@ -333,10 +357,13 @@ int SystemZTTIImpl::getArithmeticInstrCost(
// These vector operations are custom handled, but are still supported
// with one instruction per vector, regardless of element size.
if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
- Opcode == Instruction::AShr) {
+ Opcode == Instruction::AShr || UDivPow2) {
return NumVectors;
}
+ if (SDivPow2)
+ return (NumVectors * SDivCostEstimate);
+
// These FP operations are supported with a single vector instruction for
// double (base implementation assumes float generally costs 2). For
// FP128, the scalar cost is 1, and there is no overhead since the values
@@ -395,6 +422,11 @@ int SystemZTTIImpl::getArithmeticInstrCost(
// 2 * ipm sequences ; xor ; shift ; compare
return 7;
+ if (UDivPow2)
+ return 1;
+ if (SDivPow2)
+ return SDivCostEstimate;
+
// An extra extension for narrow types is needed.
if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
// sext of op(s) for narrow types
diff --git a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 5fd4a8d1949e..ba39b6cdb568 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -140,8 +140,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
// Check if it's possible to reuse any of the used colors.
if (!MRI->isLiveIn(Old))
- for (int C(UsedColors.find_first()); C != -1;
- C = UsedColors.find_next(C)) {
+ for (unsigned C : UsedColors.set_bits()) {
if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
continue;
for (LiveInterval *OtherLI : Assignments[C])
diff --git a/lib/Target/WebAssembly/known_gcc_test_failures.txt b/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 8e8e5fd1eff1..54619589c341 100644
--- a/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -33,9 +33,6 @@ built-in-setjmp.c
pr60003.c
# Error in the program / unsupported by Clang.
-scal-to-vec1.c
-scal-to-vec2.c
-scal-to-vec3.c
20000822-1.c
20010209-1.c
20010605-1.c
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3a421fe77392..784c3a6557ff 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -235,8 +235,6 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;
-def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
- "LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat
@@ -482,7 +480,6 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
- FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 9f649dad8bc0..2cd4c1a3e7b3 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,26 +27,20 @@
#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
-namespace llvm {
-void initializeFixupLEAPassPass(PassRegistry &);
-}
-
-#define FIXUPLEA_DESC "X86 LEA Fixup"
-#define FIXUPLEA_NAME "x86-fixup-LEAs"
-
-#define DEBUG_TYPE FIXUPLEA_NAME
+#define DEBUG_TYPE "x86-fixup-LEAs"
STATISTIC(NumLEAs, "Number of LEA instructions created");
namespace {
class FixupLEAPass : public MachineFunctionPass {
enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-
+ static char ID;
/// \brief Loop over all of the instructions in the basic block
/// replacing applicable instructions with LEA instructions,
/// where appropriate.
bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
+ StringRef getPassName() const override { return "X86 LEA Fixup"; }
/// \brief Given a machine register, look for the instruction
/// which writes it in the current basic block. If found,
@@ -68,22 +62,6 @@ class FixupLEAPass : public MachineFunctionPass {
void processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI);
-
- /// \brief Given a LEA instruction which is unprofitable
- /// on SNB+ try to replace it with other instructions.
- /// According to Intel's Optimization Reference Manual:
- /// " For LEA instructions with three source operands and some specific
- /// situations, instruction latency has increased to 3 cycles, and must
- /// dispatch via port 1:
- /// - LEA that has all three source operands: base, index, and offset
- /// - LEA that uses base and index registers where the base is EBP, RBP,
- /// or R13
- /// - LEA that uses RIP relative addressing mode
- /// - LEA that uses 16-bit addressing mode "
- /// This function currently handles the first 2 cases only.
- MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI);
-
/// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
/// and convert them to INC or DEC respectively.
bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -107,13 +85,7 @@ class FixupLEAPass : public MachineFunctionPass {
MachineBasicBlock::iterator &MBBI) const;
public:
- static char ID;
-
- StringRef getPassName() const override { return FIXUPLEA_DESC; }
-
- FixupLEAPass() : MachineFunctionPass(ID) {
- initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
- }
+ FixupLEAPass() : MachineFunctionPass(ID) {}
/// \brief Loop over all of the basic blocks,
/// replacing instructions by equivalent LEA instructions
@@ -132,11 +104,8 @@ private:
bool OptIncDec;
bool OptLEA;
};
-}
-
char FixupLEAPass::ID = 0;
-
-INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+}
MachineInstr *
FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
@@ -199,7 +168,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
- OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
+ OptLEA = ST.LEAusesAG() || ST.slowLEA();
if (!OptLEA && !OptIncDec)
return false;
@@ -273,64 +242,9 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
return MachineBasicBlock::iterator();
}
-static inline bool isLEA(const int Opcode) {
- return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
- Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
-}
-
-static inline bool isInefficientLEAReg(unsigned int Reg) {
- return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
-}
-
-static inline bool isRegOperand(const MachineOperand &Op) {
- return Op.isReg() && Op.getReg() != X86::NoRegister;
-}
-/// hasIneffecientLEARegs - LEA that uses base and index registers
-/// where the base is EBP, RBP, or R13
-static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
- const MachineOperand &Index) {
- return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
- isRegOperand(Index);
-}
-
-static inline bool hasLEAOffset(const MachineOperand &Offset) {
- return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
-}
-
-// LEA instruction that has all three operands: offset, base and index
-static inline bool isThreeOperandsLEA(const MachineOperand &Base,
- const MachineOperand &Index,
- const MachineOperand &Offset) {
- return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
-}
-
-static inline int getADDrrFromLEA(int LEAOpcode) {
- switch (LEAOpcode) {
- default:
- llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return X86::ADD16rr;
- case X86::LEA32r:
- return X86::ADD32rr;
- case X86::LEA64_32r:
- case X86::LEA64r:
- return X86::ADD64rr;
- }
-}
-
-static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
- bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
- switch (LEAOpcode) {
- default:
- llvm_unreachable("Unexpected LEA instruction");
- case X86::LEA16r:
- return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
- case X86::LEA32r:
- case X86::LEA64_32r:
- return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
- case X86::LEA64r:
- return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
- }
+static inline bool isLEA(const int opcode) {
+ return opcode == X86::LEA16r || opcode == X86::LEA32r ||
+ opcode == X86::LEA64r || opcode == X86::LEA64_32r;
}
/// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -423,8 +337,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
MachineFunction::iterator MFI) {
MachineInstr &MI = *I;
- const int Opcode = MI.getOpcode();
- if (!isLEA(Opcode))
+ const int opcode = MI.getOpcode();
+ if (!isLEA(opcode))
return;
if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
!TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -436,142 +350,53 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
return;
if (MI.getOperand(2).getImm() > 1)
return;
+ int addrr_opcode, addri_opcode;
+ switch (opcode) {
+ default:
+ llvm_unreachable("Unexpected LEA instruction");
+ case X86::LEA16r:
+ addrr_opcode = X86::ADD16rr;
+ addri_opcode = X86::ADD16ri;
+ break;
+ case X86::LEA32r:
+ addrr_opcode = X86::ADD32rr;
+ addri_opcode = X86::ADD32ri;
+ break;
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ addrr_opcode = X86::ADD64rr;
+ addri_opcode = X86::ADD64ri32;
+ break;
+ }
DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
DEBUG(dbgs() << "FixLEA: Replaced by: ";);
MachineInstr *NewMI = nullptr;
+ const MachineOperand &Dst = MI.getOperand(0);
// Make ADD instruction for two registers writing to LEA's destination
if (SrcR1 != 0 && SrcR2 != 0) {
- const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
- const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
- NewMI =
- BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
+ const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
+ const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+ NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
+ .add(Dst)
+ .add(Src1)
+ .add(Src2);
+ MFI->insert(I, NewMI);
DEBUG(NewMI->dump(););
}
// Make ADD instruction for immediate
if (MI.getOperand(4).getImm() != 0) {
- const MCInstrDesc &ADDri =
- TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
- NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
+ NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
+ .add(Dst)
.add(SrcR)
.addImm(MI.getOperand(4).getImm());
+ MFI->insert(I, NewMI);
DEBUG(NewMI->dump(););
}
if (NewMI) {
MFI->erase(I);
- I = NewMI;
- }
-}
-
-MachineInstr *
-FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineFunction::iterator MFI) {
-
- const int LEAOpcode = MI.getOpcode();
- if (!isLEA(LEAOpcode))
- return nullptr;
-
- const MachineOperand &Dst = MI.getOperand(0);
- const MachineOperand &Base = MI.getOperand(1);
- const MachineOperand &Scale = MI.getOperand(2);
- const MachineOperand &Index = MI.getOperand(3);
- const MachineOperand &Offset = MI.getOperand(4);
- const MachineOperand &Segment = MI.getOperand(5);
-
- if (!(isThreeOperandsLEA(Base, Index, Offset) ||
- hasInefficientLEABaseReg(Base, Index)) ||
- !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
- Segment.getReg() != X86::NoRegister)
- return nullptr;
-
- unsigned int DstR = Dst.getReg();
- unsigned int BaseR = Base.getReg();
- unsigned int IndexR = Index.getReg();
- unsigned SSDstR =
- (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
- bool IsScale1 = Scale.getImm() == 1;
- bool IsInefficientBase = isInefficientLEAReg(BaseR);
- bool IsInefficientIndex = isInefficientLEAReg(IndexR);
-
- // Skip these cases since it takes more than 2 instructions
- // to replace the LEA instruction.
- if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
- return nullptr;
- if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
- (IsInefficientIndex || !IsScale1))
- return nullptr;
-
- const DebugLoc DL = MI.getDebugLoc();
- const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
- const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
-
- DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
- DEBUG(dbgs() << "FixLEA: Replaced by: ";);
-
- // First try to replace LEA with one or two (for the 3-op LEA case)
- // add instructions:
- // 1.lea (%base,%index,1), %base => add %index,%base
- // 2.lea (%base,%index,1), %index => add %base,%index
- if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
- const MachineOperand &Src = DstR == BaseR ? Index : Base;
- MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
- DEBUG(NewMI->dump(););
- // Create ADD instruction for the Offset in case of 3-Ops LEA.
- if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- DEBUG(NewMI->dump(););
- }
- return NewMI;
- }
- // If the base is inefficient try switching the index and base operands,
- // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
- // lea offset(%base,%index,scale),%dst =>
- // lea (%base,%index,scale); add offset,%dst
- if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .add(IsInefficientBase ? Index : Base)
- .add(Scale)
- .add(IsInefficientBase ? Base : Index)
- .addImm(0)
- .add(Segment);
- DEBUG(NewMI->dump(););
- // Create ADD instruction for the Offset in case of 3-Ops LEA.
- if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- DEBUG(NewMI->dump(););
- }
- return NewMI;
- }
- // Handle the rest of the cases with inefficient base register:
- assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
- assert(IsInefficientBase && "efficient base should be handled already!");
-
- // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
- if (IsScale1 && !hasLEAOffset(Offset)) {
- TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
- DEBUG(MI.getPrevNode()->dump(););
-
- MachineInstr *NewMI =
- BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
- DEBUG(NewMI->dump(););
- return NewMI;
+ I = static_cast<MachineBasicBlock::iterator>(NewMI);
}
- // lea offset(%base,%index,scale), %dst =>
- // lea offset( ,%index,scale), %dst; add %base,%dst
- MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .addReg(0)
- .add(Scale)
- .add(Index)
- .add(Offset)
- .add(Segment);
- DEBUG(NewMI->dump(););
-
- NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
- DEBUG(NewMI->dump(););
- return NewMI;
}
bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -585,16 +410,8 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
if (OptLEA) {
if (MF.getSubtarget<X86Subtarget>().isSLM())
processInstructionForSLM(I, MFI);
-
- else {
- if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
- if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
- MFI->erase(I);
- I = NewMI;
- }
- } else
- processInstruction(I, MFI);
- }
+ else
+ processInstruction(I, MFI);
}
}
return false;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index de58d719acb4..5eb5ad52840a 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -19,6 +19,7 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -72,6 +73,9 @@ private:
bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+
const X86TargetMachine &TM;
const X86Subtarget &STI;
const X86InstrInfo &TII;
@@ -243,6 +247,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
return true;
if (selectCmp(I, MRI, MF))
return true;
+ if (selectUadde(I, MRI, MF))
+ return true;
return false;
}
@@ -564,6 +570,66 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectUadde(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ if (I.getOpcode() != TargetOpcode::G_UADDE)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned CarryOutReg = I.getOperand(1).getReg();
+ const unsigned Op0Reg = I.getOperand(2).getReg();
+ const unsigned Op1Reg = I.getOperand(3).getReg();
+ unsigned CarryInReg = I.getOperand(4).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+
+ if (DstTy != LLT::scalar(32))
+ return false;
+
+ // find CarryIn def instruction.
+ MachineInstr *Def = MRI.getVRegDef(CarryInReg);
+ while (Def->getOpcode() == TargetOpcode::G_TRUNC) {
+ CarryInReg = Def->getOperand(1).getReg();
+ Def = MRI.getVRegDef(CarryInReg);
+ }
+
+ unsigned Opcode;
+ if (Def->getOpcode() == TargetOpcode::G_UADDE) {
+ // carry set by prev ADD.
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), X86::EFLAGS)
+ .addReg(CarryInReg);
+
+ if (!RBI.constrainGenericRegister(CarryInReg, X86::GR32RegClass, MRI))
+ return false;
+
+ Opcode = X86::ADC32rr;
+ } else if (auto val = getConstantVRegVal(CarryInReg, MRI)) {
+ // carry is constant, support only 0.
+ if (*val != 0)
+ return false;
+
+ Opcode = X86::ADD32rr;
+ } else
+ return false;
+
+ MachineInstr &AddInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
+ .addReg(Op0Reg)
+ .addReg(Op1Reg);
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg)
+ .addReg(X86::EFLAGS);
+
+ if (!constrainSelectedInstRegOperands(AddInst, TII, TRI, RBI) ||
+ !RBI.constrainGenericRegister(CarryOutReg, X86::GR32RegClass, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index cf26238c0239..8ce240714f17 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -59,6 +59,11 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32})
setAction({BinOp, Ty}, Legal);
+ for (unsigned Op : {G_UADDE}) {
+ setAction({Op, s32}, Legal);
+ setAction({Op, 1, s1}, Legal);
+ }
+
for (unsigned MemOp : {G_LOAD, G_STORE}) {
for (auto Ty : {s8, s16, s32, p0})
setAction({MemOp, Ty}, Legal);
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 02be95e2e556..de1514243aeb 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -253,11 +253,6 @@ protected:
/// True if the LEA instruction with certain arguments is slow
bool SlowLEA;
- /// True if the LEA instruction has all three source operands: base, index,
- /// and offset or if the LEA instruction uses base and index registers where
- /// the base is EBP, RBP,or R13
- bool Slow3OpsLEA;
-
/// True if INC and DEC instructions are slow when writing to flags
bool SlowIncDec;
@@ -495,7 +490,6 @@ public:
bool callRegIndirect() const { return CallRegIndirect; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
- bool slow3OpsLEA() const { return Slow3OpsLEA; }
bool slowIncDec() const { return SlowIncDec; }
bool hasCDI() const { return HasCDI; }
bool hasPFI() const { return HasPFI; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index c6a90725d89c..9a82e6e50463 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -61,7 +61,6 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
-void initializeFixupLEAPassPass(PassRegistry &);
void initializeX86ExecutionDepsFixPass(PassRegistry &);
} // end namespace llvm
@@ -76,7 +75,6 @@ extern "C" void LLVMInitializeX86Target() {
initializeWinEHStatePassPass(PR);
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
- initializeFixupLEAPassPass(PR);
initializeX86ExecutionDepsFixPass(PR);
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 80e18161a94b..8566bd91c89e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1392,6 +1392,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::BITREVERSE, MVT::v8i64, 5 },
+ { ISD::BITREVERSE, MVT::v16i32, 5 },
+ { ISD::BITREVERSE, MVT::v32i16, 5 },
+ { ISD::BITREVERSE, MVT::v64i8, 5 },
+ };
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v8i64, 36 },
+ { ISD::BITREVERSE, MVT::v16i32, 24 },
+ };
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },
@@ -1550,6 +1560,14 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
return LT.first * Entry->Cost;
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 4480220f2cd4..417d57f7625b 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -347,6 +347,27 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
return FrameTy;
}
+// We need to make room to insert a spill after initial PHIs, but before
+// catchswitch instruction. Placing it before violates the requirement that
+// catchswitch, like all other EHPads must be the first nonPHI in a block.
+//
+// Split away catchswitch into a separate block and insert in its place:
+//
+// cleanuppad <InsertPt> cleanupret.
+//
+// cleanupret instruction will act as an insert point for the spill.
+static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
+ BasicBlock *CurrentBlock = CatchSwitch->getParent();
+ BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch);
+ CurrentBlock->getTerminator()->eraseFromParent();
+
+ auto *CleanupPad =
+ CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock);
+ auto *CleanupRet =
+ CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock);
+ return CleanupRet;
+}
+
// Replace all alloca and SSA values that are accessed across suspend points
// with GetElementPointer from coroutine frame + loads and stores. Create an
// AllocaSpillBB that will become the new entry block for the resume parts of
@@ -437,8 +458,11 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
InsertPt = NewBB->getTerminator();
} else if (dyn_cast<PHINode>(CurrentValue)) {
// Skip the PHINodes and EH pads instructions.
- InsertPt =
- &*cast<Instruction>(E.def())->getParent()->getFirstInsertionPt();
+ BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
+ if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
+ InsertPt = splitBeforeCatchSwitch(CSI);
+ else
+ InsertPt = &*DefBlock->getFirstInsertionPt();
} else {
// For all other values, the spill is placed immediately after
// the definition.
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 1424f61fe701..f88a2c6acc3f 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -74,6 +74,27 @@ static inline unsigned getComplexity(Value *V) {
return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
}
+/// Predicate canonicalization reduces the number of patterns that need to be
+/// matched by other transforms. For example, we may swap the operands of a
+/// conditional branch or select to create a compare with a canonical (inverted)
+/// predicate which is then more likely to be matched with other values.
+static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+ switch (Pred) {
+ case CmpInst::ICMP_NE:
+ case CmpInst::ICMP_ULE:
+ case CmpInst::ICMP_SLE:
+ case CmpInst::ICMP_UGE:
+ case CmpInst::ICMP_SGE:
+ // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_OGE:
+ return false;
+ default:
+ return true;
+ }
+}
+
/// \brief Add one to a Constant
static inline Constant *AddOne(Constant *C) {
return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 65b1148cb03b..7ed9fd566b37 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2210,37 +2210,17 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
return &BI;
}
- // Canonicalize fcmp_one -> fcmp_oeq
- FCmpInst::Predicate FPred; Value *Y;
- if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))),
- TrueDest, FalseDest))) {
- // TODO: Why are we only transforming these 3 predicates?
- if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
- FPred == FCmpInst::FCMP_OGE) {
- FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
- Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
-
- // Swap Destinations and condition.
- BI.swapSuccessors();
- Worklist.Add(Cond);
- return &BI;
- }
- }
-
- // Canonicalize icmp_ne -> icmp_eq
- ICmpInst::Predicate IPred;
- if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))),
- TrueDest, FalseDest))) {
- if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE ||
- IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
- IPred == ICmpInst::ICMP_SGE) {
- ICmpInst *Cond = cast<ICmpInst>(BI.getCondition());
- Cond->setPredicate(ICmpInst::getInversePredicate(IPred));
- // Swap Destinations and condition.
- BI.swapSuccessors();
- Worklist.Add(Cond);
- return &BI;
- }
+ // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
+ CmpInst::Predicate Pred;
+ if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest,
+ FalseDest)) &&
+ !isCanonicalPredicate(Pred)) {
+ // Swap destinations and condition.
+ CmpInst *Cond = cast<CmpInst>(BI.getCondition());
+ Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+ BI.swapSuccessors();
+ Worklist.Add(Cond);
+ return &BI;
}
return nullptr;
@@ -3053,7 +3033,10 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
}
}
- InstrsForInstCombineWorklist.push_back(Inst);
+ // Skip processing debug intrinsics in InstCombine. Processing these call instructions
+ // consumes non-trivial amount of time and provides no value for the optimization.
+ if (!isa<DbgInfoIntrinsic>(Inst))
+ InstrsForInstCombineWorklist.push_back(Inst);
}
// Recursively visit successors. If this is a branch or switch on a
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 340c81fed0fd..37b9c4b1094e 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -546,7 +546,7 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
// If there are escaping uses of invariant.start instruction, the load maybe
// non-invariant.
if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
- II->hasNUsesOrMore(1))
+ !II->use_empty())
continue;
unsigned InvariantSizeInBits =
cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 6693a26e8890..cb6223b070a6 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1292,13 +1292,15 @@ bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
BasicBlock *PH = CurLoop->getLoopPreheader();
Value *InitX = PhiX->getIncomingValueForBlock(PH);
// If we check X != 0 before entering the loop we don't need a zero
- // check in CTLZ intrinsic.
- if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
- if (BranchInst *PreCondBr =
- dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
- if (matchCondition(PreCondBr, PH) == InitX)
- ZeroCheck = true;
- }
+ // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
+ // loop (if it is used we count CTLZ(X >> 1)).
+ if (!IsCntPhiUsedOutsideLoop)
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
// Check if CTLZ intrinsic is profitable. Assume it is always profitable
// if we delete the loop (the loop has only 6 instructions):
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index ccedb98d7fa1..bd1f21c69eba 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3902,8 +3902,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// Compute the difference between the two.
int64_t Imm = (uint64_t)JImm - M->first;
- for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
- LUIdx = UsedByIndices.find_next(LUIdx))
+ for (unsigned LUIdx : UsedByIndices.set_bits())
// Make a memo of this use, offset, and register tuple.
if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 5e0a705782ea..0e7572f8d2e5 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -642,6 +642,7 @@ private:
void updateProcessedCount(Value *V);
void verifyMemoryCongruency() const;
void verifyIterationSettled(Function &F);
+ void verifyStoreExpressions() const;
bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
BasicBlock *getBlockForValue(Value *V) const;
void deleteExpression(const Expression *E) const;
@@ -2003,7 +2004,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// If it's not a memory use, set the MemoryAccess equivalence
auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I));
- bool InstWasMemoryLeader = InstMA && OldClass->getMemoryLeader() == InstMA;
if (InstMA)
moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
ValueToClass[I] = NewClass;
@@ -2029,31 +2029,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
if (OldClass->getStoredValue())
OldClass->setStoredValue(nullptr);
}
- // If we destroy the old access leader and it's a store, we have to
- // effectively destroy the congruence class. When it comes to scalars,
- // anything with the same value is as good as any other. That means that
- // one leader is as good as another, and as long as you have some leader for
- // the value, you are good.. When it comes to *memory states*, only one
- // particular thing really represents the definition of a given memory
- // state. Once it goes away, we need to re-evaluate which pieces of memory
- // are really still equivalent. The best way to do this is to re-value
- // number things. The only way to really make that happen is to destroy the
- // rest of the class. In order to effectively destroy the class, we reset
- // ExpressionToClass for each by using the ValueToExpression mapping. The
- // members later get marked as touched due to the leader change. We will
- // create new congruence classes, and the pieces that are still equivalent
- // will end back together in a new class. If this becomes too expensive, it
- // is possible to use a versioning scheme for the congruence classes to
- // avoid the expressions finding this old class. Note that the situation is
- // different for memory phis, becuase they are evaluated anew each time, and
- // they become equal not by hashing, but by seeing if all operands are the
- // same (or only one is reachable).
- if (OldClass->getStoreCount() > 0 && InstWasMemoryLeader) {
- DEBUG(dbgs() << "Kicking everything out of class " << OldClass->getID()
- << " because MemoryAccess leader changed");
- for (auto Member : *OldClass)
- ExpressionToClass.erase(ValueToExpression.lookup(Member));
- }
OldClass->setLeader(getNextValueLeader(OldClass));
OldClass->resetNextLeader();
markValueLeaderChangeTouched(OldClass);
@@ -2062,7 +2037,6 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
// Perform congruence finding on a given value numbering expression.
void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
- ValueToExpression[I] = E;
// This is guaranteed to return something, since it will at least find
// TOP.
@@ -2132,6 +2106,18 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
if (auto *CI = dyn_cast<CmpInst>(I))
markPredicateUsersTouched(CI);
}
+ // If we changed the class of the store, we want to ensure nothing finds the
+ // old store expression. In particular, loads do not compare against stored
+ // value, so they will find old store expressions (and associated class
+ // mappings) if we leave them in the table.
+ if (ClassChanged && isa<StoreExpression>(E)) {
+ auto *OldE = ValueToExpression.lookup(I);
+ // It could just be that the old class died. We don't want to erase it if we
+ // just moved classes.
+ if (OldE && isa<StoreExpression>(OldE) && !OldE->equals(*E))
+ ExpressionToClass.erase(OldE);
+ }
+ ValueToExpression[I] = E;
}
// Process the fact that Edge (from, to) is reachable, including marking
@@ -2651,6 +2637,30 @@ void NewGVN::verifyIterationSettled(Function &F) {
#endif
}
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+ DenseSet<std::pair<const Value *, const Value *>> StoreExpressionSet;
+ for (const auto &KV : ExpressionToClass) {
+ if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+ // Make sure a version that will conflict with loads is not already there
+ auto Res =
+ StoreExpressionSet.insert({SE->getOperand(0), SE->getMemoryLeader()});
+ assert(Res.second &&
+ "Stored expression conflict exists in expression table");
+ auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+ assert(ValueExpr && ValueExpr->equals(*SE) &&
+ "StoreExpression in ExpressionToClass is not latest "
+ "StoreExpression for value");
+ }
+ }
+#endif
+}
+
// This is the main value numbering loop, it iterates over the initial touched
// instruction set, propagating value numbers, marking things touched, etc,
// until the set of touched instructions is completely empty.
@@ -2668,8 +2678,7 @@ void NewGVN::iterateTouchedInstructions() {
// TODO: As we hit a new block, we should push and pop equalities into a
// table lookupOperandLeader can use, to catch things PredicateInfo
// might miss, like edge-only equivalences.
- for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
- InstrNum = TouchedInstructions.find_next(InstrNum)) {
+ for (unsigned InstrNum : TouchedInstructions.set_bits()) {
// This instruction was found to be dead. We don't bother looking
// at it again.
@@ -2776,6 +2785,7 @@ bool NewGVN::runGVN() {
iterateTouchedInstructions();
verifyMemoryCongruency();
verifyIterationSettled(F);
+ verifyStoreExpressions();
Changed |= eliminateInstructions(F);
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index ef29d4141600..53320bff0883 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1922,7 +1922,7 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
// User must be a binary operator with one or more uses.
Instruction *User = I->user_back();
- if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
+ if (!isa<BinaryOperator>(User) || User->use_empty())
return nullptr;
unsigned UserOpcode = User->getOpcode();
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 4f608c97147d..b32a61a7e8f8 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,4 +1,4 @@
-//===-- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow --------===//
+//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,25 +7,41 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
#define DEBUG_TYPE "simple-loop-unswitch"
@@ -174,7 +190,7 @@ static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
// When the loop exit is directly unswitched we just need to update the
// incoming basic block. We loop to handle weird cases with repeated
// incoming blocks, but expect to typically only have one operand here.
- for (auto i : llvm::seq<int>(0, PN->getNumOperands())) {
+ for (auto i : seq<int>(0, PN->getNumOperands())) {
assert(PN->getIncomingBlock(i) == &OldExitingBB &&
"Found incoming block different from unique predecessor!");
PN->setIncomingBlock(i, &OldPH);
@@ -688,9 +704,11 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
}
namespace {
+
class SimpleLoopUnswitchLegacyPass : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
+
explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
initializeSimpleLoopUnswitchLegacyPassPass(
*PassRegistry::getPassRegistry());
@@ -703,7 +721,8 @@ public:
getLoopAnalysisUsage(AU);
}
};
-} // namespace
+
+} // end anonymous namespace
bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
diff --git a/test/Analysis/CostModel/SystemZ/div-pow2.ll b/test/Analysis/CostModel/SystemZ/div-pow2.ll
new file mode 100644
index 000000000000..9ef2dd71e8fa
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/div-pow2.ll
@@ -0,0 +1,154 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a) {
+ %r = sdiv i64 %a, 2
+ ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, 2
+}
+
+define i64 @fun1(i64 %a) {
+ %r = sdiv i64 %a, -4
+ ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, -4
+}
+
+define i32 @fun2(i32 %a) {
+ %r = sdiv i32 %a, 8
+ ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, 8
+}
+
+define i32 @fun3(i32 %a) {
+ %r = sdiv i32 %a, -16
+ ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, -16
+}
+
+define i16 @fun4(i16 %a) {
+ %r = sdiv i16 %a, 32
+ ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, 32
+}
+
+define i16 @fun5(i16 %a) {
+ %r = sdiv i16 %a, -64
+ ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, -64
+}
+
+define i8 @fun6(i8 %a) {
+ %r = sdiv i8 %a, 64
+ ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, 64
+}
+
+define i8 @fun7(i8 %a) {
+ %r = sdiv i8 %a, -128
+ ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, -128
+}
+
+
+; Vector sdiv
+
+define <2 x i64> @fun8(<2 x i64> %a) {
+ %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+ ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+}
+
+define <2 x i64> @fun9(<2 x i64> %a) {
+ %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+ ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+}
+
+define <4 x i32> @fun10(<4 x i32> %a) {
+ %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+}
+
+define <4 x i32> @fun11(<4 x i32> %a) {
+ %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+ ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, <i32 -16
+}
+
+define <8 x i16> @fun12(<8 x i16> %a) {
+ %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+ ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, <i16 32
+}
+
+define <8 x i16> @fun13(<8 x i16> %a) {
+ %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+ ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, <i16 -64
+}
+
+define <16 x i8> @fun14(<16 x i8> %a) {
+ %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+ ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, <i8 64
+}
+
+define <16 x i8> @fun15(<16 x i8> %a) {
+ %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+ ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, <i8 -128
+}
+
+; Scalar udiv
+
+define i64 @fun16(i64 %a) {
+ %r = udiv i64 %a, 2
+ ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i64 %a, 2
+}
+
+define i32 @fun17(i32 %a) {
+ %r = udiv i32 %a, 8
+ ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i32 %a, 8
+}
+
+define i16 @fun18(i16 %a) {
+ %r = udiv i16 %a, 32
+ ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i16 %a, 32
+}
+
+define i8 @fun19(i8 %a) {
+ %r = udiv i8 %a, 128
+ ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i8 %a, -128
+}
+
+; Vector udiv
+
+define <2 x i64> @fun20(<2 x i64> %a) {
+ %r = udiv <2 x i64> %a, <i64 2, i64 2>
+ ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <2 x i64> %a, <i64 2
+}
+
+define <4 x i32> @fun21(<4 x i32> %a) {
+ %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <4 x i32> %a, <i32 8
+}
+
+define <8 x i16> @fun22(<8 x i16> %a) {
+ %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+ ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <8 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun23(<16 x i8> %a) {
+ %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+ ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <16 x i8> %a, <i8 -128
+}
diff --git a/test/Analysis/CostModel/X86/bitreverse.ll b/test/Analysis/CostModel/X86/bitreverse.ll
index 8d5e1421eb82..9321b7323b57 100644
--- a/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/test/Analysis/CostModel/X86/bitreverse.ll
@@ -2,10 +2,14 @@
; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=SSE42
; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX
; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX2
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=knl -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX512 -check-prefix=AVX512F
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=skx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX512 -check-prefix=AVX512BW
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=SSE2
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=SSE42
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX512 -check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX512 -check-prefix=AVX512BW
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
@@ -64,12 +68,18 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
+declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>)
+declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>)
+declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>)
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>)
+
define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v2i64':
; SSE2: Found an estimated cost of 29 for instruction: %bitreverse
; SSE42: Found an estimated cost of 5 for instruction: %bitreverse
; AVX: Found an estimated cost of 5 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 1 for instruction: %bitreverse
%bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
ret <2 x i64> %bitreverse
@@ -81,17 +91,32 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
; SSE42: Found an estimated cost of 10 for instruction: %bitreverse
; AVX: Found an estimated cost of 12 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 4 for instruction: %bitreverse
%bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
ret <4 x i64> %bitreverse
}
+define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v8i64':
+; SSE2: Found an estimated cost of 116 for instruction: %bitreverse
+; SSE42: Found an estimated cost of 20 for instruction: %bitreverse
+; AVX: Found an estimated cost of 24 for instruction: %bitreverse
+; AVX2: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512F: Found an estimated cost of 36 for instruction: %bitreverse
+; AVX512BW: Found an estimated cost of 5 for instruction: %bitreverse
+; XOP: Found an estimated cost of 8 for instruction: %bitreverse
+ %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+ ret <8 x i64> %bitreverse
+}
+
define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v4i32':
; SSE2: Found an estimated cost of 27 for instruction: %bitreverse
; SSE42: Found an estimated cost of 5 for instruction: %bitreverse
; AVX: Found an estimated cost of 5 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 1 for instruction: %bitreverse
%bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
ret <4 x i32> %bitreverse
@@ -103,17 +128,32 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
; SSE42: Found an estimated cost of 10 for instruction: %bitreverse
; AVX: Found an estimated cost of 12 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 4 for instruction: %bitreverse
%bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
ret <8 x i32> %bitreverse
}
+define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v16i32':
+; SSE2: Found an estimated cost of 108 for instruction: %bitreverse
+; SSE42: Found an estimated cost of 20 for instruction: %bitreverse
+; AVX: Found an estimated cost of 24 for instruction: %bitreverse
+; AVX2: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512F: Found an estimated cost of 24 for instruction: %bitreverse
+; AVX512BW: Found an estimated cost of 5 for instruction: %bitreverse
+; XOP: Found an estimated cost of 8 for instruction: %bitreverse
+ %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+ ret <16 x i32> %bitreverse
+}
+
define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v8i16':
; SSE2: Found an estimated cost of 27 for instruction: %bitreverse
; SSE42: Found an estimated cost of 5 for instruction: %bitreverse
; AVX: Found an estimated cost of 5 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 1 for instruction: %bitreverse
%bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
ret <8 x i16> %bitreverse
@@ -125,17 +165,32 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
; SSE42: Found an estimated cost of 10 for instruction: %bitreverse
; AVX: Found an estimated cost of 12 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 4 for instruction: %bitreverse
%bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
ret <16 x i16> %bitreverse
}
+define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v32i16':
+; SSE2: Found an estimated cost of 108 for instruction: %bitreverse
+; SSE42: Found an estimated cost of 20 for instruction: %bitreverse
+; AVX: Found an estimated cost of 24 for instruction: %bitreverse
+; AVX2: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512F: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512BW: Found an estimated cost of 5 for instruction: %bitreverse
+; XOP: Found an estimated cost of 8 for instruction: %bitreverse
+ %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+ ret <32 x i16> %bitreverse
+}
+
define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v16i8':
; SSE2: Found an estimated cost of 20 for instruction: %bitreverse
; SSE42: Found an estimated cost of 5 for instruction: %bitreverse
; AVX: Found an estimated cost of 5 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 1 for instruction: %bitreverse
%bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
ret <16 x i8> %bitreverse
@@ -147,7 +202,21 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
; SSE42: Found an estimated cost of 10 for instruction: %bitreverse
; AVX: Found an estimated cost of 12 for instruction: %bitreverse
; AVX2: Found an estimated cost of 5 for instruction: %bitreverse
+; AVX512: Found an estimated cost of 5 for instruction: %bitreverse
; XOP: Found an estimated cost of 4 for instruction: %bitreverse
%bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
ret <32 x i8> %bitreverse
}
+
+define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v64i8':
+; SSE2: Found an estimated cost of 80 for instruction: %bitreverse
+; SSE42: Found an estimated cost of 20 for instruction: %bitreverse
+; AVX: Found an estimated cost of 24 for instruction: %bitreverse
+; AVX2: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512F: Found an estimated cost of 10 for instruction: %bitreverse
+; AVX512BW: Found an estimated cost of 5 for instruction: %bitreverse
+; XOP: Found an estimated cost of 8 for instruction: %bitreverse
+ %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+ ret <64 x i8> %bitreverse
+}
diff --git a/test/Analysis/CostModel/X86/ctbits-cost.ll b/test/Analysis/CostModel/X86/ctbits-cost.ll
deleted file mode 100644
index aaf092c7b1d7..000000000000
--- a/test/Analysis/CostModel/X86/ctbits-cost.ll
+++ /dev/null
@@ -1,587 +0,0 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
-
-; Verify the cost of scalar population count instructions.
-
-declare i64 @llvm.ctpop.i64(i64)
-declare i32 @llvm.ctpop.i32(i32)
-declare i16 @llvm.ctpop.i16(i16)
-declare i8 @llvm.ctpop.i8(i8)
-
-define i64 @var_ctpop_i64(i64 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i64':
-; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
-; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
- %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
- ret i64 %ctpop
-}
-
-define i32 @var_ctpop_i32(i32 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i32':
-; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
-; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
- %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
- ret i32 %ctpop
-}
-
-define i16 @var_ctpop_i16(i16 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i16':
-; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
-; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
- %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
- ret i16 %ctpop
-}
-
-define i8 @var_ctpop_i8(i8 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i8':
-; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
-; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
- %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
- ret i8 %ctpop
-}
-
-; Verify the cost of vector population count instructions.
-
-declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
-declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
-declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
-
-declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
-declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
-declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
-declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
-
-define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v2i64':
-; SSE2: Found an estimated cost of 12 for instruction: %ctpop
-; SSE42: Found an estimated cost of 7 for instruction: %ctpop
-; AVX: Found an estimated cost of 7 for instruction: %ctpop
- %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
- ret <2 x i64> %ctpop
-}
-
-define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i64':
-; SSE2: Found an estimated cost of 24 for instruction: %ctpop
-; SSE42: Found an estimated cost of 14 for instruction: %ctpop
-; AVX1: Found an estimated cost of 16 for instruction: %ctpop
-; AVX2: Found an estimated cost of 7 for instruction: %ctpop
- %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
- ret <4 x i64> %ctpop
-}
-
-define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i32':
-; SSE2: Found an estimated cost of 15 for instruction: %ctpop
-; SSE42: Found an estimated cost of 11 for instruction: %ctpop
-; AVX: Found an estimated cost of 11 for instruction: %ctpop
- %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
- ret <4 x i32> %ctpop
-}
-
-define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i32':
-; SSE2: Found an estimated cost of 30 for instruction: %ctpop
-; SSE42: Found an estimated cost of 22 for instruction: %ctpop
-; AVX1: Found an estimated cost of 24 for instruction: %ctpop
-; AVX2: Found an estimated cost of 11 for instruction: %ctpop
- %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
- ret <8 x i32> %ctpop
-}
-
-define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i16':
-; SSE2: Found an estimated cost of 13 for instruction: %ctpop
-; SSE42: Found an estimated cost of 9 for instruction: %ctpop
-; AVX: Found an estimated cost of 9 for instruction: %ctpop
- %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
- ret <8 x i16> %ctpop
-}
-
-define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i16':
-; SSE2: Found an estimated cost of 26 for instruction: %ctpop
-; SSE42: Found an estimated cost of 18 for instruction: %ctpop
-; AVX1: Found an estimated cost of 20 for instruction: %ctpop
-; AVX2: Found an estimated cost of 9 for instruction: %ctpop
- %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
- ret <16 x i16> %ctpop
-}
-
-define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i8':
-; SSE2: Found an estimated cost of 10 for instruction: %ctpop
-; SSE42: Found an estimated cost of 6 for instruction: %ctpop
-; AVX: Found an estimated cost of 6 for instruction: %ctpop
- %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
- ret <16 x i8> %ctpop
-}
-
-define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v32i8':
-; SSE2: Found an estimated cost of 20 for instruction: %ctpop
-; SSE42: Found an estimated cost of 12 for instruction: %ctpop
-; AVX1: Found an estimated cost of 14 for instruction: %ctpop
-; AVX2: Found an estimated cost of 6 for instruction: %ctpop
- %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
- ret <32 x i8> %ctpop
-}
-
-; Verify the cost of scalar leading zero count instructions.
-
-declare i64 @llvm.ctlz.i64(i64, i1)
-declare i32 @llvm.ctlz.i32(i32, i1)
-declare i16 @llvm.ctlz.i16(i16, i1)
-declare i8 @llvm.ctlz.i8(i8, i1)
-
-define i64 @var_ctlz_i64(i64 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i64':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
- ret i64 %ctlz
-}
-
-define i64 @var_ctlz_i64u(i64 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i64u':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 1)
- ret i64 %ctlz
-}
-
-define i32 @var_ctlz_i32(i32 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i32':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
- ret i32 %ctlz
-}
-
-define i32 @var_ctlz_i32u(i32 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i32u':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 1)
- ret i32 %ctlz
-}
-
-define i16 @var_ctlz_i16(i16 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i16':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 0)
- ret i16 %ctlz
-}
-
-define i16 @var_ctlz_i16u(i16 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i16u':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 1)
- ret i16 %ctlz
-}
-
-define i8 @var_ctlz_i8(i8 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i8':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 0)
- ret i8 %ctlz
-}
-
-define i8 @var_ctlz_i8u(i8 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i8u':
-; CHECK: Found an estimated cost of 1 for instruction: %ctlz
- %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 1)
- ret i8 %ctlz
-}
-
-; Verify the cost of vector leading zero count instructions.
-
-declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
-
-declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
-declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
-declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
-declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
-
-define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64':
-; SSE2: Found an estimated cost of 25 for instruction: %ctlz
-; SSE42: Found an estimated cost of 23 for instruction: %ctlz
-; AVX: Found an estimated cost of 23 for instruction: %ctlz
- %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
- ret <2 x i64> %ctlz
-}
-
-define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64u':
-; SSE2: Found an estimated cost of 25 for instruction: %ctlz
-; SSE42: Found an estimated cost of 23 for instruction: %ctlz
-; AVX: Found an estimated cost of 23 for instruction: %ctlz
- %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
- ret <2 x i64> %ctlz
-}
-
-define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64':
-; SSE2: Found an estimated cost of 50 for instruction: %ctlz
-; SSE42: Found an estimated cost of 46 for instruction: %ctlz
-; AVX1: Found an estimated cost of 48 for instruction: %ctlz
-; AVX2: Found an estimated cost of 23 for instruction: %ctlz
- %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 0)
- ret <4 x i64> %ctlz
-}
-
-define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64u':
-; SSE2: Found an estimated cost of 50 for instruction: %ctlz
-; SSE42: Found an estimated cost of 46 for instruction: %ctlz
-; AVX1: Found an estimated cost of 48 for instruction: %ctlz
-; AVX2: Found an estimated cost of 23 for instruction: %ctlz
- %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 1)
- ret <4 x i64> %ctlz
-}
-
-define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32':
-; SSE2: Found an estimated cost of 26 for instruction: %ctlz
-; SSE42: Found an estimated cost of 18 for instruction: %ctlz
-; AVX: Found an estimated cost of 18 for instruction: %ctlz
- %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
- ret <4 x i32> %ctlz
-}
-
-define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32u':
-; SSE2: Found an estimated cost of 26 for instruction: %ctlz
-; SSE42: Found an estimated cost of 18 for instruction: %ctlz
-; AVX: Found an estimated cost of 18 for instruction: %ctlz
- %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
- ret <4 x i32> %ctlz
-}
-
-define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32':
-; SSE2: Found an estimated cost of 52 for instruction: %ctlz
-; SSE42: Found an estimated cost of 36 for instruction: %ctlz
-; AVX1: Found an estimated cost of 38 for instruction: %ctlz
-; AVX2: Found an estimated cost of 18 for instruction: %ctlz
- %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 0)
- ret <8 x i32> %ctlz
-}
-
-define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32u':
-; SSE2: Found an estimated cost of 52 for instruction: %ctlz
-; SSE42: Found an estimated cost of 36 for instruction: %ctlz
-; AVX1: Found an estimated cost of 38 for instruction: %ctlz
-; AVX2: Found an estimated cost of 18 for instruction: %ctlz
- %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 1)
- ret <8 x i32> %ctlz
-}
-
-define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16':
-; SSE2: Found an estimated cost of 20 for instruction: %ctlz
-; SSE42: Found an estimated cost of 14 for instruction: %ctlz
-; AVX: Found an estimated cost of 14 for instruction: %ctlz
- %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
- ret <8 x i16> %ctlz
-}
-
-define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16u':
-; SSE2: Found an estimated cost of 20 for instruction: %ctlz
-; SSE42: Found an estimated cost of 14 for instruction: %ctlz
-; AVX: Found an estimated cost of 14 for instruction: %ctlz
- %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
- ret <8 x i16> %ctlz
-}
-
-define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16':
-; SSE2: Found an estimated cost of 40 for instruction: %ctlz
-; SSE42: Found an estimated cost of 28 for instruction: %ctlz
-; AVX1: Found an estimated cost of 30 for instruction: %ctlz
-; AVX2: Found an estimated cost of 14 for instruction: %ctlz
- %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 0)
- ret <16 x i16> %ctlz
-}
-
-define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16u':
-; SSE2: Found an estimated cost of 40 for instruction: %ctlz
-; SSE42: Found an estimated cost of 28 for instruction: %ctlz
-; AVX1: Found an estimated cost of 30 for instruction: %ctlz
-; AVX2: Found an estimated cost of 14 for instruction: %ctlz
- %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 1)
- ret <16 x i16> %ctlz
-}
-
-define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8':
-; SSE2: Found an estimated cost of 17 for instruction: %ctlz
-; SSE42: Found an estimated cost of 9 for instruction: %ctlz
-; AVX: Found an estimated cost of 9 for instruction: %ctlz
- %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
- ret <16 x i8> %ctlz
-}
-
-define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8u':
-; SSE2: Found an estimated cost of 17 for instruction: %ctlz
-; SSE42: Found an estimated cost of 9 for instruction: %ctlz
-; AVX: Found an estimated cost of 9 for instruction: %ctlz
- %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
- ret <16 x i8> %ctlz
-}
-
-define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8':
-; SSE2: Found an estimated cost of 34 for instruction: %ctlz
-; SSE42: Found an estimated cost of 18 for instruction: %ctlz
-; AVX1: Found an estimated cost of 20 for instruction: %ctlz
-; AVX2: Found an estimated cost of 9 for instruction: %ctlz
- %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 0)
- ret <32 x i8> %ctlz
-}
-
-define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8u':
-; SSE2: Found an estimated cost of 34 for instruction: %ctlz
-; SSE42: Found an estimated cost of 18 for instruction: %ctlz
-; AVX1: Found an estimated cost of 20 for instruction: %ctlz
-; AVX2: Found an estimated cost of 9 for instruction: %ctlz
- %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 1)
- ret <32 x i8> %ctlz
-}
-
-; Verify the cost of scalar trailing zero count instructions.
-
-declare i64 @llvm.cttz.i64(i64, i1)
-declare i32 @llvm.cttz.i32(i32, i1)
-declare i16 @llvm.cttz.i16(i16, i1)
-declare i8 @llvm.cttz.i8(i8, i1)
-
-define i64 @var_cttz_i64(i64 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i64':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 0)
- ret i64 %cttz
-}
-
-define i64 @var_cttz_i64u(i64 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i64u':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 1)
- ret i64 %cttz
-}
-
-define i32 @var_cttz_i32(i32 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i32':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 0)
- ret i32 %cttz
-}
-
-define i32 @var_cttz_i32u(i32 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i32u':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 1)
- ret i32 %cttz
-}
-
-define i16 @var_cttz_i16(i16 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i16':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 0)
- ret i16 %cttz
-}
-
-define i16 @var_cttz_i16u(i16 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i16u':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 1)
- ret i16 %cttz
-}
-
-define i8 @var_cttz_i8(i8 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i8':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 0)
- ret i8 %cttz
-}
-
-define i8 @var_cttz_i8u(i8 %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_i8u':
-; CHECK: Found an estimated cost of 1 for instruction: %cttz
- %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 1)
- ret i8 %cttz
-}
-
-; Verify the cost of vector trailing zero count instructions.
-
-declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
-declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
-declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
-declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
-
-declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
-declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
-declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
-declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
-
-define <2 x i64> @var_cttz_v2i64(<2 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64':
-; SSE2: Found an estimated cost of 14 for instruction: %cttz
-; SSE42: Found an estimated cost of 10 for instruction: %cttz
-; AVX: Found an estimated cost of 10 for instruction: %cttz
- %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 0)
- ret <2 x i64> %cttz
-}
-
-define <2 x i64> @var_cttz_v2i64u(<2 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64u':
-; SSE2: Found an estimated cost of 14 for instruction: %cttz
-; SSE42: Found an estimated cost of 10 for instruction: %cttz
-; AVX: Found an estimated cost of 10 for instruction: %cttz
- %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 1)
- ret <2 x i64> %cttz
-}
-
-define <4 x i64> @var_cttz_v4i64(<4 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64':
-; SSE2: Found an estimated cost of 28 for instruction: %cttz
-; SSE42: Found an estimated cost of 20 for instruction: %cttz
-; AVX1: Found an estimated cost of 22 for instruction: %cttz
-; AVX2: Found an estimated cost of 10 for instruction: %cttz
- %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 0)
- ret <4 x i64> %cttz
-}
-
-define <4 x i64> @var_cttz_v4i64u(<4 x i64> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64u':
-; SSE2: Found an estimated cost of 28 for instruction: %cttz
-; SSE42: Found an estimated cost of 20 for instruction: %cttz
-; AVX1: Found an estimated cost of 22 for instruction: %cttz
-; AVX2: Found an estimated cost of 10 for instruction: %cttz
- %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 1)
- ret <4 x i64> %cttz
-}
-
-define <4 x i32> @var_cttz_v4i32(<4 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32':
-; SSE2: Found an estimated cost of 18 for instruction: %cttz
-; SSE42: Found an estimated cost of 14 for instruction: %cttz
-; AVX: Found an estimated cost of 14 for instruction: %cttz
- %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 0)
- ret <4 x i32> %cttz
-}
-
-define <4 x i32> @var_cttz_v4i32u(<4 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32u':
-; SSE2: Found an estimated cost of 18 for instruction: %cttz
-; SSE42: Found an estimated cost of 14 for instruction: %cttz
-; AVX: Found an estimated cost of 14 for instruction: %cttz
- %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 1)
- ret <4 x i32> %cttz
-}
-
-define <8 x i32> @var_cttz_v8i32(<8 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32':
-; SSE2: Found an estimated cost of 36 for instruction: %cttz
-; SSE42: Found an estimated cost of 28 for instruction: %cttz
-; AVX1: Found an estimated cost of 30 for instruction: %cttz
-; AVX2: Found an estimated cost of 14 for instruction: %cttz
- %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 0)
- ret <8 x i32> %cttz
-}
-
-define <8 x i32> @var_cttz_v8i32u(<8 x i32> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32u':
-; SSE2: Found an estimated cost of 36 for instruction: %cttz
-; SSE42: Found an estimated cost of 28 for instruction: %cttz
-; AVX1: Found an estimated cost of 30 for instruction: %cttz
-; AVX2: Found an estimated cost of 14 for instruction: %cttz
- %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 1)
- ret <8 x i32> %cttz
-}
-
-define <8 x i16> @var_cttz_v8i16(<8 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16':
-; SSE2: Found an estimated cost of 16 for instruction: %cttz
-; SSE42: Found an estimated cost of 12 for instruction: %cttz
-; AVX: Found an estimated cost of 12 for instruction: %cttz
- %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 0)
- ret <8 x i16> %cttz
-}
-
-define <8 x i16> @var_cttz_v8i16u(<8 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16u':
-; SSE2: Found an estimated cost of 16 for instruction: %cttz
-; SSE42: Found an estimated cost of 12 for instruction: %cttz
-; AVX: Found an estimated cost of 12 for instruction: %cttz
- %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 1)
- ret <8 x i16> %cttz
-}
-
-define <16 x i16> @var_cttz_v16i16(<16 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16':
-; SSE2: Found an estimated cost of 32 for instruction: %cttz
-; SSE42: Found an estimated cost of 24 for instruction: %cttz
-; AVX1: Found an estimated cost of 26 for instruction: %cttz
-; AVX2: Found an estimated cost of 12 for instruction: %cttz
- %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 0)
- ret <16 x i16> %cttz
-}
-
-define <16 x i16> @var_cttz_v16i16u(<16 x i16> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16u':
-; SSE2: Found an estimated cost of 32 for instruction: %cttz
-; SSE42: Found an estimated cost of 24 for instruction: %cttz
-; AVX1: Found an estimated cost of 26 for instruction: %cttz
-; AVX2: Found an estimated cost of 12 for instruction: %cttz
- %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 1)
- ret <16 x i16> %cttz
-}
-
-define <16 x i8> @var_cttz_v16i8(<16 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8':
-; SSE2: Found an estimated cost of 13 for instruction: %cttz
-; SSE42: Found an estimated cost of 9 for instruction: %cttz
-; AVX: Found an estimated cost of 9 for instruction: %cttz
- %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 0)
- ret <16 x i8> %cttz
-}
-
-define <16 x i8> @var_cttz_v16i8u(<16 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8u':
-; SSE2: Found an estimated cost of 13 for instruction: %cttz
-; SSE42: Found an estimated cost of 9 for instruction: %cttz
-; AVX: Found an estimated cost of 9 for instruction: %cttz
- %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 1)
- ret <16 x i8> %cttz
-}
-
-define <32 x i8> @var_cttz_v32i8(<32 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8':
-; SSE2: Found an estimated cost of 26 for instruction: %cttz
-; SSE42: Found an estimated cost of 18 for instruction: %cttz
-; AVX1: Found an estimated cost of 20 for instruction: %cttz
-; AVX2: Found an estimated cost of 9 for instruction: %cttz
- %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 0)
- ret <32 x i8> %cttz
-}
-
-define <32 x i8> @var_cttz_v32i8u(<32 x i8> %a) {
-; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8u':
-; SSE2: Found an estimated cost of 26 for instruction: %cttz
-; SSE42: Found an estimated cost of 18 for instruction: %cttz
-; AVX1: Found an estimated cost of 20 for instruction: %cttz
-; AVX2: Found an estimated cost of 9 for instruction: %cttz
- %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 1)
- ret <32 x i8> %cttz
-}
diff --git a/test/Analysis/CostModel/X86/ctlz.ll b/test/Analysis/CostModel/X86/ctlz.ll
new file mode 100644
index 000000000000..2c97da15aee5
--- /dev/null
+++ b/test/Analysis/CostModel/X86/ctlz.ll
@@ -0,0 +1,233 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+
+; Verify the cost of scalar leading zero count instructions.
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
+
+define i64 @var_ctlz_i64(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i64':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
+ ret i64 %ctlz
+}
+
+define i64 @var_ctlz_i64u(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i64u':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 1)
+ ret i64 %ctlz
+}
+
+define i32 @var_ctlz_i32(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i32':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
+ ret i32 %ctlz
+}
+
+define i32 @var_ctlz_i32u(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i32u':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 1)
+ ret i32 %ctlz
+}
+
+define i16 @var_ctlz_i16(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i16':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 0)
+ ret i16 %ctlz
+}
+
+define i16 @var_ctlz_i16u(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i16u':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 1)
+ ret i16 %ctlz
+}
+
+define i8 @var_ctlz_i8(i8 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i8':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 0)
+ ret i8 %ctlz
+}
+
+define i8 @var_ctlz_i8u(i8 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_i8u':
+; CHECK: Found an estimated cost of 1 for instruction: %ctlz
+ %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 1)
+ ret i8 %ctlz
+}
+
+; Verify the cost of vector leading zero count instructions.
+
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
+
+declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
+declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
+declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
+declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
+
+define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64':
+; SSE2: Found an estimated cost of 25 for instruction: %ctlz
+; SSE42: Found an estimated cost of 23 for instruction: %ctlz
+; AVX: Found an estimated cost of 23 for instruction: %ctlz
+ %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
+ ret <2 x i64> %ctlz
+}
+
+define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64u':
+; SSE2: Found an estimated cost of 25 for instruction: %ctlz
+; SSE42: Found an estimated cost of 23 for instruction: %ctlz
+; AVX: Found an estimated cost of 23 for instruction: %ctlz
+ %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
+ ret <2 x i64> %ctlz
+}
+
+define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64':
+; SSE2: Found an estimated cost of 50 for instruction: %ctlz
+; SSE42: Found an estimated cost of 46 for instruction: %ctlz
+; AVX1: Found an estimated cost of 48 for instruction: %ctlz
+; AVX2: Found an estimated cost of 23 for instruction: %ctlz
+ %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 0)
+ ret <4 x i64> %ctlz
+}
+
+define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64u':
+; SSE2: Found an estimated cost of 50 for instruction: %ctlz
+; SSE42: Found an estimated cost of 46 for instruction: %ctlz
+; AVX1: Found an estimated cost of 48 for instruction: %ctlz
+; AVX2: Found an estimated cost of 23 for instruction: %ctlz
+ %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 1)
+ ret <4 x i64> %ctlz
+}
+
+define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32':
+; SSE2: Found an estimated cost of 26 for instruction: %ctlz
+; SSE42: Found an estimated cost of 18 for instruction: %ctlz
+; AVX: Found an estimated cost of 18 for instruction: %ctlz
+ %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
+ ret <4 x i32> %ctlz
+}
+
+define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32u':
+; SSE2: Found an estimated cost of 26 for instruction: %ctlz
+; SSE42: Found an estimated cost of 18 for instruction: %ctlz
+; AVX: Found an estimated cost of 18 for instruction: %ctlz
+ %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
+ ret <4 x i32> %ctlz
+}
+
+define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32':
+; SSE2: Found an estimated cost of 52 for instruction: %ctlz
+; SSE42: Found an estimated cost of 36 for instruction: %ctlz
+; AVX1: Found an estimated cost of 38 for instruction: %ctlz
+; AVX2: Found an estimated cost of 18 for instruction: %ctlz
+ %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 0)
+ ret <8 x i32> %ctlz
+}
+
+define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32u':
+; SSE2: Found an estimated cost of 52 for instruction: %ctlz
+; SSE42: Found an estimated cost of 36 for instruction: %ctlz
+; AVX1: Found an estimated cost of 38 for instruction: %ctlz
+; AVX2: Found an estimated cost of 18 for instruction: %ctlz
+ %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 1)
+ ret <8 x i32> %ctlz
+}
+
+define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16':
+; SSE2: Found an estimated cost of 20 for instruction: %ctlz
+; SSE42: Found an estimated cost of 14 for instruction: %ctlz
+; AVX: Found an estimated cost of 14 for instruction: %ctlz
+ %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
+ ret <8 x i16> %ctlz
+}
+
+define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16u':
+; SSE2: Found an estimated cost of 20 for instruction: %ctlz
+; SSE42: Found an estimated cost of 14 for instruction: %ctlz
+; AVX: Found an estimated cost of 14 for instruction: %ctlz
+ %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
+ ret <8 x i16> %ctlz
+}
+
+define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16':
+; SSE2: Found an estimated cost of 40 for instruction: %ctlz
+; SSE42: Found an estimated cost of 28 for instruction: %ctlz
+; AVX1: Found an estimated cost of 30 for instruction: %ctlz
+; AVX2: Found an estimated cost of 14 for instruction: %ctlz
+ %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 0)
+ ret <16 x i16> %ctlz
+}
+
+define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16u':
+; SSE2: Found an estimated cost of 40 for instruction: %ctlz
+; SSE42: Found an estimated cost of 28 for instruction: %ctlz
+; AVX1: Found an estimated cost of 30 for instruction: %ctlz
+; AVX2: Found an estimated cost of 14 for instruction: %ctlz
+ %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 1)
+ ret <16 x i16> %ctlz
+}
+
+define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8':
+; SSE2: Found an estimated cost of 17 for instruction: %ctlz
+; SSE42: Found an estimated cost of 9 for instruction: %ctlz
+; AVX: Found an estimated cost of 9 for instruction: %ctlz
+ %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
+ ret <16 x i8> %ctlz
+}
+
+define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8u':
+; SSE2: Found an estimated cost of 17 for instruction: %ctlz
+; SSE42: Found an estimated cost of 9 for instruction: %ctlz
+; AVX: Found an estimated cost of 9 for instruction: %ctlz
+ %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
+ ret <16 x i8> %ctlz
+}
+
+define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8':
+; SSE2: Found an estimated cost of 34 for instruction: %ctlz
+; SSE42: Found an estimated cost of 18 for instruction: %ctlz
+; AVX1: Found an estimated cost of 20 for instruction: %ctlz
+; AVX2: Found an estimated cost of 9 for instruction: %ctlz
+ %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 0)
+ ret <32 x i8> %ctlz
+}
+
+define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8u':
+; SSE2: Found an estimated cost of 34 for instruction: %ctlz
+; SSE42: Found an estimated cost of 18 for instruction: %ctlz
+; AVX1: Found an estimated cost of 20 for instruction: %ctlz
+; AVX2: Found an estimated cost of 9 for instruction: %ctlz
+ %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 1)
+ ret <32 x i8> %ctlz
+}
diff --git a/test/Analysis/CostModel/X86/ctpop.ll b/test/Analysis/CostModel/X86/ctpop.ll
new file mode 100644
index 000000000000..f072cbaec492
--- /dev/null
+++ b/test/Analysis/CostModel/X86/ctpop.ll
@@ -0,0 +1,133 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+
+; Verify the cost of scalar population count instructions.
+
+declare i64 @llvm.ctpop.i64(i64)
+declare i32 @llvm.ctpop.i32(i32)
+declare i16 @llvm.ctpop.i16(i16)
+declare i8 @llvm.ctpop.i8(i8)
+
+define i64 @var_ctpop_i64(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i64':
+; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
+; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
+ %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
+ ret i64 %ctpop
+}
+
+define i32 @var_ctpop_i32(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i32':
+; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
+; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
+ %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
+ ret i32 %ctpop
+}
+
+define i16 @var_ctpop_i16(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i16':
+; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
+; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
+ %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
+ ret i16 %ctpop
+}
+
+define i8 @var_ctpop_i8(i8 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_i8':
+; NOPOPCNT: Found an estimated cost of 4 for instruction: %ctpop
+; POPCNT: Found an estimated cost of 1 for instruction: %ctpop
+ %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
+ ret i8 %ctpop
+}
+
+; Verify the cost of vector population count instructions.
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
+
+declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
+declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
+declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
+declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
+
+define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v2i64':
+; SSE2: Found an estimated cost of 12 for instruction: %ctpop
+; SSE42: Found an estimated cost of 7 for instruction: %ctpop
+; AVX: Found an estimated cost of 7 for instruction: %ctpop
+ %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+ ret <2 x i64> %ctpop
+}
+
+define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i64':
+; SSE2: Found an estimated cost of 24 for instruction: %ctpop
+; SSE42: Found an estimated cost of 14 for instruction: %ctpop
+; AVX1: Found an estimated cost of 16 for instruction: %ctpop
+; AVX2: Found an estimated cost of 7 for instruction: %ctpop
+ %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+ ret <4 x i64> %ctpop
+}
+
+define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i32':
+; SSE2: Found an estimated cost of 15 for instruction: %ctpop
+; SSE42: Found an estimated cost of 11 for instruction: %ctpop
+; AVX: Found an estimated cost of 11 for instruction: %ctpop
+ %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+ ret <4 x i32> %ctpop
+}
+
+define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i32':
+; SSE2: Found an estimated cost of 30 for instruction: %ctpop
+; SSE42: Found an estimated cost of 22 for instruction: %ctpop
+; AVX1: Found an estimated cost of 24 for instruction: %ctpop
+; AVX2: Found an estimated cost of 11 for instruction: %ctpop
+ %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+ ret <8 x i32> %ctpop
+}
+
+define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i16':
+; SSE2: Found an estimated cost of 13 for instruction: %ctpop
+; SSE42: Found an estimated cost of 9 for instruction: %ctpop
+; AVX: Found an estimated cost of 9 for instruction: %ctpop
+ %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+ ret <8 x i16> %ctpop
+}
+
+define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i16':
+; SSE2: Found an estimated cost of 26 for instruction: %ctpop
+; SSE42: Found an estimated cost of 18 for instruction: %ctpop
+; AVX1: Found an estimated cost of 20 for instruction: %ctpop
+; AVX2: Found an estimated cost of 9 for instruction: %ctpop
+ %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+ ret <16 x i16> %ctpop
+}
+
+define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i8':
+; SSE2: Found an estimated cost of 10 for instruction: %ctpop
+; SSE42: Found an estimated cost of 6 for instruction: %ctpop
+; AVX: Found an estimated cost of 6 for instruction: %ctpop
+ %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+ ret <16 x i8> %ctpop
+}
+
+define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v32i8':
+; SSE2: Found an estimated cost of 20 for instruction: %ctpop
+; SSE42: Found an estimated cost of 12 for instruction: %ctpop
+; AVX1: Found an estimated cost of 14 for instruction: %ctpop
+; AVX2: Found an estimated cost of 6 for instruction: %ctpop
+ %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+ ret <32 x i8> %ctpop
+}
diff --git a/test/Analysis/CostModel/X86/cttz.ll b/test/Analysis/CostModel/X86/cttz.ll
new file mode 100644
index 000000000000..5d3c59b60232
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cttz.ll
@@ -0,0 +1,233 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2 -check-prefix=NOPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
+
+; Verify the cost of scalar trailing zero count instructions.
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i8 @llvm.cttz.i8(i8, i1)
+
+define i64 @var_cttz_i64(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i64':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 0)
+ ret i64 %cttz
+}
+
+define i64 @var_cttz_i64u(i64 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i64u':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 1)
+ ret i64 %cttz
+}
+
+define i32 @var_cttz_i32(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i32':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 0)
+ ret i32 %cttz
+}
+
+define i32 @var_cttz_i32u(i32 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i32u':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 1)
+ ret i32 %cttz
+}
+
+define i16 @var_cttz_i16(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i16':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 0)
+ ret i16 %cttz
+}
+
+define i16 @var_cttz_i16u(i16 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i16u':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 1)
+ ret i16 %cttz
+}
+
+define i8 @var_cttz_i8(i8 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i8':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 0)
+ ret i8 %cttz
+}
+
+define i8 @var_cttz_i8u(i8 %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_i8u':
+; CHECK: Found an estimated cost of 1 for instruction: %cttz
+ %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 1)
+ ret i8 %cttz
+}
+
+; Verify the cost of vector trailing zero count instructions.
+
+declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
+declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
+declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
+declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
+
+declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
+declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
+declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
+declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
+
+define <2 x i64> @var_cttz_v2i64(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64':
+; SSE2: Found an estimated cost of 14 for instruction: %cttz
+; SSE42: Found an estimated cost of 10 for instruction: %cttz
+; AVX: Found an estimated cost of 10 for instruction: %cttz
+ %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 0)
+ ret <2 x i64> %cttz
+}
+
+define <2 x i64> @var_cttz_v2i64u(<2 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64u':
+; SSE2: Found an estimated cost of 14 for instruction: %cttz
+; SSE42: Found an estimated cost of 10 for instruction: %cttz
+; AVX: Found an estimated cost of 10 for instruction: %cttz
+ %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 1)
+ ret <2 x i64> %cttz
+}
+
+define <4 x i64> @var_cttz_v4i64(<4 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64':
+; SSE2: Found an estimated cost of 28 for instruction: %cttz
+; SSE42: Found an estimated cost of 20 for instruction: %cttz
+; AVX1: Found an estimated cost of 22 for instruction: %cttz
+; AVX2: Found an estimated cost of 10 for instruction: %cttz
+ %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 0)
+ ret <4 x i64> %cttz
+}
+
+define <4 x i64> @var_cttz_v4i64u(<4 x i64> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64u':
+; SSE2: Found an estimated cost of 28 for instruction: %cttz
+; SSE42: Found an estimated cost of 20 for instruction: %cttz
+; AVX1: Found an estimated cost of 22 for instruction: %cttz
+; AVX2: Found an estimated cost of 10 for instruction: %cttz
+ %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 1)
+ ret <4 x i64> %cttz
+}
+
+define <4 x i32> @var_cttz_v4i32(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32':
+; SSE2: Found an estimated cost of 18 for instruction: %cttz
+; SSE42: Found an estimated cost of 14 for instruction: %cttz
+; AVX: Found an estimated cost of 14 for instruction: %cttz
+ %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 0)
+ ret <4 x i32> %cttz
+}
+
+define <4 x i32> @var_cttz_v4i32u(<4 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32u':
+; SSE2: Found an estimated cost of 18 for instruction: %cttz
+; SSE42: Found an estimated cost of 14 for instruction: %cttz
+; AVX: Found an estimated cost of 14 for instruction: %cttz
+ %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 1)
+ ret <4 x i32> %cttz
+}
+
+define <8 x i32> @var_cttz_v8i32(<8 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32':
+; SSE2: Found an estimated cost of 36 for instruction: %cttz
+; SSE42: Found an estimated cost of 28 for instruction: %cttz
+; AVX1: Found an estimated cost of 30 for instruction: %cttz
+; AVX2: Found an estimated cost of 14 for instruction: %cttz
+ %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 0)
+ ret <8 x i32> %cttz
+}
+
+define <8 x i32> @var_cttz_v8i32u(<8 x i32> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32u':
+; SSE2: Found an estimated cost of 36 for instruction: %cttz
+; SSE42: Found an estimated cost of 28 for instruction: %cttz
+; AVX1: Found an estimated cost of 30 for instruction: %cttz
+; AVX2: Found an estimated cost of 14 for instruction: %cttz
+ %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 1)
+ ret <8 x i32> %cttz
+}
+
+define <8 x i16> @var_cttz_v8i16(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16':
+; SSE2: Found an estimated cost of 16 for instruction: %cttz
+; SSE42: Found an estimated cost of 12 for instruction: %cttz
+; AVX: Found an estimated cost of 12 for instruction: %cttz
+ %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 0)
+ ret <8 x i16> %cttz
+}
+
+define <8 x i16> @var_cttz_v8i16u(<8 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16u':
+; SSE2: Found an estimated cost of 16 for instruction: %cttz
+; SSE42: Found an estimated cost of 12 for instruction: %cttz
+; AVX: Found an estimated cost of 12 for instruction: %cttz
+ %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 1)
+ ret <8 x i16> %cttz
+}
+
+define <16 x i16> @var_cttz_v16i16(<16 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16':
+; SSE2: Found an estimated cost of 32 for instruction: %cttz
+; SSE42: Found an estimated cost of 24 for instruction: %cttz
+; AVX1: Found an estimated cost of 26 for instruction: %cttz
+; AVX2: Found an estimated cost of 12 for instruction: %cttz
+ %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 0)
+ ret <16 x i16> %cttz
+}
+
+define <16 x i16> @var_cttz_v16i16u(<16 x i16> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16u':
+; SSE2: Found an estimated cost of 32 for instruction: %cttz
+; SSE42: Found an estimated cost of 24 for instruction: %cttz
+; AVX1: Found an estimated cost of 26 for instruction: %cttz
+; AVX2: Found an estimated cost of 12 for instruction: %cttz
+ %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 1)
+ ret <16 x i16> %cttz
+}
+
+define <16 x i8> @var_cttz_v16i8(<16 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8':
+; SSE2: Found an estimated cost of 13 for instruction: %cttz
+; SSE42: Found an estimated cost of 9 for instruction: %cttz
+; AVX: Found an estimated cost of 9 for instruction: %cttz
+ %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 0)
+ ret <16 x i8> %cttz
+}
+
+define <16 x i8> @var_cttz_v16i8u(<16 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8u':
+; SSE2: Found an estimated cost of 13 for instruction: %cttz
+; SSE42: Found an estimated cost of 9 for instruction: %cttz
+; AVX: Found an estimated cost of 9 for instruction: %cttz
+ %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 1)
+ ret <16 x i8> %cttz
+}
+
+define <32 x i8> @var_cttz_v32i8(<32 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8':
+; SSE2: Found an estimated cost of 26 for instruction: %cttz
+; SSE42: Found an estimated cost of 18 for instruction: %cttz
+; AVX1: Found an estimated cost of 20 for instruction: %cttz
+; AVX2: Found an estimated cost of 9 for instruction: %cttz
+ %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 0)
+ ret <32 x i8> %cttz
+}
+
+define <32 x i8> @var_cttz_v32i8u(<32 x i8> %a) {
+; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8u':
+; SSE2: Found an estimated cost of 26 for instruction: %cttz
+; SSE42: Found an estimated cost of 18 for instruction: %cttz
+; AVX1: Found an estimated cost of 20 for instruction: %cttz
+; AVX2: Found an estimated cost of 9 for instruction: %cttz
+ %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 1)
+ ret <32 x i8> %cttz
+}
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
index 91797c062b88..e65992e9913d 100644
--- a/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,18 +1,16 @@
; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
+; Function Attrs: nounwind readnone
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+
define i8 @add_B(<16 x i8>* %arr) {
; CHECK-LABEL: add_B
; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
%bin.rdx = load <16 x i8>, <16 x i8>* %arr
- %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
- %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
- %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
- %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
- %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
- %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
- %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
- %r = extractelement <16 x i8> %bin.rdx14, i32 0
+ %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %bin.rdx)
ret i8 %r
}
@@ -20,13 +18,7 @@ define i16 @add_H(<8 x i16>* %arr) {
; CHECK-LABEL: add_H
; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
%bin.rdx = load <8 x i16>, <8 x i16>* %arr
- %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
- %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
- %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
- %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
- %r = extractelement <8 x i16> %bin.rdx14, i32 0
+ %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %bin.rdx)
ret i16 %r
}
@@ -34,11 +26,7 @@ define i32 @add_S( <4 x i32>* %arr) {
; CHECK-LABEL: add_S
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
%bin.rdx = load <4 x i32>, <4 x i32>* %arr
- %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
- %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
- %r = extractelement <4 x i32> %bin.rdx13, i32 0
+ %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %bin.rdx)
ret i32 %r
}
@@ -46,12 +34,12 @@ define i64 @add_D(<2 x i64>* %arr) {
; CHECK-LABEL: add_D
; CHECK-NOT: addv
%bin.rdx = load <2 x i64>, <2 x i64>* %arr
- %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
- %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
- %r = extractelement <2 x i64> %bin.rdx0, i32 0
+ %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %bin.rdx)
ret i64 %r
}
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+
define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
; CHECK-LABEL: oversized_ADDV_256
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
@@ -66,33 +54,16 @@ entry:
%7 = icmp slt <8 x i32> %6, zeroinitializer
%8 = sub nsw <8 x i32> zeroinitializer, %6
%9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
- %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx = add <8 x i32> %9, %rdx.shuf
- %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1
- %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3
- %10 = extractelement <8 x i32> %bin.rdx4, i32 0
- ret i32 %10
+ %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %9)
+ ret i32 %r
}
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+
define i32 @oversized_ADDV_512(<16 x i32>* %arr) {
; CHECK-LABEL: oversized_ADDV_512
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
%bin.rdx = load <16 x i32>, <16 x i32>* %arr
-
- %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0
-
- %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
- %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf
-
- %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
- %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12
-
- %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
- %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13
-
- %r = extractelement <16 x i32> %bin.rdx14, i32 0
+ %r = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %bin.rdx)
ret i32 %r
}
diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 9a56cd6ae7c0..760a8f8419f9 100644
--- a/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -2,344 +2,148 @@
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+
+declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>)
+declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float>)
+
; CHECK-LABEL: smax_B
; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
define i8 @smax_B(<16 x i8>* nocapture readonly %arr) {
%arr.load = load <16 x i8>, <16 x i8>* %arr
- %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+ %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %arr.load)
ret i8 %r
}
; CHECK-LABEL: smax_H
; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
- %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
- %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
- %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
- %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
- %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
- %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
- %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
- %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
- %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+ %arr.load = load <8 x i16>, <8 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %arr.load)
ret i16 %r
}
; CHECK-LABEL: smax_S
; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
define i32 @smax_S(<4 x i32> * nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
- %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
- %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
- %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
- %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
- %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
- %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+ %arr.load = load <4 x i32>, <4 x i32>* %arr
+ %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %arr.load)
ret i32 %r
}
-; CHECK-LABEL: smax_D
-; CHECK-NOT: smaxv
-define i64 @smax_D(<2 x i64>* nocapture readonly %arr) {
- %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
- %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
- %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
- %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
- %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
- %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
- ret i64 %r
-}
-
-
; CHECK-LABEL: umax_B
; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
define i8 @umax_B(<16 x i8>* nocapture readonly %arr) {
- %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
- %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+ %arr.load = load <16 x i8>, <16 x i8>* %arr
+ %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %arr.load)
ret i8 %r
}
; CHECK-LABEL: umax_H
; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
define i16 @umax_H(<8 x i16>* nocapture readonly %arr) {
- %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
- %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
- %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
- %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
- %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
- %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
- %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
- %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
- %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+ %arr.load = load <8 x i16>, <8 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %arr.load)
ret i16 %r
}
; CHECK-LABEL: umax_S
; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
- %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
- %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
- %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
- %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
- %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
- %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+ %arr.load = load <4 x i32>, <4 x i32>* %arr
+ %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %arr.load)
ret i32 %r
}
-; CHECK-LABEL: umax_D
-; CHECK-NOT: umaxv
-define i64 @umax_D(<2 x i64>* nocapture readonly %arr) {
- %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
- %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
- %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
- %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
- %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
- %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
- ret i64 %r
-}
-
-
; CHECK-LABEL: smin_B
; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
- %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
- %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+ %arr.load = load <16 x i8>, <16 x i8>* %arr
+ %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %arr.load)
ret i8 %r
}
; CHECK-LABEL: smin_H
; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
- %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
- %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
- %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
- %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
- %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
- %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
- %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
- %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
- %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+ %arr.load = load <8 x i16>, <8 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %arr.load)
ret i16 %r
}
; CHECK-LABEL: smin_S
; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
- %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
- %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
- %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
- %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
- %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
- %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+ %arr.load = load <4 x i32>, <4 x i32>* %arr
+ %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %arr.load)
ret i32 %r
}
-; CHECK-LABEL: smin_D
-; CHECK-NOT: sminv
-define i64 @smin_D(<2 x i64>* nocapture readonly %arr) {
- %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
- %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
- %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
- %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
- %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
- %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
- ret i64 %r
-}
-
-
; CHECK-LABEL: umin_B
; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
define i8 @umin_B(<16 x i8>* nocapture readonly %arr) {
- %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
- %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+ %arr.load = load <16 x i8>, <16 x i8>* %arr
+ %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %arr.load)
ret i8 %r
}
; CHECK-LABEL: umin_H
; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
define i16 @umin_H(<8 x i16>* nocapture readonly %arr) {
- %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
- %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
- %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25
- %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
- %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28
- %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
- %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
- %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
- %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+ %arr.load = load <8 x i16>, <8 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %arr.load)
ret i16 %r
}
; CHECK-LABEL: umin_S
; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
- %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
- %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20
- %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
- %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
- %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
- %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+ %arr.load = load <4 x i32>, <4 x i32>* %arr
+ %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %arr.load)
ret i32 %r
}
-; CHECK-LABEL: umin_D
-; CHECK-NOT: uminv
-define i64 @umin_D(<2 x i64>* nocapture readonly %arr) {
- %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
- %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
- %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
- %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
- %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
- %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
- ret i64 %r
-}
-
; CHECK-LABEL: fmaxnm_S
; CHECK: fmaxnmv
define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x float>, <4 x float>* %arr
- %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
- %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
- %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
- %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
- %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
- %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+ %arr.load = load <4 x float>, <4 x float>* %arr
+ %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %arr.load)
ret float %r
}
; CHECK-LABEL: fminnm_S
; CHECK: fminnmv
define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
- %rdx.minmax.select = load <4 x float>, <4 x float>* %arr
- %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
- %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
- %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
- %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
- %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
- %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+ %arr.load = load <4 x float>, <4 x float>* %arr
+ %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %arr.load)
ret float %r
}
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+
define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_umax_256
; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: umaxv {{h[0-9]+}}, [[V0]]
- %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
- %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+ %arr.load = load <16 x i16>, <16 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %arr.load)
ret i16 %r
}
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+
define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_umax_512
; CHECK: umax v
@@ -347,47 +151,23 @@ define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
%arr.load = load <16 x i32>, <16 x i32>* %arr
- %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+ %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %arr.load)
ret i32 %r
}
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+
define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_umin_256
; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: uminv {{h[0-9]+}}, [[V0]]
- %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
- %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+ %arr.load = load <16 x i16>, <16 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %arr.load)
ret i16 %r
}
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+
define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_umin_512
; CHECK: umin v
@@ -395,47 +175,23 @@ define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
%arr.load = load <16 x i32>, <16 x i32>* %arr
- %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+ %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %arr.load)
ret i32 %r
}
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+
define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_smax_256
; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: smaxv {{h[0-9]+}}, [[V0]]
%arr.load = load <16 x i16>, <16 x i16>* %arr
- %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+ %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %arr.load)
ret i16 %r
}
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+
define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_smax_512
; CHECK: smax v
@@ -443,47 +199,23 @@ define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
%arr.load = load <16 x i32>, <16 x i32>* %arr
- %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+ %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %arr.load)
ret i32 %r
}
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+
define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_smin_256
; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: sminv {{h[0-9]+}}, [[V0]]
- %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
- %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+ %arr.load = load <16 x i16>, <16 x i16>* %arr
+ %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %arr.load)
ret i16 %r
}
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+
define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-LABEL: oversized_smin_512
; CHECK: smin v
@@ -491,20 +223,6 @@ define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) {
; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
%arr.load = load <16 x i32>, <16 x i32>* %arr
- %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf
- %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
- %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
- %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
- %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
- %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
- %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
- %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
- %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
- %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
- %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+ %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %arr.load)
ret i32 %r
}
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index c7b0c33550d0..ff7a0a8300e2 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -134,8 +134,10 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
ret <2 x i64> %tmp4
}
-define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
-; CHECK-LABEL: uabdl8h_log2_shuffle
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+
+define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
+; CHECK-LABEL: uabdl8h_rdx
; CHECK: uabdl2.8h
; CHECK: uabdl.8h
%aload = load <16 x i8>, <16 x i8>* %a, align 1
@@ -146,20 +148,14 @@ define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
%abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
%ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
%absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
- %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
- %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
- %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
- %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
- %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
+ %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel)
ret i16 %reduced_v
}
-define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
-; CHECK-LABEL: uabdl4s_log2_shuffle
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+
+define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
+; CHECK-LABEL: uabdl4s_rdx
; CHECK: uabdl2.4s
; CHECK: uabdl.4s
%aload = load <8 x i16>, <8 x i16>* %a, align 1
@@ -170,18 +166,14 @@ define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
%abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
%ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
%absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
- %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx = add <8 x i32> %absel, %rdx.shuf
- %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
- %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
- %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
+ %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel)
ret i32 %reduced_v
}
-define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
-; CHECK: uabdl2d_log2_shuffle
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+
+define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
+; CHECK: uabdl2d_rdx
; CHECK: uabdl2.2d
; CHECK: uabdl.2d
%aload = load <4 x i32>, <4 x i32>* %a, align 1
@@ -192,11 +184,7 @@ define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
%abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
%ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
%absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
- %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
- %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
- %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
- %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
- %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
+ %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel)
ret i64 %reduced_v
}
diff --git a/test/CodeGen/AArch64/ldst-zero.ll b/test/CodeGen/AArch64/ldst-zero.ll
index 95b92ac70879..7d443a631f91 100644
--- a/test/CodeGen/AArch64/ldst-zero.ll
+++ b/test/CodeGen/AArch64/ldst-zero.ll
@@ -9,9 +9,9 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
; Original test case which exhibited the bug
define void @test1(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test1:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
entry:
%0 = bitcast %struct.tree_common* %t to i8*
tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
@@ -25,10 +25,8 @@ entry:
; Store to each struct element instead of using memset
define void @test2(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test2:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: str wzr, [x0, #16]
-; CHECK: str w1, [x0, #16]
-; CHECK: str x2, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: stp xzr, x2, [x0]
entry:
%0 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 0
%1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
@@ -44,9 +42,9 @@ entry:
; Vector store instead of memset
define void @test3(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test3:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
entry:
%0 = bitcast %struct.tree_common* %t to <3 x i64>*
store <3 x i64> zeroinitializer, <3 x i64>* %0, align 8
@@ -60,9 +58,8 @@ entry:
; Vector store, then store to vector elements
define void @test4(<3 x i64>* %p, i64 %x, i64 %y) {
; CHECK-LABEL: test4:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str x1, [x0, #16]
+; CHECK-DAG: stp x2, x1, [x0, #8]
+; CHECK-DAG: str xzr, [x0]
entry:
store <3 x i64> zeroinitializer, <3 x i64>* %p, align 8
%0 = bitcast <3 x i64>* %p to i64*
diff --git a/test/CodeGen/AArch64/misched-stp.ll b/test/CodeGen/AArch64/misched-stp.ll
index 4ea481cae68e..1c9ea68834c2 100644
--- a/test/CodeGen/AArch64/misched-stp.ll
+++ b/test/CodeGen/AArch64/misched-stp.ll
@@ -1,20 +1,18 @@
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -o - | FileCheck %s
; Tests to check that the scheduler dependencies derived from alias analysis are
; correct when we have loads that have been split up so that they can later be
; merged into STP.
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_splat:BB#0 entry
-; CHECK: SU({{[0-9]+}}): STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%3+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU1:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}): STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%3+4]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU2:SU\([0-9]+\)]]
-; CHECK: [[SU1]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%2]
-; CHECK: [[SU2]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%1]
+; Now that overwritten stores are elided in SelectionDAG, dependencies
+; are resolved and removed before MISCHED. Check that we have
+; equivalent pair of stp calls as a baseline.
+
+; CHECK-LABEL: test_splat
+; CHECK: ldr [[REG:w[0-9]+]], [x2]
+; CHECK-DAG: stp w0, [[REG]], [x2, #12]
+; CHECK-DAG: stp [[REG]], w1, [x2, #4]
define void @test_splat(i32 %x, i32 %y, i32* %p) {
entry:
%val = load i32, i32* %p, align 4
@@ -35,16 +33,11 @@ entry:
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
%struct.tree_common = type { i8*, i8*, i32 }
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_zero:BB#0 entry
-; CHECK: SU({{[0-9]+}}): STRXui %XZR, %vreg{{[0-9]+}}, 2; mem:ST8[%0+16]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU3:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}): STRXui %XZR, %vreg{{[0-9]+}}, 1; mem:ST8[%0+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU4:SU\([0-9]+\)]]
-; CHECK: [[SU3]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 4; mem:ST4[%code1]
-; CHECK: [[SU4]]: STRXui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 1; mem:ST8[%type2]
+; CHECK-LABEL: test_zero
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
+
define void @test_zero(%struct.tree_common* %t, i32 %code, i8* %type) {
entry:
%0 = bitcast %struct.tree_common* %t to i8*
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index a96eb5db9e2a..2e6d3f3c1e8f 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -1,39 +1,92 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.maxnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}test_fmax3_olt_0_f32:
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
%a = load volatile float, float addrspace(1)* %aptr, align 4
%b = load volatile float, float addrspace(1)* %bptr, align 4
%c = load volatile float, float addrspace(1)* %cptr, align 4
- %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
- %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+ %f0 = call float @llvm.maxnum.f32(float %a, float %b)
+ %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
store float %f1, float addrspace(1)* %out, align 4
ret void
}
; Commute operand of second fmax
-; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; GCN-LABEL: {{^}}test_fmax3_olt_1_f32:
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
%a = load volatile float, float addrspace(1)* %aptr, align 4
%b = load volatile float, float addrspace(1)* %bptr, align 4
%c = load volatile float, float addrspace(1)* %cptr, align 4
- %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
- %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+ %f0 = call float @llvm.maxnum.f32(float %a, float %b)
+ %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
store float %f1, float addrspace(1)* %out, align 4
ret void
}
+
+; GCN-LABEL: {{^}}test_fmax3_olt_0_f16:
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+
+; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_max_f16_e32
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+ %a = load volatile half, half addrspace(1)* %aptr, align 2
+ %b = load volatile half, half addrspace(1)* %bptr, align 2
+ %c = load volatile half, half addrspace(1)* %cptr, align 2
+ %f0 = call half @llvm.maxnum.f16(half %a, half %b)
+ %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
+ store half %f1, half addrspace(1)* %out, align 2
+ ret void
+}
+
+; Commute operand of second fmax
+; GCN-LABEL: {{^}}test_fmax3_olt_1_f16:
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+
+; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_max_f16_e32
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+ %a = load volatile half, half addrspace(1)* %aptr, align 2
+ %b = load volatile half, half addrspace(1)* %bptr, align 2
+ %c = load volatile half, half addrspace(1)* %cptr, align 2
+ %f0 = call half @llvm.maxnum.f16(half %a, half %b)
+ %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
+ store half %f1, half addrspace(1)* %out, align 2
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare half @llvm.maxnum.f16(half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 3183f77f090b..5fc5895c3ecb 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -1,40 +1,90 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.minnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}test_fmin3_olt_0_f32:
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
%a = load volatile float, float addrspace(1)* %aptr, align 4
%b = load volatile float, float addrspace(1)* %bptr, align 4
%c = load volatile float, float addrspace(1)* %cptr, align 4
- %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
- %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+ %f0 = call float @llvm.minnum.f32(float %a, float %b)
+ %f1 = call float @llvm.minnum.f32(float %f0, float %c)
store float %f1, float addrspace(1)* %out, align 4
ret void
}
; Commute operand of second fmin
-; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; GCN-LABEL: {{^}}test_fmin3_olt_1_f32:
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
%a = load volatile float, float addrspace(1)* %aptr, align 4
%b = load volatile float, float addrspace(1)* %bptr, align 4
%c = load volatile float, float addrspace(1)* %cptr, align 4
- %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
- %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+ %f0 = call float @llvm.minnum.f32(float %a, float %b)
+ %f1 = call float @llvm.minnum.f32(float %c, float %f0)
store float %f1, float addrspace(1)* %out, align 4
ret void
}
+
+; GCN-LABEL: {{^}}test_fmin3_olt_0_f16:
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+
+; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_min_f16_e32
+; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+ %a = load volatile half, half addrspace(1)* %aptr, align 2
+ %b = load volatile half, half addrspace(1)* %bptr, align 2
+ %c = load volatile half, half addrspace(1)* %cptr, align 2
+ %f0 = call half @llvm.minnum.f16(half %a, half %b)
+ %f1 = call half @llvm.minnum.f16(half %f0, half %c)
+ store half %f1, half addrspace(1)* %out, align 2
+ ret void
+}
+
+; Commute operand of second fmin
+; GCN-LABEL: {{^}}test_fmin3_olt_1_f16:
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+
+; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_min_f16_e32
+; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+ %a = load volatile half, half addrspace(1)* %aptr, align 2
+ %b = load volatile half, half addrspace(1)* %bptr, align 2
+ %c = load volatile half, half addrspace(1)* %cptr, align 2
+ %f0 = call half @llvm.minnum.f16(half %a, half %b)
+ %f1 = call half @llvm.minnum.f16(half %c, half %f0)
+ store half %f1, half addrspace(1)* %out, align 2
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.minnum.f32(float, float) #1
+declare half @llvm.minnum.f16(half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index 80acfcca7082..1898c8fb63ea 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -29,10 +29,10 @@
define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
%ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
%val = load float, float addrspace(2)* %ptr
- store float %val, float addrspace(1)* %out
+ store volatile float %val, float addrspace(1)* %out
%ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
%val2 = load float, float addrspace(2)* %ptr2
- store float %val2, float addrspace(1)* %out
+ store volatile float %val2, float addrspace(1)* %out
ret void
}
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index c15a30e3c540..96132d841997 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -288,9 +288,9 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
}
; GCN-LABEL: {{^}}commute_add_literal_v2f16:
-; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
-; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
+; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
; GFX9: buffer_store_dword [[REG]]
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index 4bb4fd46becd..46dcf8e340f4 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -1,41 +1,94 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imax3_sgt_i32
-; SI: v_max3_i32
-define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_imax3_sgt_i32:
+; GCN: v_max3_i32
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
%icmp0 = icmp sgt i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
%icmp1 = icmp sgt i32 %i0, %c
%i1 = select i1 %icmp1, i32 %i0, i32 %c
- store i32 %i1, i32 addrspace(1)* %out, align 4
+ store i32 %i1, i32 addrspace(1)* %out
ret void
}
-; FUNC-LABEL: @v_test_umax3_ugt_i32
-; SI: v_max3_u32
-define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umax3_ugt_i32:
+; GCN: v_max3_u32
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
%icmp0 = icmp ugt i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
%icmp1 = icmp ugt i32 %i0, %c
%i1 = select i1 %icmp1, i32 %i0, i32 %c
- store i32 %i1, i32 addrspace(1)* %out, align 4
+ store i32 %i1, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_imax3_sgt_i16:
+; SI: v_max3_i32
+
+; VI: v_max_i16
+; VI: v_max_i16
+
+; GFX9: v_max3_i16
+define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+ %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+ %b = load i16, i16 addrspace(1)* %gep1
+ %c = load i16, i16 addrspace(1)* %gep2
+ %icmp0 = icmp sgt i16 %a, %b
+ %i0 = select i1 %icmp0, i16 %a, i16 %b
+ %icmp1 = icmp sgt i16 %i0, %c
+ %i1 = select i1 %icmp1, i16 %i0, i16 %c
+ store i16 %i1, i16 addrspace(1)* %out
ret void
}
+
+; GCN-LABEL: {{^}}v_test_umax3_ugt_i16:
+; SI: v_max3_u32
+
+; VI: v_max_u16
+; VI: v_max_u16
+
+; GFX9: v_max3_u16
+define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+ %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+ %b = load i16, i16 addrspace(1)* %gep1
+ %c = load i16, i16 addrspace(1)* %gep2
+ %icmp0 = icmp ugt i16 %a, %b
+ %i0 = select i1 %icmp0, i16 %a, i16 %b
+ %icmp1 = icmp ugt i16 %i0, %c
+ %i1 = select i1 %icmp1, i16 %i0, i16 %c
+ store i16 %i1, i16 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 59d5d2cdb1aa..e20fb81f2ecf 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -1,50 +1,50 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imin3_slt_i32
-; SI: v_min3_i32
-define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}v_test_imin3_slt_i32:
+; GCN: v_min3_i32
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
%icmp0 = icmp slt i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
%icmp1 = icmp slt i32 %i0, %c
%i1 = select i1 %icmp1, i32 %i0, i32 %c
- store i32 %i1, i32 addrspace(1)* %outgep, align 4
+ store i32 %i1, i32 addrspace(1)* %outgep
ret void
}
-; FUNC-LABEL: @v_test_umin3_ult_i32
-; SI: v_min3_u32
-define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin3_ult_i32:
+; GCN: v_min3_u32
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
%icmp0 = icmp ult i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
%icmp1 = icmp ult i32 %i0, %c
%i1 = select i1 %icmp1, i32 %i0, i32 %c
- store i32 %i1, i32 addrspace(1)* %outgep, align 4
+ store i32 %i1, i32 addrspace(1)* %outgep
ret void
}
-; FUNC-LABEL: @v_test_umin_umin_umin
-; SI: v_min_i32
-; SI: v_min3_i32
-define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin_umin_umin:
+; GCN: v_min_i32
+; GCN: v_min3_i32
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = mul i32 %tid, 2
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -57,10 +57,10 @@ define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 add
%outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
- %d = load i32, i32 addrspace(1)* %gep3, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
+ %d = load i32, i32 addrspace(1)* %gep3
%icmp0 = icmp slt i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -71,14 +71,14 @@ define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 add
%icmp2 = icmp slt i32 %i0, %i1
%i2 = select i1 %icmp2, i32 %i0, i32 %i1
- store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+ store i32 %i2, i32 addrspace(1)* %outgep1
ret void
}
-; FUNC-LABEL: @v_test_umin3_2_uses
-; SI-NOT: v_min3
-define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
- %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin3_2_uses:
+; GCN-NOT: v_min3
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid2 = mul i32 %tid, 2
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -91,10 +91,10 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
%outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
- %a = load i32, i32 addrspace(1)* %gep0, align 4
- %b = load i32, i32 addrspace(1)* %gep1, align 4
- %c = load i32, i32 addrspace(1)* %gep2, align 4
- %d = load i32, i32 addrspace(1)* %gep3, align 4
+ %a = load i32, i32 addrspace(1)* %gep0
+ %b = load i32, i32 addrspace(1)* %gep1
+ %c = load i32, i32 addrspace(1)* %gep2
+ %d = load i32, i32 addrspace(1)* %gep3
%icmp0 = icmp slt i32 %a, %b
%i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -105,7 +105,60 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
%icmp2 = icmp slt i32 %i0, %c
%i2 = select i1 %icmp2, i32 %i0, i32 %c
- store i32 %i2, i32 addrspace(1)* %outgep0, align 4
- store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+ store i32 %i2, i32 addrspace(1)* %outgep0
+ store i32 %i0, i32 addrspace(1)* %outgep1
ret void
}
+
+; GCN-LABEL: {{^}}v_test_imin3_slt_i16:
+; SI: v_min3_i32
+
+; VI: v_min_i16
+; VI: v_min_i16
+
+; GFX9: v_min3_i16
+define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+ %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+ %b = load i16, i16 addrspace(1)* %gep1
+ %c = load i16, i16 addrspace(1)* %gep2
+ %icmp0 = icmp slt i16 %a, %b
+ %i0 = select i1 %icmp0, i16 %a, i16 %b
+ %icmp1 = icmp slt i16 %i0, %c
+ %i1 = select i1 %icmp1, i16 %i0, i16 %c
+ store i16 %i1, i16 addrspace(1)* %outgep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umin3_ult_i16:
+; SI: v_min3_u32
+
+; VI: v_min_u16
+; VI: v_min_u16
+
+; GFX9: v_min3_u16
+define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+ %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+ %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+ %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+ %a = load i16, i16 addrspace(1)* %gep0
+ %b = load i16, i16 addrspace(1)* %gep1
+ %c = load i16, i16 addrspace(1)* %gep2
+ %icmp0 = icmp ult i16 %a, %b
+ %i0 = select i1 %icmp0, i16 %a, i16 %b
+ %icmp1 = icmp ult i16 %i0, %c
+ %i1 = select i1 %icmp1, i16 %i0, i16 %c
+ store i16 %i1, i16 addrspace(1)* %outgep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/packed-op-sel.ll b/test/CodeGen/AMDGPU/packed-op-sel.ll
new file mode 100644
index 000000000000..6ff0c54c33d0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -0,0 +1,266 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg to broadcasted vector
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+ %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before broadcast
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before and after broadcast, and should cancel out.
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+ %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Add scalar, but negate low component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+ %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Add scalar, but negate high component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+ %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; Apply fneg before broadcast with bitcast
+; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
+; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %neg.scalar0 = fsub half -0.0, %scalar0
+ %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
+
+ %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
+ %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
+
+ %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
+ store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+ %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+ %neg.scalar1 = fsub half -0.0, %scalar1
+ %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+ %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+ %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+ %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+ %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+ %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+ %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+ %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+ %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+ %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
+ %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+
+ %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
+ store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+ ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 8403dd991360..777eccb00b02 100644
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -20,7 +20,7 @@ bb3: ; preds = %bb, %entry
bb8: ; preds = %bb3
%1 = getelementptr inbounds i8, i8* %0, i32 0
- store i8 0, i8* %1, align 1
+ store volatile i8 0, i8* %1, align 1
%2 = call i32 @ptou() nounwind
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
@@ -35,7 +35,7 @@ bb8: ; preds = %bb3
%7 = or i8 %6, 48
%8 = add i8 %6, 87
%iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
- store i8 %iftmp.5.0.1, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.1, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -49,7 +49,7 @@ bb8: ; preds = %bb3
%13 = or i8 %12, 48
%14 = add i8 %12, 87
%iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
- store i8 %iftmp.5.0.2, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.2, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -63,7 +63,7 @@ bb8: ; preds = %bb3
%19 = or i8 %18, 48
%20 = add i8 %18, 87
%iftmp.5.0.4 = select i1 %17, i8 %19, i8 %20
- store i8 %iftmp.5.0.4, i8* null, align 1
+ store volatile i8 %iftmp.5.0.4, i8* null, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -74,7 +74,7 @@ bb8: ; preds = %bb3
%22 = urem i32 %21, 10
%23 = icmp ult i32 %22, 10
%iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
- store i8 %iftmp.5.0.5, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.5, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -88,7 +88,7 @@ bb8: ; preds = %bb3
%28 = or i8 %27, 48
%29 = add i8 %27, 87
%iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
- store i8 %iftmp.5.0.6, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.6, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -102,7 +102,7 @@ bb8: ; preds = %bb3
%34 = or i8 %33, 48
%35 = add i8 %33, 87
%iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
- store i8 %iftmp.5.0.7, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -116,7 +116,7 @@ bb8: ; preds = %bb3
%40 = or i8 %39, 48
%41 = add i8 %39, 87
%iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
- store i8 %iftmp.5.0.8, i8* null, align 1
+ store volatile i8 %iftmp.5.0.8, i8* null, align 1
br label %bb46
bb46: ; preds = %bb3
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index 2a5af6199a34..954860219d19 100644
--- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -13,7 +13,7 @@ entry:
; CHECK: sub sp, sp, #12
; CHECK: sub sp, sp, #4
; CHECK: add r0, sp, #4
-; CHECK: stm sp, {r0, r1, r2, r3}
+; CHECK: stmib sp, {r1, r2, r3}
%g = alloca i8*
%g1 = bitcast i8** %g to i8*
call void @llvm.va_start(i8* %g1)
diff --git a/test/CodeGen/ARM/dag-combine-ldst.ll b/test/CodeGen/ARM/dag-combine-ldst.ll
index c1960ee6c6e9..077754ef013d 100644
--- a/test/CodeGen/ARM/dag-combine-ldst.ll
+++ b/test/CodeGen/ARM/dag-combine-ldst.ll
@@ -8,7 +8,7 @@
; CHECK-LABEL: {{^}}main
; CHECK: mov [[TMP:r[0-9]+]], #0
; CHECK-NEXT: str [[TMP]], [sp, #4]
-; CHECK-NEXT: str [[TMP]], [sp]
+; CHECK_O0: str [[TMP]], [sp]
; CHECK_O0: ldr [[TMP:r[0-9]+]], [sp]
; CHECK_O0-NEXT: add [[TMP]], [[TMP]], #2
; CHECK_O1-NOT: ldr [[TMP:r[0-9]+]], [sp]
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 6c8bceff5de9..a708b89cbd8f 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -25,7 +25,6 @@ define i16 @va_arg(i8* %vl) nounwind {
entry:
; CHECK-LABEL: va_arg:
%vl.addr = alloca i8*, align 2
-; CHECK: mov.w r12, 0(r1)
store i8* %vl, i8** %vl.addr, align 2
; CHECK: mov.w r12, [[REG:r[0-9]+]]
; CHECK-NEXT: add.w #2, [[REG]]
diff --git a/test/CodeGen/Mips/msa/bmzi_bmnzi.ll b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
index d1cb3c348c73..de62dcd69403 100644
--- a/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
+++ b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
@@ -9,9 +9,9 @@ entry:
%0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
%1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
%2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
- store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
%3 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
- store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
%4 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
ret void
@@ -32,9 +32,9 @@ entry:
%0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
%1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
%2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
- store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
%3 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
- store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
%4 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
ret void
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 18715ddb37c6..2039c1f57f17 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
-; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
-; RUN: llc < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P8U
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P8U
define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
; CHECK-LABEL: exchange_and_add:
@@ -108,8 +108,10 @@ entry:
; CHECK: @atomic_load
%tmp = load atomic i64, i64* %mem acquire, align 64
; CHECK-NOT: ldarx
-; CHECK: ld
-; CHECK: lwsync
+; CHECK: ld [[VAL:r[0-9]+]]
+; CHECK: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK: bne- [[CR]], .+4
+; CHECK: isync
ret i64 %tmp
}
diff --git a/test/CodeGen/PowerPC/atomics-indexed.ll b/test/CodeGen/PowerPC/atomics-indexed.ll
index 7a0dde034d68..cfe15f0061c4 100644
--- a/test/CodeGen/PowerPC/atomics-indexed.ll
+++ b/test/CodeGen/PowerPC/atomics-indexed.ll
@@ -10,16 +10,22 @@
define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
; CHECK-LABEL: load_x_i8_seq_cst
; CHECK: sync
-; CHECK: lbzx
-; CHECK: lwsync
+; CHECK: lbzx [[VAL:r[0-9]+]]
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
%ptr = getelementptr inbounds [100000 x i8], [100000 x i8]* %mem, i64 0, i64 90000
%val = load atomic i8, i8* %ptr seq_cst, align 1
ret i8 %val
}
define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
; CHECK-LABEL: load_x_i16_acquire
-; CHECK: lhzx
-; CHECK: lwsync
+; CHECK: lhzx [[VAL:r[0-9]+]]
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
%ptr = getelementptr inbounds [100000 x i16], [100000 x i16]* %mem, i64 0, i64 90000
%val = load atomic i16, i16* %ptr acquire, align 2
ret i16 %val
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index 9af82b625532..054d3a4146b0 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -23,7 +23,9 @@ define i8 @test2(i8* %ptr) {
; PPC64LE-LABEL: test2:
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lbz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i8, i8* %ptr acquire, align 1
ret i8 %val
@@ -35,7 +37,9 @@ define i8 @test3(i8* %ptr) {
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: ori 2, 2, 0
; PPC64LE-NEXT: lbz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i8, i8* %ptr seq_cst, align 1
ret i8 %val
@@ -63,7 +67,9 @@ define i16 @test6(i16* %ptr) {
; PPC64LE-LABEL: test6:
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lhz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i16, i16* %ptr acquire, align 2
ret i16 %val
@@ -75,7 +81,9 @@ define i16 @test7(i16* %ptr) {
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: ori 2, 2, 0
; PPC64LE-NEXT: lhz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i16, i16* %ptr seq_cst, align 2
ret i16 %val
@@ -103,7 +111,9 @@ define i32 @test10(i32* %ptr) {
; PPC64LE-LABEL: test10:
; PPC64LE: # BB#0:
; PPC64LE-NEXT: lwz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i32, i32* %ptr acquire, align 4
ret i32 %val
@@ -115,7 +125,9 @@ define i32 @test11(i32* %ptr) {
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: ori 2, 2, 0
; PPC64LE-NEXT: lwz 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i32, i32* %ptr seq_cst, align 4
ret i32 %val
@@ -143,7 +155,9 @@ define i64 @test14(i64* %ptr) {
; PPC64LE-LABEL: test14:
; PPC64LE: # BB#0:
; PPC64LE-NEXT: ld 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i64, i64* %ptr acquire, align 8
ret i64 %val
@@ -155,7 +169,9 @@ define i64 @test15(i64* %ptr) {
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: ori 2, 2, 0
; PPC64LE-NEXT: ld 3, 0(3)
-; PPC64LE-NEXT: lwsync
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
; PPC64LE-NEXT: blr
%val = load atomic i64, i64* %ptr seq_cst, align 8
ret i64 %val
@@ -9544,3 +9560,35 @@ define i64 @test559(i64* %ptr, i64 %val) {
%ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst
ret i64 %ret
}
+
+; The second load should never be scheduled before isync.
+define i32 @test_ordering0(i32* %ptr1, i32* %ptr2) {
+; PPC64LE-LABEL: test_ordering0:
+; PPC64LE: # BB#0:
+; PPC64LE-NEXT: lwz 4, 0(3)
+; PPC64LE-NEXT: cmpw 7, 4, 4
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
+; PPC64LE-NEXT: lwz 3, 0(3)
+; PPC64LE-NEXT: add 3, 4, 3
+; PPC64LE-NEXT: blr
+ %val1 = load atomic i32, i32* %ptr1 acquire, align 4
+ %val2 = load i32, i32* %ptr1
+ %add = add i32 %val1, %val2
+ ret i32 %add
+}
+
+; The second store should never be scheduled before isync.
+define i32 @test_ordering1(i32* %ptr1, i32 %val1, i32* %ptr2) {
+; PPC64LE-LABEL: test_ordering1:
+; PPC64LE: # BB#0:
+; PPC64LE-NEXT: lwz 3, 0(3)
+; PPC64LE-NEXT: cmpw 7, 3, 3
+; PPC64LE-NEXT: bne- 7, .+4
+; PPC64LE-NEXT: isync
+; PPC64LE-NEXT: stw 4, 0(5)
+; PPC64LE-NEXT: blr
+ %val2 = load atomic i32, i32* %ptr1 acquire, align 4
+ store i32 %val1, i32* %ptr2
+ ret i32 %val2
+}
diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll
index 2e1eff0f634d..61d54534f5fd 100644
--- a/test/CodeGen/PowerPC/atomics.ll
+++ b/test/CodeGen/PowerPC/atomics.ll
@@ -25,9 +25,12 @@ define i16 @load_i16_monotonic(i16* %mem) {
}
define i32 @load_i32_acquire(i32* %mem) {
; CHECK-LABEL: load_i32_acquire
-; CHECK: lwz
+; CHECK: lwz [[VAL:r[0-9]+]]
%val = load atomic i32, i32* %mem acquire, align 4
-; CHECK: lwsync
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
ret i32 %val
}
define i64 @load_i64_seq_cst(i64* %mem) {
@@ -35,9 +38,12 @@ define i64 @load_i64_seq_cst(i64* %mem) {
; CHECK: sync
; PPC32: __sync_
; PPC64-NOT: __sync_
-; PPC64: ld
+; PPC64: ld [[VAL:r[0-9]+]]
%val = load atomic i64, i64* %mem seq_cst, align 8
-; CHECK: lwsync
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
ret i64 %val
}
diff --git a/test/CodeGen/PowerPC/ppcf128sf.ll b/test/CodeGen/PowerPC/ppcf128sf.ll
index 6804b551e572..fde7d48da7c2 100644
--- a/test/CodeGen/PowerPC/ppcf128sf.ll
+++ b/test/CodeGen/PowerPC/ppcf128sf.ll
@@ -14,19 +14,19 @@ entry:
%0 = load ppc_fp128, ppc_fp128* @ld, align 16
%1 = load ppc_fp128, ppc_fp128* @ld2, align 16
%add = fadd ppc_fp128 %0, %1
- store ppc_fp128 %add, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %add, ppc_fp128* %c, align 16
%2 = load ppc_fp128, ppc_fp128* @ld, align 16
%3 = load ppc_fp128, ppc_fp128* @ld2, align 16
%sub = fsub ppc_fp128 %2, %3
- store ppc_fp128 %sub, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %sub, ppc_fp128* %c, align 16
%4 = load ppc_fp128, ppc_fp128* @ld, align 16
%5 = load ppc_fp128, ppc_fp128* @ld2, align 16
%mul = fmul ppc_fp128 %4, %5
- store ppc_fp128 %mul, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %mul, ppc_fp128* %c, align 16
%6 = load ppc_fp128, ppc_fp128* @ld, align 16
%7 = load ppc_fp128, ppc_fp128* @ld2, align 16
%div = fdiv ppc_fp128 %6, %7
- store ppc_fp128 %div, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %div, ppc_fp128* %c, align 16
ret void
; CHECK-LABEL: __gcc_qadd
diff --git a/test/CodeGen/PowerPC/save-bp.ll b/test/CodeGen/PowerPC/save-bp.ll
new file mode 100644
index 000000000000..1c7e19a1d5cb
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-bp.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=ppc64 -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC64
+; RUN: llc -march=ppc32 -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32
+; RUN: llc -march=ppc32 -ppc-always-use-base-pointer -relocation-model pic < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32PIC
+
+; CHECK-LABEL: fred:
+
+; Check for saving/restoring frame pointer (X31) and base pointer (X30)
+; on ppc64:
+; PPC64: std 31, -8(1)
+; PPC64: std 30, -16(1)
+; PPC64: ld 31, -8(1)
+; PPC64: ld 30, -16(1)
+
+; Check for saving/restoring frame pointer (R31) and base pointer (R30)
+; on ppc32:
+; PPC32: stwux 1, 1, 0
+; PPC32; addic 0, 0, -4
+; PPC32: stwx 31, 0, 0
+; PPC32: addic 0, 0, -4
+; PPC32: stwx 30, 0, 0
+; The restore sequence:
+; PPC32: lwz 31, 0(1)
+; PPC32: addic 30, 0, 8
+; PPC32: lwz 0, -4(31)
+; PPC32: lwz 30, -8(31)
+; PPC32: mr 1, 31
+; PPC32: mr 31, 0
+
+; Check for saving/restoring frame pointer (R31) and base pointer (R29)
+; on ppc32/pic. This is mostly the same as without pic, except that base
+; pointer is in R29.
+; PPC32PIC: stwux 1, 1, 0
+; PPC32PIC; addic 0, 0, -4
+; PPC32PIC: stwx 31, 0, 0
+; PPC32PIC: addic 0, 0, -8
+; PPC32PIC: stwx 29, 0, 0
+; The restore sequence:
+; PPC32PIC: lwz 31, 0(1)
+; PPC32PIC: addic 29, 0, 12
+; PPC32PIC: lwz 0, -4(31)
+; PPC32PIC: lwz 29, -12(31)
+; PPC32PIC: mr 1, 31
+; PPC32PIC: mr 31, 0
+
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+define i64 @fred() local_unnamed_addr #0 {
+entry:
+ ret i64 0
+}
+
+attributes #0 = { norecurse readnone nounwind sspstrong "no-frame-pointer-elim"="true" "target-cpu"="ppc" }
diff --git a/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll b/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll
new file mode 100644
index 000000000000..9fabca186050
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=ppc32 -relocation-model pic < %s | FileCheck %s
+;
+; Make sure that the CR register is saved correctly on PPC32/SVR4.
+
+; CHECK-LABEL: fred:
+; CHECK: stwu 1, -32(1)
+; CHECK: stw 31, 28(1)
+; CHECK: mr 31, 1
+; CHECK: stw 30, 24(1)
+; CHECK: mfcr [[CR:[0-9]+]]
+; CHECK: stw [[CR]], 20(31)
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+; Function Attrs: norecurse nounwind readnone sspstrong
+define i64 @fred(double %a0) local_unnamed_addr #0 {
+b1:
+ %v2 = fcmp olt double %a0, 0x43E0000000000000
+ br i1 %v2, label %b3, label %b7
+
+b3: ; preds = %b1
+ %v4 = fcmp olt double %a0, 0xC3E0000000000000
+ %v5 = fptosi double %a0 to i64
+ %v6 = select i1 %v4, i64 -9223372036854775808, i64 %v5
+ br label %b14
+
+b7: ; preds = %b1
+ %v8 = fcmp olt double %a0, 0x43F0000000000000
+ br i1 %v8, label %b9, label %b11
+
+b9: ; preds = %b7
+ %v10 = fptoui double %a0 to i64
+ br label %b14
+
+b11: ; preds = %b7
+ %v12 = fcmp ogt double %a0, 0.000000e+00
+ %v13 = sext i1 %v12 to i64
+ br label %b14
+
+b14: ; preds = %b11, %b9, %b3
+ %v15 = phi i64 [ %v6, %b3 ], [ %v10, %b9 ], [ %v13, %b11 ]
+ ret i64 %v15
+}
+
+attributes #0 = { norecurse nounwind readnone sspstrong "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="ppc" }
diff --git a/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll b/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll
new file mode 100644
index 000000000000..b7b3c1ada965
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=ppc32 -relocation-model pic < %s | FileCheck %s
+
+; CHECK-LABEL: fred
+; CHECK: stwux 1, 1, 0
+; Save R31..R29 via R0:
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 31, 0, 0
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 30, 0, 0
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 29, 0, 0
+; Set R29 back to the value of R0 from before the updates:
+; CHECK: addic 29, 0, 12
+; Save CR through R12 using R29 as the stack pointer (aligned base pointer).
+; CHECK: mfcr 12
+; CHECK: stw 28, -16(29)
+; CHECK: stw 12, -20(29)
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+; Function Attrs: norecurse readnone sspstrong
+define i64 @fred(double %a0) local_unnamed_addr #0 {
+b1:
+ %v2 = alloca i64, align 128
+ store i64 0, i64* %v2
+ %v3 = fcmp olt double %a0, 0x43E0000000000000
+ br i1 %v3, label %b4, label %b8
+
+b4: ; preds = %b1
+ %v5 = fcmp olt double %a0, 0xC3E0000000000000
+ %v6 = fptosi double %a0 to i64
+ store i64 %v6, i64* %v2
+ %v7 = select i1 %v5, i64 -9223372036854775808, i64 %v6
+ br label %b15
+
+b8: ; preds = %b1
+ %v9 = fcmp olt double %a0, 0x43F0000000000000
+ br i1 %v9, label %b10, label %b12
+
+b10: ; preds = %b8
+ %v11 = fptoui double %a0 to i64
+ br label %b15
+
+b12: ; preds = %b8
+ %v13 = fcmp ogt double %a0, 0.000000e+00
+ %v14 = sext i1 %v13 to i64
+ br label %b15
+
+b15: ; preds = %b12, %b10, %b4
+ %v16 = phi i64 [ %v7, %b4 ], [ %v11, %b10 ], [ %v14, %b12 ]
+ %v17 = load i64, i64* %v2
+ %v18 = add i64 %v17, %v16
+ ret i64 %v18
+}
+
+attributes #0 = { norecurse readnone sspstrong "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="ppc" }
diff --git a/test/CodeGen/SPARC/32abi.ll b/test/CodeGen/SPARC/32abi.ll
index 09e7a3a09d86..3807f84d4e92 100644
--- a/test/CodeGen/SPARC/32abi.ll
+++ b/test/CodeGen/SPARC/32abi.ll
@@ -25,17 +25,17 @@ define void @intarg(i8 %a0, ; %i0
i32 %a5, ; %i5
i32 signext %a6, ; [%fp+92]
i8* %a7) { ; [%fp+96]
- store i8 %a0, i8* %a4
- store i8 %a1, i8* %a4
+ store volatile i8 %a0, i8* %a4
+ store volatile i8 %a1, i8* %a4
%p16 = bitcast i8* %a4 to i16*
- store i16 %a2, i16* %p16
+ store volatile i16 %a2, i16* %p16
%p32 = bitcast i8* %a4 to i32*
- store i32 %a3, i32* %p32
+ store volatile i32 %a3, i32* %p32
%pp = bitcast i8* %a4 to i8**
- store i8* %a4, i8** %pp
- store i32 %a5, i32* %p32
- store i32 %a6, i32* %p32
- store i8* %a7, i8** %pp
+ store volatile i8* %a4, i8** %pp
+ store volatile i32 %a5, i32* %p32
+ store volatile i32 %a6, i32* %p32
+ store volatile i8* %a7, i8** %pp
ret void
}
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index b963be2e9853..771cc409554b 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -24,17 +24,17 @@ define void @intarg(i8 %a0, ; %i0
i32 %a5, ; %i5
i32 signext %a6, ; [%fp+BIAS+176]
i8* %a7) { ; [%fp+BIAS+184]
- store i8 %a0, i8* %a4
- store i8 %a1, i8* %a4
+ store volatile i8 %a0, i8* %a4
+ store volatile i8 %a1, i8* %a4
%p16 = bitcast i8* %a4 to i16*
- store i16 %a2, i16* %p16
+ store volatile i16 %a2, i16* %p16
%p32 = bitcast i8* %a4 to i32*
- store i32 %a3, i32* %p32
+ store volatile i32 %a3, i32* %p32
%pp = bitcast i8* %a4 to i8**
- store i8* %a4, i8** %pp
- store i32 %a5, i32* %p32
- store i32 %a6, i32* %p32
- store i8* %a7, i8** %pp
+ store volatile i8* %a4, i8** %pp
+ store volatile i32 %a5, i32* %p32
+ store volatile i32 %a6, i32* %p32
+ store volatile i8* %a7, i8** %pp
ret void
}
@@ -316,7 +316,7 @@ define void @call_ret_i64_pair(i64* %i0) {
%rv = call { i64, i64 } @ret_i64_pair(i32 undef, i32 undef,
i64* undef, i64* undef)
%e0 = extractvalue { i64, i64 } %rv, 0
- store i64 %e0, i64* %i0
+ store volatile i64 %e0, i64* %i0
%e1 = extractvalue { i64, i64 } %rv, 1
store i64 %e1, i64* %i0
ret void
diff --git a/test/CodeGen/SystemZ/swift-return.ll b/test/CodeGen/SystemZ/swift-return.ll
index 69d0e979190c..977816f66bec 100644
--- a/test/CodeGen/SystemZ/swift-return.ll
+++ b/test/CodeGen/SystemZ/swift-return.ll
@@ -189,11 +189,11 @@ define void @consume_i1_ret() {
%v6 = extractvalue { i1, i1, i1, i1 } %call, 2
%v7 = extractvalue { i1, i1, i1, i1 } %call, 3
%val = zext i1 %v3 to i32
- store i32 %val, i32* @var
+ store volatile i32 %val, i32* @var
%val2 = zext i1 %v5 to i32
- store i32 %val2, i32* @var
+ store volatile i32 %val2, i32* @var
%val3 = zext i1 %v6 to i32
- store i32 %val3, i32* @var
+ store volatile i32 %val3, i32* @var
%val4 = zext i1 %v7 to i32
store i32 %val4, i32* @var
ret void
diff --git a/test/CodeGen/Thumb/stack-access.ll b/test/CodeGen/Thumb/stack-access.ll
index 44217aba62d5..533559a67421 100644
--- a/test/CodeGen/Thumb/stack-access.ll
+++ b/test/CodeGen/Thumb/stack-access.ll
@@ -7,13 +7,13 @@ define void @test1(i8** %p) {
%z = alloca i8, align 1
; CHECK: add r1, sp, #8
; CHECK: str r1, [r0]
- store i8* %x, i8** %p, align 4
+ store volatile i8* %x, i8** %p, align 4
; CHECK: add r1, sp, #4
; CHECK: str r1, [r0]
- store i8* %y, i8** %p, align 4
+ store volatile i8* %y, i8** %p, align 4
; CHECK: mov r1, sp
; CHECK: str r1, [r0]
- store i8* %z, i8** %p, align 4
+ store volatile i8* %z, i8** %p, align 4
ret void
}
@@ -24,10 +24,10 @@ define void @test2([1024 x i8]** %p) {
; CHECK: add r1, sp, #1020
; CHECK: adds r1, #4
; CHECK: str r1, [r0]
- store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
+ store volatile [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
; CHECK: mov r1, sp
; CHECK: str r1, [r0]
- store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
+ store volatile [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
ret void
}
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 3e4bd02097ad..c6d00d4c1e11 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -50,9 +50,9 @@ bb420: ; preds = %bb20, %bb20
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
- store %union.rec* null, %union.rec** @zz_hold, align 4
+ store volatile %union.rec* null, %union.rec** @zz_hold, align 4
store %union.rec* null, %union.rec** @zz_res, align 4
- store %union.rec* %x, %union.rec** @zz_hold, align 4
+ store volatile %union.rec* %x, %union.rec** @zz_hold, align 4
%0 = call %union.rec* @Manifest(%union.rec* undef, %union.rec* %env, %struct.STYLE* %style, %union.rec** %bthr, %union.rec** %fthr, %union.rec** %target, %union.rec** %crs, i32 %ok, i32 %need_expand, %union.rec** %enclose, i32 %fcr) nounwind ; <%union.rec*> [#uses=0]
unreachable
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 553bc2789ff0..85db1c0e7e7a 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -1,44 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_add_i64:
-; ALL: # BB#0:
-; ALL-NEXT: leaq (%rsi,%rdi), %rax
-; ALL-NEXT: retq
+; X64-LABEL: test_add_i64:
+; X64: # BB#0:
+; X64-NEXT: leaq (%rsi,%rdi), %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_add_i64:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .Lcfi0:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .Lcfi1:
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .Lcfi2:
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: .Lcfi3:
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: leal 8(%ebp), %ecx
+; X32-NEXT: leal 12(%ebp), %esi
+; X32-NEXT: leal 16(%ebp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: leal 20(%ebp), %edx
+; X32-NEXT: movl (%edx), %edx
+; X32-NEXT: addl (%ecx), %eax
+; X32-NEXT: adcl (%esi), %edx
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
%ret = add i64 %arg1, %arg2
ret i64 %ret
}
define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_add_i32:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT: leal (%rsi,%rdi), %eax
-; ALL-NEXT: retq
+; X64-LABEL: test_add_i32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64-NEXT: leal (%rsi,%rdi), %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_add_i32:
+; X32: # BB#0:
+; X32-NEXT: leal 4(%esp), %ecx
+; X32-NEXT: leal 8(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: addl (%ecx), %eax
+; X32-NEXT: retl
%ret = add i32 %arg1, %arg2
ret i32 %ret
}
define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_add_i16:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %DI<def> %DI<kill> %RDI<def>
-; ALL-NEXT: # kill: %SI<def> %SI<kill> %RSI<def>
-; ALL-NEXT: leal (%rsi,%rdi), %eax
-; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ALL-NEXT: retq
+; X64-LABEL: test_add_i16:
+; X64: # BB#0:
+; X64-NEXT: # kill: %DI<def> %DI<kill> %RDI<def>
+; X64-NEXT: # kill: %SI<def> %SI<kill> %RSI<def>
+; X64-NEXT: leal (%rsi,%rdi), %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-LABEL: test_add_i16:
+; X32: # BB#0:
+; X32-NEXT: leal 4(%esp), %ecx
+; X32-NEXT: leal 8(%esp), %eax
+; X32-NEXT: movzwl (%eax), %eax
+; X32-NEXT: addw (%ecx), %ax
+; X32-NEXT: retl
%ret = add i16 %arg1, %arg2
ret i16 %ret
}
define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_add_i8:
-; ALL: # BB#0:
-; ALL-NEXT: addb %dil, %sil
-; ALL-NEXT: movl %esi, %eax
-; ALL-NEXT: retq
+; X64-LABEL: test_add_i8:
+; X64: # BB#0:
+; X64-NEXT: addb %dil, %sil
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_add_i8:
+; X32: # BB#0:
+; X32-NEXT: leal 4(%esp), %ecx
+; X32-NEXT: leal 8(%esp), %eax
+; X32-NEXT: movb (%eax), %al
+; X32-NEXT: addb (%ecx), %al
+; X32-NEXT: retl
%ret = add i8 %arg1, %arg2
ret i8 %ret
}
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add.mir b/test/CodeGen/X86/GlobalISel/legalize-add.mir
index 22619cc71033..6a03388da947 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add.mir
@@ -1,40 +1,67 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
-
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
--- |
- ; ModuleID = '<stdin>'
- source_filename = "<stdin>"
- target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
- target triple = "x86_64--linux-gnu"
+ define void @test_add_i32() {
+ ret void
+ }
- define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
- %ret = add i32 %arg1, %arg2
- ret i32 %ret
+ define void @test_add_i64() {
+ ret void
}
...
---
name: test_add_i32
+# ALL-LABEL: name: test_add_i32
alignment: 4
legalized: false
regBankSelected: false
-selected: false
-tracksRegLiveness: true
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
+# ALL: %0(s32) = IMPLICIT_DEF
+# ALL-NEXT: %1(s32) = IMPLICIT_DEF
+# ALL-NEXT: %2(s32) = G_ADD %0, %1
+# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
- liveins: %edi, %esi
- ; CHECK-LABEL: name: test_add_i32
- ; CHECK: [[VAL1:%.*]](s32) = COPY %edi
- ; CHECK: [[VAL2:%.*]](s32) = COPY %esi
- ; CHECK: [[RES:%.*]](s32) = G_ADD [[VAL1:%.*]], [[VAL2:%.*]]
-
- %0(s32) = COPY %edi
- %1(s32) = COPY %esi
+ %0(s32) = IMPLICIT_DEF
+ %1(s32) = IMPLICIT_DEF
%2(s32) = G_ADD %0, %1
- %eax = COPY %2(s32)
- RET 0, implicit %eax
+ RET 0
+
+...
+---
+name: test_add_i64
+# ALL-LABEL: name: test_add_i64
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+# X64: %0(s64) = IMPLICIT_DEF
+# X64-NEXT: %1(s64) = IMPLICIT_DEF
+# X64-NEXT: %2(s64) = G_ADD %0, %1
+# X64-NEXT: RET 0
+#
+# X32: %0(s64) = IMPLICIT_DEF
+# X32-NEXT: %1(s64) = IMPLICIT_DEF
+# X32-NEXT: %3(s32), %4(s32) = G_UNMERGE_VALUES %0(s64)
+# X32-NEXT: %5(s32), %6(s32) = G_UNMERGE_VALUES %1(s64)
+# X32-NEXT: %12(s8) = G_CONSTANT i8 0
+# X32-NEXT: %7(s1) = G_TRUNC %12(s8)
+# X32-NEXT: %8(s32), %9(s1) = G_UADDE %3, %5, %7
+# X32-NEXT: %10(s32), %11(s1) = G_UADDE %4, %6, %9
+# X32-NEXT: %2(s64) = G_MERGE_VALUES %8(s32), %10(s32)
+# X32-NEXT: RET 0
+body: |
+ bb.1 (%ir-block.0):
+ %0(s64) = IMPLICIT_DEF
+ %1(s64) = IMPLICIT_DEF
+ %2(s64) = G_ADD %0, %1
+ RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
new file mode 100644
index 000000000000..a115d1fa3255
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
@@ -0,0 +1,36 @@
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
+
+--- |
+ define void @test_uadde_i32() {
+ ret void
+ }
+
+...
+---
+name: test_uadde_i32
+# CHECK-LABEL: name: test_uadde_i32
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: gpr }
+# CHECK-NEXT: - { id: 3, class: gpr }
+# CHECK-NEXT: - { id: 4, class: gpr }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+ - { id: 3, class: _ }
+ - { id: 4, class: _ }
+body: |
+ bb.0 (%ir-block.0):
+ %0(s32) = IMPLICIT_DEF
+ %1(s32) = IMPLICIT_DEF
+ %2(s1) = IMPLICIT_DEF
+ %3(s32), %4(s1) = G_UADDE %0, %1, %2
+ RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-add-x32.mir b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
new file mode 100644
index 000000000000..8710aaa61a21
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X32
+--- |
+ define i64 @test_add_i64(i64 %a, i64 %b) {
+ %r = add i64 %a, %b
+ ret i64 %r
+ }
+
+...
+---
+name: test_add_i64
+# X32-LABEL: name: test_add_i64
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32 }
+# X32-NEXT: - { id: 1, class: gr32 }
+# X32-NEXT: - { id: 2, class: gr32 }
+# X32-NEXT: - { id: 3, class: gr32 }
+# X32-NEXT: - { id: 4, class: gpr }
+# X32-NEXT: - { id: 5, class: gr32 }
+# X32-NEXT: - { id: 6, class: gr32 }
+# X32-NEXT: - { id: 7, class: gr32 }
+# X32-NEXT: - { id: 8, class: gr32 }
+# X32-NEXT: - { id: 9, class: gpr }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: gpr }
+ - { id: 5, class: gpr }
+ - { id: 6, class: gpr }
+ - { id: 7, class: gpr }
+ - { id: 8, class: gpr }
+ - { id: 9, class: gpr }
+# X32: %0 = IMPLICIT_DEF
+# X32-NEXT: %1 = IMPLICIT_DEF
+# X32-NEXT: %2 = IMPLICIT_DEF
+# X32-NEXT: %3 = IMPLICIT_DEF
+# X32-NEXT: %5 = ADD32rr %0, %2, implicit-def %eflags
+# X32-NEXT: %6 = COPY %eflags
+# X32-NEXT: %eflags = COPY %6
+# X32-NEXT: %7 = ADC32rr %1, %3, implicit-def %eflags, implicit %eflags
+# X32-NEXT: %8 = COPY %eflags
+# X32-NEXT: %eax = COPY %5
+# X32-NEXT: %edx = COPY %7
+# X32-NEXT: RET 0, implicit %eax, implicit %edx
+body: |
+ bb.0 (%ir-block.0):
+ %0(s32) = IMPLICIT_DEF
+ %1(s32) = IMPLICIT_DEF
+ %2(s32) = IMPLICIT_DEF
+ %3(s32) = IMPLICIT_DEF
+ %9(s8) = G_CONSTANT i8 0
+ %4(s1) = G_TRUNC %9(s8)
+ %5(s32), %6(s1) = G_UADDE %0, %2, %4
+ %7(s32), %8(s1) = G_UADDE %1, %3, %6
+ %eax = COPY %5(s32)
+ %edx = COPY %7(s32)
+ RET 0, implicit %eax, implicit %edx
+
+...
diff --git a/test/CodeGen/X86/arg-copy-elide.ll b/test/CodeGen/X86/arg-copy-elide.ll
index b9a2eeeb7f8f..126f5a1c7976 100644
--- a/test/CodeGen/X86/arg-copy-elide.ll
+++ b/test/CodeGen/X86/arg-copy-elide.ll
@@ -253,9 +253,7 @@ entry:
; CHECK: calll _addrof_i32
; CHECK: retl
-
; Don't elide the copy when the alloca is escaped with a store.
-
define void @escape_with_store(i32 %x) {
%x1 = alloca i32
%x2 = alloca i32*
@@ -268,9 +266,8 @@ define void @escape_with_store(i32 %x) {
}
; CHECK-LABEL: _escape_with_store:
-; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
-; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
-; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], [[offs:[0-9]*]](%esp)
; CHECK: calll _addrof_i32
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
deleted file mode 100644
index 70aac21c7ff2..000000000000
--- a/test/CodeGen/X86/leaFixup32.mir
+++ /dev/null
@@ -1,508 +0,0 @@
-# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
---- |
- ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll'
- source_filename = "test/CodeGen/X86/fixup-lea.ll"
- target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
- target triple = "i386"
- ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir
-
- ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions
- ; where ADD32ri8 is chosen
- define i32 @test2add_32() {
- ret i32 0
- }
-
- ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions
- ; where the base is rbp/r13/ebp register
- define i32 @test2add_ebp_32() {
- ret i32 0
- }
-
- ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced
- ; with an add instruction
- define i32 @test1add_ebp_32() {
- ret i32 0
- }
-
- ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
- define i32 @testleaadd_32() {
- ret i32 0
- }
-
- ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
- ; where the base is ebp register
- define i32 @testleaadd_ebp_32() {
- ret i32 0
- }
-
- ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced
- ; with a lea instruction
- define i32 @test1lea_ebp_32() {
- ret i32 0
- }
-
- ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32
- ; is chosen
- define i32 @test2addi32_32() {
- ret i32 0
- }
-
- ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions
- ; where the base is rbp/r13/ebp register
- define i32 @test1mov1add_ebp_32() {
- ret i32 0
- }
-
- ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is offset
- define i32 @testleaadd_ebp_index_32() {
- ret i32 0
- }
-
- ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is scale
- define i32 @testleaadd_ebp_index2_32() {
- ret i32 0
- }
-
- ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions
- define i32 @test_skip_opt_32() {
- ret i32 0
- }
-
- ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags
- define i32 @test_skip_eflags_32() {
- ret i32 0
- }
-
-...
----
-name: test2add_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %eax = ADD32rr %eax, killed %ebp
- ; CHECK: %eax = ADD32ri8 %eax, -5
-
- %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
- RETQ %eax
-
-...
----
-name: test2add_ebp_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %ebp = ADD32rr %ebp, killed %eax
- ; CHECK: %ebp = ADD32ri8 %ebp, -5
-
- %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
- RETQ %ebp
-
-...
----
-name: test1add_ebp_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %ebp = ADD32rr %ebp, killed %eax
-
- %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
- RETQ %ebp
-
-...
----
-name: testleaadd_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
- - { reg: '%ebx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %esi
- ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
- ; CHECK: %ebx = ADD32ri8 %ebx, -5
-
- %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_ebp_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
- - { reg: '%ebx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
- ; CHECK: %ebx = ADD32ri8 %ebx, -5
-
- %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
- RETQ %ebx
-
-...
----
-name: test1lea_ebp_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
- - { reg: '%ebx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
-
- %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
- RETQ %ebx
-
-...
----
-name: test2addi32_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp
- ; CHECK: %eax = ADD32rr %eax, killed %ebp
- ; CHECK: %eax = ADD32ri %eax, 129
-
- %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
- RETQ %eax
-
-...
----
-name: test1mov1add_ebp_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%eax' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = MOV32rr killed %ebp
- ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
-
- %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _
- RETQ %ebx
-
-...
----
-name: testleaadd_ebp_index_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%ebx' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _
- ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
-
- %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_ebp_index2_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%ebx' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _
- ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
-
- %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _
- RETQ %ebx
-
-...
----
-name: test_skip_opt_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%ebx' }
- - { reg: '%ebp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
-
- %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
- RETQ %ebp
-
-...
----
-name: test_skip_eflags_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%ebp' }
- - { reg: '%eax' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
- ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
- ; CHECK: %ebp = ADD32ri8 %ebp, 5
-
- CMP32rr %eax, killed %ebx, implicit-def %eflags
- %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
- JE_1 %bb.1, implicit %eflags
- RETQ %ebx
- bb.1:
- liveins: %eax, %ebp, %ebx
- %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
- RETQ %ebp
-
-...
-
-
-
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
deleted file mode 100644
index 9b0058750598..000000000000
--- a/test/CodeGen/X86/leaFixup64.mir
+++ /dev/null
@@ -1,1041 +0,0 @@
-# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
---- |
- ; ModuleID = 'lea-2.ll'
- source_filename = "lea-2.ll"
- target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
- ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir
-
- ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
- ; but can be replaced with 1 lea + 1 add
- define i32 @testleaadd_64_32_1() {
- ret i32 0
- }
-
- ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
- ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add
- define i32 @testleaadd_rbp_64_32_1() {
- ret i32 0
- }
-
- ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not
- ; be replaced with an add instruction but can be replaced with 1 lea instruction
- define i32 @test1lea_rbp_64_32_1() {
- ret i32 0
- }
-
- ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions
- define i32 @test2add_64() {
- ret i32 0
- }
-
- ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions
- ; where the base is rbp/r13/ebp register
- define i32 @test2add_rbp_64() {
- ret i32 0
- }
-
- ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced
- ; with an add instruction
- define i32 @test1add_rbp_64() {
- ret i32 0
- }
-
- ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
- define i32 @testleaadd_64_32() {
- ret i32 0
- }
-
- ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
- ; where the base is rbp/r13/ebp register
- define i32 @testleaadd_rbp_64_32() {
- ret i32 0
- }
-
- ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced
- ; with a lea instruction
- define i32 @test1lea_rbp_64_32() {
- ret i32 0
- }
-
- ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
- define i32 @testleaadd_64() {
- ret i32 0
- }
-
- ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
- ; where the base is rbp/r13/ebp register
- define i32 @testleaadd_rbp_64() {
- ret i32 0
- }
-
- ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced
- ; with a lea instruction
- define i32 @test1lea_rbp_64() {
- ret i32 0
- }
-
- ;test8: dst = base & scale!=1, can't optimize
- define i32 @test8() {
- ret i32 0
- }
-
- ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where
- ; ADD64ri32 is chosen
- define i32 @testleaaddi32_64_32() {
- ret i32 0
- }
-
- ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions
- ; where the base is rbp/r13/ebp register
- define i32 @test1mov1add_rbp_64_32() {
- ret i32 0
- }
-
- ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is offset
- define i32 @testleaadd_rbp_index_64_32() {
- ret i32 0
- }
-
- ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is scale
- define i32 @testleaadd_rbp_index2_64_32() {
- ret i32 0
- }
-
- ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32
- ; is chosen
- define i32 @test2addi32_64() {
- ret i32 0
- }
-
- ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions
- ; where the base is rbp/r13/ebp register
- define i32 @test1mov1add_rbp_64() {
- ret i32 0
- }
-
- ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is offset
- define i32 @testleaadd_rbp_index_64() {
- ret i32 0
- }
-
- ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
- ; where the base and the index are ebp register and there is scale
- define i32 @testleaadd_rbp_index2_64() {
- ret i32 0
- }
-
- ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions
- define i32 @test_skip_opt_64() {
- ret i32 0
- }
-
- ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags
- define i32 @test_skip_eflags_64() {
- ret i32 0
- }
-
- ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions
- define i32 @test_skip_opt_64_32() {
- ret i32 0
- }
-
- ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags
- define i32 @test_skip_eflags_64_32() {
- ret i32 0
- }
-
-
-...
----
-name: testleaadd_64_32_1
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
- ; CHECK: %eax = ADD32ri8 %eax, -5
-
- %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
- RETQ %eax
-
-...
----
-name: testleaadd_rbp_64_32_1
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
- ; CHECK: %ebp = ADD32ri8 %ebp, -5
-
- %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
- RETQ %ebp
-
-...
----
-name: test1lea_rbp_64_32_1
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
-
- %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
- RETQ %ebp
-
-...
----
-name: test2add_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rax = ADD64rr %rax, killed %rbp
- ; CHECK: %rax = ADD64ri8 %rax, -5
-
- %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
- RETQ %eax
-
-...
----
-name: test2add_rbp_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rbp = ADD64rr %rbp, killed %rax
- ; CHECK: %rbp = ADD64ri8 %rbp, -5
-
- %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
- RETQ %ebp
-
-...
----
-name: test1add_rbp_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rbp = ADD64rr %rbp, killed %rax
-
- %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
- RETQ %ebp
-
-...
----
-name: testleaadd_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
- ; CHECK: %ebx = ADD32ri8 %ebx, -5
-
- %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
- ; CHECK: %ebx = ADD32ri8 %ebx, -5
-
- %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
- RETQ %ebx
-
-...
----
-name: test1lea_rbp_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
-
- %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
- RETQ %ebx
-
-...
----
-name: testleaadd_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
- ; CHECK: %rbx = ADD64ri8 %rbx, -5
-
- %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
- ; CHECK: %rbx = ADD64ri8 %rbx, -5
-
- %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
- RETQ %ebx
-
-...
----
-name: test1lea_rbp_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
- - { reg: '%rbx' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
-
- %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
- RETQ %ebx
-
-...
----
-name: test8
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rdi' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rdi, %rbp
- ; CHECK: %r12 = LEA64r _, 2, killed %r13, 5, _
- ; CHECK: %r12 = ADD64rr %r12, killed %rbp
- %rbp = KILL %rbp, implicit-def %rbp
- %r13 = KILL %rdi, implicit-def %r13
- %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
- RETQ %r12
-
-...
----
-name: testleaaddi32_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
- ; CHECK: %eax = ADD32ri %eax, 129
-
- %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
- RETQ %eax
-
-...
----
-name: test1mov1add_rbp_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
-
- %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_index_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
-
- %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_index2_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
-
- %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
- RETQ %ebx
-
-...
----
-name: test2addi32_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp
- ; CHECK: %rax = ADD64rr %rax, killed %rbp
- ; CHECK: %rax = ADD64ri32 %rax, 129
-
- %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
- RETQ %eax
-
-...
----
-name: test1mov1add_rbp_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rax' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = MOV64rr killed %rbp
- ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
-
- %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_index_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _
- ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
-
- %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _
- RETQ %ebx
-
-...
----
-name: testleaadd_rbp_index2_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _
- ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
-
- %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _
- RETQ %ebx
-
-...
----
-name: test_skip_opt_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
-
- %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
- RETQ %ebp
-
-...
----
-name: test_skip_eflags_64
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbp' }
- - { reg: '%rax' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
- ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
- ; CHECK: %rbp = ADD64ri8 %rbp, 5
-
- CMP64rr %rax, killed %rbx, implicit-def %eflags
- %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
- JE_1 %bb.1, implicit %eflags
- RETQ %ebx
- bb.1:
- liveins: %rax, %rbp, %rbx
- %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
- RETQ %ebp
-
-...
----
-name: test_skip_opt_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbx' }
- - { reg: '%rbp' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
-
- %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
- RETQ %ebp
-
-...
----
-name: test_skip_eflags_64_32
-alignment: 4
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-tracksRegLiveness: true
-liveins:
- - { reg: '%rbp' }
- - { reg: '%rax' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 0
- adjustsStack: false
- hasCalls: false
- maxCallFrameSize: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
-body: |
- bb.0 (%ir-block.0):
- liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
- ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
- ; CHECK: %ebp = ADD32ri8 %ebp, 5
-
- CMP64rr %rax, killed %rbx, implicit-def %eflags
- %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
- JE_1 %bb.1, implicit %eflags
- RETQ %ebx
- bb.1:
- liveins: %rax, %rbp, %rbx
- %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
- RETQ %ebp
-
-...
-
-
-
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index 33d5caba597c..d49c88724331 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -9,33 +9,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
-; X32-SSE-NEXT: pushl %esi
; X32-SSE-NEXT: andl $-16, %esp
; X32-SSE-NEXT: subl $16, %esp
; X32-SSE-NEXT: movl 72(%ebp), %eax
; X32-SSE-NEXT: movl 76(%ebp), %ecx
-; X32-SSE-NEXT: movl 12(%ebp), %edx
; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
-; X32-SSE-NEXT: movl 8(%ebp), %esi
-; X32-SSE-NEXT: addps .LCPI0_0, %xmm0
-; X32-SSE-NEXT: movntps %xmm0, (%esi)
-; X32-SSE-NEXT: paddq .LCPI0_1, %xmm2
-; X32-SSE-NEXT: movntdq %xmm2, (%esi)
-; X32-SSE-NEXT: addpd .LCPI0_2, %xmm1
-; X32-SSE-NEXT: movntpd %xmm1, (%esi)
-; X32-SSE-NEXT: paddd .LCPI0_3, %xmm5
-; X32-SSE-NEXT: movntdq %xmm5, (%esi)
-; X32-SSE-NEXT: paddw .LCPI0_4, %xmm4
-; X32-SSE-NEXT: movntdq %xmm4, (%esi)
-; X32-SSE-NEXT: paddb .LCPI0_5, %xmm3
-; X32-SSE-NEXT: movntdq %xmm3, (%esi)
-; X32-SSE-NEXT: movntil %edx, (%esi)
-; X32-SSE-NEXT: movntil %ecx, 4(%esi)
-; X32-SSE-NEXT: movntil %eax, (%esi)
-; X32-SSE-NEXT: leal -4(%ebp), %esp
-; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: movl 8(%ebp), %edx
+; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movntps %xmm0, (%edx)
+; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movntdq %xmm2, (%edx)
+; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movntpd %xmm1, (%edx)
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT: movntdq %xmm5, (%edx)
+; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT: movntdq %xmm4, (%edx)
+; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT: movntdq %xmm3, (%edx)
+; X32-SSE-NEXT: movntil %ecx, 4(%edx)
+; X32-SSE-NEXT: movntil %eax, (%edx)
+; X32-SSE-NEXT: movl %ebp, %esp
; X32-SSE-NEXT: popl %ebp
; X32-SSE-NEXT: retl
;
@@ -43,33 +39,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
; X32-AVX: # BB#0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
-; X32-AVX-NEXT: pushl %esi
; X32-AVX-NEXT: andl $-16, %esp
; X32-AVX-NEXT: subl $16, %esp
; X32-AVX-NEXT: movl 72(%ebp), %eax
; X32-AVX-NEXT: movl 76(%ebp), %ecx
-; X32-AVX-NEXT: movl 12(%ebp), %edx
; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT: movl 8(%ebp), %esi
-; X32-AVX-NEXT: vaddps .LCPI0_0, %xmm0, %xmm0
-; X32-AVX-NEXT: vmovntps %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddq .LCPI0_1, %xmm2, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vaddpd .LCPI0_2, %xmm1, %xmm0
-; X32-AVX-NEXT: vmovntpd %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddd .LCPI0_3, %xmm5, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddw .LCPI0_4, %xmm4, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddb .LCPI0_5, %xmm3, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: movntil %edx, (%esi)
-; X32-AVX-NEXT: movntil %ecx, 4(%esi)
-; X32-AVX-NEXT: movntil %eax, (%esi)
-; X32-AVX-NEXT: leal -4(%ebp), %esp
-; X32-AVX-NEXT: popl %esi
+; X32-AVX-NEXT: movl 8(%ebp), %edx
+; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
+; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: movntil %ecx, 4(%edx)
+; X32-AVX-NEXT: movntil %eax, (%edx)
+; X32-AVX-NEXT: movl %ebp, %esp
; X32-AVX-NEXT: popl %ebp
; X32-AVX-NEXT: retl
;
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 35f96eda35e1..a1f1e084d330 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -1,219 +1,169 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-define void @test1(i16* nocapture %head) nounwind {
+define <8 x i16> @test1(<8 x i16> %x) nounwind {
; SSE-LABEL: test1:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test1:
; AVX: ## BB#0: ## %vector.ph
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqu %xmm0, (%rdi)
; AVX-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <8 x i16>*
- %2 = load <8 x i16>, <8 x i16>* %1, align 2
- %3 = icmp slt <8 x i16> %2, zeroinitializer
- %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
- %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
- store <8 x i16> %5, <8 x i16>* %1, align 2
- ret void
+ %0 = icmp slt <8 x i16> %x, zeroinitializer
+ %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+ %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
}
-define void @test2(i16* nocapture %head) nounwind {
+define <8 x i16> @test2(<8 x i16> %x) nounwind {
; SSE-LABEL: test2:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: ## BB#0: ## %vector.ph
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqu %xmm0, (%rdi)
; AVX-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <8 x i16>*
- %2 = load <8 x i16>, <8 x i16>* %1, align 2
- %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
- %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
- %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
- store <8 x i16> %5, <8 x i16>* %1, align 2
- ret void
+ %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+ %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+ %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
}
-define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test3:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movd %esi, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqu (%rdi), %xmm1
-; SSE-NEXT: psubusw %xmm0, %xmm1
-; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: movd %edi, %xmm1
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT: psubusw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT: vmovdqu (%rdi), %xmm1
-; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: vmovd %edi, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdi), %xmm1
-; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
- %1 = getelementptr inbounds i16, i16* %head, i64 0
- %2 = bitcast i16* %1 to <8 x i16>*
- %3 = load <8 x i16>, <8 x i16>* %2, align 2
- %4 = icmp ult <8 x i16> %3, %broadcast15
- %5 = sub <8 x i16> %3, %broadcast15
- %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
- store <8 x i16> %6, <8 x i16>* %2, align 2
- ret void
+ %1 = icmp ult <8 x i16> %x, %broadcast15
+ %2 = sub <8 x i16> %x, %broadcast15
+ %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
+ ret <8 x i16> %res
}
-define void @test4(i8* nocapture %head) nounwind {
+define <16 x i8> @test4(<16 x i8> %x) nounwind {
; SSE-LABEL: test4:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test4:
; AVX: ## BB#0: ## %vector.ph
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqu %xmm0, (%rdi)
; AVX-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i8, i8* %head, i64 0
- %1 = bitcast i8* %0 to <16 x i8>*
- %2 = load <16 x i8>, <16 x i8>* %1, align 1
- %3 = icmp slt <16 x i8> %2, zeroinitializer
- %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
- %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
- store <16 x i8> %5, <16 x i8>* %1, align 1
- ret void
+ %0 = icmp slt <16 x i8> %x, zeroinitializer
+ %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+ %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
}
-define void @test5(i8* nocapture %head) nounwind {
+define <16 x i8> @test5(<16 x i8> %x) nounwind {
; SSE-LABEL: test5:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test5:
; AVX: ## BB#0: ## %vector.ph
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqu %xmm0, (%rdi)
; AVX-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i8, i8* %head, i64 0
- %1 = bitcast i8* %0 to <16 x i8>*
- %2 = load <16 x i8>, <16 x i8>* %1, align 1
- %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
- %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
- %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
- store <16 x i8> %5, <16 x i8>* %1, align 1
- ret void
+ %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+ %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+ %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
}
-define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-LABEL: test6:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: psubusb %xmm0, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
+; SSE2-NEXT: movd %edi, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT: psubusb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test6:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: psubusb %xmm0, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: movd %edi, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm1
+; SSSE3-NEXT: psubusb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test6:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: movd %edi, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm2, %xmm1
+; SSE41-NEXT: psubusb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test6:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu (%rdi), %xmm1
-; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: vmovd %edi, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdi), %xmm1
-; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
- %1 = getelementptr inbounds i8, i8* %head, i64 0
- %2 = bitcast i8* %1 to <16 x i8>*
- %3 = load <16 x i8>, <16 x i8>* %2, align 1
- %4 = icmp ult <16 x i8> %3, %broadcast15
- %5 = sub <16 x i8> %3, %broadcast15
- %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
- store <16 x i8> %6, <16 x i8>* %2, align 1
- ret void
+ %1 = icmp ult <16 x i8> %x, %broadcast15
+ %2 = sub <16 x i8> %x, %broadcast15
+ %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
+ ret <16 x i8> %res
}
-define void @test7(i16* nocapture %head) nounwind {
+define <16 x i16> @test7(<16 x i16> %x) nounwind {
; SSE-LABEL: test7:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
-; SSE-NEXT: movdqu 16(%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
-; SSE-NEXT: movdqu %xmm1, 16(%rdi)
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test7:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
@@ -221,43 +171,29 @@ define void @test7(i16* nocapture %head) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test7:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <16 x i16>*
- %2 = load <16 x i16>, <16 x i16>* %1, align 2
- %3 = icmp slt <16 x i16> %2, zeroinitializer
- %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
- %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
- store <16 x i16> %5, <16 x i16>* %1, align 2
- ret void
+ %0 = icmp slt <16 x i16> %x, zeroinitializer
+ %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+ %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
}
-define void @test8(i16* nocapture %head) nounwind {
+define <16 x i16> @test8(<16 x i16> %x) nounwind {
; SSE-LABEL: test8:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
-; SSE-NEXT: movdqu 16(%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
-; SSE-NEXT: movdqu %xmm1, 16(%rdi)
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test8:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
@@ -271,48 +207,33 @@ define void @test8(i16* nocapture %head) nounwind {
; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <16 x i16>*
- %2 = load <16 x i16>, <16 x i16>* %1, align 2
- %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
- %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
- %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
- store <16 x i16> %5, <16 x i16>* %1, align 2
- ret void
-
+ %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+ %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+ %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
}
-define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
+define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test9:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movd %esi, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqu (%rdi), %xmm1
-; SSE-NEXT: movdqu 16(%rdi), %xmm2
-; SSE-NEXT: psubusw %xmm0, %xmm1
-; SSE-NEXT: psubusw %xmm0, %xmm2
-; SSE-NEXT: movdqu %xmm2, 16(%rdi)
-; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: movd %edi, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-NEXT: psubusw %xmm2, %xmm0
+; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovd %esi, %xmm2
+; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
@@ -324,47 +245,33 @@ define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vmovdqu (%rdi), %ymm1
-; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
vector.ph:
%0 = insertelement <16 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
- %1 = getelementptr inbounds i16, i16* %head, i64 0
- %2 = bitcast i16* %1 to <16 x i16>*
- %3 = load <16 x i16>, <16 x i16>* %2, align 2
- %4 = icmp ult <16 x i16> %3, %broadcast15
- %5 = sub <16 x i16> %3, %broadcast15
- %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
- store <16 x i16> %6, <16 x i16>* %2, align 2
- ret void
+ %1 = icmp ult <16 x i16> %x, %broadcast15
+ %2 = sub <16 x i16> %x, %broadcast15
+ %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
+ ret <16 x i16> %res
}
-define void @test10(i8* nocapture %head) nounwind {
+define <32 x i8> @test10(<32 x i8> %x) nounwind {
; SSE-LABEL: test10:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
-; SSE-NEXT: movdqu 16(%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; SSE-NEXT: psubusb %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm2, %xmm1
-; SSE-NEXT: movdqu %xmm1, 16(%rdi)
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test10:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
@@ -372,44 +279,29 @@ define void @test10(i8* nocapture %head) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i8, i8* %head, i64 0
- %1 = bitcast i8* %0 to <32 x i8>*
- %2 = load <32 x i8>, <32 x i8>* %1, align 1
- %3 = icmp slt <32 x i8> %2, zeroinitializer
- %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
- %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
- store <32 x i8> %5, <32 x i8>* %1, align 1
- ret void
-
+ %0 = icmp slt <32 x i8> %x, zeroinitializer
+ %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+ %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
}
-define void @test11(i8* nocapture %head) nounwind {
+define <32 x i8> @test11(<32 x i8> %x) nounwind {
; SSE-LABEL: test11:
; SSE: ## BB#0: ## %vector.ph
-; SSE-NEXT: movdqu (%rdi), %xmm0
-; SSE-NEXT: movdqu 16(%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: psubusb %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm2, %xmm1
-; SSE-NEXT: movdqu %xmm1, 16(%rdi)
-; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: test11:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
@@ -423,60 +315,51 @@ define void @test11(i8* nocapture %head) nounwind {
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i8, i8* %head, i64 0
- %1 = bitcast i8* %0 to <32 x i8>*
- %2 = load <32 x i8>, <32 x i8>* %1, align 1
- %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
- %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
- %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
- store <32 x i8> %5, <32 x i8>* %1, align 1
- ret void
+ %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+ %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+ %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
}
-define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
+define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-LABEL: test12:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: movdqu 16(%rdi), %xmm2
-; SSE2-NEXT: psubusb %xmm0, %xmm1
-; SSE2-NEXT: psubusb %xmm0, %xmm2
-; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
+; SSE2-NEXT: movd %edi, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE2-NEXT: psubusb %xmm2, %xmm0
+; SSE2-NEXT: psubusb %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test12:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
-; SSSE3-NEXT: psubusb %xmm0, %xmm1
-; SSSE3-NEXT: psubusb %xmm0, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: movd %edi, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm2
+; SSSE3-NEXT: psubusb %xmm2, %xmm0
+; SSSE3-NEXT: psubusb %xmm2, %xmm1
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test12:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: movd %edi, %xmm2
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: pshufb %xmm3, %xmm2
+; SSE41-NEXT: psubusb %xmm2, %xmm0
+; SSE41-NEXT: psubusb %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test12:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rdi), %ymm0
-; AVX1-NEXT: vmovd %esi, %xmm1
+; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -489,617 +372,675 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT: vmovdqu (%rdi), %ymm1
-; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
vector.ph:
%0 = insertelement <32 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
- %1 = getelementptr inbounds i8, i8* %head, i64 0
- %2 = bitcast i8* %1 to <32 x i8>*
- %3 = load <32 x i8>, <32 x i8>* %2, align 1
- %4 = icmp ult <32 x i8> %3, %broadcast15
- %5 = sub <32 x i8> %3, %broadcast15
- %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
- store <32 x i8> %6, <32 x i8>* %2, align 1
- ret void
+ %1 = icmp ult <32 x i8> %x, %broadcast15
+ %2 = sub <32 x i8> %x, %broadcast15
+ %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
+ ret <32 x i8> %res
}
-define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test13:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu (%rsi), %xmm2
-; SSE2-NEXT: movdqu 16(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT: psubd %xmm1, %xmm3
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: movdqu %xmm4, (%rdi)
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: packssdw %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test13:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: movdqu (%rsi), %xmm2
-; SSSE3-NEXT: movdqu 16(%rsi), %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: psubd %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: psubd %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm6
; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm1, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm5, %xmm6
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; SSSE3-NEXT: psubd %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm5, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSSE3-NEXT: psubd %xmm1, %xmm3
; SSSE3-NEXT: pshufb %xmm5, %xmm0
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: pandn %xmm1, %xmm6
-; SSSE3-NEXT: movdqu %xmm6, (%rdi)
+; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test13:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm3, %xmm6
+; SSE41-NEXT: psubd %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE41-NEXT: pshufb %xmm1, %xmm6
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSE41-NEXT: psubd %xmm2, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE41-NEXT: pandn %xmm3, %xmm0
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test13:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rsi), %ymm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test13:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rsi), %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <8 x i16>*
- %2 = load <8 x i16>, <8 x i16>* %1, align 2
- %3 = getelementptr inbounds i32, i32* %w, i64 0
- %4 = bitcast i32* %3 to <8 x i32>*
- %5 = load <8 x i32>, <8 x i32>* %4, align 2
- %6 = zext <8 x i16> %2 to <8 x i32>
- %7 = icmp ult <8 x i32> %6, %5
- %8 = sub <8 x i32> %6, %5
- %9 = trunc <8 x i32> %8 to <8 x i16>
- %10 = select <8 x i1> %7, <8 x i16> zeroinitializer, <8 x i16> %9
- store <8 x i16> %10, <8 x i16>* %1, align 1
- ret void
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ult <8 x i32> %lhs, %y
+ %sub = sub <8 x i32> %lhs, %y
+ %trunc = trunc <8 x i32> %sub to <8 x i16>
+ %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
+ ret <8 x i16> %res
}
-define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
+define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: test14:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu (%rsi), %xmm8
-; SSE2-NEXT: movdqu 16(%rsi), %xmm9
-; SSE2-NEXT: movdqu 32(%rsi), %xmm10
-; SSE2-NEXT: movdqu 48(%rsi), %xmm7
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
-; SSE2-NEXT: pand %xmm5, %xmm7
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: psubd %xmm10, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm10
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm10
-; SSE2-NEXT: pand %xmm5, %xmm10
-; SSE2-NEXT: packuswb %xmm7, %xmm10
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: psubd %xmm9, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm9
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT: pand %xmm5, %xmm9
-; SSE2-NEXT: movdqa %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm5, %xmm4
-; SSE2-NEXT: packuswb %xmm9, %xmm4
-; SSE2-NEXT: packuswb %xmm10, %xmm4
-; SSE2-NEXT: psubd %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: packuswb %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packuswb %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: movdqu %xmm4, (%rdi)
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm9
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: pxor %xmm0, %xmm7
+; SSE2-NEXT: psubd %xmm10, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm10
+; SSE2-NEXT: pand %xmm9, %xmm10
+; SSE2-NEXT: packuswb %xmm5, %xmm10
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm9, %xmm0
+; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: packuswb %xmm10, %xmm0
+; SSE2-NEXT: psubd %xmm8, %xmm1
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm3
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test14:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: movdqu (%rsi), %xmm8
-; SSSE3-NEXT: movdqu 16(%rsi), %xmm9
-; SSSE3-NEXT: movdqu 32(%rsi), %xmm10
-; SSSE3-NEXT: movdqu 48(%rsi), %xmm7
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSSE3-NEXT: pxor %xmm7, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm11
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSSE3-NEXT: movdqa %xmm11, %xmm8
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm10
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pxor %xmm7, %xmm9
+; SSSE3-NEXT: psubd %xmm0, %xmm4
; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: psubd %xmm7, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pxor %xmm7, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm7, %xmm5
+; SSSE3-NEXT: psubd %xmm10, %xmm3
+; SSSE3-NEXT: movdqa %xmm10, %xmm0
+; SSSE3-NEXT: pxor %xmm7, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm9, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm7, %xmm5
+; SSSE3-NEXT: psubd %xmm11, %xmm2
+; SSSE3-NEXT: pxor %xmm7, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm5, %xmm11
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm7, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
; SSSE3-NEXT: pshufb %xmm5, %xmm7
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: psubd %xmm10, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm10
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm10
-; SSSE3-NEXT: pshufb %xmm5, %xmm10
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: psubd %xmm9, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm9
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm4, %xmm9
-; SSSE3-NEXT: movdqa %xmm8, %xmm5
-; SSSE3-NEXT: pxor %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufb %xmm4, %xmm5
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
-; SSSE3-NEXT: psubd %xmm8, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm6
-; SSSE3-NEXT: packuswb %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm1, %xmm2
-; SSSE3-NEXT: packuswb %xmm6, %xmm2
-; SSSE3-NEXT: andnpd %xmm2, %xmm10
-; SSSE3-NEXT: movupd %xmm10, (%rdi)
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSSE3-NEXT: psubd %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSSE3-NEXT: pand %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: packuswb %xmm4, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: packuswb %xmm2, %xmm1
+; SSSE3-NEXT: packuswb %xmm3, %xmm1
+; SSSE3-NEXT: andnpd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test14:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,0,1]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm4, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: psubd %xmm6, %xmm4
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm10 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm10, %xmm6
+; SSE41-NEXT: movdqa %xmm3, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: psubd %xmm9, %xmm3
+; SSE41-NEXT: pxor %xmm5, %xmm9
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT: pshufb %xmm10, %xmm9
+; SSE41-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; SSE41-NEXT: movdqa %xmm1, %xmm6
+; SSE41-NEXT: pxor %xmm5, %xmm6
+; SSE41-NEXT: psubd %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pxor %xmm8, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE41-NEXT: pshufb %xmm6, %xmm5
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4,5,6,7]
+; SSE41-NEXT: psubd %xmm8, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: packuswb %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test14:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rsi), %ymm0
-; AVX1-NEXT: vmovdqu 32(%rsi), %ymm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11
-; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm4
-; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11
+; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm3
+; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm4
-; AVX1-NEXT: vpsubd %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm4
+; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm10, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test14:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rsi), %ymm0
-; AVX2-NEXT: vmovdqu 32(%rsi), %ymm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpacksswb %xmm6, %xmm5, %xmm5
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i8, i8* %head, i64 0
- %1 = bitcast i8* %0 to <16 x i8>*
- %2 = load <16 x i8>, <16 x i8>* %1, align 2
- %3 = getelementptr inbounds i32, i32* %w, i64 0
- %4 = bitcast i32* %3 to <16 x i32>*
- %5 = load <16 x i32>, <16 x i32>* %4, align 2
- %6 = zext <16 x i8> %2 to <16 x i32>
- %7 = icmp ult <16 x i32> %6, %5
- %8 = sub <16 x i32> %6, %5
- %9 = trunc <16 x i32> %8 to <16 x i8>
- %10 = select <16 x i1> %7, <16 x i8> zeroinitializer, <16 x i8> %9
- store <16 x i8> %10, <16 x i8>* %1, align 1
- ret void
+ %rhs = zext <16 x i8> %x to <16 x i32>
+ %cond = icmp ult <16 x i32> %y, %rhs
+ %sub = sub <16 x i32> %y, %rhs
+ %truncsub = trunc <16 x i32> %sub to <16 x i8>
+ %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
+ ret <16 x i8> %res
}
-define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test15:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu (%rsi), %xmm2
-; SSE2-NEXT: movdqu 16(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE2-NEXT: psubd %xmm1, %xmm3
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: packssdw %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test15:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: movdqu (%rsi), %xmm2
-; SSSE3-NEXT: movdqu 16(%rsi), %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: psubd %xmm4, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm6
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
-; SSSE3-NEXT: pshufb %xmm4, %xmm3
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT: psubd %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: pshufb %xmm4, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pshufb %xmm2, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pshufb %xmm2, %xmm0
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test15:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm1, %xmm5
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm4, %xmm6
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: pshufb %xmm1, %xmm0
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test15:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rsi), %ymm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test15:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rsi), %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <8 x i16>*
- %2 = load <8 x i16>, <8 x i16>* %1, align 2
- %3 = getelementptr inbounds i32, i32* %w, i64 0
- %4 = bitcast i32* %3 to <8 x i32>*
- %5 = load <8 x i32>, <8 x i32>* %4, align 2
- %6 = zext <8 x i16> %2 to <8 x i32>
- %7 = icmp ugt <8 x i32> %6, %5
- %8 = sub <8 x i32> %6, %5
- %9 = trunc <8 x i32> %8 to <8 x i16>
- %10 = select <8 x i1> %7, <8 x i16> %9, <8 x i16> zeroinitializer
- store <8 x i16> %10, <8 x i16>* %1, align 1
- ret void
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ugt <8 x i32> %lhs, %y
+ %sub = sub <8 x i32> %lhs, %y
+ %truncsub = trunc <8 x i32> %sub to <8 x i16>
+ %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
}
-define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test16:
; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu (%rsi), %xmm2
-; SSE2-NEXT: movdqu 16(%rsi), %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE2-NEXT: psubd %xmm1, %xmm3
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: packssdw %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test16:
; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movdqu (%rdi), %xmm0
-; SSSE3-NEXT: movdqu (%rsi), %xmm2
-; SSSE3-NEXT: movdqu 16(%rsi), %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: psubd %xmm4, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm4, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm6
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3
-; SSSE3-NEXT: pshufb %xmm4, %xmm3
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT: psubd %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: pshufb %xmm4, %xmm1
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pshufb %xmm2, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: psubd %xmm1, %xmm3
+; SSSE3-NEXT: pshufb %xmm2, %xmm0
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: retq
;
+; SSE41-LABEL: test16:
+; SSE41: ## BB#0: ## %vector.ph
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm1, %xmm5
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm4, %xmm6
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: pshufb %xmm1, %xmm0
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: retq
+;
; AVX1-LABEL: test16:
; AVX1: ## BB#0: ## %vector.ph
-; AVX1-NEXT: vmovdqu (%rsi), %ymm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test16:
; AVX2: ## BB#0: ## %vector.ph
-; AVX2-NEXT: vmovdqu (%rsi), %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
- %0 = getelementptr inbounds i16, i16* %head, i64 0
- %1 = bitcast i16* %0 to <8 x i16>*
- %2 = load <8 x i16>, <8 x i16>* %1, align 2
- %3 = getelementptr inbounds i32, i32* %w, i64 0
- %4 = bitcast i32* %3 to <8 x i32>*
- %5 = load <8 x i32>, <8 x i32>* %4, align 2
- %6 = zext <8 x i16> %2 to <8 x i32>
- %7 = icmp ult <8 x i32> %5, %6
- %8 = sub <8 x i32> %6, %5
- %9 = trunc <8 x i32> %8 to <8 x i16>
- %10 = select <8 x i1> %7, <8 x i16> %9, <8 x i16> zeroinitializer
- store <8 x i16> %10, <8 x i16>* %1, align 1
- ret void
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ult <8 x i32> %y, %lhs
+ %sub = sub <8 x i32> %lhs, %y
+ %truncsub = trunc <8 x i32> %sub to <8 x i16>
+ %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
}
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 16f152d169d3..5e9e1e364fef 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -134,10 +134,7 @@ entry:
@g_16 = internal global i32 -1
; X64-LABEL: test8:
-; X64-NEXT: movl _g_16(%rip), %eax
-; X64-NEXT: movl $0, _g_16(%rip)
-; X64-NEXT: orl $1, %eax
-; X64-NEXT: movl %eax, _g_16(%rip)
+; X64-NEXT: orb $1, _g_16(%rip)
; X64-NEXT: ret
define void @test8() nounwind {
%tmp = load i32, i32* @g_16
diff --git a/test/CodeGen/X86/swift-return.ll b/test/CodeGen/X86/swift-return.ll
index 60e33e62b4ad..0ea176d5d82f 100644
--- a/test/CodeGen/X86/swift-return.ll
+++ b/test/CodeGen/X86/swift-return.ll
@@ -184,11 +184,11 @@ define void @consume_i1_ret() {
%v6 = extractvalue { i1, i1, i1, i1 } %call, 2
%v7 = extractvalue { i1, i1, i1, i1 } %call, 3
%val = zext i1 %v3 to i32
- store i32 %val, i32* @var
+ store volatile i32 %val, i32* @var
%val2 = zext i1 %v5 to i32
- store i32 %val2, i32* @var
+ store volatile i32 %val2, i32* @var
%val3 = zext i1 %v6 to i32
- store i32 %val3, i32* @var
+ store volatile i32 %val3, i32* @var
%val4 = zext i1 %v7 to i32
store i32 %val4, i32* @var
ret void
diff --git a/test/CodeGen/X86/win32-spill-xmm.ll b/test/CodeGen/X86/win32-spill-xmm.ll
index 0db97cfe20f0..c6b163b88b24 100644
--- a/test/CodeGen/X86/win32-spill-xmm.ll
+++ b/test/CodeGen/X86/win32-spill-xmm.ll
@@ -20,7 +20,7 @@ declare void @bar(<16 x float> %a, i32 %b)
; Check that proper alignment of spilled vector does not affect vargs
; CHECK-LABEL: vargs_not_affected
-; CHECK: leal 28(%ebp), %eax
+; CHECK: movl 28(%ebp), %eax
define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
entry:
%ap = alloca i8*, align 4
diff --git a/test/CodeGen/X86/win64_sibcall.ll b/test/CodeGen/X86/win64_sibcall.ll
index 4bba0e1e0acd..42dd4d31ca9f 100644
--- a/test/CodeGen/X86/win64_sibcall.ll
+++ b/test/CodeGen/X86/win64_sibcall.ll
@@ -12,8 +12,8 @@ entry:
; LINUX: movq $0, -8(%rsp)
%this = alloca %Object addrspace(1)*
- store %Object addrspace(1)* null, %Object addrspace(1)** %this
- store %Object addrspace(1)* %param0, %Object addrspace(1)** %this
+ store volatile %Object addrspace(1)* null, %Object addrspace(1)** %this
+ store volatile %Object addrspace(1)* %param0, %Object addrspace(1)** %this
br label %0
; <label>:0 ; preds = %entry
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index 8d7f2010a541..20386bf36395 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -94,9 +94,7 @@ entry:
; CHECK-LABEL: arg4:
; CHECK: pushq
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
; va_arg:
; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
; CHECK: movq [[REG_arg4_2]], (%rsp)
diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
index e3436521a5bd..299190e8a595 100644
--- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
+++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -90,9 +90,7 @@ entry:
}
; CHECK-LABEL: arg4:
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
; va_arg:
; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
; CHECK: movq [[REG_arg4_2]], (%rsp)
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/ELF_x86-64_debug_frame.s b/test/ExecutionEngine/RuntimeDyld/X86/ELF_x86-64_debug_frame.s
new file mode 100644
index 000000000000..8f907a6c4991
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/X86/ELF_x86-64_debug_frame.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj -o %T/ELF_x86-64_debug_frame.o %s
+# RUN: llvm-rtdyld -triple=x86_64-pc-linux -verify -check=%s %T/ELF_x86-64_debug_frame.o
+
+ .text
+ .file "debug_frame_test.c"
+ .align 16, 0x90
+ .type foo,@function
+foo:
+ .cfi_startproc
+ retq
+.Ltmp0:
+ .size foo, .Ltmp0-foo
+ .cfi_endproc
+ .cfi_sections .debug_frame
+
+# Check that .debug_frame is mapped to 0.
+# rtdyld-check: section_addr(ELF_x86-64_debug_frame.o, .debug_frame) = 0
+
+# Check that The relocated FDE's CIE offset also points to zero.
+# rtdyld-check: *{4}(section_addr(ELF_x86-64_debug_frame.o, .debug_frame) + 0x1C) = 0
diff --git a/test/Feature/optnone-llc.ll b/test/Feature/optnone-llc.ll
index 69dc5291226a..2129fc9b8815 100644
--- a/test/Feature/optnone-llc.ll
+++ b/test/Feature/optnone-llc.ll
@@ -42,6 +42,7 @@ attributes #0 = { optnone noinline }
; LLC-Ox-DAG: Skipping pass 'Control Flow Optimizer'
; LLC-Ox-DAG: Skipping pass 'Machine code sinking'
; LLC-Ox-DAG: Skipping pass 'Machine Common Subexpression Elimination'
+; LLC-Ox-DAG: Skipping pass 'Shrink Wrapping analysis'
; LLC-Ox-DAG: Skipping pass 'Machine Copy Propagation Pass'
; LLC-Ox-DAG: Skipping pass 'Machine Instruction Scheduler'
; LLC-Ox-DAG: Skipping pass 'Machine Loop Invariant Code Motion'
diff --git a/test/MC/AMDGPU/vop3-gfx9.s b/test/MC/AMDGPU/vop3-gfx9.s
index 22a0cddceab4..f50d9248e738 100644
--- a/test/MC/AMDGPU/vop3-gfx9.s
+++ b/test/MC/AMDGPU/vop3-gfx9.s
@@ -35,6 +35,30 @@ v_xad_u32 v1, v2, v3, v4
// GFX9: v_xad_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf3,0xd1,0x02,0x07,0x12,0x04]
// NOVI: :1: error: instruction not supported on this GPU
+v_min3_f16 v1, v2, v3, v4
+// GFX9: v_min3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf4,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_min3_i16 v1, v2, v3, v4
+// GFX9: v_min3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf5,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_min3_u16 v1, v2, v3, v4
+// GFX9: v_min3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf6,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_max3_f16 v1, v2, v3, v4
+// GFX9: v_max3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf7,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_max3_i16 v1, v2, v3, v4
+// GFX9: v_max3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf8,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_max3_u16 v1, v2, v3, v4
+// GFX9: v_max3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf9,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
v_med3_f16 v1, v2, v3, v4
// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
// NOVI: :1: error: instruction not supported on this GPU
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index 9f89602ae4ad..2784e937954a 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -7,6 +7,10 @@ include "llvm/Target/Target.td"
def MyTargetISA : InstrInfo;
def MyTarget : Target { let InstructionSet = MyTargetISA; }
+let TargetPrefix = "mytarget" in {
+def int_mytarget_nop : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+}
+
def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
def GPR32Op : RegisterOperand<GPR32>;
@@ -127,6 +131,37 @@ def : Pat<(select GPR32:$src1, complex:$src2, complex:$src3),
def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
[(set GPR32:$dst, (add GPR32:$src1, GPR32:$src2))]>;
+//===- Test a simple pattern with an intrinsic. ---------------------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT: MachineInstr &MI0 = I;
+// CHECK-NEXT: if (MI0.getNumOperands() < 3)
+// CHECK-NEXT: return false;
+// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_INTRINSIC) &&
+// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT: ((/* Operand 1 */ (isOperandImmEqual(MI0.getOperand(1), [[ID:[0-9]+]], MRI)))) &&
+// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
+// CHECK-NEXT: // (intrinsic_wo_chain:i32 [[ID]]:iPTR, GPR32:i32:$src1) => (MOV:i32 GPR32:i32:$src1)
+// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV));
+// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT: MIB.add(MI0.getOperand(2)/*src1*/);
+// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT: MIB.addMemOperand(MMO);
+// CHECK-NEXT: I.eraseFromParent();
+// CHECK-NEXT: MachineInstr &NewI = *MIB;
+// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }()) { return true; }
+
+def MOV : I<(outs GPR32:$dst), (ins GPR32:$src1),
+ [(set GPR32:$dst, (int_mytarget_nop GPR32:$src1))]>;
+
//===- Test a nested instruction match. -----------------------------------===//
// CHECK-LABEL: if ([&]() {
@@ -138,6 +173,8 @@ def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
// CHECK-NEXT: return false;
// CHECK-NEXT: if (!MI0.getOperand(1).isReg())
// CHECK-NEXT: return false;
+// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(1).getReg()))
+// CHECK-NEXT: return false;
// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(1).getReg());
// CHECK-NEXT: if (MI1.getNumOperands() < 3)
// CHECK-NEXT: return false;
@@ -180,6 +217,8 @@ def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
// CHECK-NEXT: return false;
// CHECK-NEXT: if (!MI0.getOperand(2).isReg())
// CHECK-NEXT: return false;
+// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(2).getReg()))
+// CHECK-NEXT: return false;
// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(2).getReg());
// CHECK-NEXT: if (MI1.getNumOperands() < 3)
// CHECK-NEXT: return false;
@@ -387,6 +426,42 @@ def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1),
def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1),
[(set GPR32:$dst, (xor GPR32:$src1, -4))]>;
+//===- Test a simple pattern with multiple operands with defaults. --------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT: MachineInstr &MI0 = I;
+// CHECK-NEXT: if (MI0.getNumOperands() < 3)
+// CHECK-NEXT: return false;
+// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
+// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -5, MRI))))) {
+// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -5:i32) => (XORManyDefaults:i32 GPR32:i32:$src1)
+// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORManyDefaults));
+// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT: MIB.addImm(-1);
+// CHECK-NEXT: MIB.addReg(MyTarget::R0);
+// CHECK-NEXT: MIB.addReg(MyTarget::R0);
+// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT: for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT: MIB.addMemOperand(MMO);
+// CHECK-NEXT: I.eraseFromParent();
+// CHECK-NEXT: MachineInstr &NewI = *MIB;
+// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT: return true;
+// CHECK-NEXT: }
+// CHECK-NEXT: return false;
+// CHECK-NEXT: }()) { return true; }
+
+// The -5 is just to distinguish it from the other cases.
+def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1),
+ [(set GPR32:$dst, (xor GPR32:$src1, -5))]>;
+
//===- Test a simple pattern with constant immediate operands. ------------===//
//
// This must precede the 3-register variants because constant immediates have
diff --git a/test/Transforms/Coroutines/coro-catchswitch.ll b/test/Transforms/Coroutines/coro-catchswitch.ll
new file mode 100644
index 000000000000..dd06f1280cae
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-catchswitch.ll
@@ -0,0 +1,88 @@
+; Verifies that we can insert the spill for a PHI preceding the catchswitch
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; CHECK-LABEL: define void @f(
+define void @f(i1 %cond) "coroutine.presplit"="1" personality i32 0 {
+entry:
+ %id = call token @llvm.coro.id(i32 8, i8* null, i8* null, i8* null)
+ %size = call i32 @llvm.coro.size.i32()
+ %alloc = call i8* @malloc(i32 %size)
+ %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+ br i1 %cond, label %if.else, label %if.then
+
+if.then:
+ invoke void @may_throw1()
+ to label %coro.ret unwind label %catch.dispatch
+
+if.else:
+ invoke void @may_throw2()
+ to label %coro.ret unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %if.else, %if.then
+ %val = phi i32 [ 1, %if.then ], [ 2, %if.else ]
+ %switch = catchswitch within none [label %catch] unwind label %cleanuppad
+
+; Verifies that we split out the PHI into a separate block
+; added a cleanuppad spill cleanupret unwinding into the catchswitch.
+
+; CHECK: catch.dispatch:
+; CHECK: %val = phi i32 [ 2, %if.else ], [ 1, %if.then ]
+; CHECK: %[[Pad:.+]] = cleanuppad within none []
+; CHECK: %val.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: store i32 %val, i32* %val.spill.addr
+; CHECK: cleanupret from %[[Pad]] unwind label %[[Switch:.+]]
+
+; CHECK: [[Switch]]:
+; CHECK: %switch = catchswitch within none [label %catch] unwind to caller
+
+catch: ; preds = %catch.dispatch
+ %pad = catchpad within %switch [i8* null, i32 64, i8* null]
+ catchret from %pad to label %suspend
+
+suspend:
+ %sp = call i8 @llvm.coro.suspend(token none, i1 false)
+ switch i8 %sp, label %coro.ret [
+ i8 0, label %resume
+ i8 1, label %coro.ret
+ ]
+
+resume: ; preds = %await2.suspend
+ call void @print(i32 %val)
+ br label %coro.ret
+
+coro.ret:
+ call i1 @llvm.coro.end(i8* %hdl, i1 0)
+ ret void
+
+cleanuppad:
+ %cpad = cleanuppad within none []
+ cleanupret from %cpad unwind to caller
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #1
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.alloc(token) #2
+
+; Function Attrs: nobuiltin
+declare i32 @llvm.coro.size.i32() #4
+declare i8* @llvm.coro.begin(token, i8* writeonly) #2
+declare token @llvm.coro.save(i8*)
+declare i8 @llvm.coro.suspend(token, i1)
+
+declare void @may_throw1()
+declare void @may_throw2()
+declare void @print(i32)
+declare noalias i8* @malloc(i32)
+declare void @free(i8*)
+
+declare i1 @llvm.coro.end(i8*, i1) #2
+
+; Function Attrs: nobuiltin nounwind
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #1
diff --git a/test/Transforms/Inline/inline-hot-callee.ll b/test/Transforms/Inline/inline-hot-callee.ll
index da6e52343b2d..dad57440063b 100644
--- a/test/Transforms/Inline/inline-hot-callee.ll
+++ b/test/Transforms/Inline/inline-hot-callee.ll
@@ -1,10 +1,10 @@
; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s
-; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s
-; This tests that a hot callee gets the (higher) inlinehint-threshold even without
-; inline hints and gets inlined because the cost is less than inlinehint-threshold.
-; A cold callee with identical body does not get inlined because cost exceeds the
-; inline-threshold
+; This tests that a hot callee gets the (higher) inlinehint-threshold even
+; without inline hints and gets inlined because the cost is less than
+; inlinehint-threshold. A cold callee with identical body does not get inlined
+; because cost exceeds the inline-threshold. This test is relevant only when the
+; old pass manager is used.
define i32 @callee1(i32 %x) !prof !21 {
%x1 = add i32 %x, 1
diff --git a/test/Transforms/InstCombine/canonicalize_branch.ll b/test/Transforms/InstCombine/canonicalize_branch.ll
index 29fd51a39ab4..401490879e92 100644
--- a/test/Transforms/InstCombine/canonicalize_branch.ll
+++ b/test/Transforms/InstCombine/canonicalize_branch.ll
@@ -1,69 +1,500 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instcombine -S | FileCheck %s
; Test an already canonical branch to make sure we don't flip those.
-define i32 @test0(i32 %X, i32 %Y) {
- %C = icmp eq i32 %X, %Y
- br i1 %C, label %T, label %F, !prof !0
+define i32 @eq(i32 %X, i32 %Y) {
+; CHECK-LABEL: @eq(
+; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !0
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp eq i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !0
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @ne(i32 %X, i32 %Y) {
+; CHECK-LABEL: @ne(
+; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !1
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp ne i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !1
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @ugt(i32 %X, i32 %Y) {
+; CHECK-LABEL: @ugt(
+; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !2
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp ugt i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !2
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
-; CHECK-LABEL: @test0(
-; CHECK: %C = icmp eq i32 %X, %Y
-; CHECK: br i1 %C, label %T, label %F
+define i32 @uge(i32 %X, i32 %Y) {
+; CHECK-LABEL: @uge(
+; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !3
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp uge i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !3
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @ult(i32 %X, i32 %Y) {
+; CHECK-LABEL: @ult(
+; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !4
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp ult i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !4
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @ule(i32 %X, i32 %Y) {
+; CHECK-LABEL: @ule(
+; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !5
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp ule i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !5
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+define i32 @sgt(i32 %X, i32 %Y) {
+; CHECK-LABEL: @sgt(
+; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !6
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp sgt i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !6
T:
- ret i32 12
+ ret i32 12
F:
- ret i32 123
+ ret i32 123
}
-define i32 @test1(i32 %X, i32 %Y) {
- %C = icmp ne i32 %X, %Y
- br i1 %C, label %T, label %F, !prof !1
+define i32 @sge(i32 %X, i32 %Y) {
+; CHECK-LABEL: @sge(
+; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !7
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp sge i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !7
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
-; CHECK-LABEL: @test1(
-; CHECK: %C = icmp eq i32 %X, %Y
-; CHECK: br i1 %C, label %F, label %T
+define i32 @slt(i32 %X, i32 %Y) {
+; CHECK-LABEL: @slt(
+; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !8
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp slt i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !8
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+define i32 @sle(i32 %X, i32 %Y) {
+; CHECK-LABEL: @sle(
+; CHECK-NEXT: [[C:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !9
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = icmp sle i32 %X, %Y
+ br i1 %C, label %T, label %F, !prof !9
T:
- ret i32 12
+ ret i32 12
F:
- ret i32 123
+ ret i32 123
}
-define i32 @test2(i32 %X, i32 %Y) {
- %C = icmp ule i32 %X, %Y
- br i1 %C, label %T, label %F, !prof !2
+define i32 @f_false(float %X, float %Y) {
+; CHECK-LABEL: @f_false(
+; CHECK-NEXT: br i1 false, label [[T:%.*]], label [[F:%.*]], !prof !10
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp false float %X, %Y
+ br i1 %C, label %T, label %F, !prof !10
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
-; CHECK-LABEL: @test2(
-; CHECK: %C = icmp ugt i32 %X, %Y
-; CHECK: br i1 %C, label %F, label %T
+define i32 @f_oeq(float %X, float %Y) {
+; CHECK-LABEL: @f_oeq(
+; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !11
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp oeq float %X, %Y
+ br i1 %C, label %T, label %F, !prof !11
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+define i32 @f_ogt(float %X, float %Y) {
+; CHECK-LABEL: @f_ogt(
+; CHECK-NEXT: [[C:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !12
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ogt float %X, %Y
+ br i1 %C, label %T, label %F, !prof !12
T:
- ret i32 12
+ ret i32 12
F:
- ret i32 123
+ ret i32 123
}
-define i32 @test3(i32 %X, i32 %Y) {
- %C = icmp uge i32 %X, %Y
- br i1 %C, label %T, label %F, !prof !3
+define i32 @f_oge(float %X, float %Y) {
+; CHECK-LABEL: @f_oge(
+; CHECK-NEXT: [[C:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !13
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp oge float %X, %Y
+ br i1 %C, label %T, label %F, !prof !13
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
-; CHECK-LABEL: @test3(
-; CHECK: %C = icmp ult i32 %X, %Y
-; CHECK: br i1 %C, label %F, label %T
+define i32 @f_olt(float %X, float %Y) {
+; CHECK-LABEL: @f_olt(
+; CHECK-NEXT: [[C:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !14
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp olt float %X, %Y
+ br i1 %C, label %T, label %F, !prof !14
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+define i32 @f_ole(float %X, float %Y) {
+; CHECK-LABEL: @f_ole(
+; CHECK-NEXT: [[C:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !15
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ole float %X, %Y
+ br i1 %C, label %T, label %F, !prof !15
T:
- ret i32 12
+ ret i32 12
F:
- ret i32 123
+ ret i32 123
}
-!0 = !{!"branch_weights", i32 1, i32 2}
-!1 = !{!"branch_weights", i32 3, i32 4}
-!2 = !{!"branch_weights", i32 5, i32 6}
-!3 = !{!"branch_weights", i32 7, i32 8}
-; Base case shouldn't change.
-; CHECK: !0 = {{.*}} i32 1, i32 2}
+define i32 @f_one(float %X, float %Y) {
+; CHECK-LABEL: @f_one(
+; CHECK-NEXT: [[C:%.*]] = fcmp ueq float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[F:%.*]], label [[T:%.*]], !prof !16
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp one float %X, %Y
+ br i1 %C, label %T, label %F, !prof !16
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_ord(float %X, float %Y) {
+; CHECK-LABEL: @f_ord(
+; CHECK-NEXT: [[C:%.*]] = fcmp ord float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !17
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ord float %X, %Y
+ br i1 %C, label %T, label %F, !prof !17
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_uno(float %X, float %Y) {
+; CHECK-LABEL: @f_uno(
+; CHECK-NEXT: [[C:%.*]] = fcmp uno float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !18
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp uno float %X, %Y
+ br i1 %C, label %T, label %F, !prof !18
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_ueq(float %X, float %Y) {
+; CHECK-LABEL: @f_ueq(
+; CHECK-NEXT: [[C:%.*]] = fcmp ueq float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !19
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ueq float %X, %Y
+ br i1 %C, label %T, label %F, !prof !19
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_ugt(float %X, float %Y) {
+; CHECK-LABEL: @f_ugt(
+; CHECK-NEXT: [[C:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !20
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ugt float %X, %Y
+ br i1 %C, label %T, label %F, !prof !20
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_uge(float %X, float %Y) {
+; CHECK-LABEL: @f_uge(
+; CHECK-NEXT: [[C:%.*]] = fcmp uge float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !21
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp uge float %X, %Y
+ br i1 %C, label %T, label %F, !prof !21
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_ult(float %X, float %Y) {
+; CHECK-LABEL: @f_ult(
+; CHECK-NEXT: [[C:%.*]] = fcmp ult float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !22
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ult float %X, %Y
+ br i1 %C, label %T, label %F, !prof !22
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_ule(float %X, float %Y) {
+; CHECK-LABEL: @f_ule(
+; CHECK-NEXT: [[C:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !23
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp ule float %X, %Y
+ br i1 %C, label %T, label %F, !prof !23
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_une(float %X, float %Y) {
+; CHECK-LABEL: @f_une(
+; CHECK-NEXT: [[C:%.*]] = fcmp une float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]], !prof !24
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp une float %X, %Y
+ br i1 %C, label %T, label %F, !prof !24
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+define i32 @f_true(float %X, float %Y) {
+; CHECK-LABEL: @f_true(
+; CHECK-NEXT: br i1 true, label [[T:%.*]], label [[F:%.*]], !prof !25
+; CHECK: T:
+; CHECK-NEXT: ret i32 12
+; CHECK: F:
+; CHECK-NEXT: ret i32 123
+;
+ %C = fcmp true float %X, %Y
+ br i1 %C, label %T, label %F, !prof !25
+T:
+ ret i32 12
+F:
+ ret i32 123
+}
+
+
+!0 = !{!"branch_weights", i32 0, i32 99}
+!1 = !{!"branch_weights", i32 1, i32 99}
+!2 = !{!"branch_weights", i32 2, i32 99}
+!3 = !{!"branch_weights", i32 3, i32 99}
+!4 = !{!"branch_weights", i32 4, i32 99}
+!5 = !{!"branch_weights", i32 5, i32 99}
+!6 = !{!"branch_weights", i32 6, i32 99}
+!7 = !{!"branch_weights", i32 7, i32 99}
+!8 = !{!"branch_weights", i32 8, i32 99}
+!9 = !{!"branch_weights", i32 9, i32 99}
+!10 = !{!"branch_weights", i32 10, i32 99}
+!11 = !{!"branch_weights", i32 11, i32 99}
+!12 = !{!"branch_weights", i32 12, i32 99}
+!13 = !{!"branch_weights", i32 13, i32 99}
+!14 = !{!"branch_weights", i32 14, i32 99}
+!15 = !{!"branch_weights", i32 15, i32 99}
+!16 = !{!"branch_weights", i32 16, i32 99}
+!17 = !{!"branch_weights", i32 17, i32 99}
+!18 = !{!"branch_weights", i32 18, i32 99}
+!19 = !{!"branch_weights", i32 19, i32 99}
+!20 = !{!"branch_weights", i32 20, i32 99}
+!21 = !{!"branch_weights", i32 21, i32 99}
+!22 = !{!"branch_weights", i32 22, i32 99}
+!23 = !{!"branch_weights", i32 23, i32 99}
+!24 = !{!"branch_weights", i32 24, i32 99}
+!25 = !{!"branch_weights", i32 25, i32 99}
+
; Ensure that the branch metadata is reversed to match the reversals above.
-; CHECK: !1 = {{.*}} i32 4, i32 3}
-; CHECK: !2 = {{.*}} i32 6, i32 5}
-; CHECK: !3 = {{.*}} i32 8, i32 7}
+; CHECK: !0 = {{.*}} i32 0, i32 99}
+; CHECK: !1 = {{.*}} i32 99, i32 1}
+; CHECK: !2 = {{.*}} i32 2, i32 99}
+; CHECK: !3 = {{.*}} i32 99, i32 3}
+; CHECK: !4 = {{.*}} i32 4, i32 99}
+; CHECK: !5 = {{.*}} i32 99, i32 5}
+; CHECK: !6 = {{.*}} i32 6, i32 99}
+; CHECK: !7 = {{.*}} i32 99, i32 7}
+; CHECK: !8 = {{.*}} i32 8, i32 99}
+; CHECK: !9 = {{.*}} i32 99, i32 9}
+; CHECK: !10 = {{.*}} i32 10, i32 99}
+; CHECK: !11 = {{.*}} i32 11, i32 99}
+; CHECK: !12 = {{.*}} i32 12, i32 99}
+; CHECK: !13 = {{.*}} i32 99, i32 13}
+; CHECK: !14 = {{.*}} i32 14, i32 99}
+; CHECK: !15 = {{.*}} i32 99, i32 15}
+; CHECK: !16 = {{.*}} i32 99, i32 16}
+; CHECK: !17 = {{.*}} i32 17, i32 99}
+; CHECK: !18 = {{.*}} i32 18, i32 99}
+; CHECK: !19 = {{.*}} i32 19, i32 99}
+; CHECK: !20 = {{.*}} i32 20, i32 99}
+; CHECK: !21 = {{.*}} i32 21, i32 99}
+; CHECK: !22 = {{.*}} i32 22, i32 99}
+; CHECK: !23 = {{.*}} i32 23, i32 99}
+; CHECK: !24 = {{.*}} i32 24, i32 99}
+; CHECK: !25 = {{.*}} i32 25, i32 99}
+
diff --git a/test/Transforms/InstCombine/debuginfo-skip.ll b/test/Transforms/InstCombine/debuginfo-skip.ll
new file mode 100644
index 000000000000..d2295e29ee46
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo-skip.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -instcombine -debug -S -o %t 2>&1 | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-IR
+; REQUIRES: asserts
+
+; Debug output from InstCombine should not have any @llvm.dbg.* instructions visited
+; CHECK-NOT: call void @llvm.dbg.
+
+; The resulting IR should still have them
+; CHECK-IR: call void @llvm.dbg.
+
+define i32 @foo(i32 %j) #0 !dbg !7 {
+entry:
+ %j.addr = alloca i32, align 4
+ store i32 %j, i32* %j.addr, align 4
+ call void @llvm.dbg.declare(metadata i32* %j.addr, metadata !11, metadata !12), !dbg !13
+ call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !16, metadata !12), !dbg !15
+ %0 = load i32, i32* %j.addr, align 4, !dbg !14
+ ret i32 %0, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "a.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 (trunk 302918) (llvm/trunk 302925)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocalVariable(name: "j", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+!12 = !DIExpression()
+!13 = !DILocation(line: 2, column: 13, scope: !7)
+!14 = !DILocation(line: 5, column: 10, scope: !7)
+!15 = !DILocation(line: 5, column: 3, scope: !7)
+!16 = !DILocalVariable(name: "h", scope: !7, file: !1, line: 4, type: !10)
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index 427ea655fcb2..a9b4e4e5cfcc 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -738,8 +738,7 @@ define i32 @test54(i32 %a, i32 %b) {
define i8 @lshr_perfect_mask(i8 %x) {
; CHECK-LABEL: @lshr_perfect_mask(
; CHECK-NEXT: [[SH:%.*]] = lshr i8 %x, 5
-; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], 7
-; CHECK-NEXT: ret i8 [[MASK]]
+; CHECK-NEXT: ret i8 [[SH]]
;
%sh = lshr i8 %x, 5
%mask = and i8 %sh, 7 ; 0x07
@@ -749,8 +748,7 @@ define i8 @lshr_perfect_mask(i8 %x) {
define <2 x i8> @lshr_oversized_mask_splat(<2 x i8> %x) {
; CHECK-LABEL: @lshr_oversized_mask_splat(
; CHECK-NEXT: [[SH:%.*]] = lshr <2 x i8> %x, <i8 5, i8 5>
-; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -121, i8 -121>
-; CHECK-NEXT: ret <2 x i8> [[MASK]]
+; CHECK-NEXT: ret <2 x i8> [[SH]]
;
%sh = lshr <2 x i8> %x, <i8 5, i8 5>
%mask = and <2 x i8> %sh, <i8 135, i8 135> ; 0x87
@@ -771,8 +769,7 @@ define i8 @lshr_undersized_mask(i8 %x) {
define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) {
; CHECK-LABEL: @shl_perfect_mask_splat(
; CHECK-NEXT: [[SH:%.*]] = shl <2 x i8> %x, <i8 6, i8 6>
-; CHECK-NEXT: [[MASK:%.*]] = and <2 x i8> [[SH]], <i8 -64, i8 -64>
-; CHECK-NEXT: ret <2 x i8> [[MASK]]
+; CHECK-NEXT: ret <2 x i8> [[SH]]
;
%sh = shl <2 x i8> %x, <i8 6, i8 6>
%mask = and <2 x i8> %sh, <i8 192, i8 192> ; 0xC0
@@ -782,8 +779,7 @@ define <2 x i8> @shl_perfect_mask_splat(<2 x i8> %x) {
define i8 @shl_oversized_mask(i8 %x) {
; CHECK-LABEL: @shl_oversized_mask(
; CHECK-NEXT: [[SH:%.*]] = shl i8 %x, 6
-; CHECK-NEXT: [[MASK:%.*]] = and i8 [[SH]], -61
-; CHECK-NEXT: ret i8 [[MASK]]
+; CHECK-NEXT: ret i8 [[SH]]
;
%sh = shl i8 %x, 6
%mask = and i8 %sh, 195 ; 0xC3
diff --git a/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/test/Transforms/LoopVectorize/AArch64/pr33053.ll
new file mode 100644
index 000000000000..6763940bf98e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/pr33053.ll