Skip to content

Commit 9e23538

Browse files
committed
[Clang] Remove 3-element vector load and store special handling
Clang uses a long-time special handling of the case where 3 element vector loads and stores are performed as 4 element, and then a shufflevector is used to extract the used elements. Odd sized vector codegen should now work reasonably well. This patch removes this special handling, as well as the compiler argument `-fpreserve-vec3-type`.
1 parent bf700c3 commit 9e23538

File tree

8 files changed

+58
-123
lines changed

8 files changed

+58
-123
lines changed

clang/include/clang/Basic/CodeGenOptions.def

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1)
413413
/// Whether emit pseudo probes for sample pgo profile collection.
414414
CODEGENOPT(PseudoProbeForProfiling, 1, 0)
415415

416-
/// Whether 3-component vector type is preserved.
417-
CODEGENOPT(PreserveVec3Type, 1, 0)
418-
419416
CODEGENOPT(NoPLT, 1, 0)
420417

421418
/// Whether to emit all vtables

clang/include/clang/Driver/Options.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8190,10 +8190,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">,
81908190
Group<hlsl_Group>,
81918191
MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>;
81928192

8193-
def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
8194-
HelpText<"Preserve 3-component vector type">,
8195-
MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>,
8196-
ImpliedByAnyOf<[hlsl.KeyPath]>;
81978193
def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
81988194
HelpText<"Select underlying type for wchar_t">,
81998195
Values<"char,short,int">,

clang/lib/CodeGen/ABIInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr,
236236
}
237237
}
238238

239+
llvm::FixedVectorType *
240+
ABIInfo::getOptimalVectorType(llvm::FixedVectorType *T,
241+
const LangOptions &Opt) const {
242+
if (!Opt.HLSL && T->getNumElements() == 3)
243+
return llvm::FixedVectorType::get(T->getElementType(), 4);
244+
return T;
245+
}
246+
239247
// Pin the vtable to this file.
240248
SwiftABIInfo::~SwiftABIInfo() = default;
241249

clang/lib/CodeGen/ABIInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Value;
2020
class LLVMContext;
2121
class DataLayout;
2222
class Type;
23+
class FixedVectorType;
2324
} // namespace llvm
2425

2526
namespace clang {
@@ -123,6 +124,12 @@ class ABIInfo {
123124
raw_ostream &Out) const;
124125
virtual void appendAttributeMangling(StringRef AttrStr,
125126
raw_ostream &Out) const;
127+
128+
/// Returns the optimal vector type based on the given vector type. For
129+
/// example, on certain targets, a vector with 3 elements might be promoted to
130+
/// one with 4 elements to improve performance.
131+
virtual llvm::FixedVectorType *
132+
getOptimalVectorType(llvm::FixedVectorType *T, const LangOptions &Opt) const;
126133
};
127134

128135
/// Target specific hooks for defining how a type should be passed or returned

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,20 +2003,18 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
20032003
return EmitFromMemory(V, Ty);
20042004
}
20052005

2006-
// Handle vectors of size 3 like size 4 for better performance.
2007-
const llvm::Type *EltTy = Addr.getElementType();
2008-
const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
2009-
2010-
if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) {
2011-
2012-
llvm::VectorType *vec4Ty =
2013-
llvm::FixedVectorType::get(VTy->getElementType(), 4);
2014-
Address Cast = Addr.withElementType(vec4Ty);
2015-
// Now load value.
2016-
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");
2017-
2018-
// Shuffle vector to get vec3.
2019-
V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec");
2006+
// Handles vectors of sizes that are likely to be expanded to a larger size
2007+
// to optimize performance.
2008+
auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType());
2009+
auto *NewVecTy = CGM.getABIInfo().getOptimalVectorType(VTy, getLangOpts());
2010+
2011+
if (VTy != NewVecTy) {
2012+
Address Cast = Addr.withElementType(NewVecTy);
2013+
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN");
2014+
unsigned OldNumElements = VTy->getNumElements();
2015+
SmallVector<int, 4> Mask(OldNumElements);
2016+
std::iota(Mask.begin(), Mask.begin() + OldNumElements, 0);
2017+
V = Builder.CreateShuffleVector(V, Mask, "extractVec");
20202018
return EmitFromMemory(V, Ty);
20212019
}
20222020
}
@@ -2146,21 +2144,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
21462144
Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV),
21472145
NotKnownNonNull);
21482146

2147+
// Handles vectors of sizes that are likely to be expanded to a larger size
2148+
// to optimize performance.
21492149
llvm::Type *SrcTy = Value->getType();
21502150
if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
2151-
auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
2152-
if (!CGM.getCodeGenOpts().PreserveVec3Type) {
2153-
// Handle vec3 special.
2154-
if (VecTy && !ClangVecTy->isExtVectorBoolType() &&
2155-
cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
2156-
// Our source is a vec3, do a shuffle vector to make it a vec4.
2157-
Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1},
2158-
"extractVec");
2159-
SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
2151+
if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
2152+
auto *NewVecTy =
2153+
CGM.getABIInfo().getOptimalVectorType(VecTy, getLangOpts());
2154+
if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) {
2155+
SmallVector<int, 4> Mask(NewVecTy->getNumElements(), -1);
2156+
std::iota(Mask.begin(), Mask.end(), 0);
2157+
Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
2158+
SrcTy = NewVecTy;
21602159
}
2161-
if (Addr.getElementType() != SrcTy) {
2160+
if (Addr.getElementType() != SrcTy)
21622161
Addr = Addr.withElementType(SrcTy);
2163-
}
21642162
}
21652163
}
21662164

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
5252
void computeInfo(CGFunctionInfo &FI) const override;
5353
RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
5454
AggValueSlot Slot) const override;
55+
56+
llvm::FixedVectorType *
57+
getOptimalVectorType(llvm::FixedVectorType *T,
58+
const LangOptions &) const override {
59+
return T;
60+
}
5561
};
5662

5763
bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {

clang/test/CodeGenOpenCL/amdgpu-alignment.cl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -94,43 +94,43 @@ typedef double __attribute__((ext_vector_type(16))) double16;
9494
// CHECK-LABEL: @local_memory_alignment_global(
9595
// CHECK: store volatile i8 0, ptr addrspace(3) @local_memory_alignment_global.lds_i8, align 1
9696
// CHECK: store volatile <2 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i8, align 2
97-
// CHECK: store volatile <4 x i8> <i8 0, i8 0, i8 0, i8 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i8, align 4
97+
// CHECK: store volatile <3 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i8, align 4
9898
// CHECK: store volatile <4 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i8, align 4
9999
// CHECK: store volatile <8 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i8, align 8
100100
// CHECK: store volatile <16 x i8> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i8, align 16
101101
// CHECK: store volatile i16 0, ptr addrspace(3) @local_memory_alignment_global.lds_i16, align 2
102102
// CHECK: store volatile <2 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i16, align 4
103-
// CHECK: store volatile <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i16, align 8
103+
// CHECK: store volatile <3 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i16, align 8
104104
// CHECK: store volatile <4 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i16, align 8
105105
// CHECK: store volatile <8 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i16, align 16
106106
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32
107107
// CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4
108108
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8
109-
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
109+
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
110110
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16
111111
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32
112112
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64
113113
// CHECK: store volatile i64 0, ptr addrspace(3) @local_memory_alignment_global.lds_i64, align 8
114114
// CHECK: store volatile <2 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i64, align 16
115-
// CHECK: store volatile <4 x i64> <i64 0, i64 0, i64 0, i64 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i64, align 32
115+
// CHECK: store volatile <3 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i64, align 32
116116
// CHECK: store volatile <4 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i64, align 32
117117
// CHECK: store volatile <8 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i64, align 64
118118
// CHECK: store volatile <16 x i64> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i64, align 128
119119
// CHECK: store volatile half 0xH0000, ptr addrspace(3) @local_memory_alignment_global.lds_f16, align 2
120120
// CHECK: store volatile <2 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f16, align 4
121-
// CHECK: store volatile <4 x half> <half 0xH0000, half 0xH0000, half 0xH0000, half undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f16, align 8
121+
// CHECK: store volatile <3 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f16, align 8
122122
// CHECK: store volatile <4 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f16, align 8
123123
// CHECK: store volatile <8 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f16, align 16
124124
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32
125125
// CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4
126126
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8
127-
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
127+
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
128128
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16
129129
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32
130130
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64
131131
// CHECK: store volatile double 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f64, align 8
132132
// CHECK: store volatile <2 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f64, align 16
133-
// CHECK: store volatile <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f64, align 32
133+
// CHECK: store volatile <3 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f64, align 32
134134
// CHECK: store volatile <4 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f64, align 32
135135
// CHECK: store volatile <8 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f64, align 64
136136
// CHECK: store volatile <16 x double> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f64, align 128
@@ -381,43 +381,43 @@ kernel void local_memory_alignment_arg(
381381

382382
// CHECK: store volatile i8 0, ptr addrspace(5) %arraydecay, align 1
383383
// CHECK: store volatile <2 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
384-
// CHECK: store volatile <4 x i8> <i8 0, i8 0, i8 0, i8 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
384+
// CHECK: store volatile <3 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
385385
// CHECK: store volatile <4 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
386386
// CHECK: store volatile <8 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
387387
// CHECK: store volatile <16 x i8> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
388388
// CHECK: store volatile i16 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
389389
// CHECK: store volatile <2 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
390-
// CHECK: store volatile <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
390+
// CHECK: store volatile <3 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
391391
// CHECK: store volatile <4 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
392392
// CHECK: store volatile <8 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
393393
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
394394
// CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
395395
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
396-
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
396+
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
397397
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
398398
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
399399
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
400400
// CHECK: store volatile i64 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
401401
// CHECK: store volatile <2 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
402-
// CHECK: store volatile <4 x i64> <i64 0, i64 0, i64 0, i64 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
402+
// CHECK: store volatile <3 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
403403
// CHECK: store volatile <4 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
404404
// CHECK: store volatile <8 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
405405
// CHECK: store volatile <16 x i64> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 128
406406
// CHECK: store volatile half 0xH0000, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 2
407407
// CHECK: store volatile <2 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
408-
// CHECK: store volatile <4 x half> <half 0xH0000, half 0xH0000, half 0xH0000, half undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
408+
// CHECK: store volatile <3 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
409409
// CHECK: store volatile <4 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
410410
// CHECK: store volatile <8 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
411411
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
412412
// CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
413413
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
414-
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
414+
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
415415
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
416416
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
417417
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
418418
// CHECK: store volatile double 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
419419
// CHECK: store volatile <2 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
420-
// CHECK: store volatile <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
420+
// CHECK: store volatile <3 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
421421
// CHECK: store volatile <4 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
422422
// CHECK: store volatile <8 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
423423
// CHECK: store volatile <16 x double> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 128

0 commit comments

Comments
 (0)