Skip to content

Commit e4759ca

Browse files
committed
[Clang] Remove 3-element vector load and store special handling
Clang uses a long-time special handling of the case where 3 element vector loads and stores are performed as 4 element, and then a shufflevector is used to extract the used elements. Odd sized vector codegen should now work reasonably well. This patch removes this special handling, as well as the compiler argument `-fpreserve-vec3-type`.
1 parent a15fedc commit e4759ca

File tree

11 files changed

+70
-129
lines changed

11 files changed

+70
-129
lines changed

clang/include/clang/Basic/CodeGenOptions.def

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1)
413413
/// Whether emit pseudo probes for sample pgo profile collection.
414414
CODEGENOPT(PseudoProbeForProfiling, 1, 0)
415415

416-
/// Whether 3-component vector type is preserved.
417-
CODEGENOPT(PreserveVec3Type, 1, 0)
418-
419416
CODEGENOPT(NoPLT, 1, 0)
420417

421418
/// Whether to emit all vtables

clang/include/clang/Basic/LangOptions.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,8 @@ BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1,
532532

533533
LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C")
534534

535+
LANGOPT(PreserveVec3Type, 1, 0, "Preserve 3-component vector type")
536+
535537
#undef LANGOPT
536538
#undef COMPATIBLE_LANGOPT
537539
#undef BENIGN_LANGOPT

clang/include/clang/Driver/Options.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8223,10 +8223,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">,
82238223
Group<hlsl_Group>,
82248224
MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>;
82258225

8226-
def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
8227-
HelpText<"Preserve 3-component vector type">,
8228-
MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>,
8229-
ImpliedByAnyOf<[hlsl.KeyPath]>;
82308226
def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
82318227
HelpText<"Select underlying type for wchar_t">,
82328228
Values<"char,short,int">,

clang/lib/Basic/LangOptions.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
208208

209209
// OpenCL and HLSL have half keyword
210210
Opts.Half = Opts.OpenCL || Opts.HLSL;
211+
212+
Opts.PreserveVec3Type = Opts.HLSL;
211213
}
212214

213215
FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) {

clang/lib/CodeGen/ABIInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr,
236236
}
237237
}
238238

239+
llvm::FixedVectorType *
240+
ABIInfo::getOptimalVectorType(llvm::FixedVectorType *T,
241+
const LangOptions &Opt) const {
242+
if (T->getNumElements() == 3 && !Opt.PreserveVec3Type)
243+
return llvm::FixedVectorType::get(T->getElementType(), 4);
244+
return T;
245+
}
246+
239247
// Pin the vtable to this file.
240248
SwiftABIInfo::~SwiftABIInfo() = default;
241249

clang/lib/CodeGen/ABIInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Value;
2020
class LLVMContext;
2121
class DataLayout;
2222
class Type;
23+
class FixedVectorType;
2324
} // namespace llvm
2425

2526
namespace clang {
@@ -123,6 +124,12 @@ class ABIInfo {
123124
raw_ostream &Out) const;
124125
virtual void appendAttributeMangling(StringRef AttrStr,
125126
raw_ostream &Out) const;
127+
128+
/// Returns the optimal vector type based on the given vector type. For
129+
/// example, on certain targets, a vector with 3 elements might be promoted to
130+
/// one with 4 elements to improve performance.
131+
virtual llvm::FixedVectorType *
132+
getOptimalVectorType(llvm::FixedVectorType *T, const LangOptions &Opt) const;
126133
};
127134

128135
/// Target specific hooks for defining how a type should be passed or returned

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1998,20 +1998,18 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
19981998
return EmitFromMemory(V, Ty);
19991999
}
20002000

2001-
// Handle vectors of size 3 like size 4 for better performance.
2002-
const llvm::Type *EltTy = Addr.getElementType();
2003-
const auto *VTy = cast<llvm::FixedVectorType>(EltTy);
2004-
2005-
if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) {
2006-
2007-
llvm::VectorType *vec4Ty =
2008-
llvm::FixedVectorType::get(VTy->getElementType(), 4);
2009-
Address Cast = Addr.withElementType(vec4Ty);
2010-
// Now load value.
2011-
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");
2012-
2013-
// Shuffle vector to get vec3.
2014-
V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec");
2001+
// Handles vectors of sizes that are likely to be expanded to a larger size
2002+
// to optimize performance.
2003+
auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType());
2004+
auto *NewVecTy = CGM.getABIInfo().getOptimalVectorType(VTy, getLangOpts());
2005+
2006+
if (VTy != NewVecTy) {
2007+
Address Cast = Addr.withElementType(NewVecTy);
2008+
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN");
2009+
unsigned OldNumElements = VTy->getNumElements();
2010+
SmallVector<int, 4> Mask(OldNumElements);
2011+
std::iota(Mask.begin(), Mask.end(), 0);
2012+
V = Builder.CreateShuffleVector(V, Mask, "extractVec");
20152013
return EmitFromMemory(V, Ty);
20162014
}
20172015
}
@@ -2141,21 +2139,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
21412139
Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV),
21422140
NotKnownNonNull);
21432141

2142+
// Handles vectors of sizes that are likely to be expanded to a larger size
2143+
// to optimize performance.
21442144
llvm::Type *SrcTy = Value->getType();
21452145
if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
2146-
auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
2147-
if (!CGM.getCodeGenOpts().PreserveVec3Type) {
2148-
// Handle vec3 special.
2149-
if (VecTy && !ClangVecTy->isExtVectorBoolType() &&
2150-
cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
2151-
// Our source is a vec3, do a shuffle vector to make it a vec4.
2152-
Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1},
2153-
"extractVec");
2154-
SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
2146+
if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
2147+
auto *NewVecTy =
2148+
CGM.getABIInfo().getOptimalVectorType(VecTy, getLangOpts());
2149+
if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) {
2150+
SmallVector<int, 4> Mask(NewVecTy->getNumElements(), -1);
2151+
std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
2152+
Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
2153+
SrcTy = NewVecTy;
21552154
}
2156-
if (Addr.getElementType() != SrcTy) {
2155+
if (Addr.getElementType() != SrcTy)
21572156
Addr = Addr.withElementType(SrcTy);
2158-
}
21592157
}
21602158
}
21612159

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
5252
void computeInfo(CGFunctionInfo &FI) const override;
5353
RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
5454
AggValueSlot Slot) const override;
55+
56+
llvm::FixedVectorType *
57+
getOptimalVectorType(llvm::FixedVectorType *T,
58+
const LangOptions &Opt) const override {
59+
if (T->getNumElements() == 3 && T->getScalarSizeInBits() == 32)
60+
return T;
61+
return DefaultABIInfo::getOptimalVectorType(T, Opt);
62+
}
5563
};
5664

5765
bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {

clang/test/CodeGenCXX/matrix-vector-bit-int.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
1515
// CHECK-NEXT: [[A:%.*]] = alloca <3 x i8>, align 4
1616
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
1717
// CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4
18-
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
19-
// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
18+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
19+
// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
2020
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2121
// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
22-
// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
23-
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
24-
// CHECK-NEXT: [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
25-
// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
22+
// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
23+
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
24+
// CHECK-NEXT: [[LOADVECN4:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
25+
// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVECN4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
2626
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
2727
// CHECK-NEXT: store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
2828
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
@@ -38,10 +38,10 @@ i8x3 v1(i8x3 a) {
3838
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
3939
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
41-
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
42-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
43-
// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
44-
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
41+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
42+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
43+
// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
44+
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVECN2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
4545
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
4646
// CHECK-NEXT: ret <3 x i32> [[ADD]]
4747
//
@@ -53,14 +53,14 @@ i32x3 v2(i32x3 a) {
5353
// CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
5454
// CHECK-NEXT: [[ENTRY:.*:]]
5555
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
56-
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
57-
// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
56+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
57+
// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
5858
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
5959
// CHECK-NEXT: store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
60-
// CHECK-NEXT: [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
61-
// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
62-
// CHECK-NEXT: [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
63-
// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
60+
// CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
61+
// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
62+
// CHECK-NEXT: [[LOADVECN3:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
63+
// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVECN3]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
6464
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
6565
// CHECK-NEXT: ret <3 x i512> [[ADD]]
6666
//

clang/test/CodeGenOpenCL/amdgpu-alignment.cl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
106106
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32
107107
// CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4
108108
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8
109-
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
109+
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
110110
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16
111111
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32
112112
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64
@@ -124,7 +124,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
124124
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32
125125
// CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4
126126
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8
127-
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
127+
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
128128
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16
129129
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32
130130
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64
@@ -393,7 +393,7 @@ kernel void local_memory_alignment_arg(
393393
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
394394
// CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
395395
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
396-
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
396+
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
397397
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
398398
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
399399
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
@@ -411,7 +411,7 @@ kernel void local_memory_alignment_arg(
411411
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
412412
// CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
413413
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
414-
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
414+
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
415415
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
416416
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
417417
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)