Skip to content

[Clang] Remove 3-element vector load and store special handling #104661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions clang/include/clang/Basic/CodeGenOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1)
/// Whether emit pseudo probes for sample pgo profile collection.
CODEGENOPT(PseudoProbeForProfiling, 1, 0)

/// Whether 3-component vector type is preserved.
CODEGENOPT(PreserveVec3Type, 1, 0)

CODEGENOPT(NoPLT, 1, 0)

/// Whether to emit all vtables
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,8 @@ BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1,

LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C")

LANGOPT(PreserveVec3Type, 1, 0, "Preserve 3-component vector type")

#undef LANGOPT
#undef COMPATIBLE_LANGOPT
#undef BENIGN_LANGOPT
Expand Down
4 changes: 0 additions & 4 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -8240,10 +8240,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">,
Group<hlsl_Group>,
MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>;

def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">,
HelpText<"Preserve 3-component vector type">,
MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>,
ImpliedByAnyOf<[hlsl.KeyPath]>;
def fwchar_type_EQ : Joined<["-"], "fwchar-type=">,
HelpText<"Select underlying type for wchar_t">,
Values<"char,short,int">,
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Basic/LangOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,

// OpenCL and HLSL have half keyword
Opts.Half = Opts.OpenCL || Opts.HLSL;

Opts.PreserveVec3Type = Opts.HLSL;
}

FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) {
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/CodeGen/ABIInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr,
}
}

llvm::FixedVectorType *
ABIInfo::getOptimalVectorMemoryType(llvm::FixedVectorType *T,
const LangOptions &Opt) const {
if (T->getNumElements() == 3 && !Opt.PreserveVec3Type)
return llvm::FixedVectorType::get(T->getElementType(), 4);
return T;
}

// Pin the vtable to this file.
SwiftABIInfo::~SwiftABIInfo() = default;

Expand Down
8 changes: 8 additions & 0 deletions clang/lib/CodeGen/ABIInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Value;
class LLVMContext;
class DataLayout;
class Type;
class FixedVectorType;
} // namespace llvm

namespace clang {
Expand Down Expand Up @@ -123,6 +124,13 @@ class ABIInfo {
raw_ostream &Out) const;
virtual void appendAttributeMangling(StringRef AttrStr,
raw_ostream &Out) const;

/// Returns the optimal vector memory type based on the given vector type. For
/// example, on certain targets, a vector with 3 elements might be promoted to
/// one with 4 elements to improve performance.
virtual llvm::FixedVectorType *
getOptimalVectorMemoryType(llvm::FixedVectorType *T,
const LangOptions &Opt) const;
};

/// Target specific hooks for defining how a type should be passed or returned
Expand Down
49 changes: 24 additions & 25 deletions clang/lib/CodeGen/CGExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2002,20 +2002,19 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile,
return EmitFromMemory(V, Ty);
}

// Handle vectors of size 3 like size 4 for better performance.
const llvm::Type *EltTy = Addr.getElementType();
const auto *VTy = cast<llvm::FixedVectorType>(EltTy);

if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) {

llvm::VectorType *vec4Ty =
llvm::FixedVectorType::get(VTy->getElementType(), 4);
Address Cast = Addr.withElementType(vec4Ty);
// Now load value.
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4");

// Shuffle vector to get vec3.
V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec");
// Handles vectors of sizes that are likely to be expanded to a larger size
// to optimize performance.
auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType());
auto *NewVecTy =
CGM.getABIInfo().getOptimalVectorMemoryType(VTy, getLangOpts());

if (VTy != NewVecTy) {
Address Cast = Addr.withElementType(NewVecTy);
llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN");
unsigned OldNumElements = VTy->getNumElements();
SmallVector<int, 16> Mask(OldNumElements);
std::iota(Mask.begin(), Mask.end(), 0);
V = Builder.CreateShuffleVector(V, Mask, "extractVec");
return EmitFromMemory(V, Ty);
}
}
Expand Down Expand Up @@ -2145,21 +2144,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV),
NotKnownNonNull);

// Handles vectors of sizes that are likely to be expanded to a larger size
// to optimize performance.
llvm::Type *SrcTy = Value->getType();
if (const auto *ClangVecTy = Ty->getAs<VectorType>()) {
auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy);
if (!CGM.getCodeGenOpts().PreserveVec3Type) {
// Handle vec3 special.
if (VecTy && !ClangVecTy->isExtVectorBoolType() &&
cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) {
// Our source is a vec3, do a shuffle vector to make it a vec4.
Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1},
"extractVec");
SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4);
if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
auto *NewVecTy =
CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts());
if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) {
SmallVector<int, 16> Mask(NewVecTy->getNumElements(), -1);
std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0);
Value = Builder.CreateShuffleVector(Value, Mask, "extractVec");
SrcTy = NewVecTy;
}
if (Addr.getElementType() != SrcTy) {
if (Addr.getElementType() != SrcTy)
Addr = Addr.withElementType(SrcTy);
}
}
}

Expand Down
11 changes: 11 additions & 0 deletions clang/lib/CodeGen/Targets/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
void computeInfo(CGFunctionInfo &FI) const override;
RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
AggValueSlot Slot) const override;

llvm::FixedVectorType *
getOptimalVectorMemoryType(llvm::FixedVectorType *T,
const LangOptions &Opt) const override {
// We have legal instructions for 96-bit so 3x32 can be supported.
// FIXME: This check should be a subtarget feature as technically SI doesn't
// support it.
if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
return T;
return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
}
};

bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
Expand Down
32 changes: 16 additions & 16 deletions clang/test/CodeGenCXX/matrix-vector-bit-int.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3)));
// CHECK-NEXT: [[A:%.*]] = alloca <3 x i8>, align 4
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i8>, align 4
// CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4
// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4
// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4
// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN4:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4
// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVECN4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]]
// CHECK-NEXT: store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
Expand All @@ -38,10 +38,10 @@ i8x3 v1(i8x3 a) {
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i32>, align 16
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16
// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVECN2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]]
// CHECK-NEXT: ret <3 x i32> [[ADD]]
//
Expand All @@ -53,14 +53,14 @@ i32x3 v2(i32x3 a) {
// CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i512>, align 256
// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256
// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256
// CHECK-NEXT: [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[LOADVECN3:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256
// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVECN3]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2>
// CHECK-NEXT: [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]]
// CHECK-NEXT: ret <3 x i512> [[ADD]]
//
Expand Down
8 changes: 4 additions & 4 deletions clang/test/CodeGenOpenCL/amdgpu-alignment.cl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32
// CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64
Expand All @@ -124,7 +124,7 @@ typedef double __attribute__((ext_vector_type(16))) double16;
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32
// CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64
Expand Down Expand Up @@ -393,7 +393,7 @@ kernel void local_memory_alignment_arg(
// CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
Expand All @@ -411,7 +411,7 @@ kernel void local_memory_alignment_arg(
// CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4
// CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8
// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16
// CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32
// CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64
Expand Down
Loading
Loading