From c74896f34d85b5dd888620ecd54153627d62bd16 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 1 Sep 2023 10:26:05 +0000 Subject: [PATCH] [AArch64][SME] Create new interface for isSVEAvailable. When a function is compiled to be in Streaming(-compatible) mode, the full set of SVE instructions may not be available. This patch adds an interface to query that and changes the codegen for FADDA (not legal in Streaming-SVE mode) to instead be expanded for fixed-length vectors, or otherwise not to code-generate for scalable vectors. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D156109 --- .../Target/AArch64/AArch64ISelLowering.cpp | 15 +- llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 18 +-- llvm/lib/Target/AArch64/AArch64Subtarget.h | 18 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 3 +- .../CodeGen/AArch64/sve-fp-reduce-fadda.ll | 4 +- ...e-streaming-mode-fixed-length-fp-reduce.ll | 129 ++++++++++++------ 6 files changed, 127 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eff0722e1c77..f042f4f9df5d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1453,7 +1453,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + if (Subtarget->isSVEAvailable()) + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); @@ -1507,9 +1508,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v1i64, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - // NEON doesn't support across-vector reductions, but SVE does. - for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + if (Subtarget->isSVEAvailable()) { + // NEON doesn't support across-vector reductions, but SVE does. + for (auto VT : + {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + } if (!Subtarget->isNeonAvailable()) { setTruncStoreAction(MVT::v2f32, MVT::v2f16, Custom); @@ -1867,7 +1871,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); - setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, + StreamingSVE ? Expand : Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 1aff7e30a0cf..3d2e9304746a 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -484,14 +484,16 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { bool AArch64Subtarget::useAA() const { return UseAA; } -bool AArch64Subtarget::isNeonAvailable() const { - if (!hasNEON()) - return false; +bool AArch64Subtarget::isStreamingCompatible() const { + return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE; +} - // The 'force-streaming-comaptible-sve' flag overrides the streaming - // function attributes. - if (ForceStreamingCompatibleSVE.getNumOccurrences() > 0) - return !ForceStreamingCompatibleSVE; +bool AArch64Subtarget::isNeonAvailable() const { + return hasNEON() && !isStreaming() && !isStreamingCompatible(); +} - return !isStreaming() && !isStreamingCompatible(); +bool AArch64Subtarget::isSVEAvailable() const{ + // FIXME: Also return false if FEAT_FA64 is set, but we can't do this yet + // as we don't yet support the feature in LLVM. + return hasSVE() && !isStreaming() && !isStreamingCompatible(); } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8a1cebe96894..47058b5f4578 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -205,20 +205,28 @@ public: bool isXRaySupported() const override { return true; } - /// Returns true if the function has the streaming attribute. + /// Returns true if the function has a streaming body. bool isStreaming() const { return StreamingSVEMode; } - /// Returns true if the function has the streaming-compatible attribute. - bool isStreamingCompatible() const { return StreamingCompatibleSVEMode; } + /// Returns true if the function has a streaming-compatible body. + bool isStreamingCompatible() const; /// Returns true if the target has NEON and the function at runtime is known /// to have NEON enabled (e.g. the function is known not to be in streaming-SVE /// mode, which disables NEON instructions). bool isNeonAvailable() const; + /// Returns true if the target has SVE and can use the full range of SVE + /// instructions, for example because it knows the function is known not to be + /// in streaming-SVE mode or when the target has FEAT_FA64 enabled. + bool isSVEAvailable() const; + unsigned getMinVectorRegisterBitWidth() const { - // Don't assume any minimum vector size when PSTATE.SM may not be 0. - if (StreamingSVEMode || StreamingCompatibleSVEMode) + // Don't assume any minimum vector size when PSTATE.SM may not be 0, because + // we don't yet support streaming-compatible codegen support that we trust + // is safe for functions that may be executed in streaming-SVE mode. + // By returning '0' here, we disable vectorization. + if (!isSVEAvailable() && !isNeonAvailable()) return 0; return MinVectorRegisterBitWidth; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index bee9ec4c7132..c60234fd85b5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1967,8 +1967,7 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); case TargetTransformInfo::RGK_ScalableVector: - if ((ST->isStreaming() || ST->isStreamingCompatible()) && - !EnableScalableAutovecInStreamingMode) + if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode) return TypeSize::getScalable(0); return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll index 259f457d3ad2..ffb94716ca51 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve < %s | FileCheck %s -; FIXME: Streaming-compatible SVE doesn't include FADDA, so this shouldn't compile! -; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s +; Streaming-compatible SVE doesn't include FADDA, so this shouldn't compile! +; RUN: not --crash llc -mattr=+sve -force-streaming-compatible-sve < %s target triple = "aarch64-linux-gnu" diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index 8675477a7d60..3689fc3a6ee6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -10,11 +10,14 @@ target triple = "aarch64-unknown-linux-gnu" define half @fadda_v4f16(half %start, <4 x half> %a) { ; CHECK-LABEL: fadda_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: fadda h0, p0, h0, z1.h -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: mov z1.h, z1.h[3] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res @@ -23,11 +26,22 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { define half @fadda_v8f16(half %start, <8 x half> %a) { ; CHECK-LABEL: fadda_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: fadda h0, p0, h0, z1.h -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[3] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[4] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[5] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res @@ -36,12 +50,38 @@ define half @fadda_v8f16(half %start, <8 x half> %a) { define half @fadda_v16f16(half %start, ptr %a) { ; CHECK-LABEL: fadda_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fadda h0, p0, h0, z1.h -; CHECK-NEXT: fadda h0, p0, h0, z2.h -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[3] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[4] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[5] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: mov z2.h, z1.h[1] +; CHECK-NEXT: fadd h0, h0, h1 +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[2] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[3] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[4] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[5] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: mov z2.h, z1.h[6] +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: fadd h0, h0, h2 +; CHECK-NEXT: fadd h0, h0, h1 ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) @@ -51,11 +91,10 @@ define half @fadda_v16f16(half %start, ptr %a) { define float @fadda_v2f32(float %start, <2 x float> %a) { ; CHECK-LABEL: fadda_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 -; CHECK-NEXT: fadda s0, p0, s0, z1.s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov z1.s, z1.s[1] +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res @@ -64,11 +103,14 @@ define float @fadda_v2f32(float %start, <2 x float> %a) { define float @fadda_v4f32(float %start, <4 x float> %a) { ; CHECK-LABEL: fadda_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: fadda s0, p0, s0, z1.s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) ret float %res @@ -77,12 +119,22 @@ define float @fadda_v4f32(float %start, <4 x float> %a) { define float @fadda_v8f32(float %start, ptr %a) { ; CHECK-LABEL: fadda_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fadda s0, p0, s0, z1.s -; CHECK-NEXT: fadda s0, p0, s0, z2.s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: mov z2.s, z1.s[1] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: mov z2.s, z1.s[2] +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) @@ -102,11 +154,10 @@ define double @fadda_v1f64(double %start, <1 x double> %a) { define double @fadda_v2f64(double %start, <2 x double> %a) { ; CHECK-LABEL: fadda_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: fadda d0, p0, d0, z1.d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) ret double %res @@ -115,12 +166,14 @@ define double @fadda_v2f64(double %start, <2 x double> %a) { define double @fadda_v4f64(double %start, ptr %a) { ; CHECK-LABEL: fadda_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fadda d0, p0, d0, z1.d -; CHECK-NEXT: fadda d0, p0, d0, z2.d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fadd d0, d0, d1 ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) -- Gitee