llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp

//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass implements IR expansion for reduction intrinsics, allowing targets
// to enable the intrinsics until just before codegen.
//
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/ExpandReductions.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Utils/LoopUtils.h"

using namespace llvm;

namespace {

bool expandReductions(Function &F, const TargetTransformInfo *TTI,
                      DominatorTree *DT, LoopInfo *LI) {
  bool Changed = false;
  SmallVector<IntrinsicInst *, 4> Worklist;
  for (auto &I : instructions(F)) {
    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
      switch (II->getIntrinsicID()) {
      default: break;
      case Intrinsic::vector_reduce_fadd:
      case Intrinsic::vector_reduce_fmul:
      case Intrinsic::vector_reduce_add:
      case Intrinsic::vector_reduce_mul:
      case Intrinsic::vector_reduce_and:
      case Intrinsic::vector_reduce_or:
      case Intrinsic::vector_reduce_xor:
      case Intrinsic::vector_reduce_smax:
      case Intrinsic::vector_reduce_smin:
      case Intrinsic::vector_reduce_umax:
      case Intrinsic::vector_reduce_umin:
      case Intrinsic::vector_reduce_fmax:
      case Intrinsic::vector_reduce_fmin:
        if (TTI->shouldExpandReduction(II))
          Worklist.push_back(II);

        break;
      }
    }
  }

  for (auto *II : Worklist) {
    FastMathFlags FMF =
        isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
    Intrinsic::ID ID = II->getIntrinsicID();
    RecurKind RK = getMinMaxReductionRecurKind(ID);
    TargetTransformInfo::ReductionShuffle RS =
        TTI->getPreferredExpandedReductionShuffle(II);

    Value *Rdx = nullptr;
    IRBuilder<> Builder(II);
    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
    Builder.setFastMathFlags(FMF);
    switch (ID) {
    default: llvm_unreachable("Unexpected intrinsic!");
    case Intrinsic::vector_reduce_fadd:
    case Intrinsic::vector_reduce_fmul: {
      // FMFs must be attached to the call, otherwise it's an ordered reduction
      // and it can't be handled by generating a shuffle sequence.
      Value *Acc = II->getArgOperand(0);
      Value *Vec = II->getArgOperand(1);
      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
      if (isa<ScalableVectorType>(Vec->getType())) {
        Rdx = expandReductionViaLoop(Builder, Vec, RdxOpcode, Acc, DT, LI);
        break;
      }
      if (!FMF.allowReassoc())
        Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
      else {
        if (!isPowerOf2_32(
                cast<FixedVectorType>(Vec->getType())->getNumElements()))
          continue;
        Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
        Rdx = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, Acc, Rdx,
                                  "bin.rdx");
      }
      break;
    }
    case Intrinsic::vector_reduce_and:
    case Intrinsic::vector_reduce_or: {
      // Canonicalize logical or/and reductions:
      // Or reduction for i1 is represented as:
      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
      // %res = cmp ne iReduxWidth %val, 0
      // And reduction for i1 is represented as:
      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
      // %res = cmp eq iReduxWidth %val, 11111
      Value *Vec = II->getArgOperand(0);
      auto *FTy = cast<FixedVectorType>(Vec->getType());
      unsigned NumElts = FTy->getNumElements();
      if (!isPowerOf2_32(NumElts))
        continue;

      if (FTy->getElementType() == Builder.getInt1Ty()) {
        Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
        if (ID == Intrinsic::vector_reduce_and) {
          Rdx = Builder.CreateICmpEQ(
              Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
        } else {
          assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
          Rdx = Builder.CreateIsNotNull(Rdx);
        }
        break;
      }
      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
      break;
    }
    case Intrinsic::vector_reduce_add:
    case Intrinsic::vector_reduce_mul:
    case Intrinsic::vector_reduce_xor:
    case Intrinsic::vector_reduce_smax:
    case Intrinsic::vector_reduce_smin:
    case Intrinsic::vector_reduce_umax:
    case Intrinsic::vector_reduce_umin: {
      Value *Vec = II->getArgOperand(0);
      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
      if (isa<ScalableVectorType>(Vec->getType())) {
        Type *EltTy = Vec->getType()->getScalarType();
        Value *Ident = getReductionIdentity(ID, EltTy, FMF);
        Rdx = expandReductionViaLoop(Builder, Vec, RdxOpcode, Ident, DT, LI);
        break;
      }
      if (!isPowerOf2_32(
              cast<FixedVectorType>(Vec->getType())->getNumElements()))
        continue;
      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
      break;
    }
    case Intrinsic::vector_reduce_fmax:
    case Intrinsic::vector_reduce_fmin: {
      // We require "nnan" to use a shuffle reduction; "nsz" is implied by the
      // semantics of the reduction.
      Value *Vec = II->getArgOperand(0);
      if (!isPowerOf2_32(
              cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
          !FMF.noNaNs())
        continue;
      unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
      Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
      break;
    }
    }
    II->replaceAllUsesWith(Rdx);
    II->eraseFromParent();
    Changed = true;
  }
  return Changed;
}

class ExpandReductions : public FunctionPass {
public:
  static char ID;
  ExpandReductions() : FunctionPass(ID) {}

  bool runOnFunction(Function &F) override {
    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
    auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
    auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
    return expandReductions(F, TTI, DT, LI);
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<TargetTransformInfoWrapperPass>();
    AU.addPreserved<DominatorTreeWrapperPass>();
    AU.addPreserved<LoopInfoWrapperPass>();
  }
};
}

char ExpandReductions::ID;
INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
                      "Expand reduction intrinsics", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
                    "Expand reduction intrinsics", false, false)

FunctionPass *llvm::createExpandReductionsPass() {
  return new ExpandReductions();
}

PreservedAnalyses ExpandReductionsPass::run(Function &F,
                                            FunctionAnalysisManager &AM) {
  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
  auto *LI = AM.getCachedResult<LoopAnalysis>(F);
  if (!expandReductions(F, &TTI, DT, LI))
    return PreservedAnalyses::all();
  PreservedAnalyses PA;
  PA.preserve<DominatorTreeAnalysis>();
  PA.preserve<LoopAnalysis>();
  return PA;
}