//===-- CUFAddConstructor.cpp ---------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/CodeGen/Target.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/SmallVector.h"

namespace fir {
#define GEN_PASS_DEF_CUFADDCONSTRUCTOR
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

using namespace Fortran::runtime::cuda;

namespace {

static constexpr llvm::StringRef cudaFortranCtorName{
    "__cudaFortranConstructor"};
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};

/// Create an 8-byte pointer global in the __nv_managed_data__ section.
/// The CUDA runtime populates this pointer with the unified memory address
/// when the module is initialized via __cudaInitModule.
static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
                                                mlir::ModuleOp mod,
                                                fir::GlobalOp globalOp) {
  mlir::MLIRContext *ctx = mod.getContext();
  std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
  auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));

  mlir::OpBuilder::InsertionGuard guard(builder);
  builder.setInsertionPointAfter(globalOp);

  llvm::SmallVector<mlir::NamedAttribute> attrs;
  attrs.push_back(
      mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
                           mlir::StringAttr::get(ctx, "__nv_managed_data__")));

  mlir::DenseElementsAttr initAttr = {};
  auto ptrGlobal = fir::GlobalOp::create(
      builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
      /*isTarget=*/false, ptrTy, initAttr,
      /*linkName=*/builder.createInternalLinkage(), attrs);

  mlir::Region &region = ptrGlobal.getRegion();
  mlir::Block *block = builder.createBlock(&region);
  builder.setInsertionPointToStart(block);
  mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
  fir::HasValueOp::create(builder, globalOp.getLoc(), zero);

  return ptrGlobal;
}

static bool hasRegisteredGlobals(mlir::ModuleOp mod,
                                 mlir::SymbolTable gpuSymTable) {
  for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
    auto attr = globalOp.getDataAttrAttr();
    if (!attr)
      continue;
    if (!gpuSymTable.lookup(globalOp.getSymName()))
      continue;
    if (attr.getValue() == cuf::DataAttribute::Managed &&
        !mlir::isa<fir::BaseBoxType>(globalOp.getType()))
      return true;
    switch (attr.getValue()) {
    case cuf::DataAttribute::Device:
    case cuf::DataAttribute::Constant:
    case cuf::DataAttribute::Managed: {
      return true;
    } break;
    default:
      break;
    }
  }
  return false;
}

static bool hasKernel(mlir::gpu::GPUModuleOp gpuMod) {
  for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>())
    if (func.isKernel())
      return true;
  return false;
}

struct CUFAddConstructor
    : public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {

  void runOnOperation() override {
    mlir::ModuleOp mod = getOperation();
    mlir::SymbolTable symTab(mod);
    mlir::OpBuilder opBuilder{mod.getBodyRegion()};
    fir::FirOpBuilder builder(opBuilder, mod);
    fir::KindMapping kindMap{fir::getKindMapping(mod)};
    builder.setInsertionPointToEnd(mod.getBody());
    mlir::Location loc = mod.getLoc();
    auto *ctx = mod.getContext();
    auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
    auto idxTy = builder.getIndexType();
    auto funcTy =
        mlir::LLVM::LLVMFunctionType::get(voidTy, {}, /*isVarArg=*/false);
    std::optional<mlir::DataLayout> dl =
        fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
    if (!dl) {
      mlir::emitError(mod.getLoc(),
                      "data layout attribute is required to perform " +
                          getName() + "pass");
    }

    // Symbol reference to CUFRegisterAllocator.
    builder.setInsertionPointToEnd(mod.getBody());
    auto registerFuncOp = mlir::LLVM::LLVMFuncOp::create(
        builder, loc, RTNAME_STRING(CUFRegisterAllocator), funcTy);
    registerFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
    auto cufRegisterAllocatorRef = mlir::SymbolRefAttr::get(
        mod.getContext(), RTNAME_STRING(CUFRegisterAllocator));
    builder.setInsertionPointToEnd(mod.getBody());

    // Create the constructor function that call CUFRegisterAllocator.
    auto func = mlir::LLVM::LLVMFuncOp::create(builder, loc,
                                               cudaFortranCtorName, funcTy);
    func.setLinkage(mlir::LLVM::Linkage::Internal);
    builder.setInsertionPointToStart(func.addEntryBlock(builder));
    mlir::LLVM::CallOp::create(builder, loc, funcTy, cufRegisterAllocatorRef);

    auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName);
    if (gpuMod) {
      mlir::SymbolTable gpuSymTable(gpuMod);
      if (!hasKernel(gpuMod) && !hasRegisteredGlobals(mod, gpuSymTable)) {
        // No kernels and no globals to register means no GPU binary to
        // register. This happens for host TUs that USE a kernel module but
        // don't define any device code.
        mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
        return;
      }

      auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
      auto registeredMod = cuf::RegisterModuleOp::create(
          builder, loc, llvmPtrTy,
          mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));

      fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false,
                                           /*forceUnifiedTBAATree=*/false, *dl);
      // Register kernels
      for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
        if (func.isKernel()) {
          auto kernelName = mlir::SymbolRefAttr::get(
              builder.getStringAttr(cudaDeviceModuleName),
              {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())});
          cuf::RegisterKernelOp::create(builder, loc, kernelName,
                                        registeredMod);
        }
      }

      // Register variables
      bool hasNonAllocManagedGlobal = false;
      for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
        auto attr = globalOp.getDataAttrAttr();
        if (!attr)
          continue;
        if (!gpuSymTable.lookup(globalOp.getSymName()))
          continue;

        bool isNonAllocManagedGlobal =
            attr.getValue() == cuf::DataAttribute::Managed &&
            !mlir::isa<fir::BaseBoxType>(globalOp.getType());

        mlir::func::FuncOp func;
        switch (attr.getValue()) {
        case cuf::DataAttribute::Device:
        case cuf::DataAttribute::Constant:
        case cuf::DataAttribute::Managed: {
          // Global variable name
          std::string gblNameStr = globalOp.getSymbol().getValue().str();
          gblNameStr += '\0';
          mlir::Value gblName = fir::getBase(
              fir::factory::createStringLiteral(builder, loc, gblNameStr));

          // Global variable size
          std::optional<uint64_t> size;
          if (auto boxTy =
                  mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
            mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
            size = dl->getTypeSizeInBits(structTy) / 8;
          }
          if (!size) {
            size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(),
                                                       *dl, kindMap)
                       .first;
          }
          auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);

          if (isNonAllocManagedGlobal) {
            hasNonAllocManagedGlobal = true;
            // Non-allocatable managed globals use pointer indirection:
            // a companion pointer in __nv_managed_data__ holds the unified
            // memory address, registered via __cudaRegisterManagedVar.
            fir::GlobalOp ptrGlobal =
                createManagedPointerGlobal(builder, mod, globalOp);
            func = fir::runtime::getRuntimeFunc<mkRTKey(
                CUFRegisterManagedVariable)>(loc, builder);
            auto fTy = func.getFunctionType();
            mlir::Value addr = fir::AddrOfOp::create(
                builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
            llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
                builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
            fir::CallOp::create(builder, loc, func, args);
          } else {
            func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
                loc, builder);
            auto fTy = func.getFunctionType();
            mlir::Value addr = fir::AddrOfOp::create(
                builder, loc, globalOp.resultType(), globalOp.getSymbol());
            llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
                builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
            fir::CallOp::create(builder, loc, func, args);
          }
        } break;
        default:
          break;
        }
      }

      if (hasNonAllocManagedGlobal) {
        // Initialize the module after all variables are registered so the
        // runtime populates managed variable unified memory pointers.
        mlir::func::FuncOp initFunc =
            fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
        mlir::FunctionType initFTy = initFunc.getFunctionType();
        llvm::SmallVector<mlir::Value> initArgs{fir::runtime::createArguments(
            builder, loc, initFTy, registeredMod)};
        fir::CallOp::create(builder, loc, initFunc, initArgs);
      }
    }
    mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});

    // Create the llvm.global_ctor with the function.
    // TODO: We might want to have a utility that retrieve it if already
    // created and adds new functions.
    builder.setInsertionPointToEnd(mod.getBody());
    llvm::SmallVector<mlir::Attribute> funcs;
    funcs.push_back(
        mlir::FlatSymbolRefAttr::get(mod.getContext(), func.getSymName()));
    llvm::SmallVector<int> priorities;
    llvm::SmallVector<mlir::Attribute> data;
    priorities.push_back(0);
    data.push_back(mlir::LLVM::ZeroAttr::get(mod.getContext()));
    mlir::LLVM::GlobalCtorsOp::create(
        builder, mod.getLoc(), builder.getArrayAttr(funcs),
        builder.getI32ArrayAttr(priorities), builder.getArrayAttr(data));
  }
};

} // end anonymous namespace