Add support for non-allocatable module-level CUDA managed variables using pointer indirection through a companion global in __nv_managed_data__. The CUDA runtime populates this pointer with the unified memory address via __cudaRegisterManagedVar and __cudaInitModule. - Create a .managed.ptr companion global in the __nv_managed_data__ section and register it with _FortranACUFRegisterManagedVariable - Call __cudaInitModule once after all variables are registered, only when non-allocatable managed globals are present, to populate managed pointers - Annotate managed globals in gpu.module with nvvm.managed for PTX .attribute(.managed) generation - Suppress cuf.data_transfer for assignments to/from non-allocatable module managed variables, since cudaMemcpy would target the shadow address rather than the actual unified memory - Preserve cuf.data_transfer for device_var = managed_var assignments where explicit transfer is still required Note: This PR depends on [#189751](https://github.com/llvm/llvm-project/pull/189751) (MLIR: nvvm.managed attribute).
139 lines
5.5 KiB
C++
139 lines
5.5 KiB
C++
//===-- CUFOpConversionLate.cpp -------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Optimizer/Builder/CUFCommon.h"
|
|
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
|
|
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
|
|
#include "flang/Optimizer/CodeGen/TypeConverter.h"
|
|
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
|
#include "flang/Optimizer/Dialect/FIRDialect.h"
|
|
#include "flang/Optimizer/Dialect/FIROps.h"
|
|
#include "flang/Optimizer/Dialect/FIRType.h"
|
|
#include "flang/Optimizer/Transforms/Passes.h"
|
|
#include "flang/Runtime/CUDA/common.h"
|
|
#include "flang/Runtime/CUDA/descriptor.h"
|
|
#include "flang/Runtime/allocatable.h"
|
|
#include "flang/Runtime/allocator-registry-consts.h"
|
|
#include "flang/Support/Fortran.h"
|
|
#include "mlir/Conversion/LLVMCommon/Pattern.h"
|
|
#include "mlir/Dialect/DLTI/DLTI.h"
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/OpenACC/OpenACC.h"
|
|
#include "mlir/IR/Matchers.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
|
|
namespace fir {
|
|
#define GEN_PASS_DEF_CUFOPCONVERSIONLATE
|
|
#include "flang/Optimizer/Transforms/Passes.h.inc"
|
|
} // namespace fir
|
|
|
|
using namespace fir;
|
|
using namespace mlir;
|
|
using namespace Fortran::runtime;
|
|
using namespace Fortran::runtime::cuda;
|
|
|
|
namespace {
|
|
|
|
static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
|
|
mlir::Location loc, mlir::Type toTy,
|
|
mlir::Value val) {
|
|
if (val.getType() != toTy)
|
|
return fir::ConvertOp::create(rewriter, loc, toTy, val);
|
|
return val;
|
|
}
|
|
|
|
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
|
|
|
|
struct CUFDeviceAddressOpConversion
|
|
: public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
|
|
using OpRewritePattern::OpRewritePattern;
|
|
|
|
CUFDeviceAddressOpConversion(mlir::MLIRContext *context,
|
|
const mlir::SymbolTable &symtab)
|
|
: OpRewritePattern(context), symTab{symtab} {}
|
|
|
|
mlir::LogicalResult
|
|
matchAndRewrite(cuf::DeviceAddressOp op,
|
|
mlir::PatternRewriter &rewriter) const override {
|
|
auto symName = op.getHostSymbol().getRootReference().getValue();
|
|
if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
|
|
auto mod = op->getParentOfType<mlir::ModuleOp>();
|
|
mlir::Location loc = op.getLoc();
|
|
|
|
// For non-allocatable managed globals, CUFAddConstructor created a
|
|
// companion pointer global (@sym.managed.ptr) that holds the unified
|
|
// memory address. Load from it instead of calling CUFGetDeviceAddress.
|
|
std::string ptrGlobalName = (symName + managedPtrSuffix).str();
|
|
if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
|
|
auto ptrRef = fir::AddrOfOp::create(
|
|
rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
|
|
auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
|
|
auto converted =
|
|
fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
|
|
rewriter.replaceOp(op, converted);
|
|
return success();
|
|
}
|
|
|
|
auto hostAddr = fir::AddrOfOp::create(
|
|
rewriter, loc, fir::ReferenceType::get(global.getType()),
|
|
op.getHostSymbol());
|
|
fir::FirOpBuilder builder(rewriter, mod);
|
|
mlir::func::FuncOp callee =
|
|
fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc,
|
|
builder);
|
|
auto fTy = callee.getFunctionType();
|
|
mlir::Value conv =
|
|
createConvertOp(rewriter, loc, fTy.getInput(0), hostAddr);
|
|
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
|
|
mlir::Value sourceLine =
|
|
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
|
|
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
|
builder, loc, fTy, conv, sourceFile, sourceLine)};
|
|
auto call = fir::CallOp::create(rewriter, loc, callee, args);
|
|
mlir::Value addr = createConvertOp(rewriter, loc, hostAddr.getType(),
|
|
call->getResult(0));
|
|
rewriter.replaceOp(op, addr.getDefiningOp());
|
|
return success();
|
|
}
|
|
return failure();
|
|
}
|
|
|
|
private:
|
|
const mlir::SymbolTable &symTab;
|
|
};
|
|
|
|
class CUFOpConversionLate
|
|
: public fir::impl::CUFOpConversionLateBase<CUFOpConversionLate> {
|
|
using CUFOpConversionLateBase::CUFOpConversionLateBase;
|
|
|
|
public:
|
|
void runOnOperation() override {
|
|
auto *ctx = &getContext();
|
|
mlir::RewritePatternSet patterns(ctx);
|
|
mlir::ConversionTarget target(*ctx);
|
|
mlir::Operation *op = getOperation();
|
|
mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
|
|
if (!module)
|
|
return signalPassFailure();
|
|
mlir::SymbolTable symtab(module);
|
|
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
|
|
mlir::gpu::GPUDialect>();
|
|
patterns.insert<CUFDeviceAddressOpConversion>(patterns.getContext(),
|
|
symtab);
|
|
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
|
|
std::move(patterns)))) {
|
|
mlir::emitError(mlir::UnknownLoc::get(ctx),
|
|
"error in CUF op conversion\n");
|
|
signalPassFailure();
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|