Files
llvm-project/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp
Zhen Wang c794742bd7 [flang][cuda] Support non-allocatable module-level managed variables (#189753)
Add support for non-allocatable module-level CUDA managed variables
using pointer indirection through a companion global in
__nv_managed_data__. The CUDA runtime populates this pointer with the
unified memory address via __cudaRegisterManagedVar and
__cudaInitModule.

- Create a .managed.ptr companion global in the __nv_managed_data__
section and register it with _FortranACUFRegisterManagedVariable
- Call __cudaInitModule once after all variables are registered, only
when non-allocatable managed globals are present, to populate managed
pointers
- Annotate managed globals in gpu.module with nvvm.managed for PTX
.attribute(.managed) generation
- Suppress cuf.data_transfer for assignments to/from non-allocatable
module managed variables, since cudaMemcpy would target the shadow
address rather than the actual unified memory
- Preserve cuf.data_transfer for device_var = managed_var assignments
where explicit transfer is still required

Note: This PR depends on
[#189751](https://github.com/llvm/llvm-project/pull/189751) (MLIR:
nvvm.managed attribute).
2026-04-01 18:43:04 +00:00

139 lines
5.5 KiB
C++

//===-- CUFOpConversionLate.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
#include "flang/Runtime/allocatable.h"
#include "flang/Runtime/allocator-registry-consts.h"
#include "flang/Support/Fortran.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
namespace fir {
#define GEN_PASS_DEF_CUFOPCONVERSIONLATE
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir
using namespace fir;
using namespace mlir;
using namespace Fortran::runtime;
using namespace Fortran::runtime::cuda;
namespace {
static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
mlir::Location loc, mlir::Type toTy,
mlir::Value val) {
if (val.getType() != toTy)
return fir::ConvertOp::create(rewriter, loc, toTy, val);
return val;
}
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
struct CUFDeviceAddressOpConversion
: public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
using OpRewritePattern::OpRewritePattern;
CUFDeviceAddressOpConversion(mlir::MLIRContext *context,
const mlir::SymbolTable &symtab)
: OpRewritePattern(context), symTab{symtab} {}
mlir::LogicalResult
matchAndRewrite(cuf::DeviceAddressOp op,
mlir::PatternRewriter &rewriter) const override {
auto symName = op.getHostSymbol().getRootReference().getValue();
if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
auto mod = op->getParentOfType<mlir::ModuleOp>();
mlir::Location loc = op.getLoc();
// For non-allocatable managed globals, CUFAddConstructor created a
// companion pointer global (@sym.managed.ptr) that holds the unified
// memory address. Load from it instead of calling CUFGetDeviceAddress.
std::string ptrGlobalName = (symName + managedPtrSuffix).str();
if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
auto ptrRef = fir::AddrOfOp::create(
rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
auto converted =
fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
rewriter.replaceOp(op, converted);
return success();
}
auto hostAddr = fir::AddrOfOp::create(
rewriter, loc, fir::ReferenceType::get(global.getType()),
op.getHostSymbol());
fir::FirOpBuilder builder(rewriter, mod);
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(CUFGetDeviceAddress)>(loc,
builder);
auto fTy = callee.getFunctionType();
mlir::Value conv =
createConvertOp(rewriter, loc, fTy.getInput(0), hostAddr);
mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
mlir::Value sourceLine =
fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, conv, sourceFile, sourceLine)};
auto call = fir::CallOp::create(rewriter, loc, callee, args);
mlir::Value addr = createConvertOp(rewriter, loc, hostAddr.getType(),
call->getResult(0));
rewriter.replaceOp(op, addr.getDefiningOp());
return success();
}
return failure();
}
private:
const mlir::SymbolTable &symTab;
};
class CUFOpConversionLate
: public fir::impl::CUFOpConversionLateBase<CUFOpConversionLate> {
using CUFOpConversionLateBase::CUFOpConversionLateBase;
public:
void runOnOperation() override {
auto *ctx = &getContext();
mlir::RewritePatternSet patterns(ctx);
mlir::ConversionTarget target(*ctx);
mlir::Operation *op = getOperation();
mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
if (!module)
return signalPassFailure();
mlir::SymbolTable symtab(module);
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
mlir::gpu::GPUDialect>();
patterns.insert<CUFDeviceAddressOpConversion>(patterns.getContext(),
symtab);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(ctx),
"error in CUF op conversion\n");
signalPassFailure();
}
}
};
} // namespace