Revert "[flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortr… (#186891)
…an device arrays (#185984)"
This reverts commit fb18d570b0.
This PR caused compilation failures with allocatable arrays, reverting
now for more investigation.
This commit is contained in:
@@ -62,14 +62,6 @@ cuf::DataAttributeAttr
|
||||
translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
|
||||
const Fortran::semantics::Symbol &sym);
|
||||
|
||||
/// Create a cuf.alloc operation with extents and length parameters elided
|
||||
/// when they are already encoded in the static type.
|
||||
mlir::Value genCUFAlloc(fir::FirOpBuilder &builder, mlir::Location loc,
|
||||
mlir::Type type, llvm::StringRef uniqName,
|
||||
llvm::StringRef bindcName,
|
||||
cuf::DataAttributeAttr dataAttr,
|
||||
mlir::ValueRange lenParams, mlir::ValueRange extents);
|
||||
|
||||
/// Check if the rhs has an implicit conversion. Return the elemental op if
|
||||
/// there is a conversion. Return null otherwise.
|
||||
std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
|
||||
|
||||
@@ -68,24 +68,6 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
|
||||
return cuf::getDataAttribute(mlirContext, cudaAttr);
|
||||
}
|
||||
|
||||
mlir::Value Fortran::lower::genCUFAlloc(fir::FirOpBuilder &builder,
|
||||
mlir::Location loc, mlir::Type type,
|
||||
llvm::StringRef uniqName,
|
||||
llvm::StringRef bindcName,
|
||||
cuf::DataAttributeAttr dataAttr,
|
||||
mlir::ValueRange lenParams,
|
||||
mlir::ValueRange extents) {
|
||||
llvm::SmallVector<mlir::Value> elidedExtents =
|
||||
fir::factory::elideExtentsAlreadyInType(type, extents);
|
||||
llvm::SmallVector<mlir::Value> elidedLenParams =
|
||||
fir::factory::elideLengthsAlreadyInType(type, lenParams);
|
||||
auto idxTy = builder.getIndexType();
|
||||
for (mlir::Value &ext : elidedExtents)
|
||||
ext = builder.createConvert(loc, idxTy, ext);
|
||||
return cuf::AllocOp::create(builder, loc, type, uniqName, bindcName, dataAttr,
|
||||
elidedLenParams, elidedExtents);
|
||||
}
|
||||
|
||||
std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
|
||||
Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
|
||||
auto isCopyElementalOp = [](hlfir::ElementalOp elOp) {
|
||||
|
||||
@@ -760,20 +760,21 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
|
||||
cuf::DataAttributeAttr dataAttr =
|
||||
Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
|
||||
ultimateSymbol);
|
||||
if (dataAttr.getValue() == cuf::DataAttribute::Shared) {
|
||||
llvm::SmallVector<mlir::Value> elidedShape =
|
||||
fir::factory::elideExtentsAlreadyInType(ty, shape);
|
||||
auto idxTy = builder.getIndexType();
|
||||
llvm::SmallVector<mlir::Value> indices;
|
||||
for (mlir::Value sh : elidedShape)
|
||||
indices.push_back(builder.createConvert(loc, idxTy, sh));
|
||||
llvm::SmallVector<mlir::Value> indices;
|
||||
llvm::SmallVector<mlir::Value> elidedShape =
|
||||
fir::factory::elideExtentsAlreadyInType(ty, shape);
|
||||
llvm::SmallVector<mlir::Value> elidedLenParams =
|
||||
fir::factory::elideLengthsAlreadyInType(ty, lenParams);
|
||||
auto idxTy = builder.getIndexType();
|
||||
for (mlir::Value sh : elidedShape)
|
||||
indices.push_back(builder.createConvert(loc, idxTy, sh));
|
||||
if (dataAttr.getValue() == cuf::DataAttribute::Shared)
|
||||
return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
|
||||
indices);
|
||||
}
|
||||
|
||||
if (!cuf::isCUDADeviceContext(builder.getRegion()))
|
||||
return Fortran::lower::genCUFAlloc(builder, loc, ty, nm, symNm, dataAttr,
|
||||
lenParams, shape);
|
||||
return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
|
||||
lenParams, indices);
|
||||
}
|
||||
|
||||
// Let the builder do all the heavy lifting.
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
#include "flang/Lower/AbstractConverter.h"
|
||||
#include "flang/Lower/Allocatable.h"
|
||||
#include "flang/Lower/CUDA.h"
|
||||
#include "flang/Lower/ConvertVariable.h"
|
||||
#include "flang/Optimizer/Builder/BoxValue.h"
|
||||
#include "flang/Optimizer/Builder/Character.h"
|
||||
@@ -22,14 +21,12 @@
|
||||
#include "flang/Optimizer/Builder/HLFIRTools.h"
|
||||
#include "flang/Optimizer/Builder/Runtime/Derived.h"
|
||||
#include "flang/Optimizer/Builder/Todo.h"
|
||||
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
||||
#include "flang/Optimizer/Dialect/FIROps.h"
|
||||
#include "flang/Optimizer/Dialect/FIRType.h"
|
||||
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
|
||||
#include "flang/Optimizer/HLFIR/HLFIROps.h"
|
||||
#include "flang/Optimizer/Support/FatalError.h"
|
||||
#include "flang/Semantics/symbol.h"
|
||||
#include "flang/Semantics/tools.h"
|
||||
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
|
||||
#include "mlir/IR/Location.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
@@ -49,11 +46,11 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void createCleanupRegion(
|
||||
Fortran::lower::AbstractConverter &converter, mlir::Location loc,
|
||||
mlir::Type argType, mlir::Region &cleanupRegion,
|
||||
const Fortran::semantics::Symbol *sym, bool isDoConcurrent,
|
||||
std::optional<cuf::DataAttributeAttr> cudaDataAttr = std::nullopt) {
|
||||
static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
|
||||
mlir::Location loc, mlir::Type argType,
|
||||
mlir::Region &cleanupRegion,
|
||||
const Fortran::semantics::Symbol *sym,
|
||||
bool isDoConcurrent) {
|
||||
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
|
||||
assert(cleanupRegion.empty());
|
||||
mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
|
||||
@@ -112,14 +109,9 @@ static void createCleanupRegion(
|
||||
fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
|
||||
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
||||
|
||||
if (cudaDataAttr) {
|
||||
cuf::FreeOp::create(builder, loc, addr, *cudaDataAttr);
|
||||
} else {
|
||||
mlir::Value cast = builder.createConvert(
|
||||
loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())),
|
||||
addr);
|
||||
fir::FreeMemOp::create(builder, loc, cast);
|
||||
}
|
||||
mlir::Value cast = builder.createConvert(
|
||||
loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
|
||||
fir::FreeMemOp::create(builder, loc, cast);
|
||||
|
||||
builder.setInsertionPointAfter(ifOp);
|
||||
if (isDoConcurrent)
|
||||
@@ -555,31 +547,6 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
|
||||
if (shouldAllocateTempOnStack(boxTy))
|
||||
return createStackTempFromMold(loc, builder, source);
|
||||
|
||||
// For CUDA device arrays that require special allocation (device,
|
||||
// managed, unified, etc.), use cuf.alloc instead of fir.allocmem so
|
||||
// that the private copy lives in device memory.
|
||||
if (sym && Fortran::semantics::NeedCUDAAlloc(sym->GetUltimate())) {
|
||||
cuf::DataAttributeAttr dataAttr =
|
||||
Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
|
||||
sym->GetUltimate());
|
||||
mlir::Type sequenceType =
|
||||
hlfir::getFortranElementOrSequenceType(source.getType());
|
||||
mlir::Value shape = hlfir::genShape(loc, builder, source);
|
||||
auto extents = hlfir::getIndexExtents(loc, builder, shape);
|
||||
mlir::Value alloc = Fortran::lower::genCUFAlloc(
|
||||
builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
|
||||
dataAttr, lenParams, extents);
|
||||
auto declareOp = hlfir::DeclareOp::create(
|
||||
builder, loc, alloc, ".tmp", shape, lenParams,
|
||||
/*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
|
||||
fir::FortranVariableFlagsAttr{}, dataAttr);
|
||||
hlfir::Entity temp{declareOp.getBase()};
|
||||
mlir::OpBuilder::InsertionGuard guard(builder);
|
||||
createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
|
||||
isDoConcurrent, dataAttr);
|
||||
return temp;
|
||||
}
|
||||
|
||||
auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
|
||||
// if needsDealloc, add cleanup region. Always
|
||||
// do this for allocatable boxes because they might have been re-allocated
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
! Test that OpenMP privatization of CUDA Fortran device arrays uses cuf.alloc
|
||||
! instead of fir.allocmem so the private copy resides in device memory.
|
||||
|
||||
! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
|
||||
|
||||
subroutine omp_private_device_array()
|
||||
implicit none
|
||||
integer(4), device :: a(8)
|
||||
|
||||
!$omp parallel private(a)
|
||||
a(1) = 42
|
||||
!$omp end parallel
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: omp.private {type = private}
|
||||
! CHECK-SAME: @[[PRIVATIZER:.*]] : !fir.box<!fir.array<8xi32>> init {
|
||||
|
||||
! CHECK-NEXT: ^bb0(%[[MOLD:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>, %[[PRIV:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
|
||||
! CHECK-NEXT: %[[C8:.*]] = arith.constant 8 : index
|
||||
! CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C8]]
|
||||
! CHECK-NEXT: %[[ALLOC:.*]] = cuf.alloc !fir.array<8xi32> {bindc_name = ".tmp", data_attr = #cuf.cuda<device>}
|
||||
! CHECK-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = ".tmp"}
|
||||
! CHECK: fir.embox
|
||||
! CHECK: fir.store
|
||||
! CHECK-NEXT: omp.yield
|
||||
|
||||
! CHECK: } dealloc {
|
||||
! CHECK-NEXT: ^bb0(%[[DEALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
|
||||
! CHECK: cuf.free %{{.*}} {data_attr = #cuf.cuda<device>}
|
||||
! CHECK: omp.yield
|
||||
! CHECK-NEXT: }
|
||||
Reference in New Issue
Block a user