[flang][OpenMP] Fix crash when a sliced array is specified in a forall within a workshare construct (#170913)
This is a fix for two problems that caused a crash: 1. Thread-local variables sometimes are required to be parallelized. Added a special case to handle this in `LowerWorkshare.cpp:isSafeToParallelize`. 2. Race condition caused by a `nowait` added to the `omp.workshare` if it is the last operation in a block. This allowed multiple threads to execute the `omp.workshare` region concurrently. Since _FortranAPushValue modifies a shared stack, this concurrent access causes a crash. Disable the addition of `nowait` and rely on the implicit barrier at the the of the `omp.workshare` region. Fixes #143330
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <flang/Optimizer/Analysis/AliasAnalysis.h>
|
||||
#include <flang/Optimizer/Builder/FIRBuilder.h>
|
||||
#include <flang/Optimizer/Dialect/FIROps.h>
|
||||
#include <flang/Optimizer/Dialect/FIRType.h>
|
||||
@@ -37,6 +38,7 @@
|
||||
#include <mlir/IR/PatternMatch.h>
|
||||
#include <mlir/IR/Value.h>
|
||||
#include <mlir/IR/Visitors.h>
|
||||
#include <mlir/Interfaces/LoopLikeInterface.h>
|
||||
#include <mlir/Interfaces/SideEffectInterfaces.h>
|
||||
#include <mlir/Support/LLVM.h>
|
||||
|
||||
@@ -135,9 +137,123 @@ static bool mustParallelizeOp(Operation *op) {
|
||||
.wasInterrupted();
|
||||
}
|
||||
|
||||
// Determines if a memory reference is thread-local in an OpenMP context.
|
||||
//
|
||||
// This is a best-effort analysis. We cannot definitively determine if code
|
||||
// is inside a parallel region when it's in a function called from that
|
||||
// region. However, we can identify common patterns of thread-local memory:
|
||||
//
|
||||
// 1. Memory allocated via fir.alloca inside the enclosing omp.parallel region
|
||||
// 2. Memory that comes from OpenMP clause block arguments that create
|
||||
// thread-local storage (private, firstprivate, lastprivate, reduction,
|
||||
// linear clauses)
|
||||
//
|
||||
// Returns true if the memory reference appears to be thread-local and thus
|
||||
// safe to parallelize (each thread should access its own copy).
|
||||
static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
|
||||
// Use AliasAnalysis to trace through declares, converts, reboxes, etc.
|
||||
// to find the underlying source of the memory reference.
|
||||
fir::AliasAnalysis aliasAnalysis;
|
||||
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
|
||||
|
||||
// Check if the source is a Value (not a global symbol).
|
||||
mlir::Value sourceValue =
|
||||
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
|
||||
if (!sourceValue)
|
||||
return false;
|
||||
|
||||
// Case 1: Memory allocated by fir.alloca inside the enclosing parallel
|
||||
// region is thread-private (each thread gets its own stack allocation).
|
||||
// Note: fir.allocmem is NOT thread-local even inside omp.parallel.
|
||||
if (source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
|
||||
if (auto alloca = sourceValue.getDefiningOp<fir::AllocaOp>()) {
|
||||
if (auto parallelOp = alloca->getParentOfType<omp::ParallelOp>()) {
|
||||
if (op->getParentOfType<omp::ParallelOp>() == parallelOp)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// When the alias analysis encounters an hlfir.declare/fir.declare on a
|
||||
// private clause block argument, it marks the SourceKind as Allocate and
|
||||
// sets the source value to the declare op result (not the block arg).
|
||||
// Trace through the declare to check if the underlying Memref is a
|
||||
// private block argument.
|
||||
Value declMemref;
|
||||
if (auto hlfirDecl = sourceValue.getDefiningOp<hlfir::DeclareOp>())
|
||||
declMemref = hlfirDecl.getMemref();
|
||||
else if (auto firDecl = sourceValue.getDefiningOp<fir::DeclareOp>())
|
||||
declMemref = firDecl.getMemref();
|
||||
if (declMemref) {
|
||||
if (auto blockArg = llvm::dyn_cast<BlockArgument>(declMemref)) {
|
||||
Operation *parentOp = blockArg.getOwner()->getParentOp();
|
||||
if (auto argIface =
|
||||
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
|
||||
if (llvm::is_contained(argIface.getPrivateBlockArgs(), blockArg))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Case 2: Memory from OpenMP clause block arguments that create thread-local
|
||||
// storage. These clauses create private copies for each thread:
|
||||
// - private: uninitialized thread-local copy
|
||||
// - firstprivate: thread-local copy initialized from original
|
||||
// - lastprivate: thread-local copy, value copied back after construct
|
||||
// - reduction: thread-local copy for reduction operations
|
||||
// - linear: thread-local copy with linear modification
|
||||
//
|
||||
// Check if the source value is a block argument of an OpenMP operation
|
||||
// that implements BlockArgOpenMPOpInterface.
|
||||
if (auto blockArg = llvm::dyn_cast<BlockArgument>(sourceValue)) {
|
||||
Operation *parentOp = blockArg.getOwner()->getParentOp();
|
||||
if (auto argIface =
|
||||
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
|
||||
// Check if this block argument corresponds to a privatizing clause.
|
||||
// Private, reduction, and in_reduction clauses create thread-local
|
||||
// memory.
|
||||
auto isInBlockArgs = [&](auto blockArgs) {
|
||||
return llvm::is_contained(blockArgs, blockArg);
|
||||
};
|
||||
|
||||
if (isInBlockArgs(argIface.getPrivateBlockArgs()))
|
||||
return true;
|
||||
if (isInBlockArgs(argIface.getReductionBlockArgs()))
|
||||
return true;
|
||||
if (isInBlockArgs(argIface.getInReductionBlockArgs()))
|
||||
return true;
|
||||
if (isInBlockArgs(argIface.getTaskReductionBlockArgs()))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isSafeToParallelize(Operation *op) {
|
||||
return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
|
||||
isMemoryEffectFree(op);
|
||||
if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
|
||||
isMemoryEffectFree(op))
|
||||
return true;
|
||||
|
||||
// Thread-local variables allocated in the OpenMP parallel region or coming
|
||||
// from privatizing clauses are private to each thread and thus safe (and
|
||||
// sometimes required) to parallelize. If the compiler wraps such load/stores
|
||||
// in an omp.single block, only one thread updates its local copy, while
|
||||
// all other threads read uninitialized data (see issue #143330).
|
||||
// Use MemoryEffectOpInterface to check all memory effects generically.
|
||||
// This also handles hlfir.assign which is present when the pass runs before
|
||||
// HLFIR lowering.
|
||||
if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
|
||||
SmallVector<MemoryEffects::EffectInstance> effects;
|
||||
memEffects.getEffects(effects);
|
||||
if (!effects.empty() &&
|
||||
llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
|
||||
Value val = effect.getValue();
|
||||
return val && isOpenMPThreadLocalMemory(op, val);
|
||||
}))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Simple shallow copies suffice for our purposes in this pass, so we implement
|
||||
@@ -335,6 +451,13 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
|
||||
|
||||
for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
|
||||
bool isLast = i + 1 == regions.size();
|
||||
// Make sure shared runtime calls are synchronized: disable `nowait`
|
||||
// insertion, and rely on the implicit barrier at the end of the
|
||||
// omp.workshare block. This applies to any loop-like operation
|
||||
// (fir.do_loop, fir.iterate_while, fir.do_concurrent.loop, etc.)
|
||||
// because iterations could overlap if nowait is used.
|
||||
if (isa<LoopLikeOpInterface>(block.getParentOp()))
|
||||
isLast = false;
|
||||
if (std::holds_alternative<SingleRegion>(opOrSingle)) {
|
||||
OpBuilder singleBuilder(sourceRegion.getContext());
|
||||
Block *singleBlock = new Block();
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
!===----------------------------------------------------------------------===!
|
||||
! This directory can be used to add Integration tests involving multiple
|
||||
! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
|
||||
! contain executable tests. We should only add tests here sparingly and only
|
||||
! if there is no other way to test. Repeat this message in each test that is
|
||||
! added to this directory and sub-directories.
|
||||
!===----------------------------------------------------------------------===!
|
||||
|
||||
! Regression test for https://github.com/llvm/llvm-project/issues/143330
|
||||
! Verify that forall constructs with sliced arrays inside workshare are
|
||||
! correctly lowered without causing race conditions or crashes.
|
||||
|
||||
!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix HLFIR
|
||||
!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix FIR
|
||||
!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix LLVM
|
||||
|
||||
subroutine workshare_forall_sliced(a1)
|
||||
integer :: a1(2,1,3)
|
||||
!$omp parallel
|
||||
!$omp workshare
|
||||
forall (n1=1:1)
|
||||
forall (n2=1:3)
|
||||
a1(:,n1,n2) = a1(:,n1,n2)+1
|
||||
end forall
|
||||
end forall
|
||||
!$omp end workshare
|
||||
!$omp end parallel
|
||||
end subroutine
|
||||
|
||||
! HLFIR-LABEL: func.func @_QPworkshare_forall_sliced
|
||||
! HLFIR: omp.parallel {
|
||||
! HLFIR: omp.workshare {
|
||||
! HLFIR: hlfir.forall
|
||||
! HLFIR: hlfir.forall
|
||||
! HLFIR: hlfir.region_assign
|
||||
! HLFIR: omp.terminator
|
||||
! HLFIR: }
|
||||
! HLFIR: omp.terminator
|
||||
! HLFIR: }
|
||||
|
||||
! After workshare lowering, the forall should be in omp.single (since it
|
||||
! contains operations that are not safe to parallelize across threads).
|
||||
! The key fix is that this compiles without crashing and the runtime
|
||||
! executes correctly.
|
||||
|
||||
! FIR-LABEL: func.func @_QPworkshare_forall_sliced
|
||||
! FIR: omp.parallel {
|
||||
! FIR: omp.single
|
||||
! FIR: fir.do_loop
|
||||
! FIR: fir.do_loop
|
||||
! FIR: omp.barrier
|
||||
! FIR: omp.terminator
|
||||
! FIR: }
|
||||
|
||||
! Verify LLVM IR is generated successfully (the original issue caused crashes)
|
||||
! LLVM-LABEL: define {{.*}}workshare_forall_sliced
|
||||
! LLVM: call {{.*}}@__kmpc_fork_call
|
||||
294
flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
Normal file
294
flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
Normal file
@@ -0,0 +1,294 @@
|
||||
// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
|
||||
|
||||
// Tests for thread-local memory handling in workshare lowering (#143330):
|
||||
// 1. Thread-local variables (from fir.alloca in omp.parallel or from OpenMP
|
||||
// private/reduction clauses) should be parallelized, not wrapped in omp.single
|
||||
// 2. nowait should not be added to omp.single when inside loop-like operations
|
||||
// that contain omp.workshare.loop_wrapper
|
||||
|
||||
|
||||
// Check that fir.alloca inside omp.parallel creates thread-local memory,
|
||||
// and stores to it should be parallelized (not wrapped in omp.single).
|
||||
|
||||
// CHECK-LABEL: func.func @thread_local_alloca_store
|
||||
func.func @thread_local_alloca_store() {
|
||||
omp.parallel {
|
||||
// The alloca is inside omp.parallel, so it's thread-local
|
||||
%alloca = fir.alloca i32
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : i32
|
||||
// This store should NOT be in omp.single because %alloca is thread-local
|
||||
fir.store %c1 to %alloca : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
|
||||
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
|
||||
// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that memory accessed through fir.declare is also recognized as thread-local
|
||||
// when the underlying alloca is in the parallel region.
|
||||
|
||||
// CHECK-LABEL: func.func @thread_local_with_declare
|
||||
func.func @thread_local_with_declare() {
|
||||
omp.parallel {
|
||||
%alloca = fir.alloca i32
|
||||
%declare = fir.declare %alloca {uniq_name = "local_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
|
||||
omp.workshare {
|
||||
%c42 = arith.constant 42 : i32
|
||||
// Store through declare should still be recognized as thread-local
|
||||
fir.store %c42 to %declare : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
|
||||
// CHECK-NEXT: %[[DECLARE:.*]] = fir.declare %[[ALLOCA]]
|
||||
// CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : i32
|
||||
// CHECK-NEXT: fir.store %[[C42]] to %[[DECLARE]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that private clause block arguments are recognized as thread-local.
|
||||
|
||||
omp.private {type = private} @x_private : i32
|
||||
|
||||
// CHECK-LABEL: func.func @private_clause_thread_local
|
||||
func.func @private_clause_thread_local(%arg0: !fir.ref<i32>) {
|
||||
omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
|
||||
omp.workshare {
|
||||
%c10 = arith.constant 10 : i32
|
||||
// Store to private variable should NOT be in omp.single
|
||||
fir.store %c10 to %priv_arg : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
|
||||
// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : i32
|
||||
// CHECK-NEXT: fir.store %[[C10]] to %[[PRIV_ARG]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that hlfir.assign to a private clause variable through hlfir.declare
|
||||
// is recognized as thread-local. The alias analysis marks private items as
|
||||
// SourceKind::Allocate with the declare result as source value, so we need
|
||||
// to trace through the declare to find the private block argument.
|
||||
|
||||
// CHECK-LABEL: func.func @hlfir_assign_private_clause
|
||||
func.func @hlfir_assign_private_clause(%arg0: !fir.ref<i32>) {
|
||||
omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
|
||||
%decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : i32
|
||||
// hlfir.assign to private variable should NOT be in omp.single
|
||||
hlfir.assign %c1 to %decl#0 : i32, !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
|
||||
// CHECK-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]]
|
||||
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
|
||||
// CHECK-NEXT: hlfir.assign %[[C1]] to %[[DECL]]#0 : i32, !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that reduction clause block arguments are recognized as thread-local.
|
||||
|
||||
omp.declare_reduction @add_reduction_i32 : i32 init {
|
||||
^bb0(%arg0: i32):
|
||||
%c0 = arith.constant 0 : i32
|
||||
omp.yield(%c0 : i32)
|
||||
} combiner {
|
||||
^bb0(%arg0: i32, %arg1: i32):
|
||||
%0 = arith.addi %arg0, %arg1 : i32
|
||||
omp.yield(%0 : i32)
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func.func @reduction_clause_thread_local
|
||||
func.func @reduction_clause_thread_local(%arg0: !fir.ref<i32>) {
|
||||
omp.parallel reduction(@add_reduction_i32 %arg0 -> %red_arg : !fir.ref<i32>) {
|
||||
omp.workshare {
|
||||
%c5 = arith.constant 5 : i32
|
||||
// Store to reduction variable should NOT be in omp.single
|
||||
fir.store %c5 to %red_arg : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel reduction(@add_reduction_i32 %{{.*}} -> %[[RED_ARG:.*]] : !fir.ref<i32>) {
|
||||
// CHECK-NEXT: %[[C5:.*]] = arith.constant 5 : i32
|
||||
// CHECK-NEXT: fir.store %[[C5]] to %[[RED_ARG]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that nowait is NOT added to omp.single when inside fir.do_loop
|
||||
// that contains omp.workshare.loop_wrapper. This prevents race conditions
|
||||
// when multiple threads execute different loop iterations concurrently.
|
||||
// The workshare.loop_wrapper triggers recursive parallelization of the loop body.
|
||||
|
||||
// CHECK-LABEL: func.func @no_nowait_in_loop_with_workshare_wrapper
|
||||
func.func @no_nowait_in_loop_with_workshare_wrapper(%arg0: !fir.ref<i32>) {
|
||||
omp.parallel {
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c10 = arith.constant 10 : index
|
||||
fir.do_loop %i = %c1 to %c10 step %c1 {
|
||||
// This side-effecting op will be wrapped in omp.single without nowait
|
||||
"test.side_effect"(%arg0) : (!fir.ref<i32>) -> ()
|
||||
// The workshare.loop_wrapper triggers recursive processing of the loop
|
||||
omp.workshare.loop_wrapper {
|
||||
omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
|
||||
"test.inner"() : () -> ()
|
||||
omp.yield
|
||||
}
|
||||
}
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// The omp.single inside the loop should NOT have nowait
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK: fir.do_loop
|
||||
// CHECK: omp.single {
|
||||
// CHECK: "test.side_effect"
|
||||
// CHECK: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
// CHECK: omp.wsloop {
|
||||
// CHECK: }
|
||||
// CHECK: omp.barrier
|
||||
// CHECK: }
|
||||
|
||||
|
||||
// Check that thread-local store inside a loop with workshare.loop_wrapper
|
||||
// is correctly parallelized (not wrapped in omp.single).
|
||||
|
||||
// CHECK-LABEL: func.func @thread_local_store_in_loop_with_wrapper
|
||||
func.func @thread_local_store_in_loop_with_wrapper() {
|
||||
omp.parallel {
|
||||
%alloca = fir.alloca i32
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c10 = arith.constant 10 : index
|
||||
fir.do_loop %i = %c1 to %c10 step %c1 {
|
||||
%c99 = arith.constant 99 : i32
|
||||
// Store to thread-local alloca should NOT be in omp.single
|
||||
fir.store %c99 to %alloca : !fir.ref<i32>
|
||||
omp.workshare.loop_wrapper {
|
||||
omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
|
||||
"test.inner"() : () -> ()
|
||||
omp.yield
|
||||
}
|
||||
}
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
|
||||
// CHECK: fir.do_loop
|
||||
// The store should be outside omp.single
|
||||
// CHECK: %[[C99:.*]] = arith.constant 99 : i32
|
||||
// CHECK-NEXT: fir.store %[[C99]] to %[[ALLOCA]] : !fir.ref<i32>
|
||||
// CHECK: omp.wsloop {
|
||||
// CHECK: }
|
||||
// CHECK: omp.barrier
|
||||
// CHECK: }
|
||||
|
||||
|
||||
// Check that non-thread-local memory is still wrapped in omp.single.
|
||||
// This is the baseline case to ensure we haven't broken normal behavior.
|
||||
|
||||
// CHECK-LABEL: func.func @non_thread_local_needs_single
|
||||
func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
|
||||
omp.parallel {
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : i32
|
||||
// arg0 is shared memory, store must be in omp.single
|
||||
fir.store %c1 to %arg0 : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK-NEXT: omp.single nowait {
|
||||
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
|
||||
// CHECK-NEXT: fir.store %[[C1]] to %{{.*}} : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
|
||||
|
||||
// Check that loads from thread-local alloca combined with stores are parallelized.
|
||||
|
||||
// CHECK-LABEL: func.func @thread_local_load_and_store
|
||||
func.func @thread_local_load_and_store() {
|
||||
omp.parallel {
|
||||
%alloca = fir.alloca i32
|
||||
omp.workshare {
|
||||
%c1 = arith.constant 1 : i32
|
||||
fir.store %c1 to %alloca : !fir.ref<i32>
|
||||
%val = fir.load %alloca : !fir.ref<i32>
|
||||
fir.store %val to %alloca : !fir.ref<i32>
|
||||
omp.terminator
|
||||
}
|
||||
omp.terminator
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// All operations on thread-local memory should be outside omp.single
|
||||
|
||||
// CHECK: omp.parallel {
|
||||
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
|
||||
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
|
||||
// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
|
||||
// CHECK-NEXT: omp.barrier
|
||||
// CHECK-NEXT: omp.terminator
|
||||
// CHECK-NEXT: }
|
||||
Reference in New Issue
Block a user