[flang][OpenMP] Fix crash when a sliced array is specified in a forall within a workshare construct (#170913)

This is a fix for two problems that caused a crash:

1. Thread-local variables sometimes are required to be parallelized.
Added a special case to handle this in
`LowerWorkshare.cpp:isSafeToParallelize`.
2. Race condition caused by a `nowait` added to the `omp.workshare` if
it is the last operation in a block. This allowed multiple threads to
execute the `omp.workshare` region concurrently. Since
_FortranAPushValue modifies a shared stack, this concurrent access
causes a crash. Disable the addition of `nowait` and rely on the
implicit barrier at the the of the `omp.workshare` region.

Fixes #143330
This commit is contained in:
Carlos Seo
2026-03-05 09:59:20 -03:00
committed by GitHub
parent c7ddb30552
commit 0f5e9bee83
3 changed files with 476 additions and 2 deletions

View File

@@ -16,6 +16,7 @@
//
//===----------------------------------------------------------------------===//
#include <flang/Optimizer/Analysis/AliasAnalysis.h>
#include <flang/Optimizer/Builder/FIRBuilder.h>
#include <flang/Optimizer/Dialect/FIROps.h>
#include <flang/Optimizer/Dialect/FIRType.h>
@@ -37,6 +38,7 @@
#include <mlir/IR/PatternMatch.h>
#include <mlir/IR/Value.h>
#include <mlir/IR/Visitors.h>
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include <mlir/Support/LLVM.h>
@@ -135,9 +137,123 @@ static bool mustParallelizeOp(Operation *op) {
.wasInterrupted();
}
// Determines if a memory reference is thread-local in an OpenMP context.
//
// This is a best-effort analysis. We cannot definitively determine if code
// is inside a parallel region when it's in a function called from that
// region. However, we can identify common patterns of thread-local memory:
//
// 1. Memory allocated via fir.alloca inside the enclosing omp.parallel region
// 2. Memory that comes from OpenMP clause block arguments that create
// thread-local storage (private, firstprivate, lastprivate, reduction,
// linear clauses)
//
// Returns true if the memory reference appears to be thread-local and thus
// safe to parallelize (each thread should access its own copy).
static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
// Use AliasAnalysis to trace through declares, converts, reboxes, etc.
// to find the underlying source of the memory reference.
fir::AliasAnalysis aliasAnalysis;
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
// Check if the source is a Value (not a global symbol).
mlir::Value sourceValue =
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
if (!sourceValue)
return false;
// Case 1: Memory allocated by fir.alloca inside the enclosing parallel
// region is thread-private (each thread gets its own stack allocation).
// Note: fir.allocmem is NOT thread-local even inside omp.parallel.
if (source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
if (auto alloca = sourceValue.getDefiningOp<fir::AllocaOp>()) {
if (auto parallelOp = alloca->getParentOfType<omp::ParallelOp>()) {
if (op->getParentOfType<omp::ParallelOp>() == parallelOp)
return true;
}
}
// When the alias analysis encounters an hlfir.declare/fir.declare on a
// private clause block argument, it marks the SourceKind as Allocate and
// sets the source value to the declare op result (not the block arg).
// Trace through the declare to check if the underlying Memref is a
// private block argument.
Value declMemref;
if (auto hlfirDecl = sourceValue.getDefiningOp<hlfir::DeclareOp>())
declMemref = hlfirDecl.getMemref();
else if (auto firDecl = sourceValue.getDefiningOp<fir::DeclareOp>())
declMemref = firDecl.getMemref();
if (declMemref) {
if (auto blockArg = llvm::dyn_cast<BlockArgument>(declMemref)) {
Operation *parentOp = blockArg.getOwner()->getParentOp();
if (auto argIface =
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
if (llvm::is_contained(argIface.getPrivateBlockArgs(), blockArg))
return true;
}
}
}
}
// Case 2: Memory from OpenMP clause block arguments that create thread-local
// storage. These clauses create private copies for each thread:
// - private: uninitialized thread-local copy
// - firstprivate: thread-local copy initialized from original
// - lastprivate: thread-local copy, value copied back after construct
// - reduction: thread-local copy for reduction operations
// - linear: thread-local copy with linear modification
//
// Check if the source value is a block argument of an OpenMP operation
// that implements BlockArgOpenMPOpInterface.
if (auto blockArg = llvm::dyn_cast<BlockArgument>(sourceValue)) {
Operation *parentOp = blockArg.getOwner()->getParentOp();
if (auto argIface =
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
// Check if this block argument corresponds to a privatizing clause.
// Private, reduction, and in_reduction clauses create thread-local
// memory.
auto isInBlockArgs = [&](auto blockArgs) {
return llvm::is_contained(blockArgs, blockArg);
};
if (isInBlockArgs(argIface.getPrivateBlockArgs()))
return true;
if (isInBlockArgs(argIface.getReductionBlockArgs()))
return true;
if (isInBlockArgs(argIface.getInReductionBlockArgs()))
return true;
if (isInBlockArgs(argIface.getTaskReductionBlockArgs()))
return true;
}
}
return false;
}
static bool isSafeToParallelize(Operation *op) {
return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
isMemoryEffectFree(op);
if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
isMemoryEffectFree(op))
return true;
// Thread-local variables allocated in the OpenMP parallel region or coming
// from privatizing clauses are private to each thread and thus safe (and
// sometimes required) to parallelize. If the compiler wraps such load/stores
// in an omp.single block, only one thread updates its local copy, while
// all other threads read uninitialized data (see issue #143330).
// Use MemoryEffectOpInterface to check all memory effects generically.
// This also handles hlfir.assign which is present when the pass runs before
// HLFIR lowering.
if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
SmallVector<MemoryEffects::EffectInstance> effects;
memEffects.getEffects(effects);
if (!effects.empty() &&
llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
Value val = effect.getValue();
return val && isOpenMPThreadLocalMemory(op, val);
}))
return true;
}
return false;
}
/// Simple shallow copies suffice for our purposes in this pass, so we implement
@@ -335,6 +451,13 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
bool isLast = i + 1 == regions.size();
// Make sure shared runtime calls are synchronized: disable `nowait`
// insertion, and rely on the implicit barrier at the end of the
// omp.workshare block. This applies to any loop-like operation
// (fir.do_loop, fir.iterate_while, fir.do_concurrent.loop, etc.)
// because iterations could overlap if nowait is used.
if (isa<LoopLikeOpInterface>(block.getParentOp()))
isLast = false;
if (std::holds_alternative<SingleRegion>(opOrSingle)) {
OpBuilder singleBuilder(sourceRegion.getContext());
Block *singleBlock = new Block();

View File

@@ -0,0 +1,57 @@
!===----------------------------------------------------------------------===!
! This directory can be used to add Integration tests involving multiple
! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
! contain executable tests. We should only add tests here sparingly and only
! if there is no other way to test. Repeat this message in each test that is
! added to this directory and sub-directories.
!===----------------------------------------------------------------------===!
! Regression test for https://github.com/llvm/llvm-project/issues/143330
! Verify that forall constructs with sliced arrays inside workshare are
! correctly lowered without causing race conditions or crashes.
!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix HLFIR
!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix FIR
!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix LLVM
subroutine workshare_forall_sliced(a1)
integer :: a1(2,1,3)
!$omp parallel
!$omp workshare
forall (n1=1:1)
forall (n2=1:3)
a1(:,n1,n2) = a1(:,n1,n2)+1
end forall
end forall
!$omp end workshare
!$omp end parallel
end subroutine
! HLFIR-LABEL: func.func @_QPworkshare_forall_sliced
! HLFIR: omp.parallel {
! HLFIR: omp.workshare {
! HLFIR: hlfir.forall
! HLFIR: hlfir.forall
! HLFIR: hlfir.region_assign
! HLFIR: omp.terminator
! HLFIR: }
! HLFIR: omp.terminator
! HLFIR: }
! After workshare lowering, the forall should be in omp.single (since it
! contains operations that are not safe to parallelize across threads).
! The key fix is that this compiles without crashing and the runtime
! executes correctly.
! FIR-LABEL: func.func @_QPworkshare_forall_sliced
! FIR: omp.parallel {
! FIR: omp.single
! FIR: fir.do_loop
! FIR: fir.do_loop
! FIR: omp.barrier
! FIR: omp.terminator
! FIR: }
! Verify LLVM IR is generated successfully (the original issue caused crashes)
! LLVM-LABEL: define {{.*}}workshare_forall_sliced
! LLVM: call {{.*}}@__kmpc_fork_call

View File

@@ -0,0 +1,294 @@
// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
// Tests for thread-local memory handling in workshare lowering (#143330):
// 1. Thread-local variables (from fir.alloca in omp.parallel or from OpenMP
// private/reduction clauses) should be parallelized, not wrapped in omp.single
// 2. nowait should not be added to omp.single when inside loop-like operations
// that contain omp.workshare.loop_wrapper
// Check that fir.alloca inside omp.parallel creates thread-local memory,
// and stores to it should be parallelized (not wrapped in omp.single).
// CHECK-LABEL: func.func @thread_local_alloca_store
func.func @thread_local_alloca_store() {
omp.parallel {
// The alloca is inside omp.parallel, so it's thread-local
%alloca = fir.alloca i32
omp.workshare {
%c1 = arith.constant 1 : i32
// This store should NOT be in omp.single because %alloca is thread-local
fir.store %c1 to %alloca : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel {
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that memory accessed through fir.declare is also recognized as thread-local
// when the underlying alloca is in the parallel region.
// CHECK-LABEL: func.func @thread_local_with_declare
func.func @thread_local_with_declare() {
omp.parallel {
%alloca = fir.alloca i32
%declare = fir.declare %alloca {uniq_name = "local_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
omp.workshare {
%c42 = arith.constant 42 : i32
// Store through declare should still be recognized as thread-local
fir.store %c42 to %declare : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel {
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
// CHECK-NEXT: %[[DECLARE:.*]] = fir.declare %[[ALLOCA]]
// CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : i32
// CHECK-NEXT: fir.store %[[C42]] to %[[DECLARE]] : !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that private clause block arguments are recognized as thread-local.
omp.private {type = private} @x_private : i32
// CHECK-LABEL: func.func @private_clause_thread_local
func.func @private_clause_thread_local(%arg0: !fir.ref<i32>) {
omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
omp.workshare {
%c10 = arith.constant 10 : i32
// Store to private variable should NOT be in omp.single
fir.store %c10 to %priv_arg : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : i32
// CHECK-NEXT: fir.store %[[C10]] to %[[PRIV_ARG]] : !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that hlfir.assign to a private clause variable through hlfir.declare
// is recognized as thread-local. The alias analysis marks private items as
// SourceKind::Allocate with the declare result as source value, so we need
// to trace through the declare to find the private block argument.
// CHECK-LABEL: func.func @hlfir_assign_private_clause
func.func @hlfir_assign_private_clause(%arg0: !fir.ref<i32>) {
omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
%decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
omp.workshare {
%c1 = arith.constant 1 : i32
// hlfir.assign to private variable should NOT be in omp.single
hlfir.assign %c1 to %decl#0 : i32, !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
// CHECK-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]]
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
// CHECK-NEXT: hlfir.assign %[[C1]] to %[[DECL]]#0 : i32, !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that reduction clause block arguments are recognized as thread-local.
omp.declare_reduction @add_reduction_i32 : i32 init {
^bb0(%arg0: i32):
%c0 = arith.constant 0 : i32
omp.yield(%c0 : i32)
} combiner {
^bb0(%arg0: i32, %arg1: i32):
%0 = arith.addi %arg0, %arg1 : i32
omp.yield(%0 : i32)
}
// CHECK-LABEL: func.func @reduction_clause_thread_local
func.func @reduction_clause_thread_local(%arg0: !fir.ref<i32>) {
omp.parallel reduction(@add_reduction_i32 %arg0 -> %red_arg : !fir.ref<i32>) {
omp.workshare {
%c5 = arith.constant 5 : i32
// Store to reduction variable should NOT be in omp.single
fir.store %c5 to %red_arg : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel reduction(@add_reduction_i32 %{{.*}} -> %[[RED_ARG:.*]] : !fir.ref<i32>) {
// CHECK-NEXT: %[[C5:.*]] = arith.constant 5 : i32
// CHECK-NEXT: fir.store %[[C5]] to %[[RED_ARG]] : !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that nowait is NOT added to omp.single when inside fir.do_loop
// that contains omp.workshare.loop_wrapper. This prevents race conditions
// when multiple threads execute different loop iterations concurrently.
// The workshare.loop_wrapper triggers recursive parallelization of the loop body.
// CHECK-LABEL: func.func @no_nowait_in_loop_with_workshare_wrapper
func.func @no_nowait_in_loop_with_workshare_wrapper(%arg0: !fir.ref<i32>) {
omp.parallel {
omp.workshare {
%c1 = arith.constant 1 : index
%c10 = arith.constant 10 : index
fir.do_loop %i = %c1 to %c10 step %c1 {
// This side-effecting op will be wrapped in omp.single without nowait
"test.side_effect"(%arg0) : (!fir.ref<i32>) -> ()
// The workshare.loop_wrapper triggers recursive processing of the loop
omp.workshare.loop_wrapper {
omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
"test.inner"() : () -> ()
omp.yield
}
}
}
omp.terminator
}
omp.terminator
}
return
}
// The omp.single inside the loop should NOT have nowait
// CHECK: omp.parallel {
// CHECK: fir.do_loop
// CHECK: omp.single {
// CHECK: "test.side_effect"
// CHECK: omp.terminator
// CHECK-NEXT: }
// CHECK: omp.wsloop {
// CHECK: }
// CHECK: omp.barrier
// CHECK: }
// Check that thread-local store inside a loop with workshare.loop_wrapper
// is correctly parallelized (not wrapped in omp.single).
// CHECK-LABEL: func.func @thread_local_store_in_loop_with_wrapper
func.func @thread_local_store_in_loop_with_wrapper() {
omp.parallel {
%alloca = fir.alloca i32
omp.workshare {
%c1 = arith.constant 1 : index
%c10 = arith.constant 10 : index
fir.do_loop %i = %c1 to %c10 step %c1 {
%c99 = arith.constant 99 : i32
// Store to thread-local alloca should NOT be in omp.single
fir.store %c99 to %alloca : !fir.ref<i32>
omp.workshare.loop_wrapper {
omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
"test.inner"() : () -> ()
omp.yield
}
}
}
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel {
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
// CHECK: fir.do_loop
// The store should be outside omp.single
// CHECK: %[[C99:.*]] = arith.constant 99 : i32
// CHECK-NEXT: fir.store %[[C99]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK: omp.wsloop {
// CHECK: }
// CHECK: omp.barrier
// CHECK: }
// Check that non-thread-local memory is still wrapped in omp.single.
// This is the baseline case to ensure we haven't broken normal behavior.
// CHECK-LABEL: func.func @non_thread_local_needs_single
func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
omp.parallel {
omp.workshare {
%c1 = arith.constant 1 : i32
// arg0 is shared memory, store must be in omp.single
fir.store %c1 to %arg0 : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// CHECK: omp.parallel {
// CHECK-NEXT: omp.single nowait {
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
// CHECK-NEXT: fir.store %[[C1]] to %{{.*}} : !fir.ref<i32>
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
// Check that loads from thread-local alloca combined with stores are parallelized.
// CHECK-LABEL: func.func @thread_local_load_and_store
func.func @thread_local_load_and_store() {
omp.parallel {
%alloca = fir.alloca i32
omp.workshare {
%c1 = arith.constant 1 : i32
fir.store %c1 to %alloca : !fir.ref<i32>
%val = fir.load %alloca : !fir.ref<i32>
fir.store %val to %alloca : !fir.ref<i32>
omp.terminator
}
omp.terminator
}
return
}
// All operations on thread-local memory should be outside omp.single
// CHECK: omp.parallel {
// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK-NEXT: %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
// CHECK-NEXT: fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }