[flang][OpenMP] Fix crash when a sliced array is specified in a forall within a workshare construct (#170913)

This is a fix for two problems that caused a crash: 1. Thread-local variables sometimes are required to be parallelized. Added a special case to handle this in `LowerWorkshare.cpp:isSafeToParallelize`. 2. Race condition caused by a `nowait` added to the `omp.workshare` if it is the last operation in a block. This allowed multiple threads to execute the `omp.workshare` region concurrently. Since _FortranAPushValue modifies a shared stack, this concurrent access causes a crash. Disable the addition of `nowait` and rely on the implicit barrier at the the of the `omp.workshare` region. Fixes #143330
2026-03-05 09:59:20 -03:00
parent c7ddb30552
commit 0f5e9bee83
3 changed files with 476 additions and 2 deletions
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -16,6 +16,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include <flang/Optimizer/Analysis/AliasAnalysis.h>
 #include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Optimizer/Dialect/FIROps.h>
 #include <flang/Optimizer/Dialect/FIRType.h>
@@ -37,6 +38,7 @@
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/IR/Value.h>
 #include <mlir/IR/Visitors.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LLVM.h>

@@ -135,9 +137,123 @@ static bool mustParallelizeOp(Operation *op) {
      .wasInterrupted();
 }

+// Determines if a memory reference is thread-local in an OpenMP context.
+//
+// This is a best-effort analysis. We cannot definitively determine if code
+// is inside a parallel region when it's in a function called from that
+// region. However, we can identify common patterns of thread-local memory:
+//
+// 1. Memory allocated via fir.alloca inside the enclosing omp.parallel region
+// 2. Memory that comes from OpenMP clause block arguments that create
+//    thread-local storage (private, firstprivate, lastprivate, reduction,
+//    linear clauses)
+//
+// Returns true if the memory reference appears to be thread-local and thus
+// safe to parallelize (each thread should access its own copy).
+static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
+  // Use AliasAnalysis to trace through declares, converts, reboxes, etc.
+  // to find the underlying source of the memory reference.
+  fir::AliasAnalysis aliasAnalysis;
+  fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
+
+  // Check if the source is a Value (not a global symbol).
+  mlir::Value sourceValue =
+      llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
+  if (!sourceValue)
+    return false;
+
+  // Case 1: Memory allocated by fir.alloca inside the enclosing parallel
+  // region is thread-private (each thread gets its own stack allocation).
+  // Note: fir.allocmem is NOT thread-local even inside omp.parallel.
+  if (source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
+    if (auto alloca = sourceValue.getDefiningOp<fir::AllocaOp>()) {
+      if (auto parallelOp = alloca->getParentOfType<omp::ParallelOp>()) {
+        if (op->getParentOfType<omp::ParallelOp>() == parallelOp)
+          return true;
+      }
+    }
+    // When the alias analysis encounters an hlfir.declare/fir.declare on a
+    // private clause block argument, it marks the SourceKind as Allocate and
+    // sets the source value to the declare op result (not the block arg).
+    // Trace through the declare to check if the underlying Memref is a
+    // private block argument.
+    Value declMemref;
+    if (auto hlfirDecl = sourceValue.getDefiningOp<hlfir::DeclareOp>())
+      declMemref = hlfirDecl.getMemref();
+    else if (auto firDecl = sourceValue.getDefiningOp<fir::DeclareOp>())
+      declMemref = firDecl.getMemref();
+    if (declMemref) {
+      if (auto blockArg = llvm::dyn_cast<BlockArgument>(declMemref)) {
+        Operation *parentOp = blockArg.getOwner()->getParentOp();
+        if (auto argIface =
+                llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
+          if (llvm::is_contained(argIface.getPrivateBlockArgs(), blockArg))
+            return true;
+        }
+      }
+    }
+  }
+
+  // Case 2: Memory from OpenMP clause block arguments that create thread-local
+  // storage. These clauses create private copies for each thread:
+  // - private: uninitialized thread-local copy
+  // - firstprivate: thread-local copy initialized from original
+  // - lastprivate: thread-local copy, value copied back after construct
+  // - reduction: thread-local copy for reduction operations
+  // - linear: thread-local copy with linear modification
+  //
+  // Check if the source value is a block argument of an OpenMP operation
+  // that implements BlockArgOpenMPOpInterface.
+  if (auto blockArg = llvm::dyn_cast<BlockArgument>(sourceValue)) {
+    Operation *parentOp = blockArg.getOwner()->getParentOp();
+    if (auto argIface =
+            llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
+      // Check if this block argument corresponds to a privatizing clause.
+      // Private, reduction, and in_reduction clauses create thread-local
+      // memory.
+      auto isInBlockArgs = [&](auto blockArgs) {
+        return llvm::is_contained(blockArgs, blockArg);
+      };
+
+      if (isInBlockArgs(argIface.getPrivateBlockArgs()))
+        return true;
+      if (isInBlockArgs(argIface.getReductionBlockArgs()))
+        return true;
+      if (isInBlockArgs(argIface.getInReductionBlockArgs()))
+        return true;
+      if (isInBlockArgs(argIface.getTaskReductionBlockArgs()))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 static bool isSafeToParallelize(Operation *op) {
-  return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
-         isMemoryEffectFree(op);
+  if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
+      isMemoryEffectFree(op))
+    return true;
+
+  // Thread-local variables allocated in the OpenMP parallel region or coming
+  // from privatizing clauses are private to each thread and thus safe (and
+  // sometimes required) to parallelize. If the compiler wraps such load/stores
+  // in an omp.single block, only one thread updates its local copy, while
+  // all other threads read uninitialized data (see issue #143330).
+  // Use MemoryEffectOpInterface to check all memory effects generically.
+  // This also handles hlfir.assign which is present when the pass runs before
+  // HLFIR lowering.
+  if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
+    SmallVector<MemoryEffects::EffectInstance> effects;
+    memEffects.getEffects(effects);
+    if (!effects.empty() &&
+        llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
+          Value val = effect.getValue();
+          return val && isOpenMPThreadLocalMemory(op, val);
+        }))
+      return true;
+  }
+
+  return false;
 }

 /// Simple shallow copies suffice for our purposes in this pass, so we implement
@@ -335,6 +451,13 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,

    for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
      bool isLast = i + 1 == regions.size();
+      // Make sure shared runtime calls are synchronized: disable `nowait`
+      // insertion, and rely on the implicit barrier at the end of the
+      // omp.workshare block. This applies to any loop-like operation
+      // (fir.do_loop, fir.iterate_while, fir.do_concurrent.loop, etc.)
+      // because iterations could overlap if nowait is used.
+      if (isa<LoopLikeOpInterface>(block.getParentOp()))
+        isLast = false;
      if (std::holds_alternative<SingleRegion>(opOrSingle)) {
        OpBuilder singleBuilder(sourceRegion.getContext());
        Block *singleBlock = new Block();
--- a/flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90
+++ b/flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90
@@ -0,0 +1,57 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+! Regression test for https://github.com/llvm/llvm-project/issues/143330
+! Verify that forall constructs with sliced arrays inside workshare are
+! correctly lowered without causing race conditions or crashes.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix LLVM
+
+subroutine workshare_forall_sliced(a1)
+  integer :: a1(2,1,3)
+  !$omp parallel
+  !$omp workshare
+  forall (n1=1:1)
+     forall (n2=1:3)
+        a1(:,n1,n2) = a1(:,n1,n2)+1
+     end forall
+  end forall
+  !$omp end workshare
+  !$omp end parallel
+end subroutine
+
+! HLFIR-LABEL: func.func @_QPworkshare_forall_sliced
+! HLFIR:       omp.parallel {
+! HLFIR:         omp.workshare {
+! HLFIR:           hlfir.forall
+! HLFIR:             hlfir.forall
+! HLFIR:               hlfir.region_assign
+! HLFIR:           omp.terminator
+! HLFIR:         }
+! HLFIR:         omp.terminator
+! HLFIR:       }
+
+! After workshare lowering, the forall should be in omp.single (since it
+! contains operations that are not safe to parallelize across threads).
+! The key fix is that this compiles without crashing and the runtime
+! executes correctly.
+
+! FIR-LABEL: func.func @_QPworkshare_forall_sliced
+! FIR:       omp.parallel {
+! FIR:         omp.single
+! FIR:           fir.do_loop
+! FIR:             fir.do_loop
+! FIR:         omp.barrier
+! FIR:         omp.terminator
+! FIR:       }
+
+! Verify LLVM IR is generated successfully (the original issue caused crashes)
+! LLVM-LABEL: define {{.*}}workshare_forall_sliced
+! LLVM:       call {{.*}}@__kmpc_fork_call
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -0,0 +1,294 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Tests for thread-local memory handling in workshare lowering (#143330):
+// 1. Thread-local variables (from fir.alloca in omp.parallel or from OpenMP
+//    private/reduction clauses) should be parallelized, not wrapped in omp.single
+// 2. nowait should not be added to omp.single when inside loop-like operations
+//    that contain omp.workshare.loop_wrapper
+
+
+// Check that fir.alloca inside omp.parallel creates thread-local memory,
+// and stores to it should be parallelized (not wrapped in omp.single).
+
+// CHECK-LABEL: func.func @thread_local_alloca_store
+func.func @thread_local_alloca_store() {
+  omp.parallel {
+    // The alloca is inside omp.parallel, so it's thread-local
+    %alloca = fir.alloca i32
+    omp.workshare {
+      %c1 = arith.constant 1 : i32
+      // This store should NOT be in omp.single because %alloca is thread-local
+      fir.store %c1 to %alloca : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT:    fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that memory accessed through fir.declare is also recognized as thread-local
+// when the underlying alloca is in the parallel region.
+
+// CHECK-LABEL: func.func @thread_local_with_declare
+func.func @thread_local_with_declare() {
+  omp.parallel {
+    %alloca = fir.alloca i32
+    %declare = fir.declare %alloca {uniq_name = "local_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    omp.workshare {
+      %c42 = arith.constant 42 : i32
+      // Store through declare should still be recognized as thread-local
+      fir.store %c42 to %declare : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT:    %[[DECLARE:.*]] = fir.declare %[[ALLOCA]]
+// CHECK-NEXT:    %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-NEXT:    fir.store %[[C42]] to %[[DECLARE]] : !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that private clause block arguments are recognized as thread-local.
+
+omp.private {type = private} @x_private : i32
+
+// CHECK-LABEL: func.func @private_clause_thread_local
+func.func @private_clause_thread_local(%arg0: !fir.ref<i32>) {
+  omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
+    omp.workshare {
+      %c10 = arith.constant 10 : i32
+      // Store to private variable should NOT be in omp.single
+      fir.store %c10 to %priv_arg : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT:    %[[C10:.*]] = arith.constant 10 : i32
+// CHECK-NEXT:    fir.store %[[C10]] to %[[PRIV_ARG]] : !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that hlfir.assign to a private clause variable through hlfir.declare
+// is recognized as thread-local. The alias analysis marks private items as
+// SourceKind::Allocate with the declare result as source value, so we need
+// to trace through the declare to find the private block argument.
+
+// CHECK-LABEL: func.func @hlfir_assign_private_clause
+func.func @hlfir_assign_private_clause(%arg0: !fir.ref<i32>) {
+  omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
+    %decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    omp.workshare {
+      %c1 = arith.constant 1 : i32
+      // hlfir.assign to private variable should NOT be in omp.single
+      hlfir.assign %c1 to %decl#0 : i32, !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT:    %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]]
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT:    hlfir.assign %[[C1]] to %[[DECL]]#0 : i32, !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that reduction clause block arguments are recognized as thread-local.
+
+omp.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  %c0 = arith.constant 0 : i32
+  omp.yield(%c0 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.addi %arg0, %arg1 : i32
+  omp.yield(%0 : i32)
+}
+
+// CHECK-LABEL: func.func @reduction_clause_thread_local
+func.func @reduction_clause_thread_local(%arg0: !fir.ref<i32>) {
+  omp.parallel reduction(@add_reduction_i32 %arg0 -> %red_arg : !fir.ref<i32>) {
+    omp.workshare {
+      %c5 = arith.constant 5 : i32
+      // Store to reduction variable should NOT be in omp.single
+      fir.store %c5 to %red_arg : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel reduction(@add_reduction_i32 %{{.*}} -> %[[RED_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT:    %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-NEXT:    fir.store %[[C5]] to %[[RED_ARG]] : !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that nowait is NOT added to omp.single when inside fir.do_loop
+// that contains omp.workshare.loop_wrapper. This prevents race conditions
+// when multiple threads execute different loop iterations concurrently.
+// The workshare.loop_wrapper triggers recursive parallelization of the loop body.
+
+// CHECK-LABEL: func.func @no_nowait_in_loop_with_workshare_wrapper
+func.func @no_nowait_in_loop_with_workshare_wrapper(%arg0: !fir.ref<i32>) {
+  omp.parallel {
+    omp.workshare {
+      %c1 = arith.constant 1 : index
+      %c10 = arith.constant 10 : index
+      fir.do_loop %i = %c1 to %c10 step %c1 {
+        // This side-effecting op will be wrapped in omp.single without nowait
+        "test.side_effect"(%arg0) : (!fir.ref<i32>) -> ()
+        // The workshare.loop_wrapper triggers recursive processing of the loop
+        omp.workshare.loop_wrapper {
+          omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+            "test.inner"() : () -> ()
+            omp.yield
+          }
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The omp.single inside the loop should NOT have nowait
+// CHECK:       omp.parallel {
+// CHECK:         fir.do_loop
+// CHECK:           omp.single {
+// CHECK:             "test.side_effect"
+// CHECK:             omp.terminator
+// CHECK-NEXT:      }
+// CHECK:           omp.wsloop {
+// CHECK:         }
+// CHECK:         omp.barrier
+// CHECK:       }
+
+
+// Check that thread-local store inside a loop with workshare.loop_wrapper
+// is correctly parallelized (not wrapped in omp.single).
+
+// CHECK-LABEL: func.func @thread_local_store_in_loop_with_wrapper
+func.func @thread_local_store_in_loop_with_wrapper() {
+  omp.parallel {
+    %alloca = fir.alloca i32
+    omp.workshare {
+      %c1 = arith.constant 1 : index
+      %c10 = arith.constant 10 : index
+      fir.do_loop %i = %c1 to %c10 step %c1 {
+        %c99 = arith.constant 99 : i32
+        // Store to thread-local alloca should NOT be in omp.single
+        fir.store %c99 to %alloca : !fir.ref<i32>
+        omp.workshare.loop_wrapper {
+          omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+            "test.inner"() : () -> ()
+            omp.yield
+          }
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK:         fir.do_loop
+// The store should be outside omp.single
+// CHECK:           %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-NEXT:      fir.store %[[C99]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           omp.wsloop {
+// CHECK:         }
+// CHECK:         omp.barrier
+// CHECK:       }
+
+
+// Check that non-thread-local memory is still wrapped in omp.single.
+// This is the baseline case to ensure we haven't broken normal behavior.
+
+// CHECK-LABEL: func.func @non_thread_local_needs_single
+func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
+  omp.parallel {
+    omp.workshare {
+      %c1 = arith.constant 1 : i32
+      // arg0 is shared memory, store must be in omp.single
+      fir.store %c1 to %arg0 : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    omp.single nowait {
+// CHECK-NEXT:      %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT:      fir.store %[[C1]] to %{{.*}} : !fir.ref<i32>
+// CHECK-NEXT:      omp.terminator
+// CHECK-NEXT:    }
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
+// Check that loads from thread-local alloca combined with stores are parallelized.
+
+// CHECK-LABEL: func.func @thread_local_load_and_store
+func.func @thread_local_load_and_store() {
+  omp.parallel {
+    %alloca = fir.alloca i32
+    omp.workshare {
+      %c1 = arith.constant 1 : i32
+      fir.store %c1 to %alloca : !fir.ref<i32>
+      %val = fir.load %alloca : !fir.ref<i32>
+      fir.store %val to %alloca : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// All operations on thread-local memory should be outside omp.single
+
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT:    fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT:    %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT:    fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }