From 04ec6544d4e0cebfabc9a55c2e4ce4d4c023100f Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 30 Apr 2026 20:26:09 +0200
Subject: [PATCH] [CIR] Implement emitStoreThroughLValue for ExtVectorType
 (#194127)

Implement emitStoreThroughLValue for ExtVectorType

Issue https://github.com/llvm/llvm-project/issues/192311
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 89 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  6 +-
 clang/test/CIR/CodeGen/vector-ext-element.cpp | 63 +++++++++++++
 3 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index ef1aacb9779f..2959dc567da0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -303,6 +303,92 @@ Address CIRGenFunction::emitPointerWithAlignment(const Expr *expr,
       /*forPointeeType=*/true, baseInfo);
 }
 
+void CIRGenFunction::emitStoreThroughExtVectorComponentLValue(RValue src,
+                                                              LValue dst) {
+  auto getScalarSizeInBits = [&](mlir::Type ty) -> unsigned {
+    mlir::Type scalarTy = mlir::isa<cir::VectorType>(ty)
+                              ? mlir::cast<cir::VectorType>(ty).getElementType()
+                              : ty;
+    cir::CIRDataLayout dl = cgm.getDataLayout();
+    return dl.getTypeSizeInBits(scalarTy).getFixedValue();
+  };
+
+  mlir::Value srcVal = src.getValue();
+  Address dstAddr = dst.getExtVectorAddress();
+  if (getScalarSizeInBits(dstAddr.getElementType()) >
+      getScalarSizeInBits(srcVal.getType())) {
+    cgm.errorNYI(
+        dst.getPointer().getLoc(),
+        "emitStoreThroughExtVectorComponentLValue: dstTySize > srcTysize");
+    return;
+  }
+
+  if (getLangOpts().HLSL) {
+    cgm.errorNYI(dst.getPointer().getLoc(),
+                 "emitStoreThroughExtVectorComponentLValue: HLSL");
+    return;
+  }
+
+  // This access turns into a read/modify/write of the vector.  Load the input
+  // value now.
+  mlir::Location loc = dst.getExtVectorPointer().getLoc();
+
+  mlir::ArrayAttr elts = dst.getExtVectorElts();
+
+  mlir::Value vec = builder.createLoad(loc, dstAddr, dst.isVolatile());
+  if (const auto *vecTy = dst.getType()->getAs<clang::VectorType>()) {
+    unsigned numSrcElts = vecTy->getNumElements();
+    unsigned numDstElts = cast<cir::VectorType>(vec.getType()).getSize();
+    if (numDstElts == numSrcElts) {
+      // Use shuffle vector is the src and destination are the same number of
+      // elements and restore the vector mask since it is on the side it will be
+      // stored.
+      SmallVector<int64_t> mask(numDstElts);
+      for (unsigned i = 0; i != numDstElts; ++i)
+        mask[getAccessedFieldNo(i, elts)] = i;
+
+      vec = builder.createVecShuffle(loc, srcVal, mask);
+    } else if (numDstElts > numSrcElts) {
+      // Extended the source vector to the same length and then shuffle it
+      // into the destination.
+      // FIXME: since we're shuffling with undef, can we just use the indices
+      //        into that?  This could be simpler.
+      SmallVector<int64_t> extMask(numDstElts, -1);
+      std::iota(extMask.begin(), extMask.begin() + numSrcElts, 0);
+
+      mlir::Value extSrcVal = builder.createVecShuffle(loc, srcVal, extMask);
+
+      // build identity
+      SmallVector<int64_t> mask(numDstElts);
+      std::iota(mask.begin(), mask.begin() + numDstElts, 0);
+
+      // When the vector size is odd and .odd or .hi is used, the last element
+      // of the Elts constant array will be one past the size of the vector.
+      // Ignore the last element here, if it is greater than the mask size.
+      if ((unsigned)getAccessedFieldNo(numSrcElts - 1, elts) == mask.size())
+        numSrcElts--;
+
+      // modify when what gets shuffled in
+      for (unsigned i = 0; i != numSrcElts; ++i)
+        mask[getAccessedFieldNo(i, elts)] = i + numDstElts;
+
+      vec = builder.createVecShuffle(loc, vec, extSrcVal, mask);
+    } else {
+      // We should never shorten the vector
+      llvm_unreachable("unexpected shorten vector length");
+    }
+  } else {
+    // If the Src is a scalar (not a vector), and the target is a vector it
+    // must be updating one element.
+    unsigned inIdx = getAccessedFieldNo(0, elts);
+    cir::ConstantOp elt = builder.getSInt64(inIdx, loc);
+    vec = cir::VecInsertOp::create(builder, loc, vec, srcVal, elt);
+  }
+
+  builder.createStore(loc, vec, dst.getExtVectorAddress(),
+                      dst.isVolatileQualified());
+}
+
 void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
                                             bool isInit) {
   if (!dst.isSimple()) {
@@ -317,6 +403,9 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       return;
     }
 
+    if (dst.isExtVectorElt())
+      return emitStoreThroughExtVectorComponentLValue(src, dst);
+
     assert(dst.isBitField() && "Unknown LValue type");
     emitStoreThroughBitfieldLValue(src, dst);
     return;
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index aadd2ff74165..3905c154e472 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -2070,9 +2070,11 @@ public:
                          bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
+  void emitStoreThroughExtVectorComponentLValue(RValue src, LValue dst);
+
   /// Store the specified rvalue into the specified
-  /// lvalue, where both are guaranteed to the have the same type, and that type
-  /// is 'Ty'.
+  /// lvalue, where both are guaranteed to the have the same type, and that
+  /// type is 'Ty'.
   void emitStoreThroughLValue(RValue src, LValue dst, bool isInit = false);
 
   mlir::Value emitStoreThroughBitfieldLValue(RValue src, LValue dstresult);
diff --git a/clang/test/CIR/CodeGen/vector-ext-element.cpp b/clang/test/CIR/CodeGen/vector-ext-element.cpp
index 26c94e03a625..1d071a583f5d 100644
--- a/clang/test/CIR/CodeGen/vector-ext-element.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext-element.cpp
@@ -339,3 +339,66 @@ void array_subscript_expr_with_element_expr_base() {
 // OGCG: %[[VEC_MEMBER_EXPR:.*]] = getelementptr inbounds i32, ptr %[[A_ADDR]], i64 0
 // OGCG: %[[VEC_ELEM_PTR:.*]] = getelementptr inbounds i32, ptr %[[VEC_MEMBER_EXPR]], i64 1
 // OGCG: store i32 2, ptr %[[VEC_ELEM_PTR]], align 4
+
+void store_src_dest_same_size() {
+  vi4 a;
+  vi2 b;
+  b.xy = a.xy;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<2 x !s32i>, !cir.ptr<!cir.vector<2 x !s32i>>, ["b"]
+// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[POISON:.*]] = cir.const #cir.poison : !cir.vector<4 x !s32i>
+// CIR: %[[SHUFFLE_A:.*]] = cir.vec.shuffle(%[[TMP_A]], %[[POISON]] : !cir.vector<4 x !s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !s32i>
+// CIR: %[[TMP_B:.*]] = cir.load {{.*}} %[[B_ADDR]] : !cir.ptr<!cir.vector<2 x !s32i>>, !cir.vector<2 x !s32i>
+// CIR: %[[POISON:.*]] = cir.const #cir.poison : !cir.vector<2 x !s32i>
+// CIR: %[[RESULT:.*]] = cir.vec.shuffle(%[[SHUFFLE_A]], %[[POISON]] : !cir.vector<2 x !s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !s32i>
+// CIR: cir.store {{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.vector<2 x !s32i>, !cir.ptr<!cir.vector<2 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <2 x i32>, i64 1, align 8
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: %[[SHUFFLE_A:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// LLVM: %[[TMP_B:.*]] = load <2 x i32>, ptr %[[B_ADDR]], align 8
+// LLVM: %[[RESULT:.*]] = shufflevector <2 x i32> %[[SHUFFLE_A]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+// LLVM: store <2 x i32> %[[RESULT]], ptr %[[B_ADDR]], align 8
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <2 x i32>, align 8
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: %[[SHUFFLE_A:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+// OGCG: %[[TMP_B:.*]] = load <2 x i32>, ptr %[[B_ADDR]], align 8
+// OGCG: %[[RESULT:.*]] = shufflevector <2 x i32> %[[SHUFFLE_A]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+// OGCG: store <2 x i32> %[[RESULT]], ptr %[[B_ADDR]], align 8
+
+void store_src_dest_not_same_size() {
+  vi4 a;
+  vi2 b;
+  a.lo = b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<2 x !s32i>, !cir.ptr<!cir.vector<2 x !s32i>>, ["b"]
+// CIR: %[[TMP_B:.*]] = cir.load {{.*}} %[[B_ADDR]] : !cir.ptr<!cir.vector<2 x !s32i>>, !cir.vector<2 x !s32i>
+// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[POISON:.*]] = cir.const #cir.poison : !cir.vector<2 x !s32i>
+// CIR: %[[SHUFFLE_B:.*]] = cir.vec.shuffle(%[[TMP_B]], %[[POISON]] : !cir.vector<2 x !s32i>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<-1> : !s32i, #cir.int<-1> : !s32i] : !cir.vector<4 x !s32i>
+// CIR: %[[RESULT:.*]] = cir.vec.shuffle(%[[TMP_A]], %[[SHUFFLE_B]] : !cir.vector<4 x !s32i>) [#cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i] : !cir.vector<4 x !s32i> 
+// CIR: cir.store {{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <2 x i32>, i64 1, align 8
+// LLVM: %[[TMP_B:.*]] = load <2 x i32>, ptr %[[B_ADDR]], align 8
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: %[[SHUFFLE_A:.*]] = shufflevector <2 x i32> %[[TMP_B]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+// LLVM: %[[RESULT:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[SHUFFLE_A]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// LLVM: store <4 x i32> %[[RESULT]], ptr %[[A_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <2 x i32>, align 8
+// OGCG: %[[TMP_B:.*]] = load <2 x i32>, ptr %[[B_ADDR]], align 8
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: %[[SHUFFLE_A:.*]] = shufflevector <2 x i32> %[[TMP_B]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+// OGCG: %[[RESULT:.*]] = shufflevector <4 x i32> %[[TMP_A]], <4 x i32> %[[SHUFFLE_A]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// OGCG: store <4 x i32> %[[RESULT]], ptr %[[A_ADDR]], align 16