@@ -556,3 +556,124 @@ exit: ; preds = %for.body
556
556
%add.lcssa = phi i32 [ %add , %for.body ]
557
557
ret i32 %add.lcssa
558
558
}
559
+
560
+ ; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
561
+ ; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
562
+ ; See https://github.com/llvm/llvm-project/issues/64047
563
+ define void @reduc_add_mul_store_same_ptr (ptr %dst , ptr readonly %src ) {
564
+ ; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
565
+ ; CHECK: middle.block:
566
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
567
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
568
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
569
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
570
+ ;
571
+ entry:
572
+ br label %for.body
573
+
574
+ for.body:
575
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
576
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
577
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
578
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
579
+ %0 = load i32 , ptr %gep.src , align 4
580
+ %sum.next = add nsw i32 %sum , %0
581
+ store i32 %sum.next , ptr %dst , align 4
582
+ %mul.next = mul nsw i32 %mul , %0
583
+ store i32 %mul.next , ptr %dst , align 4
584
+ %iv.next = add nuw nsw i64 %iv , 1
585
+ %exitcond = icmp eq i64 %iv.next , 1000
586
+ br i1 %exitcond , label %exit , label %for.body
587
+
588
+ exit:
589
+ ret void
590
+ }
591
+
592
+ define void @reduc_mul_add_store_same_ptr (ptr %dst , ptr readonly %src ) {
593
+ ; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
594
+ ; CHECK: middle.block:
595
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
596
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
597
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
598
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
599
+ ;
600
+ entry:
601
+ br label %for.body
602
+
603
+ for.body:
604
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
605
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
606
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
607
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
608
+ %0 = load i32 , ptr %gep.src , align 4
609
+ %mul.next = mul nsw i32 %mul , %0
610
+ store i32 %mul.next , ptr %dst , align 4
611
+ %sum.next = add nsw i32 %sum , %0
612
+ store i32 %sum.next , ptr %dst , align 4
613
+ %iv.next = add nuw nsw i64 %iv , 1
614
+ %exitcond = icmp eq i64 %iv.next , 1000
615
+ br i1 %exitcond , label %exit , label %for.body
616
+
617
+ exit:
618
+ ret void
619
+ }
620
+
621
+ ; Same as above but storing is done to two different pointers and they can be aliased
622
+ ; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
623
+ define void @reduc_add_mul_store_different_ptr (ptr %dst1 , ptr %dst2 , ptr readonly %src ) {
624
+ ; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
625
+ ; CHECK: middle.block:
626
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
627
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst2, align 4
628
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
629
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst1, align 4
630
+ ;
631
+ entry:
632
+ br label %for.body
633
+
634
+ for.body:
635
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
636
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
637
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
638
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
639
+ %0 = load i32 , ptr %gep.src , align 4
640
+ %sum.next = add nsw i32 %sum , %0
641
+ store i32 %sum.next , ptr %dst1 , align 4
642
+ %mul.next = mul nsw i32 %mul , %0
643
+ store i32 %mul.next , ptr %dst2 , align 4
644
+ %iv.next = add nuw nsw i64 %iv , 1
645
+ %exitcond = icmp eq i64 %iv.next , 1000
646
+ br i1 %exitcond , label %exit , label %for.body
647
+
648
+ exit:
649
+ ret void
650
+ }
651
+
652
+ define void @reduc_mul_add_store_different_ptr (ptr %dst1 , ptr %dst2 , ptr readonly %src ) {
653
+ ; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
654
+ ; CHECK: middle.block:
655
+ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
656
+ ; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
657
+ ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
658
+ ; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
659
+ ;
660
+ entry:
661
+ br label %for.body
662
+
663
+ for.body:
664
+ %sum = phi i32 [ 0 , %entry ], [ %sum.next , %for.body ]
665
+ %mul = phi i32 [ 1 , %entry ], [ %mul.next , %for.body ]
666
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %for.body ]
667
+ %gep.src = getelementptr inbounds i32 , ptr %src , i64 %iv
668
+ %0 = load i32 , ptr %gep.src , align 4
669
+ %mul.next = mul nsw i32 %mul , %0
670
+ store i32 %mul.next , ptr %dst1 , align 4
671
+ %sum.next = add nsw i32 %sum , %0
672
+ store i32 %sum.next , ptr %dst2 , align 4
673
+ %iv.next = add nuw nsw i64 %iv , 1
674
+ %exitcond = icmp eq i64 %iv.next , 1000
675
+ br i1 %exitcond , label %exit , label %for.body
676
+
677
+ exit:
678
+ ret void
679
+ }
0 commit comments