@@ -211,10 +211,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
211
211
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
212
212
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
213
213
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
214
- ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
215
- ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
216
- ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
214
+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
215
+ ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
217
216
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
217
+ ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
218
218
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
219
219
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
220
220
; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
@@ -228,10 +228,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
228
228
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
229
229
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
230
230
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
231
- ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
232
- ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
233
- ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
231
+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
232
+ ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
234
233
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
234
+ ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
235
235
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
236
236
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
237
237
; AVX2-FP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -245,10 +245,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
245
245
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
246
246
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
247
247
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
248
- ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
249
- ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero
250
- ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
248
+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
249
+ ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
251
250
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
251
+ ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23],zero,zero,zero,zero,zero,zero,zero,zero
252
252
; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
253
253
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
254
254
; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -262,10 +262,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
262
262
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
263
263
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
264
264
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
265
- ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
266
- ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
267
- ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
265
+ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
266
+ ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
268
267
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
268
+ ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
269
269
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
270
270
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
271
271
; AVX512-NEXT: vmovq %xmm1, 16(%rcx)
@@ -279,10 +279,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
279
279
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
280
280
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
281
281
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
282
- ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
283
- ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
284
- ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
282
+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
283
+ ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
285
284
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
285
+ ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
286
286
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
287
287
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
288
288
; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -296,10 +296,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
296
296
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
297
297
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
298
298
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
299
- ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
300
- ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
301
- ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
299
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
300
+ ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
302
301
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
302
+ ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
303
303
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
304
304
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
305
305
; AVX512DQ-NEXT: vmovq %xmm1, 16(%rcx)
@@ -313,10 +313,10 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
313
313
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
314
314
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
315
315
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
316
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
317
- ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u]
318
- ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
316
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
317
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
319
318
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
319
+ ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,ymm2[2,3,10,11],zero,zero,ymm2[4,5,12,13,20,21],zero,zero,zero,zero,ymm2[22,23,u,u,u,u,u,u,u,u]
320
320
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
321
321
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
322
322
; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rcx)
@@ -330,12 +330,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
330
330
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
331
331
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
332
332
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
333
- ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
334
- ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
335
- ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
336
- ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
337
- ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx)
338
- ; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx)
333
+ ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
334
+ ; AVX512BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
335
+ ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
336
+ ; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx)
337
+ ; AVX512BW-NEXT: vmovdqa %xmm2, (%rcx)
339
338
; AVX512BW-NEXT: vzeroupper
340
339
; AVX512BW-NEXT: retq
341
340
;
@@ -345,12 +344,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
345
344
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
346
345
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
347
346
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
348
- ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
349
- ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
350
- ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
351
- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
352
- ; AVX512BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
353
- ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
347
+ ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
348
+ ; AVX512BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
349
+ ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
350
+ ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
351
+ ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
354
352
; AVX512BW-FCP-NEXT: vzeroupper
355
353
; AVX512BW-FCP-NEXT: retq
356
354
;
@@ -360,12 +358,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
360
358
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
361
359
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
362
360
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
363
- ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
364
- ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
365
- ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
366
- ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
367
- ; AVX512DQ-BW-NEXT: vmovq %xmm1, 16(%rcx)
368
- ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rcx)
361
+ ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
362
+ ; AVX512DQ-BW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
363
+ ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
364
+ ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rcx)
365
+ ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rcx)
369
366
; AVX512DQ-BW-NEXT: vzeroupper
370
367
; AVX512DQ-BW-NEXT: retq
371
368
;
@@ -375,12 +372,11 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
375
372
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
376
373
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
377
374
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
378
- ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
379
- ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,0,0,0,0]
380
- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
381
- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
382
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 16(%rcx)
383
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rcx)
375
+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,4,16,1,5,17,2,6,18,3,7,19,0,0,0,0]
376
+ ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
377
+ ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
378
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rcx)
379
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
384
380
; AVX512DQ-BW-FCP-NEXT: vzeroupper
385
381
; AVX512DQ-BW-FCP-NEXT: retq
386
382
%in.vec0 = load <4 x i16 >, ptr %in.vecptr0 , align 64
0 commit comments